From 31e813bc15b794a7ed8886c860fd72e78af56a41 Mon Sep 17 00:00:00 2001 From: Stefano Pigozzi Date: Wed, 1 Feb 2023 02:33:42 +0100 Subject: [PATCH] First commit --- .github/dependabot.yml | 93 +++++++++++++++++++++ .gitignore | 169 ++++++++++++++++++++++++++++++++++++++ .idea/.gitignore | 8 ++ .idea/compiler.xml | 6 ++ .idea/discord.xml | 7 ++ .idea/markdown.xml | 10 +++ .idea/misc.xml | 10 +++ .idea/modules.xml | 8 ++ .idea/vcs.xml | 6 ++ .readthedocs.yml | 10 +++ .vscode/extensions.json | 5 ++ .vscode/settings.json | 13 +++ Dockerfile | 24 ++++++ README.md | 27 ++++++ pyproject.toml | 167 +++++++++++++++++++++++++++++++++++++ unimore-bda-6.iml | 8 ++ unimore_bda_6/__init__.py | 5 ++ unimore_bda_6/__main__.py | 8 ++ 18 files changed, 584 insertions(+) create mode 100644 .github/dependabot.yml create mode 100644 .gitignore create mode 100644 .idea/.gitignore create mode 100644 .idea/compiler.xml create mode 100644 .idea/discord.xml create mode 100644 .idea/markdown.xml create mode 100644 .idea/misc.xml create mode 100644 .idea/modules.xml create mode 100644 .idea/vcs.xml create mode 100644 .readthedocs.yml create mode 100644 .vscode/extensions.json create mode 100644 .vscode/settings.json create mode 100644 Dockerfile create mode 100644 README.md create mode 100644 pyproject.toml create mode 100644 unimore-bda-6.iml create mode 100644 unimore_bda_6/__init__.py create mode 100644 unimore_bda_6/__main__.py diff --git a/.github/dependabot.yml b/.github/dependabot.yml new file mode 100644 index 0000000..0918bb0 --- /dev/null +++ b/.github/dependabot.yml @@ -0,0 +1,93 @@ +# Dependabot configuration file +# See: https://docs.github.com/en/code-security/supply-chain-security/keeping-your-dependencies-updated-automatically/configuration-options-for-dependency-updates#allow + +version: 2 + +updates: + + # We're using Poetry + - package-ecosystem: pip + + # The root directory of the project + directory: "/" + + # Check every day for updates at 08:00 UTC + schedule: + interval: "daily" + time: "08:00" + timezone: "UTC" + + # Do not alert for indirect dependencies, as there may be too many + allow: + - dependency-type: direct + + # Use Gitmoji in the commit message + commit-message: + prefix: "⬆️ " + include: "scope" + + # Set the pull request label + labels: + - "dependencies" + + # Stay updated to the main branch + rebase-strategy: auto + + # Pip does not support any other versioning strategy + versioning-strategy: lockfile-only + + # Additionally, keep GitHub Actions updated + - package-ecosystem: github-actions + + # The root directory of the actions + directory: "/" + + # Check every day for updates at 08:00 UTC + schedule: + interval: "daily" + time: "08:00" + timezone: "UTC" + + # GitHub Actions only has direct dependencies + allow: + - dependency-type: direct + + # Use Gitmoji in the commit message + commit-message: + prefix: "🔨️ " + include: "scope" + + # Set the pull request label + labels: + - "automation" + + # Stay updated to the main branch + rebase-strategy: auto + + # Finally, keep Git submodules updated + - package-ecosystem: gitsubmodule + + # The root directory of the repository + directory: "/" + + # Check every day for updates at 08:00 UTC + schedule: + interval: "daily" + time: "08:00" + timezone: "UTC" + + # GitHub Actions only has direct dependencies + allow: + - dependency-type: direct + + # Use Gitmoji in the commit message + commit-message: + prefix: "⬆️ " + include: "scope" + + # Set the pull request label + labels: + - "dependencies" + + # Stay updated to the main branch + rebase-strategy: auto diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..a2c3170 --- /dev/null +++ b/.gitignore @@ -0,0 +1,169 @@ +# Gitignore file +# See https://git-scm.com/docs/gitignore for more details + +################### +# Project ignores # +################### + +# Add your own ignores here! + + + +################## +# Python ignores # +################## + +# From https://github.com/github/gitignore/blob/main/Python.gitignore + +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +#poetry.lock + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintainted in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +#.idea/ diff --git a/.idea/.gitignore b/.idea/.gitignore new file mode 100644 index 0000000..13566b8 --- /dev/null +++ b/.idea/.gitignore @@ -0,0 +1,8 @@ +# Default ignored files +/shelf/ +/workspace.xml +# Editor-based HTTP Client requests +/httpRequests/ +# Datasource local storage ignored files +/dataSources/ +/dataSources.local.xml diff --git a/.idea/compiler.xml b/.idea/compiler.xml new file mode 100644 index 0000000..1a2fb33 --- /dev/null +++ b/.idea/compiler.xml @@ -0,0 +1,6 @@ + + + + + \ No newline at end of file diff --git a/.idea/discord.xml b/.idea/discord.xml new file mode 100644 index 0000000..30bab2a --- /dev/null +++ b/.idea/discord.xml @@ -0,0 +1,7 @@ + + + + + \ No newline at end of file diff --git a/.idea/markdown.xml b/.idea/markdown.xml new file mode 100644 index 0000000..064f873 --- /dev/null +++ b/.idea/markdown.xml @@ -0,0 +1,10 @@ + + + + + + + + + \ No newline at end of file diff --git a/.idea/misc.xml b/.idea/misc.xml new file mode 100644 index 0000000..2ca2389 --- /dev/null +++ b/.idea/misc.xml @@ -0,0 +1,10 @@ + + + + + + + + \ No newline at end of file diff --git a/.idea/modules.xml b/.idea/modules.xml new file mode 100644 index 0000000..07cb2d5 --- /dev/null +++ b/.idea/modules.xml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/.idea/vcs.xml b/.idea/vcs.xml new file mode 100644 index 0000000..94a25f7 --- /dev/null +++ b/.idea/vcs.xml @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file diff --git a/.readthedocs.yml b/.readthedocs.yml new file mode 100644 index 0000000..0190f6e --- /dev/null +++ b/.readthedocs.yml @@ -0,0 +1,10 @@ +version: 2 + +python: + install: + - path: "." + +build: + os: "ubuntu-20.04" + tools: + python: "3.10" \ No newline at end of file diff --git a/.vscode/extensions.json b/.vscode/extensions.json new file mode 100644 index 0000000..3a390d4 --- /dev/null +++ b/.vscode/extensions.json @@ -0,0 +1,5 @@ +{ + "recommendations": [ + "ms-python.python" + ] +} \ No newline at end of file diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000..85b78f1 --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,13 @@ +{ + "files.exclude": { + ".idea/": true, + "**/*.iml": true, + "**/.pytest_cache/": true, + "**/__pycache__/": true, + ".venv/": true, + "dist/": true, + "out/": true, + "poetry.lock": true, + }, + "python.analysis.extraPaths": ["./unimore_bda_6"], +} diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..040d308 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,24 @@ +FROM python AS metadata +LABEL maintainer="Your Name " + +FROM metadata AS workdir +WORKDIR /usr/src/PACKAGE_NAME + +FROM workdir AS poetry +RUN pip install "poetry" + +FROM poetry AS dependencies +COPY pyproject.toml ./pyproject.toml +COPY poetry.lock ./poetry.lock +RUN poetry install --no-root --no-dev + +FROM dependencies AS package +COPY . . +RUN poetry install + +FROM package AS environment +ENV PYTHONUNBUFFERED=1 + +FROM environment AS entrypoint +ENTRYPOINT ["poetry", "run", "python", "-m", "PACKAGE_NAME"] +CMD [] diff --git a/README.md b/README.md new file mode 100644 index 0000000..8e03b3c --- /dev/null +++ b/README.md @@ -0,0 +1,27 @@ +[ Stefano Pigozzi | Tema Text Analytics | Big Data Analytics | A.A. 2022/2023 | Unimore ] + +# WIP + +> ### Sentiment analysis su recensioni Amazon +> +> Modificare l’esercizio di sentiment analysis sulle review Amazon svolto in classe e verificare l’efficacia del metodo effettuando queste varianti: +> +> 1. Utilizzare come tokenizer il “sentiment tokenizer” di Christopher Potts (link disponibile nelle slide del corso); +> 2. Modificare il dataset recuperando anche recensioni a 2, 3 e 4 stelle ed effettuare una classificazione a più classi (es. 5 classi di sentiment corrispondenti al numero di stelle delle recensioni). +> +> Effettuare quindi un confronto di efficacia tra queste varianti e la versione originale vista in classe. +> +> Valutare anche l’inclusione di altre feature estratte dai dati, con l’obiettivo di aumentare l’efficacia. +> +> * E’ necessario effettuare tutti i test su un numero significativo di run (es., almeno 50), scegliendo ogni volta in maniera casuale la composizione di test-set e training-set a partire dall’insieme di post estratti (è possibile utilizzare le feature automatiche di cross validation viste per scikit-learn) +> * E’ possibile (e gradito) estendere in ampiezza la propria analisi: +> * utilizzare e confrontare una o più delle librerie di ML viste a lezione (NLTK/scikitlearn/XGBoost/Tensorflow) (NOTA: per le tracce 2 e 3 è necessario sperimentare anche almeno una libreria diversa da NLTK) +> * utilizzare e confrontare diversi classificatori tra quelli offerti (es. quelli citati a lezione in scikit-learn) e una o più delle tecniche citate/viste a lezione (es. codifica del testo tramite TF-IDF, word embeddings per tensorflow, hyper-parameter tuning per scikit-learn, tecniche specifiche per sent. analysis, …) +> * utilizzare librerie per l’elaborazione del testo alternative (es. SpaCy https://spacy.io/ ) per estrarre feature aggiuntive, valutandone l’effetto sul modello +> * in generale: studiare, riassumere brevemente e applicare eventuali altre tecniche o strumenti ritenuti utili all’obiettivo (cioè, migliorare l’efficacia del modello proposto). +> +> Consegna: PDF commentato con discussione e codice Python (includere dati e codice anche in un file .txt per facilitarne il testing) +> +> Per quanto riguarda il codice Python, è possibile (e gradito) produrre e consegnare un notebook jupyter .ipynb +> (https://jupyter.org/) invece di codice .py e relativi commenti separati su PDF (per comodità di consultazione, +> consegnare comunque anche una stampa PDF del notebook oltre al notebook stesso). diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..c29cc6b --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,167 @@ +[build-system] +####################### +# Python build system # +####################### +# The build system to use when installing this package. +# Used when installing the package with `pip install .`. +# See also: https://www.python.org/dev/peps/pep-0517/ + +requires = ["poetry-core>=1.0.0"] +build-backend = "poetry.core.masonry.api" + + + +[tool.poetry] +################### +# Poetry settings # +################### +# See https://python-poetry.org/docs/pyproject/ for more details! + +# The name of your project. +# Ensure that it is available on PyPI: https://pypi.org/ +name = "unimore_bda_6" + +# The version of the package. +version = "0.1.0" + +# A brief, one-sentence description about your project. +description = "Sesto progetto di Big Data Analytics" + +# A list of the authors of the project. +authors = [ + "Stefano Pigozzi ", +] + +# A list of maintainers of the project. +# Often, it is the same as the authors list. +maintainers = [ + "Stefano Pigozzi ", +] + +# The license of the package. +# Uses SPDX format: https://spdx.org/licenses/ +license = "" + +# The README file. +readme = "README.md" + +# The URL of the project website. +# Not the GitHub repository! +# homepage = "https://example.org/" + +# The URL of the project repository. +repository = "https://github.com/Steffo99/unimore-bda-6" + +# The URL of the project documentation location. +# documentation = "https://example.org/docs" + +# Up to five keywords related to your project. +# See also: https://packaging.python.org/en/latest/guides/distributing-packages-using-setuptools/#keywords +keywords = [ + "unimore", + "big data analytics", + "sentiment analysis", + "amazon reviews", + "university project", +] + +# Any number of trove classifiers that apply to your project. +# See the list at: https://pypi.org/classifiers/ +classifiers = [ + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.10", +] + +# ADVANCED: specify the packages exported by your project +# See also: https://python-poetry.org/docs/pyproject/#packages +# packages = [ +# # Regular packages +# { include = "OTHER_PACKAGE_NAME" }, +# # Namespace packages +# { include = "EXTEND/OTHER/NAMESPACE/**/*.py" } +# ] + +# ADVANCED: include additional files in the build +# include = [ +# "FILENAME.txt" +# ] + +# ADVANCED: exclude files from the build +# exclude = [ +# "PACKAGE_NAME/test.py" +# ] + + + +[tool.poetry.urls] +################## +# URLs # +################## +# Additional project URLs in a name → link mapping. + + + +[tool.poetry.scripts] +##################### +# Scripts # +##################### +# Poetry can create "binaries" to add to the PATH when the project is installed. +# They are specified in the form of a mapping with the command name as key and the function to execute as value. + +# If you are building a library, comment this. +# If you are building an application, replace PACKAGE-NAME and PACKAGE_NAME with your package name in kebab-case and snake_case respectively. + +# PACKAGE-NAME = "PACKAGE_NAME.__main__:main" + + + +[tool.poetry.dependencies] +########################## +# Dependencies # +########################## +# A mapping of dependencies of the project +# It should be maintained by `poetry add` / `poetry remove`, but it currently adds things after all comments... +# You can manually specify allowed version numbers: +# * means "any release" +# * → any +# ^X.X.X means "newer releases with this major version" +# ^3.10.1 → == 3 && >= 3.10.1 +# ~X.X.X means "newer releases with this minor version" +# ~3.10.1 → == 3.10 && >= 3.10.1 +# nothing means "this specific release" +# 3.10.1 → == 3.10.1 + +python = "^3.10" + + + +[tool.poetry.dev-dependencies] +############################## +# Development dependencies # +############################## +# Same as above, but these dependencies are installed only if the project is being installed in development mode. +# They are excluded from the final build. + + +[tool.poetry.extras] +#################### +# Package extras # +#################### +# ADVANCED: specify optional dependency groups. +# See: https://python-poetry.org/docs/pyproject/#extras + + + +[tool.poetry.plugins] +##################### +# Poetry plugins # +##################### +# ADVANCED: extend Poetry's functionality. +# See: https://python-poetry.org/docs/pyproject/#plugins + + + +[tool.pytest.ini_options] +######################### +# Pytest configuration # +######################### diff --git a/unimore-bda-6.iml b/unimore-bda-6.iml new file mode 100644 index 0000000..9a5cfce --- /dev/null +++ b/unimore-bda-6.iml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/unimore_bda_6/__init__.py b/unimore_bda_6/__init__.py new file mode 100644 index 0000000..8c571c4 --- /dev/null +++ b/unimore_bda_6/__init__.py @@ -0,0 +1,5 @@ +# If you are building a **library**, use this file to export objects! + +__all__ = ( + # "", +) diff --git a/unimore_bda_6/__main__.py b/unimore_bda_6/__main__.py new file mode 100644 index 0000000..a015126 --- /dev/null +++ b/unimore_bda_6/__main__.py @@ -0,0 +1,8 @@ +# If you are building an **application**, use this file to run code! + +def main(): + pass + + +if __name__ == "__main__": + main()