1
Fork 0
mirror of https://github.com/Steffo99/unimore-bda-6.git synced 2024-11-25 09:14:19 +00:00

First commit

This commit is contained in:
Steffo 2023-02-01 02:33:42 +01:00
commit 31e813bc15
Signed by: steffo
GPG key ID: 2A24051445686895
18 changed files with 584 additions and 0 deletions

93
.github/dependabot.yml vendored Normal file
View file

@ -0,0 +1,93 @@
# Dependabot configuration file
# See: https://docs.github.com/en/code-security/supply-chain-security/keeping-your-dependencies-updated-automatically/configuration-options-for-dependency-updates#allow
version: 2
updates:
# We're using Poetry
- package-ecosystem: pip
# The root directory of the project
directory: "/"
# Check every day for updates at 08:00 UTC
schedule:
interval: "daily"
time: "08:00"
timezone: "UTC"
# Do not alert for indirect dependencies, as there may be too many
allow:
- dependency-type: direct
# Use Gitmoji in the commit message
commit-message:
prefix: "⬆️ "
include: "scope"
# Set the pull request label
labels:
- "dependencies"
# Stay updated to the main branch
rebase-strategy: auto
# Pip does not support any other versioning strategy
versioning-strategy: lockfile-only
# Additionally, keep GitHub Actions updated
- package-ecosystem: github-actions
# The root directory of the actions
directory: "/"
# Check every day for updates at 08:00 UTC
schedule:
interval: "daily"
time: "08:00"
timezone: "UTC"
# GitHub Actions only has direct dependencies
allow:
- dependency-type: direct
# Use Gitmoji in the commit message
commit-message:
prefix: "🔨️ "
include: "scope"
# Set the pull request label
labels:
- "automation"
# Stay updated to the main branch
rebase-strategy: auto
# Finally, keep Git submodules updated
- package-ecosystem: gitsubmodule
# The root directory of the repository
directory: "/"
# Check every day for updates at 08:00 UTC
schedule:
interval: "daily"
time: "08:00"
timezone: "UTC"
# GitHub Actions only has direct dependencies
allow:
- dependency-type: direct
# Use Gitmoji in the commit message
commit-message:
prefix: "⬆️ "
include: "scope"
# Set the pull request label
labels:
- "dependencies"
# Stay updated to the main branch
rebase-strategy: auto

169
.gitignore vendored Normal file
View file

@ -0,0 +1,169 @@
# Gitignore file
# See https://git-scm.com/docs/gitignore for more details
###################
# Project ignores #
###################
# Add your own ignores here!
##################
# Python ignores #
##################
# From https://github.com/github/gitignore/blob/main/Python.gitignore
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
cover/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
.pybuilder/
target/
# Jupyter Notebook
.ipynb_checkpoints
# IPython
profile_default/
ipython_config.py
# pyenv
# For a library or package, you might want to ignore these files since the code is
# intended to run in multiple environments; otherwise, check them in:
# .python-version
# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock
# poetry
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
# This is especially recommended for binary packages to ensure reproducibility, and is more
# commonly ignored for libraries.
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
#poetry.lock
# PEP 582; used by e.g. github.com/David-OConnor/pyflow
__pypackages__/
# Celery stuff
celerybeat-schedule
celerybeat.pid
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
# Pyre type checker
.pyre/
# pytype static type analyzer
.pytype/
# Cython debug symbols
cython_debug/
# PyCharm
# JetBrains specific template is maintainted in a separate JetBrains.gitignore that can
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
#.idea/

8
.idea/.gitignore vendored Normal file
View file

@ -0,0 +1,8 @@
# Default ignored files
/shelf/
/workspace.xml
# Editor-based HTTP Client requests
/httpRequests/
# Datasource local storage ignored files
/dataSources/
/dataSources.local.xml

6
.idea/compiler.xml Normal file
View file

@ -0,0 +1,6 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="TypeScriptCompiler">
<option name="recompileOnChanges" value="true" />
</component>
</project>

7
.idea/discord.xml Normal file
View file

@ -0,0 +1,7 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="DiscordProjectSettings">
<option name="show" value="ASK" />
<option name="description" value="" />
</component>
</project>

10
.idea/markdown.xml Normal file
View file

@ -0,0 +1,10 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="MarkdownSettings">
<enabledExtensions>
<entry key="MermaidLanguageExtension" value="false" />
<entry key="PlantUMLLanguageExtension" value="false" />
</enabledExtensions>
<option name="splitLayout" value="SHOW_EDITOR" />
</component>
</project>

10
.idea/misc.xml Normal file
View file

@ -0,0 +1,10 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="DiscordProjectSettings">
<option name="show" value="ASK" />
<option name="description" value="" />
</component>
<component name="ProjectRootManager">
<output url="file://$PROJECT_DIR$/out" />
</component>
</project>

8
.idea/modules.xml Normal file
View file

@ -0,0 +1,8 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectModuleManager">
<modules>
<module fileurl="file://$PROJECT_DIR$/unimore-bda-6.iml" filepath="$PROJECT_DIR$/unimore-bda-6.iml" />
</modules>
</component>
</project>

6
.idea/vcs.xml Normal file
View file

@ -0,0 +1,6 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="VcsDirectoryMappings">
<mapping directory="$PROJECT_DIR$" vcs="Git" />
</component>
</project>

10
.readthedocs.yml Normal file
View file

@ -0,0 +1,10 @@
version: 2
python:
install:
- path: "."
build:
os: "ubuntu-20.04"
tools:
python: "3.10"

5
.vscode/extensions.json vendored Normal file
View file

@ -0,0 +1,5 @@
{
"recommendations": [
"ms-python.python"
]
}

13
.vscode/settings.json vendored Normal file
View file

@ -0,0 +1,13 @@
{
"files.exclude": {
".idea/": true,
"**/*.iml": true,
"**/.pytest_cache/": true,
"**/__pycache__/": true,
".venv/": true,
"dist/": true,
"out/": true,
"poetry.lock": true,
},
"python.analysis.extraPaths": ["./unimore_bda_6"],
}

24
Dockerfile Normal file
View file

@ -0,0 +1,24 @@
FROM python AS metadata
LABEL maintainer="Your Name <you@example.org>"
FROM metadata AS workdir
WORKDIR /usr/src/PACKAGE_NAME
FROM workdir AS poetry
RUN pip install "poetry"
FROM poetry AS dependencies
COPY pyproject.toml ./pyproject.toml
COPY poetry.lock ./poetry.lock
RUN poetry install --no-root --no-dev
FROM dependencies AS package
COPY . .
RUN poetry install
FROM package AS environment
ENV PYTHONUNBUFFERED=1
FROM environment AS entrypoint
ENTRYPOINT ["poetry", "run", "python", "-m", "PACKAGE_NAME"]
CMD []

27
README.md Normal file
View file

@ -0,0 +1,27 @@
[ Stefano Pigozzi | Tema Text Analytics | Big Data Analytics | A.A. 2022/2023 | Unimore ]
# WIP
> ### Sentiment analysis su recensioni Amazon
>
> Modificare lesercizio di sentiment analysis sulle review Amazon svolto in classe e verificare lefficacia del metodo effettuando queste varianti:
>
> 1. Utilizzare come tokenizer il “sentiment tokenizer” di Christopher Potts (link disponibile nelle slide del corso);
> 2. Modificare il dataset recuperando anche recensioni a 2, 3 e 4 stelle ed effettuare una classificazione a più classi (es. 5 classi di sentiment corrispondenti al numero di stelle delle recensioni).
>
> Effettuare quindi un confronto di efficacia tra queste varianti e la versione originale vista in classe.
>
> Valutare anche linclusione di altre feature estratte dai dati, con lobiettivo di aumentare lefficacia.
>
> * E necessario effettuare tutti i test su un numero significativo di run (es., almeno 50), scegliendo ogni volta in maniera casuale la composizione di test-set e training-set a partire dallinsieme di post estratti (è possibile utilizzare le feature automatiche di cross validation viste per scikit-learn)
> * E possibile (e gradito) estendere in ampiezza la propria analisi:
> * utilizzare e confrontare una o più delle librerie di ML viste a lezione (NLTK/scikitlearn/XGBoost/Tensorflow) (NOTA: per le tracce 2 e 3 è necessario sperimentare anche almeno una libreria diversa da NLTK)
> * utilizzare e confrontare diversi classificatori tra quelli offerti (es. quelli citati a lezione in scikit-learn) e una o più delle tecniche citate/viste a lezione (es. codifica del testo tramite TF-IDF, word embeddings per tensorflow, hyper-parameter tuning per scikit-learn, tecniche specifiche per sent. analysis, …)
> * utilizzare librerie per lelaborazione del testo alternative (es. SpaCy https://spacy.io/ ) per estrarre feature aggiuntive, valutandone leffetto sul modello
> * in generale: studiare, riassumere brevemente e applicare eventuali altre tecniche o strumenti ritenuti utili allobiettivo (cioè, migliorare lefficacia del modello proposto).
>
> Consegna: PDF commentato con discussione e codice Python (includere dati e codice anche in un file .txt per facilitarne il testing)
>
> Per quanto riguarda il codice Python, è possibile (e gradito) produrre e consegnare un notebook jupyter .ipynb
> (https://jupyter.org/) invece di codice .py e relativi commenti separati su PDF (per comodità di consultazione,
> consegnare comunque anche una stampa PDF del notebook oltre al notebook stesso).

167
pyproject.toml Normal file
View file

@ -0,0 +1,167 @@
[build-system]
#######################
# Python build system #
#######################
# The build system to use when installing this package.
# Used when installing the package with `pip install .`.
# See also: https://www.python.org/dev/peps/pep-0517/
requires = ["poetry-core>=1.0.0"]
build-backend = "poetry.core.masonry.api"
[tool.poetry]
###################
# Poetry settings #
###################
# See https://python-poetry.org/docs/pyproject/ for more details!
# The name of your project.
# Ensure that it is available on PyPI: https://pypi.org/
name = "unimore_bda_6"
# The version of the package.
version = "0.1.0"
# A brief, one-sentence description about your project.
description = "Sesto progetto di Big Data Analytics"
# A list of the authors of the project.
authors = [
"Stefano Pigozzi <me@steffo.eu>",
]
# A list of maintainers of the project.
# Often, it is the same as the authors list.
maintainers = [
"Stefano Pigozzi <me@steffo.eu>",
]
# The license of the package.
# Uses SPDX format: https://spdx.org/licenses/
license = ""
# The README file.
readme = "README.md"
# The URL of the project website.
# Not the GitHub repository!
# homepage = "https://example.org/"
# The URL of the project repository.
repository = "https://github.com/Steffo99/unimore-bda-6"
# The URL of the project documentation location.
# documentation = "https://example.org/docs"
# Up to five keywords related to your project.
# See also: https://packaging.python.org/en/latest/guides/distributing-packages-using-setuptools/#keywords
keywords = [
"unimore",
"big data analytics",
"sentiment analysis",
"amazon reviews",
"university project",
]
# Any number of trove classifiers that apply to your project.
# See the list at: https://pypi.org/classifiers/
classifiers = [
"Programming Language :: Python :: 3",
"Programming Language :: Python :: 3.10",
]
# ADVANCED: specify the packages exported by your project
# See also: https://python-poetry.org/docs/pyproject/#packages
# packages = [
# # Regular packages
# { include = "OTHER_PACKAGE_NAME" },
# # Namespace packages
# { include = "EXTEND/OTHER/NAMESPACE/**/*.py" }
# ]
# ADVANCED: include additional files in the build
# include = [
# "FILENAME.txt"
# ]
# ADVANCED: exclude files from the build
# exclude = [
# "PACKAGE_NAME/test.py"
# ]
[tool.poetry.urls]
##################
# URLs #
##################
# Additional project URLs in a name → link mapping.
[tool.poetry.scripts]
#####################
# Scripts #
#####################
# Poetry can create "binaries" to add to the PATH when the project is installed.
# They are specified in the form of a mapping with the command name as key and the function to execute as value.
# If you are building a library, comment this.
# If you are building an application, replace PACKAGE-NAME and PACKAGE_NAME with your package name in kebab-case and snake_case respectively.
# PACKAGE-NAME = "PACKAGE_NAME.__main__:main"
[tool.poetry.dependencies]
##########################
# Dependencies #
##########################
# A mapping of dependencies of the project
# It should be maintained by `poetry add` / `poetry remove`, but it currently adds things after all comments...
# You can manually specify allowed version numbers:
# * means "any release"
# * → any
# ^X.X.X means "newer releases with this major version"
# ^3.10.1 → == 3 && >= 3.10.1
# ~X.X.X means "newer releases with this minor version"
# ~3.10.1 → == 3.10 && >= 3.10.1
# nothing means "this specific release"
# 3.10.1 → == 3.10.1
python = "^3.10"
[tool.poetry.dev-dependencies]
##############################
# Development dependencies #
##############################
# Same as above, but these dependencies are installed only if the project is being installed in development mode.
# They are excluded from the final build.
[tool.poetry.extras]
####################
# Package extras #
####################
# ADVANCED: specify optional dependency groups.
# See: https://python-poetry.org/docs/pyproject/#extras
[tool.poetry.plugins]
#####################
# Poetry plugins #
#####################
# ADVANCED: extend Poetry's functionality.
# See: https://python-poetry.org/docs/pyproject/#plugins
[tool.pytest.ini_options]
#########################
# Pytest configuration #
#########################

8
unimore-bda-6.iml Normal file
View file

@ -0,0 +1,8 @@
<?xml version="1.0" encoding="UTF-8"?>
<module type="GENERAL_MODULE" version="4">
<component name="NewModuleRootManager" inherit-compiler-output="true">
<exclude-output />
<content url="file://$MODULE_DIR$" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
</module>

View file

@ -0,0 +1,5 @@
# If you are building a **library**, use this file to export objects!
__all__ = (
# "",
)

View file

@ -0,0 +1,8 @@
# If you are building an **application**, use this file to run code!
def main():
pass
if __name__ == "__main__":
main()