From 2f7237ebfa05545754eabb5061686659c7f46753 Mon Sep 17 00:00:00 2001 From: Stefano Pigozzi Date: Wed, 1 Feb 2023 17:46:25 +0100 Subject: [PATCH] Make some progress --- poetry.lock | 47 ++++++++++++++++++- pyproject.toml | 1 + unimore_bda_6/__main__.py | 10 +++- unimore_bda_6/analysis.py | 14 ------ unimore_bda_6/analysis/__init__.py | 0 unimore_bda_6/analysis/vanilla.py | 58 ++++++++++++++++++++++++ unimore_bda_6/config.py | 33 +++++++++++++- unimore_bda_6/database.py | 73 ++++++++++++++++++++++++++---- unimore_bda_6/log.py | 35 ++++++++++++++ 9 files changed, 243 insertions(+), 28 deletions(-) delete mode 100644 unimore_bda_6/analysis.py create mode 100644 unimore_bda_6/analysis/__init__.py create mode 100644 unimore_bda_6/analysis/vanilla.py create mode 100644 unimore_bda_6/log.py diff --git a/poetry.lock b/poetry.lock index 6ce832e..6430b86 100644 --- a/poetry.lock +++ b/poetry.lock @@ -47,6 +47,24 @@ files = [ {file = "colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44"}, ] +[[package]] +name = "coloredlogs" +version = "15.0.1" +description = "Colored terminal output for Python's logging module" +category = "main" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" +files = [ + {file = "coloredlogs-15.0.1-py2.py3-none-any.whl", hash = "sha256:612ee75c546f53e92e70049c9dbfcc18c935a2b9a53b66085ce9ef6a6e5c0934"}, + {file = "coloredlogs-15.0.1.tar.gz", hash = "sha256:7c991aa71a4577af2f82600d8f8f3a89f936baeaf9b50a9c197da014e5bf16b0"}, +] + +[package.dependencies] +humanfriendly = ">=9.1" + +[package.extras] +cron = ["capturer (>=2.4)"] + [[package]] name = "dnspython" version = "2.3.0" @@ -68,6 +86,21 @@ idna = ["idna (>=2.1,<4.0)"] trio = ["trio (>=0.14,<0.23)"] wmi = ["wmi (>=1.5.1,<2.0.0)"] +[[package]] +name = "humanfriendly" +version = "10.0" +description = "Human friendly output for text interfaces using Python" +category = "main" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" +files = [ + {file = "humanfriendly-10.0-py2.py3-none-any.whl", hash = "sha256:1697e1a8a8f550fd43c2865cd84542fc175a61dcb779b6fee18cf6b6ccba1477"}, + {file = "humanfriendly-10.0.tar.gz", hash = "sha256:6b0b831ce8f15f7300721aa49829fc4e83921a9a301cc7f606be6686a2288ddc"}, +] + +[package.dependencies] +pyreadline3 = {version = "*", markers = "sys_platform == \"win32\" and python_version >= \"3.8\""} + [[package]] name = "joblib" version = "1.2.0" @@ -247,6 +280,18 @@ ocsp = ["certifi", "pyopenssl (>=17.2.0)", "requests (<3.0.0)", "service-identit snappy = ["python-snappy"] zstd = ["zstandard"] +[[package]] +name = "pyreadline3" +version = "3.4.1" +description = "A python implementation of GNU readline." +category = "main" +optional = false +python-versions = "*" +files = [ + {file = "pyreadline3-3.4.1-py3-none-any.whl", hash = "sha256:b0efb6516fd4fb07b45949053826a62fa4cb353db5be2bbb4a7aa1fdd1e345fb"}, + {file = "pyreadline3-3.4.1.tar.gz", hash = "sha256:6f3d1f7b8a31ba32b73917cefc1f28cc660562f39aea8646d30bd6eff21f7bae"}, +] + [[package]] name = "regex" version = "2022.10.31" @@ -369,4 +414,4 @@ telegram = ["requests"] [metadata] lock-version = "2.0" python-versions = "^3.10" -content-hash = "baf82d3820a17be0b6fcd6b31d1627a6ab198c9c094a2ee0fc2b0caca2659a72" +content-hash = "4545b19bfa3d0ad9a489c2bee037d0c317dd34ba5b9745375d3566d5fc68ef55" diff --git a/pyproject.toml b/pyproject.toml index 06b5a44..5480ea6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -135,6 +135,7 @@ python = "^3.10" pymongo = "^4.3.3" nltk = "^3.8.1" cfig = {extras = ["cli"], version = "^0.3.0"} +coloredlogs = "^15.0.1" diff --git a/unimore_bda_6/__main__.py b/unimore_bda_6/__main__.py index d577504..eb4e616 100644 --- a/unimore_bda_6/__main__.py +++ b/unimore_bda_6/__main__.py @@ -1,9 +1,15 @@ -from .database import create_mongo_client_from_config +from .config import config +from .analysis.vanilla import create_model_vanilla, train_model_vanilla, evaluate_model_vanilla +from .log import install_log_handler def main(): - pass + model = create_model_vanilla() + train_model_vanilla(model) + evaluate_model_vanilla(model) if __name__ == "__main__": + install_log_handler() + config.proxies.resolve() main() diff --git a/unimore_bda_6/analysis.py b/unimore_bda_6/analysis.py deleted file mode 100644 index 7a64c78..0000000 --- a/unimore_bda_6/analysis.py +++ /dev/null @@ -1,14 +0,0 @@ -import nltk -import nltk.sentiment - - -def create_sentiment_analysis_model() -> ntlk.sentiment.SentimentAnalyzer: - analyzer = nltk.sentiment.SentimentAnalyzer() - - -def train(): - ... - - -def test(): - ... diff --git a/unimore_bda_6/analysis/__init__.py b/unimore_bda_6/analysis/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/unimore_bda_6/analysis/vanilla.py b/unimore_bda_6/analysis/vanilla.py new file mode 100644 index 0000000..c190b78 --- /dev/null +++ b/unimore_bda_6/analysis/vanilla.py @@ -0,0 +1,58 @@ +import nltk +import nltk.classify +import nltk.sentiment +import nltk.sentiment.util +import logging + +from ..database import mongo_reviews_collection_from_config, get_reviews_training_set, get_reviews_test_set + + +log = logging.getLogger(__name__) + + +def create_model_vanilla() -> nltk.sentiment.SentimentAnalyzer: + log.debug("Creating model...") + model = nltk.sentiment.SentimentAnalyzer() + log.debug("Created model %s!", model) + return model + + +def train_model_vanilla(model: nltk.sentiment.SentimentAnalyzer) -> None: + # TODO: This doesn't work yet + + with mongo_reviews_collection_from_config() as reviews: + training_set = get_reviews_training_set(reviews) + + log.debug("Marking negations...") + training_negated_set = list(map(nltk.sentiment.util.mark_negation, training_set)) + + log.debug("Extracting tokens...") + training_tokens = model.all_words(training_negated_set, labeled=False) + + log.debug("Counting unigrams...") + training_unigrams = model.unigram_word_feats(words=training_tokens, min_freq=4) + + log.debug("Configuring model features...") + model.add_feat_extractor(nltk.sentiment.util.extract_unigram_feats, unigrams=training_unigrams) + training_set = model.apply_features(documents=training_set) + + log.info("Training model...") + model.train(trainer=nltk.classify.NaiveBayesClassifier.train, training_set=training_set) + + +def evaluate_model_vanilla(model: nltk.sentiment.SentimentAnalyzer): + with mongo_reviews_collection_from_config() as reviews: + test_set = get_reviews_test_set(reviews) + + log.info("Evaluating model...") + model.evaluate(test_set) + + # TODO + breakpoint() + + +__all__ = ( + "create_model_vanilla", + "train_model_vanilla", + "evaluate_model_vanilla", +) diff --git a/unimore_bda_6/config.py b/unimore_bda_6/config.py index eb93fc0..758b973 100644 --- a/unimore_bda_6/config.py +++ b/unimore_bda_6/config.py @@ -16,7 +16,7 @@ def MONGO_PORT(val: str | None) -> int: """ The port of the MongoDB database to connect to. """ - if not val: + if val is None: return 27017 try: return int(val) @@ -24,12 +24,38 @@ def MONGO_PORT(val: str | None) -> int: raise cfig.InvalidValueError("Not an int.") +@config.optional() +def SAMPLE_MODE(val: str | None) -> str: + """ + Whether `$sample` or `$limit` should be used to aggregate the training and test sets. + `$limit` is much faster, but not truly random, while `$sample` is completely random. + """ + if val is None: + return "$sample" + if val not in ["$sample", "$limit"]: + raise cfig.InvalidValueError("Neither $sample or $limit.") + return val + + @config.optional() def TRAINING_SET_SIZE(val: str | None) -> int: """ The number of reviews from each category to fetch for the training set. """ - if not val: + if val is None: + return 1000 + try: + return int(val) + except ValueError: + raise cfig.InvalidValueError("Not an int.") + + +@config.optional() +def TEST_SET_SIZE(val: str | None) -> int: + """ + The number of reviews to fetch for the test set. + """ + if val is None: return 1000 try: return int(val) @@ -41,4 +67,7 @@ __all__ = ( "config", "MONGO_HOST", "MONGO_PORT", + "SAMPLE_MODE", + "TRAINING_SET_SIZE", + "TEST_SET_SIZE", ) diff --git a/unimore_bda_6/database.py b/unimore_bda_6/database.py index 593ef71..c152d24 100644 --- a/unimore_bda_6/database.py +++ b/unimore_bda_6/database.py @@ -3,8 +3,12 @@ import pymongo import pymongo.collection import contextlib import bson +import logging +import random -from .config import MONGO_HOST, MONGO_PORT +from .config import MONGO_HOST, MONGO_PORT, TRAINING_SET_SIZE, TEST_SET_SIZE, SAMPLE_MODE + +log = logging.getLogger(__name__) class Review(t.TypedDict): @@ -25,12 +29,18 @@ def mongo_client_from_config() -> t.ContextManager[pymongo.MongoClient]: """ Create a new MongoDB client and yield it. """ + log.debug("Opening connection to MongoDB...") client = pymongo.MongoClient( - host=MONGO_HOST.__resolved__, - port=MONGO_PORT.__resolved__, + host=MONGO_HOST.__wrapped__, + port=MONGO_PORT.__wrapped__, ) + log.info("Opened connection to MongoDB: %s", client) + yield client + + log.info("Closing connection to MongoDB: %s", client) client.close() + log.debug("Closed connection to MongoDB!") @contextlib.contextmanager @@ -39,16 +49,39 @@ def mongo_reviews_collection_from_config() -> pymongo.collection.Collection[Revi Create a new MongoDB client, access the ``reviews`` collection in the ``reviews`` database, and yield it. """ with mongo_client_from_config() as db: - yield db.reviews.reviews + log.debug("Accessing the reviews collection...") + collection = db.reviews.reviews + log.debug("Collection accessed successfully: %s", collection) + yield collection + + +def pipeline_sample(collection: pymongo.collection.Collection, amount: int) -> list: + """ + Create pipeline stages for sampling random documents, either with true randomness or by skipping a random amount of them. + """ + if SAMPLE_MODE.__wrapped__ == "$sample": + return [ + {"$sample": {"size": amount}}, + ] + elif SAMPLE_MODE.__wrapped__ == "$limit": + log.warning("USE_SAMPLE is disabled, sampling documents using $skip and $limit.") + skip = random.randint(0, collection.estimated_document_count(maxTimeMS=100)) + return [ + {"$skip": skip}, + {"$limit": amount}, + ] + else: + raise ValueError("Unknown sample mode", SAMPLE_MODE) def sample_reviews(reviews: pymongo.collection.Collection, amount: int) -> t.Iterable[Review]: """ Get ``amount`` random reviews from the ``reviews`` collection. """ + log.debug("Getting a sample of %d reviews...", amount) return reviews.aggregate([ - {"$sample": {"size": amount}} + *pipeline_sample(reviews, amount), ]) @@ -56,10 +89,11 @@ def sample_reviews_by_rating(reviews: pymongo.collection.Collection, rating: flo """ Get ``amount`` random reviews with ``rating`` stars from the ``reviews`` collection. """ + log.debug("Getting a sample of %d reviews with %d stars...", amount, rating) return reviews.aggregate([ {"$match": {"overall": rating}}, - {"$sample": {"size": amount}}, + *pipeline_sample(reviews, amount), ]) @@ -67,6 +101,7 @@ def sample_reviews_by_rating_polar(reviews: pymongo.collection.Collection, amoun """ Get ``amount`` random reviews with either a 5-star or 1-star rating from the ``reviews`` collection. """ + log.debug("Getting a sample of %d reviews with either 5 or 1 stars...", amount) return reviews.aggregate([ {"$match": @@ -77,14 +112,18 @@ def sample_reviews_by_rating_polar(reviews: pymongo.collection.Collection, amoun ] }, }, - {"$sample": {"size": amount}}, + *pipeline_sample(reviews, amount), ]) -def get_reviews_training_set(reviews: pymongo.collection.Collection, amount: int) -> t.Iterable[Review]: +def get_reviews_training_set(reviews: pymongo.collection.Collection) -> t.Iterable[Review]: """ Get the subset of reviews that should act as training set. """ + log.info("Building training set...") + + # Get the amount from the config + amount: int = TRAINING_SET_SIZE.__wrapped__ # Handle odd numbers positive_amount: int = amount // 2 @@ -100,9 +139,25 @@ def get_reviews_training_set(reviews: pymongo.collection.Collection, amount: int return both -def get_reviews_test_set(reviews: pymongo.collection.Collection, amount: int) -> t.Iterable[Review]: +def get_reviews_test_set(reviews: pymongo.collection.Collection) -> t.Iterable[Review]: """ Get the subset of reviews that should act as test set. """ + log.info("Building test set...") + + amount: int = TEST_SET_SIZE.__wrapped__ + return sample_reviews_by_rating_polar(reviews, amount) + + +__all__ = ( + "Review", + "mongo_client_from_config", + "mongo_reviews_collection_from_config", + "sample_reviews", + "sample_reviews_by_rating", + "sample_reviews_by_rating_polar", + "get_reviews_training_set", + "get_reviews_test_set", +) diff --git a/unimore_bda_6/log.py b/unimore_bda_6/log.py new file mode 100644 index 0000000..e01899c --- /dev/null +++ b/unimore_bda_6/log.py @@ -0,0 +1,35 @@ +import logging +import coloredlogs + +log = logging.getLogger(__name__) + + +def install_log_handler(logger: logging.Logger = None): + if logger is None: + logger = logging.getLogger("unimore_bda_6") + + coloredlogs.install( + logger=logger, + level="DEBUG", + fmt="{asctime} | {name:<32} | {levelname:>8} | {message}", + style="{", + level_styles=dict( + debug=dict(color="white"), + info=dict(color="cyan"), + warning=dict(color="yellow"), + error=dict(color="red"), + critical=dict(color="red", bold=True), + ), + field_styles=dict( + asctime=dict(color='magenta'), + levelname=dict(color='blue', bold=True), + name=dict(color='blue'), + ), + isatty=True, + ) + log.info("Installed custom log handler!") + + +__all__ = ( + "install_log_handler", +)