From 2f7237ebfa05545754eabb5061686659c7f46753 Mon Sep 17 00:00:00 2001
From: Stefano Pigozzi <me@steffo.eu>
Date: Wed, 1 Feb 2023 17:46:25 +0100
Subject: [PATCH] Make some progress

---
 poetry.lock                        | 47 ++++++++++++++++++-
 pyproject.toml                     |  1 +
 unimore_bda_6/__main__.py          | 10 +++-
 unimore_bda_6/analysis.py          | 14 ------
 unimore_bda_6/analysis/__init__.py |  0
 unimore_bda_6/analysis/vanilla.py  | 58 ++++++++++++++++++++++++
 unimore_bda_6/config.py            | 33 +++++++++++++-
 unimore_bda_6/database.py          | 73 ++++++++++++++++++++++++++----
 unimore_bda_6/log.py               | 35 ++++++++++++++
 9 files changed, 243 insertions(+), 28 deletions(-)
 delete mode 100644 unimore_bda_6/analysis.py
 create mode 100644 unimore_bda_6/analysis/__init__.py
 create mode 100644 unimore_bda_6/analysis/vanilla.py
 create mode 100644 unimore_bda_6/log.py

diff --git a/poetry.lock b/poetry.lock
index 6ce832e..6430b86 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -47,6 +47,24 @@ files = [
     {file = "colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44"},
 ]
 
+[[package]]
+name = "coloredlogs"
+version = "15.0.1"
+description = "Colored terminal output for Python's logging module"
+category = "main"
+optional = false
+python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*"
+files = [
+    {file = "coloredlogs-15.0.1-py2.py3-none-any.whl", hash = "sha256:612ee75c546f53e92e70049c9dbfcc18c935a2b9a53b66085ce9ef6a6e5c0934"},
+    {file = "coloredlogs-15.0.1.tar.gz", hash = "sha256:7c991aa71a4577af2f82600d8f8f3a89f936baeaf9b50a9c197da014e5bf16b0"},
+]
+
+[package.dependencies]
+humanfriendly = ">=9.1"
+
+[package.extras]
+cron = ["capturer (>=2.4)"]
+
 [[package]]
 name = "dnspython"
 version = "2.3.0"
@@ -68,6 +86,21 @@ idna = ["idna (>=2.1,<4.0)"]
 trio = ["trio (>=0.14,<0.23)"]
 wmi = ["wmi (>=1.5.1,<2.0.0)"]
 
+[[package]]
+name = "humanfriendly"
+version = "10.0"
+description = "Human friendly output for text interfaces using Python"
+category = "main"
+optional = false
+python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*"
+files = [
+    {file = "humanfriendly-10.0-py2.py3-none-any.whl", hash = "sha256:1697e1a8a8f550fd43c2865cd84542fc175a61dcb779b6fee18cf6b6ccba1477"},
+    {file = "humanfriendly-10.0.tar.gz", hash = "sha256:6b0b831ce8f15f7300721aa49829fc4e83921a9a301cc7f606be6686a2288ddc"},
+]
+
+[package.dependencies]
+pyreadline3 = {version = "*", markers = "sys_platform == \"win32\" and python_version >= \"3.8\""}
+
 [[package]]
 name = "joblib"
 version = "1.2.0"
@@ -247,6 +280,18 @@ ocsp = ["certifi", "pyopenssl (>=17.2.0)", "requests (<3.0.0)", "service-identit
 snappy = ["python-snappy"]
 zstd = ["zstandard"]
 
+[[package]]
+name = "pyreadline3"
+version = "3.4.1"
+description = "A python implementation of GNU readline."
+category = "main"
+optional = false
+python-versions = "*"
+files = [
+    {file = "pyreadline3-3.4.1-py3-none-any.whl", hash = "sha256:b0efb6516fd4fb07b45949053826a62fa4cb353db5be2bbb4a7aa1fdd1e345fb"},
+    {file = "pyreadline3-3.4.1.tar.gz", hash = "sha256:6f3d1f7b8a31ba32b73917cefc1f28cc660562f39aea8646d30bd6eff21f7bae"},
+]
+
 [[package]]
 name = "regex"
 version = "2022.10.31"
@@ -369,4 +414,4 @@ telegram = ["requests"]
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.10"
-content-hash = "baf82d3820a17be0b6fcd6b31d1627a6ab198c9c094a2ee0fc2b0caca2659a72"
+content-hash = "4545b19bfa3d0ad9a489c2bee037d0c317dd34ba5b9745375d3566d5fc68ef55"
diff --git a/pyproject.toml b/pyproject.toml
index 06b5a44..5480ea6 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -135,6 +135,7 @@ python = "^3.10"
 pymongo = "^4.3.3"
 nltk = "^3.8.1"
 cfig = {extras = ["cli"], version = "^0.3.0"}
+coloredlogs = "^15.0.1"
 
 
 
diff --git a/unimore_bda_6/__main__.py b/unimore_bda_6/__main__.py
index d577504..eb4e616 100644
--- a/unimore_bda_6/__main__.py
+++ b/unimore_bda_6/__main__.py
@@ -1,9 +1,15 @@
-from .database import create_mongo_client_from_config
+from .config import config
+from .analysis.vanilla import create_model_vanilla, train_model_vanilla, evaluate_model_vanilla
+from .log import install_log_handler
 
 
 def main():
-    pass
+    model = create_model_vanilla()
+    train_model_vanilla(model)
+    evaluate_model_vanilla(model)
 
 
 if __name__ == "__main__":
+    install_log_handler()
+    config.proxies.resolve()
     main()
diff --git a/unimore_bda_6/analysis.py b/unimore_bda_6/analysis.py
deleted file mode 100644
index 7a64c78..0000000
--- a/unimore_bda_6/analysis.py
+++ /dev/null
@@ -1,14 +0,0 @@
-import nltk
-import nltk.sentiment
-
-
-def create_sentiment_analysis_model() -> ntlk.sentiment.SentimentAnalyzer:
-    analyzer = nltk.sentiment.SentimentAnalyzer()
-
-
-def train():
-    ...
-
-
-def test():
-    ...
diff --git a/unimore_bda_6/analysis/__init__.py b/unimore_bda_6/analysis/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/unimore_bda_6/analysis/vanilla.py b/unimore_bda_6/analysis/vanilla.py
new file mode 100644
index 0000000..c190b78
--- /dev/null
+++ b/unimore_bda_6/analysis/vanilla.py
@@ -0,0 +1,58 @@
+import nltk
+import nltk.classify
+import nltk.sentiment
+import nltk.sentiment.util
+import logging
+
+from ..database import mongo_reviews_collection_from_config, get_reviews_training_set, get_reviews_test_set
+
+
+log = logging.getLogger(__name__)
+
+
+def create_model_vanilla() -> nltk.sentiment.SentimentAnalyzer:
+    log.debug("Creating model...")
+    model = nltk.sentiment.SentimentAnalyzer()
+    log.debug("Created model %s!", model)
+    return model
+
+
+def train_model_vanilla(model: nltk.sentiment.SentimentAnalyzer) -> None:
+    # TODO: This doesn't work yet
+
+    with mongo_reviews_collection_from_config() as reviews:
+        training_set = get_reviews_training_set(reviews)
+
+    log.debug("Marking negations...")
+    training_negated_set = list(map(nltk.sentiment.util.mark_negation, training_set))
+
+    log.debug("Extracting tokens...")
+    training_tokens = model.all_words(training_negated_set, labeled=False)
+
+    log.debug("Counting unigrams...")
+    training_unigrams = model.unigram_word_feats(words=training_tokens, min_freq=4)
+
+    log.debug("Configuring model features...")
+    model.add_feat_extractor(nltk.sentiment.util.extract_unigram_feats, unigrams=training_unigrams)
+    training_set = model.apply_features(documents=training_set)
+
+    log.info("Training model...")
+    model.train(trainer=nltk.classify.NaiveBayesClassifier.train, training_set=training_set)
+
+
+def evaluate_model_vanilla(model: nltk.sentiment.SentimentAnalyzer):
+    with mongo_reviews_collection_from_config() as reviews:
+        test_set = get_reviews_test_set(reviews)
+
+    log.info("Evaluating model...")
+    model.evaluate(test_set)
+
+    # TODO
+    breakpoint()
+
+
+__all__ = (
+    "create_model_vanilla",
+    "train_model_vanilla",
+    "evaluate_model_vanilla",
+)
diff --git a/unimore_bda_6/config.py b/unimore_bda_6/config.py
index eb93fc0..758b973 100644
--- a/unimore_bda_6/config.py
+++ b/unimore_bda_6/config.py
@@ -16,7 +16,7 @@ def MONGO_PORT(val: str | None) -> int:
     """
     The port of the MongoDB database to connect to.
     """
-    if not val:
+    if val is None:
         return 27017
     try:
         return int(val)
@@ -24,12 +24,38 @@ def MONGO_PORT(val: str | None) -> int:
         raise cfig.InvalidValueError("Not an int.")
 
 
+@config.optional()
+def SAMPLE_MODE(val: str | None) -> str:
+    """
+    Whether `$sample` or `$limit` should be used to aggregate the training and test sets.
+    `$limit` is much faster, but not truly random, while `$sample` is completely random.
+    """
+    if val is None:
+        return "$sample"
+    if val not in ["$sample", "$limit"]:
+        raise cfig.InvalidValueError("Neither $sample or $limit.")
+    return val
+
+
 @config.optional()
 def TRAINING_SET_SIZE(val: str | None) -> int:
     """
     The number of reviews from each category to fetch for the training set.
     """
-    if not val:
+    if val is None:
+        return 1000
+    try:
+        return int(val)
+    except ValueError:
+        raise cfig.InvalidValueError("Not an int.")
+
+
+@config.optional()
+def TEST_SET_SIZE(val: str | None) -> int:
+    """
+    The number of reviews to fetch for the test set.
+    """
+    if val is None:
         return 1000
     try:
         return int(val)
@@ -41,4 +67,7 @@ __all__ = (
     "config",
     "MONGO_HOST",
     "MONGO_PORT",
+    "SAMPLE_MODE",
+    "TRAINING_SET_SIZE",
+    "TEST_SET_SIZE",
 )
diff --git a/unimore_bda_6/database.py b/unimore_bda_6/database.py
index 593ef71..c152d24 100644
--- a/unimore_bda_6/database.py
+++ b/unimore_bda_6/database.py
@@ -3,8 +3,12 @@ import pymongo
 import pymongo.collection
 import contextlib
 import bson
+import logging
+import random
 
-from .config import MONGO_HOST, MONGO_PORT
+from .config import MONGO_HOST, MONGO_PORT, TRAINING_SET_SIZE, TEST_SET_SIZE, SAMPLE_MODE
+
+log = logging.getLogger(__name__)
 
 
 class Review(t.TypedDict):
@@ -25,12 +29,18 @@ def mongo_client_from_config() -> t.ContextManager[pymongo.MongoClient]:
     """
     Create a new MongoDB client and yield it.
     """
+    log.debug("Opening connection to MongoDB...")
     client = pymongo.MongoClient(
-        host=MONGO_HOST.__resolved__,
-        port=MONGO_PORT.__resolved__,
+        host=MONGO_HOST.__wrapped__,
+        port=MONGO_PORT.__wrapped__,
     )
+    log.info("Opened connection to MongoDB: %s", client)
+
     yield client
+
+    log.info("Closing connection to MongoDB: %s", client)
     client.close()
+    log.debug("Closed connection to MongoDB!")
 
 
 @contextlib.contextmanager
@@ -39,16 +49,39 @@ def mongo_reviews_collection_from_config() -> pymongo.collection.Collection[Revi
     Create a new MongoDB client, access the ``reviews`` collection in the ``reviews`` database, and yield it.
     """
     with mongo_client_from_config() as db:
-        yield db.reviews.reviews
+        log.debug("Accessing the reviews collection...")
+        collection = db.reviews.reviews
+        log.debug("Collection accessed successfully: %s", collection)
+        yield collection
+
+
+def pipeline_sample(collection: pymongo.collection.Collection, amount: int) -> list:
+    """
+    Create pipeline stages for sampling random documents, either with true randomness or by skipping a random amount of them.
+    """
+    if SAMPLE_MODE.__wrapped__ == "$sample":
+        return [
+            {"$sample": {"size": amount}},
+        ]
+    elif SAMPLE_MODE.__wrapped__ == "$limit":
+        log.warning("USE_SAMPLE is disabled, sampling documents using $skip and $limit.")
+        skip = random.randint(0, collection.estimated_document_count(maxTimeMS=100))
+        return [
+            {"$skip": skip},
+            {"$limit": amount},
+        ]
+    else:
+        raise ValueError("Unknown sample mode", SAMPLE_MODE)
 
 
 def sample_reviews(reviews: pymongo.collection.Collection, amount: int) -> t.Iterable[Review]:
     """
     Get ``amount`` random reviews from the ``reviews`` collection.
     """
+    log.debug("Getting a sample of %d reviews...", amount)
 
     return reviews.aggregate([
-        {"$sample": {"size": amount}}
+        *pipeline_sample(reviews, amount),
     ])
 
 
@@ -56,10 +89,11 @@ def sample_reviews_by_rating(reviews: pymongo.collection.Collection, rating: flo
     """
     Get ``amount`` random reviews with ``rating`` stars from the ``reviews`` collection.
     """
+    log.debug("Getting a sample of %d reviews with %d stars...", amount, rating)
 
     return reviews.aggregate([
         {"$match": {"overall": rating}},
-        {"$sample": {"size": amount}},
+        *pipeline_sample(reviews, amount),
     ])
 
 
@@ -67,6 +101,7 @@ def sample_reviews_by_rating_polar(reviews: pymongo.collection.Collection, amoun
     """
     Get ``amount`` random reviews with either a 5-star or 1-star rating from the ``reviews`` collection.
     """
+    log.debug("Getting a sample of %d reviews with either 5 or 1 stars...", amount)
 
     return reviews.aggregate([
         {"$match":
@@ -77,14 +112,18 @@ def sample_reviews_by_rating_polar(reviews: pymongo.collection.Collection, amoun
                 ]
             },
         },
-        {"$sample": {"size": amount}},
+        *pipeline_sample(reviews, amount),
     ])
 
 
-def get_reviews_training_set(reviews: pymongo.collection.Collection, amount: int) -> t.Iterable[Review]:
+def get_reviews_training_set(reviews: pymongo.collection.Collection) -> t.Iterable[Review]:
     """
     Get the subset of reviews that should act as training set.
     """
+    log.info("Building training set...")
+
+    # Get the amount from the config
+    amount: int = TRAINING_SET_SIZE.__wrapped__
 
     # Handle odd numbers
     positive_amount: int = amount // 2
@@ -100,9 +139,25 @@ def get_reviews_training_set(reviews: pymongo.collection.Collection, amount: int
     return both
 
 
-def get_reviews_test_set(reviews: pymongo.collection.Collection, amount: int) -> t.Iterable[Review]:
+def get_reviews_test_set(reviews: pymongo.collection.Collection) -> t.Iterable[Review]:
     """
     Get the subset of reviews that should act as test set.
     """
 
+    log.info("Building test set...")
+
+    amount: int = TEST_SET_SIZE.__wrapped__
+
     return sample_reviews_by_rating_polar(reviews, amount)
+
+
+__all__ = (
+    "Review",
+    "mongo_client_from_config",
+    "mongo_reviews_collection_from_config",
+    "sample_reviews",
+    "sample_reviews_by_rating",
+    "sample_reviews_by_rating_polar",
+    "get_reviews_training_set",
+    "get_reviews_test_set",
+)
diff --git a/unimore_bda_6/log.py b/unimore_bda_6/log.py
new file mode 100644
index 0000000..e01899c
--- /dev/null
+++ b/unimore_bda_6/log.py
@@ -0,0 +1,35 @@
+import logging
+import coloredlogs
+
+log = logging.getLogger(__name__)
+
+
+def install_log_handler(logger: logging.Logger = None):
+    if logger is None:
+        logger = logging.getLogger("unimore_bda_6")
+
+    coloredlogs.install(
+        logger=logger,
+        level="DEBUG",
+        fmt="{asctime} | {name:<32} | {levelname:>8} | {message}",
+        style="{",
+        level_styles=dict(
+            debug=dict(color="white"),
+            info=dict(color="cyan"),
+            warning=dict(color="yellow"),
+            error=dict(color="red"),
+            critical=dict(color="red", bold=True),
+        ),
+        field_styles=dict(
+            asctime=dict(color='magenta'),
+            levelname=dict(color='blue', bold=True),
+            name=dict(color='blue'),
+        ),
+        isatty=True,
+    )
+    log.info("Installed custom log handler!")
+
+
+__all__ = (
+    "install_log_handler",
+)