Make some progress

2024-11-21 23:44:19 +00:00 · 2023-02-01 17:46:25 +01:00 · 2023-02-01 17:46:25 +01:00 · 2f7237ebfa
commit 2f7237ebfa
parent 0f37d206a1
9 changed files with 243 additions and 28 deletions
--- a/poetry.lock
+++ b/poetry.lock
@ -47,6 +47,24 @@ files = [
    {file = "colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44"},
 ]

+[[package]]
+name = "coloredlogs"
+version = "15.0.1"
+description = "Colored terminal output for Python's logging module"
+category = "main"
+optional = false
+python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*"
+files = [
+    {file = "coloredlogs-15.0.1-py2.py3-none-any.whl", hash = "sha256:612ee75c546f53e92e70049c9dbfcc18c935a2b9a53b66085ce9ef6a6e5c0934"},
+    {file = "coloredlogs-15.0.1.tar.gz", hash = "sha256:7c991aa71a4577af2f82600d8f8f3a89f936baeaf9b50a9c197da014e5bf16b0"},
+]
+
+[package.dependencies]
+humanfriendly = ">=9.1"
+
+[package.extras]
+cron = ["capturer (>=2.4)"]
+
 [[package]]
 name = "dnspython"
 version = "2.3.0"
@ -68,6 +86,21 @@ idna = ["idna (>=2.1,<4.0)"]
 trio = ["trio (>=0.14,<0.23)"]
 wmi = ["wmi (>=1.5.1,<2.0.0)"]

+[[package]]
+name = "humanfriendly"
+version = "10.0"
+description = "Human friendly output for text interfaces using Python"
+category = "main"
+optional = false
+python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*"
+files = [
+    {file = "humanfriendly-10.0-py2.py3-none-any.whl", hash = "sha256:1697e1a8a8f550fd43c2865cd84542fc175a61dcb779b6fee18cf6b6ccba1477"},
+    {file = "humanfriendly-10.0.tar.gz", hash = "sha256:6b0b831ce8f15f7300721aa49829fc4e83921a9a301cc7f606be6686a2288ddc"},
+]
+
+[package.dependencies]
+pyreadline3 = {version = "*", markers = "sys_platform == \"win32\" and python_version >= \"3.8\""}
+
 [[package]]
 name = "joblib"
 version = "1.2.0"
@ -247,6 +280,18 @@ ocsp = ["certifi", "pyopenssl (>=17.2.0)", "requests (<3.0.0)", "service-identit
 snappy = ["python-snappy"]
 zstd = ["zstandard"]

+[[package]]
+name = "pyreadline3"
+version = "3.4.1"
+description = "A python implementation of GNU readline."
+category = "main"
+optional = false
+python-versions = "*"
+files = [
+    {file = "pyreadline3-3.4.1-py3-none-any.whl", hash = "sha256:b0efb6516fd4fb07b45949053826a62fa4cb353db5be2bbb4a7aa1fdd1e345fb"},
+    {file = "pyreadline3-3.4.1.tar.gz", hash = "sha256:6f3d1f7b8a31ba32b73917cefc1f28cc660562f39aea8646d30bd6eff21f7bae"},
+]
+
 [[package]]
 name = "regex"
 version = "2022.10.31"
@ -369,4 +414,4 @@ telegram = ["requests"]
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.10"
-content-hash = "baf82d3820a17be0b6fcd6b31d1627a6ab198c9c094a2ee0fc2b0caca2659a72"
+content-hash = "4545b19bfa3d0ad9a489c2bee037d0c317dd34ba5b9745375d3566d5fc68ef55"
--- a/pyproject.toml
+++ b/pyproject.toml
@ -135,6 +135,7 @@ python = "^3.10"
 pymongo = "^4.3.3"
 nltk = "^3.8.1"
 cfig = {extras = ["cli"], version = "^0.3.0"}
+coloredlogs = "^15.0.1"



--- a/unimore_bda_6/main.py
+++ b/unimore_bda_6/main.py
@ -1,9 +1,15 @@
-from .database import create_mongo_client_from_config
+from .config import config
+from .analysis.vanilla import create_model_vanilla, train_model_vanilla, evaluate_model_vanilla
+from .log import install_log_handler


 def main():
-    pass
+    model = create_model_vanilla()
+    train_model_vanilla(model)
+    evaluate_model_vanilla(model)


 if __name__ == "__main__":
+    install_log_handler()
+    config.proxies.resolve()
    main()
--- a/unimore_bda_6/analysis.py
+++ b/unimore_bda_6/analysis.py
@ -1,14 +0,0 @@
-import nltk
-import nltk.sentiment
-
-
-def create_sentiment_analysis_model() -> ntlk.sentiment.SentimentAnalyzer:
-    analyzer = nltk.sentiment.SentimentAnalyzer()
-
-
-def train():
-    ...
-
-
-def test():
-    ...
--- a/unimore_bda_6/analysis/init.py
+++ b/unimore_bda_6/analysis/init.py
--- a/unimore_bda_6/analysis/vanilla.py
+++ b/unimore_bda_6/analysis/vanilla.py
@ -0,0 +1,58 @@
+import nltk
+import nltk.classify
+import nltk.sentiment
+import nltk.sentiment.util
+import logging
+
+from ..database import mongo_reviews_collection_from_config, get_reviews_training_set, get_reviews_test_set
+
+
+log = logging.getLogger(__name__)
+
+
+def create_model_vanilla() -> nltk.sentiment.SentimentAnalyzer:
+    log.debug("Creating model...")
+    model = nltk.sentiment.SentimentAnalyzer()
+    log.debug("Created model %s!", model)
+    return model
+
+
+def train_model_vanilla(model: nltk.sentiment.SentimentAnalyzer) -> None:
+    # TODO: This doesn't work yet
+
+    with mongo_reviews_collection_from_config() as reviews:
+        training_set = get_reviews_training_set(reviews)
+
+    log.debug("Marking negations...")
+    training_negated_set = list(map(nltk.sentiment.util.mark_negation, training_set))
+
+    log.debug("Extracting tokens...")
+    training_tokens = model.all_words(training_negated_set, labeled=False)
+
+    log.debug("Counting unigrams...")
+    training_unigrams = model.unigram_word_feats(words=training_tokens, min_freq=4)
+
+    log.debug("Configuring model features...")
+    model.add_feat_extractor(nltk.sentiment.util.extract_unigram_feats, unigrams=training_unigrams)
+    training_set = model.apply_features(documents=training_set)
+
+    log.info("Training model...")
+    model.train(trainer=nltk.classify.NaiveBayesClassifier.train, training_set=training_set)
+
+
+def evaluate_model_vanilla(model: nltk.sentiment.SentimentAnalyzer):
+    with mongo_reviews_collection_from_config() as reviews:
+        test_set = get_reviews_test_set(reviews)
+
+    log.info("Evaluating model...")
+    model.evaluate(test_set)
+
+    # TODO
+    breakpoint()
+
+
+__all__ = (
+    "create_model_vanilla",
+    "train_model_vanilla",
+    "evaluate_model_vanilla",
+)
--- a/unimore_bda_6/config.py
+++ b/unimore_bda_6/config.py
@ -16,7 +16,7 @@ def MONGO_PORT(val: str | None) -> int:
    """
    The port of the MongoDB database to connect to.
    """
-    if not val:
+    if val is None:
        return 27017
    try:
        return int(val)
@ -24,12 +24,38 @@ def MONGO_PORT(val: str | None) -> int:
        raise cfig.InvalidValueError("Not an int.")


+@config.optional()
+def SAMPLE_MODE(val: str | None) -> str:
+    """
+    Whether `$sample` or `$limit` should be used to aggregate the training and test sets.
+    `$limit` is much faster, but not truly random, while `$sample` is completely random.
+    """
+    if val is None:
+        return "$sample"
+    if val not in ["$sample", "$limit"]:
+        raise cfig.InvalidValueError("Neither $sample or $limit.")
+    return val
+
+
@config.optional()
 def TRAINING_SET_SIZE(val: str | None) -> int:
    """
    The number of reviews from each category to fetch for the training set.
    """
-    if not val:
+    if val is None:
+        return 1000
+    try:
+        return int(val)
+    except ValueError:
+        raise cfig.InvalidValueError("Not an int.")
+
+
+@config.optional()
+def TEST_SET_SIZE(val: str | None) -> int:
+    """
+    The number of reviews to fetch for the test set.
+    """
+    if val is None:
        return 1000
    try:
        return int(val)
@ -41,4 +67,7 @@ __all__ = (
    "config",
    "MONGO_HOST",
    "MONGO_PORT",
+    "SAMPLE_MODE",
+    "TRAINING_SET_SIZE",
+    "TEST_SET_SIZE",
 )
--- a/unimore_bda_6/database.py
+++ b/unimore_bda_6/database.py
@ -3,8 +3,12 @@ import pymongo
 import pymongo.collection
 import contextlib
 import bson
+import logging
+import random

-from .config import MONGO_HOST, MONGO_PORT
+from .config import MONGO_HOST, MONGO_PORT, TRAINING_SET_SIZE, TEST_SET_SIZE, SAMPLE_MODE
+
+log = logging.getLogger(__name__)


 class Review(t.TypedDict):
@ -25,12 +29,18 @@ def mongo_client_from_config() -> t.ContextManager[pymongo.MongoClient]:
    """
    Create a new MongoDB client and yield it.
    """
+    log.debug("Opening connection to MongoDB...")
    client = pymongo.MongoClient(
-        host=MONGO_HOST.__resolved__,
-        port=MONGO_PORT.__resolved__,
+        host=MONGO_HOST.__wrapped__,
+        port=MONGO_PORT.__wrapped__,
    )
+    log.info("Opened connection to MongoDB: %s", client)
+
    yield client
+
+    log.info("Closing connection to MongoDB: %s", client)
    client.close()
+    log.debug("Closed connection to MongoDB!")


@contextlib.contextmanager
@ -39,16 +49,39 @@ def mongo_reviews_collection_from_config() -> pymongo.collection.Collection[Revi
    Create a new MongoDB client, access the ``reviews`` collection in the ``reviews`` database, and yield it.
    """
    with mongo_client_from_config() as db:
-        yield db.reviews.reviews
+        log.debug("Accessing the reviews collection...")
+        collection = db.reviews.reviews
+        log.debug("Collection accessed successfully: %s", collection)
+        yield collection
+
+
+def pipeline_sample(collection: pymongo.collection.Collection, amount: int) -> list:
+    """
+    Create pipeline stages for sampling random documents, either with true randomness or by skipping a random amount of them.
+    """
+    if SAMPLE_MODE.__wrapped__ == "$sample":
+        return [
+            {"$sample": {"size": amount}},
+        ]
+    elif SAMPLE_MODE.__wrapped__ == "$limit":
+        log.warning("USE_SAMPLE is disabled, sampling documents using $skip and $limit.")
+        skip = random.randint(0, collection.estimated_document_count(maxTimeMS=100))
+        return [
+            {"$skip": skip},
+            {"$limit": amount},
+        ]
+    else:
+        raise ValueError("Unknown sample mode", SAMPLE_MODE)


 def sample_reviews(reviews: pymongo.collection.Collection, amount: int) -> t.Iterable[Review]:
    """
    Get ``amount`` random reviews from the ``reviews`` collection.
    """
+    log.debug("Getting a sample of %d reviews...", amount)

    return reviews.aggregate([
-        {"$sample": {"size": amount}}
+        *pipeline_sample(reviews, amount),
    ])


@ -56,10 +89,11 @@ def sample_reviews_by_rating(reviews: pymongo.collection.Collection, rating: flo
    """
    Get ``amount`` random reviews with ``rating`` stars from the ``reviews`` collection.
    """
+    log.debug("Getting a sample of %d reviews with %d stars...", amount, rating)

    return reviews.aggregate([
        {"$match": {"overall": rating}},
-        {"$sample": {"size": amount}},
+        *pipeline_sample(reviews, amount),
    ])


@ -67,6 +101,7 @@ def sample_reviews_by_rating_polar(reviews: pymongo.collection.Collection, amoun
    """
    Get ``amount`` random reviews with either a 5-star or 1-star rating from the ``reviews`` collection.
    """
+    log.debug("Getting a sample of %d reviews with either 5 or 1 stars...", amount)

    return reviews.aggregate([
        {"$match":
@ -77,14 +112,18 @@ def sample_reviews_by_rating_polar(reviews: pymongo.collection.Collection, amoun
                ]
            },
        },
-        {"$sample": {"size": amount}},
+        *pipeline_sample(reviews, amount),
    ])


-def get_reviews_training_set(reviews: pymongo.collection.Collection, amount: int) -> t.Iterable[Review]:
+def get_reviews_training_set(reviews: pymongo.collection.Collection) -> t.Iterable[Review]:
    """
    Get the subset of reviews that should act as training set.
    """
+    log.info("Building training set...")
+
+    # Get the amount from the config
+    amount: int = TRAINING_SET_SIZE.__wrapped__

    # Handle odd numbers
    positive_amount: int = amount // 2
@ -100,9 +139,25 @@ def get_reviews_training_set(reviews: pymongo.collection.Collection, amount: int
    return both


-def get_reviews_test_set(reviews: pymongo.collection.Collection, amount: int) -> t.Iterable[Review]:
+def get_reviews_test_set(reviews: pymongo.collection.Collection) -> t.Iterable[Review]:
    """
    Get the subset of reviews that should act as test set.
    """

+    log.info("Building test set...")
+
+    amount: int = TEST_SET_SIZE.__wrapped__
+
    return sample_reviews_by_rating_polar(reviews, amount)
+
+
+__all__ = (
+    "Review",
+    "mongo_client_from_config",
+    "mongo_reviews_collection_from_config",
+    "sample_reviews",
+    "sample_reviews_by_rating",
+    "sample_reviews_by_rating_polar",
+    "get_reviews_training_set",
+    "get_reviews_test_set",
+)
--- a/unimore_bda_6/log.py
+++ b/unimore_bda_6/log.py
@ -0,0 +1,35 @@
+import logging
+import coloredlogs
+
+log = logging.getLogger(__name__)
+
+
+def install_log_handler(logger: logging.Logger = None):
+    if logger is None:
+        logger = logging.getLogger("unimore_bda_6")
+
+    coloredlogs.install(
+        logger=logger,
+        level="DEBUG",
+        fmt="{asctime} | {name:<32} | {levelname:>8} | {message}",
+        style="{",
+        level_styles=dict(
+            debug=dict(color="white"),
+            info=dict(color="cyan"),
+            warning=dict(color="yellow"),
+            error=dict(color="red"),
+            critical=dict(color="red", bold=True),
+        ),
+        field_styles=dict(
+            asctime=dict(color='magenta'),
+            levelname=dict(color='blue', bold=True),
+            name=dict(color='blue'),
+        ),
+        isatty=True,
+    )
+    log.info("Installed custom log handler!")
+
+
+__all__ = (
+    "install_log_handler",
+)