Make some progress

2025-03-14 04:33:31 +00:00 · 2023-02-01 17:46:25 +01:00 · 2023-02-01 17:46:25 +01:00 · 2f7237ebfa
commit 2f7237ebfa
parent 0f37d206a1
9 changed files with 243 additions and 28 deletions
--- a/poetry.lock
+++ b/poetry.lock
@ -47,6 +47,24 @@ files = [
    {file = "colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44"},
 ]
 [[package]]
 name = "coloredlogs"
 version = "15.0.1"
 description = "Colored terminal output for Python's logging module"
 category = "main"
 optional = false
 python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*"
 files = [
    {file = "coloredlogs-15.0.1-py2.py3-none-any.whl", hash = "sha256:612ee75c546f53e92e70049c9dbfcc18c935a2b9a53b66085ce9ef6a6e5c0934"},
    {file = "coloredlogs-15.0.1.tar.gz", hash = "sha256:7c991aa71a4577af2f82600d8f8f3a89f936baeaf9b50a9c197da014e5bf16b0"},
 ]
 [package.dependencies]
 humanfriendly = ">=9.1"
 [package.extras]
 cron = ["capturer (>=2.4)"]
 [[package]]
 name = "dnspython"
 version = "2.3.0"
@ -68,6 +86,21 @@ idna = ["idna (>=2.1,<4.0)"]
 trio = ["trio (>=0.14,<0.23)"]
 wmi = ["wmi (>=1.5.1,<2.0.0)"]
 [[package]]
 name = "humanfriendly"
 version = "10.0"
 description = "Human friendly output for text interfaces using Python"
 category = "main"
 optional = false
 python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*"
 files = [
    {file = "humanfriendly-10.0-py2.py3-none-any.whl", hash = "sha256:1697e1a8a8f550fd43c2865cd84542fc175a61dcb779b6fee18cf6b6ccba1477"},
    {file = "humanfriendly-10.0.tar.gz", hash = "sha256:6b0b831ce8f15f7300721aa49829fc4e83921a9a301cc7f606be6686a2288ddc"},
 ]
 [package.dependencies]
 pyreadline3 = {version = "*", markers = "sys_platform == \"win32\" and python_version >= \"3.8\""}
 [[package]]
 name = "joblib"
 version = "1.2.0"
@ -247,6 +280,18 @@ ocsp = ["certifi", "pyopenssl (>=17.2.0)", "requests (<3.0.0)", "service-identit
 snappy = ["python-snappy"]
 zstd = ["zstandard"]
 [[package]]
 name = "pyreadline3"
 version = "3.4.1"
 description = "A python implementation of GNU readline."
 category = "main"
 optional = false
 python-versions = "*"
 files = [
    {file = "pyreadline3-3.4.1-py3-none-any.whl", hash = "sha256:b0efb6516fd4fb07b45949053826a62fa4cb353db5be2bbb4a7aa1fdd1e345fb"},
    {file = "pyreadline3-3.4.1.tar.gz", hash = "sha256:6f3d1f7b8a31ba32b73917cefc1f28cc660562f39aea8646d30bd6eff21f7bae"},
 ]
 [[package]]
 name = "regex"
 version = "2022.10.31"
@ -369,4 +414,4 @@ telegram = ["requests"]
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.10"
-content-hash = "baf82d3820a17be0b6fcd6b31d1627a6ab198c9c094a2ee0fc2b0caca2659a72"
+content-hash = "4545b19bfa3d0ad9a489c2bee037d0c317dd34ba5b9745375d3566d5fc68ef55"
--- a/pyproject.toml
+++ b/pyproject.toml
@ -135,6 +135,7 @@ python = "^3.10"
 pymongo = "^4.3.3"
 nltk = "^3.8.1"
 cfig = {extras = ["cli"], version = "^0.3.0"}
 coloredlogs = "^15.0.1"
--- a/unimore_bda_6/main.py
+++ b/unimore_bda_6/main.py
@ -1,9 +1,15 @@
-from .database import create_mongo_client_from_config
+from .config import config
 from .analysis.vanilla import create_model_vanilla, train_model_vanilla, evaluate_model_vanilla
 from .log import install_log_handler
 def main():
-    pass
+    model = create_model_vanilla()
    train_model_vanilla(model)
    evaluate_model_vanilla(model)
 if __name__ == "__main__":
    install_log_handler()
    config.proxies.resolve()
    main()
--- a/unimore_bda_6/analysis.py
+++ b/unimore_bda_6/analysis.py
@ -1,14 +0,0 @@
 import nltk
 import nltk.sentiment
 def create_sentiment_analysis_model() -> ntlk.sentiment.SentimentAnalyzer:
    analyzer = nltk.sentiment.SentimentAnalyzer()
 def train():
    ...
 def test():
    ...
--- a/unimore_bda_6/analysis/init.py
+++ b/unimore_bda_6/analysis/init.py
--- a/unimore_bda_6/analysis/vanilla.py
+++ b/unimore_bda_6/analysis/vanilla.py
@ -0,0 +1,58 @@
 import nltk
 import nltk.classify
 import nltk.sentiment
 import nltk.sentiment.util
 import logging
 from ..database import mongo_reviews_collection_from_config, get_reviews_training_set, get_reviews_test_set
 log = logging.getLogger(__name__)
 def create_model_vanilla() -> nltk.sentiment.SentimentAnalyzer:
    log.debug("Creating model...")
    model = nltk.sentiment.SentimentAnalyzer()
    log.debug("Created model %s!", model)
    return model
 def train_model_vanilla(model: nltk.sentiment.SentimentAnalyzer) -> None:
    # TODO: This doesn't work yet
    with mongo_reviews_collection_from_config() as reviews:
        training_set = get_reviews_training_set(reviews)
    log.debug("Marking negations...")
    training_negated_set = list(map(nltk.sentiment.util.mark_negation, training_set))
    log.debug("Extracting tokens...")
    training_tokens = model.all_words(training_negated_set, labeled=False)
    log.debug("Counting unigrams...")
    training_unigrams = model.unigram_word_feats(words=training_tokens, min_freq=4)
    log.debug("Configuring model features...")
    model.add_feat_extractor(nltk.sentiment.util.extract_unigram_feats, unigrams=training_unigrams)
    training_set = model.apply_features(documents=training_set)
    log.info("Training model...")
    model.train(trainer=nltk.classify.NaiveBayesClassifier.train, training_set=training_set)
 def evaluate_model_vanilla(model: nltk.sentiment.SentimentAnalyzer):
    with mongo_reviews_collection_from_config() as reviews:
        test_set = get_reviews_test_set(reviews)
    log.info("Evaluating model...")
    model.evaluate(test_set)
    # TODO
    breakpoint()
 __all__ = (
    "create_model_vanilla",
    "train_model_vanilla",
    "evaluate_model_vanilla",
 )
--- a/unimore_bda_6/config.py
+++ b/unimore_bda_6/config.py
@ -16,7 +16,7 @@ def MONGO_PORT(val: str | None) -> int:
    """
    The port of the MongoDB database to connect to.
    """
-    if not val:
+    if val is None:
        return 27017
    try:
        return int(val)
@ -24,12 +24,38 @@ def MONGO_PORT(val: str | None) -> int:
        raise cfig.InvalidValueError("Not an int.")
@config.optional()
 def SAMPLE_MODE(val: str | None) -> str:
    """
    Whether `$sample` or `$limit` should be used to aggregate the training and test sets.
    `$limit` is much faster, but not truly random, while `$sample` is completely random.
    """
    if val is None:
        return "$sample"
    if val not in ["$sample", "$limit"]:
        raise cfig.InvalidValueError("Neither $sample or $limit.")
    return val
@config.optional()
 def TRAINING_SET_SIZE(val: str | None) -> int:
    """
    The number of reviews from each category to fetch for the training set.
    """
-    if not val:
+    if val is None:
        return 1000
    try:
        return int(val)
    except ValueError:
        raise cfig.InvalidValueError("Not an int.")
@config.optional()
 def TEST_SET_SIZE(val: str | None) -> int:
    """
    The number of reviews to fetch for the test set.
    """
    if val is None:
        return 1000
    try:
        return int(val)
@ -41,4 +67,7 @@ __all__ = (
    "config",
    "MONGO_HOST",
    "MONGO_PORT",
    "SAMPLE_MODE",
    "TRAINING_SET_SIZE",
    "TEST_SET_SIZE",
 )
--- a/unimore_bda_6/database.py
+++ b/unimore_bda_6/database.py
@ -3,8 +3,12 @@ import pymongo
 import pymongo.collection
 import contextlib
 import bson
 import logging
 import random
-from .config import MONGO_HOST, MONGO_PORT
+from .config import MONGO_HOST, MONGO_PORT, TRAINING_SET_SIZE, TEST_SET_SIZE, SAMPLE_MODE
 log = logging.getLogger(__name__)
 class Review(t.TypedDict):
@ -25,12 +29,18 @@ def mongo_client_from_config() -> t.ContextManager[pymongo.MongoClient]:
    """
    Create a new MongoDB client and yield it.
    """
    log.debug("Opening connection to MongoDB...")
    client = pymongo.MongoClient(
-        host=MONGO_HOST.__resolved__,
+        host=MONGO_HOST.__wrapped__,
-        port=MONGO_PORT.__resolved__,
+        port=MONGO_PORT.__wrapped__,
    )
    log.info("Opened connection to MongoDB: %s", client)
    yield client
    log.info("Closing connection to MongoDB: %s", client)
    client.close()
    log.debug("Closed connection to MongoDB!")
@contextlib.contextmanager
@ -39,16 +49,39 @@ def mongo_reviews_collection_from_config() -> pymongo.collection.Collection[Revi
    Create a new MongoDB client, access the ``reviews`` collection in the ``reviews`` database, and yield it.
    """
    with mongo_client_from_config() as db:
-        yield db.reviews.reviews
+        log.debug("Accessing the reviews collection...")
        collection = db.reviews.reviews
        log.debug("Collection accessed successfully: %s", collection)
        yield collection
 def pipeline_sample(collection: pymongo.collection.Collection, amount: int) -> list:
    """
    Create pipeline stages for sampling random documents, either with true randomness or by skipping a random amount of them.
    """
    if SAMPLE_MODE.__wrapped__ == "$sample":
        return [
            {"$sample": {"size": amount}},
        ]
    elif SAMPLE_MODE.__wrapped__ == "$limit":
        log.warning("USE_SAMPLE is disabled, sampling documents using $skip and $limit.")
        skip = random.randint(0, collection.estimated_document_count(maxTimeMS=100))
        return [
            {"$skip": skip},
            {"$limit": amount},
        ]
    else:
        raise ValueError("Unknown sample mode", SAMPLE_MODE)
 def sample_reviews(reviews: pymongo.collection.Collection, amount: int) -> t.Iterable[Review]:
    """
    Get ``amount`` random reviews from the ``reviews`` collection.
    """
    log.debug("Getting a sample of %d reviews...", amount)
    return reviews.aggregate([
-        {"$sample": {"size": amount}}
+        *pipeline_sample(reviews, amount),
    ])
@ -56,10 +89,11 @@ def sample_reviews_by_rating(reviews: pymongo.collection.Collection, rating: flo
    """
    Get ``amount`` random reviews with ``rating`` stars from the ``reviews`` collection.
    """
    log.debug("Getting a sample of %d reviews with %d stars...", amount, rating)
    return reviews.aggregate([
        {"$match": {"overall": rating}},
-        {"$sample": {"size": amount}},
+        *pipeline_sample(reviews, amount),
    ])
@ -67,6 +101,7 @@ def sample_reviews_by_rating_polar(reviews: pymongo.collection.Collection, amoun
    """
    Get ``amount`` random reviews with either a 5-star or 1-star rating from the ``reviews`` collection.
    """
    log.debug("Getting a sample of %d reviews with either 5 or 1 stars...", amount)
    return reviews.aggregate([
        {"$match":
@ -77,14 +112,18 @@ def sample_reviews_by_rating_polar(reviews: pymongo.collection.Collection, amoun
                ]
            },
        },
-        {"$sample": {"size": amount}},
+        *pipeline_sample(reviews, amount),
    ])
-def get_reviews_training_set(reviews: pymongo.collection.Collection, amount: int) -> t.Iterable[Review]:
+def get_reviews_training_set(reviews: pymongo.collection.Collection) -> t.Iterable[Review]:
    """
    Get the subset of reviews that should act as training set.
    """
    log.info("Building training set...")
    # Get the amount from the config
    amount: int = TRAINING_SET_SIZE.__wrapped__
    # Handle odd numbers
    positive_amount: int = amount // 2
@ -100,9 +139,25 @@ def get_reviews_training_set(reviews: pymongo.collection.Collection, amount: int
    return both
-def get_reviews_test_set(reviews: pymongo.collection.Collection, amount: int) -> t.Iterable[Review]:
+def get_reviews_test_set(reviews: pymongo.collection.Collection) -> t.Iterable[Review]:
    """
    Get the subset of reviews that should act as test set.
    """
    log.info("Building test set...")
    amount: int = TEST_SET_SIZE.__wrapped__
    return sample_reviews_by_rating_polar(reviews, amount)
 __all__ = (
    "Review",
    "mongo_client_from_config",
    "mongo_reviews_collection_from_config",
    "sample_reviews",
    "sample_reviews_by_rating",
    "sample_reviews_by_rating_polar",
    "get_reviews_training_set",
    "get_reviews_test_set",
 )
--- a/unimore_bda_6/log.py
+++ b/unimore_bda_6/log.py
@ -0,0 +1,35 @@
 import logging
 import coloredlogs
 log = logging.getLogger(__name__)
 def install_log_handler(logger: logging.Logger = None):
    if logger is None:
        logger = logging.getLogger("unimore_bda_6")
    coloredlogs.install(
        logger=logger,
        level="DEBUG",
        fmt="{asctime} | {name:<32} | {levelname:>8} | {message}",
        style="{",
        level_styles=dict(
            debug=dict(color="white"),
            info=dict(color="cyan"),
            warning=dict(color="yellow"),
            error=dict(color="red"),
            critical=dict(color="red", bold=True),
        ),
        field_styles=dict(
            asctime=dict(color='magenta'),
            levelname=dict(color='blue', bold=True),
            name=dict(color='blue'),
        ),
        isatty=True,
    )
    log.info("Installed custom log handler!")
 __all__ = (
    "install_log_handler",
 )