Make some more progress for the night

Many things still do not work properly
2024-11-21 23:44:19 +00:00 · 2023-02-02 05:01:31 +01:00 · 2023-02-02 05:01:31 +01:00 · 4344752cf6
commit 4344752cf6
parent b347031663
6 changed files with 80 additions and 73 deletions
--- a/.vscode/launch.json
+++ b/.vscode/launch.json
@ -12,6 +12,7 @@
            "justMyCode": true,
            "env": {
                "NLTK_DATA": "./data/nltk",
+                "DATA_SET_SIZE": "100",
            },
            "cwd": "${workspaceFolder}",
        }
--- a/unimore_bda_6/main.py
+++ b/unimore_bda_6/main.py
@ -1,8 +1,8 @@
 import logging

-from .config import config
-from .database import mongo_reviews_collection_from_config, get_training_reviews, get_test_reviews
-from .analysis.vanilla import VanillaReviewSA
+from .config import config, DATA_SET_SIZE
+from .database import mongo_reviews_collection_from_config, get_reviews_dataset_polar, get_reviews_dataset_uniform
+from .analysis.vanilla import VanillaReviewSA, VanillaUniformReviewSA
 from .analysis.potts import PottsReviewSA
 from .log import install_log_handler

@ -11,16 +11,26 @@ log = logging.getLogger(__name__)

 def main():
    with mongo_reviews_collection_from_config() as reviews:
-        training_reviews = get_training_reviews(collection=reviews)
-        test_reviews = get_test_reviews(collection=reviews)
+        reviews_polar_training = get_reviews_dataset_polar(collection=reviews, amount=DATA_SET_SIZE.__wrapped__)
+        reviews_polar_evaluation = get_reviews_dataset_polar(collection=reviews, amount=DATA_SET_SIZE.__wrapped__)
+        reviews_uniform_training = get_reviews_dataset_uniform(collection=reviews, amount=DATA_SET_SIZE.__wrapped__)
+        reviews_uniform_evaluation = get_reviews_dataset_uniform(collection=reviews, amount=DATA_SET_SIZE.__wrapped__)

-    vanilla = VanillaReviewSA()
-    vanilla.train(training_reviews)
-    log.info("Vanilla evaluation results: %s", vanilla.evaluate(test_reviews))
+    vanilla_polar = VanillaReviewSA()
+    vanilla_polar.train(reviews_polar_training)
+    log.info("Vanilla polar evaluation results: %s", vanilla_polar.evaluate(reviews_polar_evaluation))
+
+    potts_polar = PottsReviewSA()
+    potts_polar.train(reviews_polar_training)
+    log.info("Potts polar evaluation results: %s", potts_polar.evaluate(reviews_polar_evaluation))
+
+    vanilla_uniform = VanillaUniformReviewSA()
+    vanilla_uniform.train(reviews_uniform_training)
+    log.info("Vanilla uniform evaluation results: %s", vanilla_polar.evaluate(reviews_polar_evaluation))
+
+    while True:
+        print(vanilla_uniform.use(input("> ")))

-    potts = PottsReviewSA()
-    potts.train(training_reviews)
-    log.info("Potts evaluation results: %s", potts.evaluate(test_reviews))


 if __name__ == "__main__":
--- a/unimore_bda_6/analysis/potts.py
+++ b/unimore_bda_6/analysis/potts.py
@ -1,5 +1,5 @@
 from ..vendor.potts import Tokenizer
-from .vanilla import VanillaSA, VanillaReviewSA
+from .vanilla import VanillaSA, VanillaReviewSA, VanillaUniformReviewSA


 class PottsSA(VanillaSA):
@ -24,6 +24,12 @@ class PottsReviewSA(VanillaReviewSA, PottsSA):
    """


+class PottsUniformReviewSA(VanillaUniformReviewSA, PottsSA):
+    """
+    A `PottsSA` with 5 buckets instead of 2.
+    """
+
+
 __all__ = (
    "PottsSA",
    "PottsReviewSA",
--- a/unimore_bda_6/analysis/vanilla.py
+++ b/unimore_bda_6/analysis/vanilla.py
@ -108,6 +108,26 @@ class VanillaReviewSA(VanillaSA):
        return self._use_with_tokens(self._tokenize_text(text))


+class VanillaUniformReviewSA(VanillaReviewSA):
+    @staticmethod
+    def _rating_to_label(rating: float) -> str:
+        match rating:
+            case 0.0:
+                return "abysmal"
+            case 1.0:
+                return "terrible"
+            case 2.0:
+                return "negative"
+            case 3.0:
+                return "mixed"
+            case 4.0:
+                return "positive"
+            case 5.0:
+                return "great"
+            case _:
+                return "unknown"
+
+
 __all__ = (
    "VanillaSA",
    "VanillaReviewSA",
--- a/unimore_bda_6/config.py
+++ b/unimore_bda_6/config.py
@ -45,24 +45,9 @@ def WORKING_SET_SIZE(val: str | None) -> int:


@config.optional()
-def TRAINING_SET_SIZE(val: str | None) -> int:
+def DATA_SET_SIZE(val: str | None) -> int:
    """
-    The number of reviews from each category to fetch for the training set.
-
-    Defaults to `1000`.
-    """
-    if val is None:
-        return 1000
-    try:
-        return int(val)
-    except ValueError:
-        raise cfig.InvalidValueError("Not an int.")
-
-
-@config.optional()
-def TEST_SET_SIZE(val: str | None) -> int:
-    """
-    The number of reviews to fetch for the test set.
+    The number of reviews from each category to fetch for the datasets.

    Defaults to `1000`.
    """
@ -79,9 +64,7 @@ __all__ = (
    "MONGO_HOST",
    "MONGO_PORT",
    "WORKING_SET_SIZE",
-    "TRAINING_SET_SIZE",
-    "TEST_SET_SIZE",
-    "NLTK_DOUBLE_NEG_SWITCH",
+    "DATA_SET_SIZE",
 )


--- a/unimore_bda_6/database.py
+++ b/unimore_bda_6/database.py
@ -4,8 +4,9 @@ import pymongo.collection
 import contextlib
 import bson
 import logging
+import random

-from .config import MONGO_HOST, MONGO_PORT, WORKING_SET_SIZE, TRAINING_SET_SIZE, TEST_SET_SIZE
+from .config import MONGO_HOST, MONGO_PORT, WORKING_SET_SIZE, DATA_SET_SIZE

 log = logging.getLogger(__name__)

@ -79,59 +80,47 @@ def sample_reviews_by_rating(reviews: pymongo.collection.Collection, rating: flo
    ])


-def sample_reviews_by_rating_polar(reviews: pymongo.collection.Collection, amount: int) -> t.Iterable[Review]:
+def get_reviews_dataset_polar(collection: pymongo.collection.Collection, amount: int) -> list[Review]:
    """
-    Get ``amount`` random reviews with either a 5-star or 1-star rating from the ``reviews`` collection.
+    Get a list of shuffled 1-star and 5-star reviews.
    """
-    log.debug("Getting a sample of %d reviews with either 5 or 1 stars...", amount)
-
-    return reviews.aggregate([
-        {"$limit": WORKING_SET_SIZE.__wrapped__},
-        {"$match":
-            {"$or":
-                [
-                    {"overall": 1.0},
-                    {"overall": 5.0},
-                ]
-            },
-        },
-        {"$sample": {"size": amount}},
-    ])
-
-
-def get_training_reviews(collection: pymongo.collection.Collection) -> list[Review]:
-    """
-    Get the subset of reviews that should act as training set.
-    """
-    log.info("Building training set...")
-
-    # Get the amount from the config
-    amount: int = TRAINING_SET_SIZE.__wrapped__
-
-    # Handle odd numbers
-    positive_amount: int = amount // 2
-    negative_amount: int = amount - positive_amount
+    log.info("Building dataset with %d polar reviews...", amount * 2)

    # Sample the required reviews
-    positive = sample_reviews_by_rating(collection, 5.0, positive_amount)
-    negative = sample_reviews_by_rating(collection, 1.0, negative_amount)
+    positive = sample_reviews_by_rating(collection, rating=5.0, amount=amount)
+    negative = sample_reviews_by_rating(collection, rating=1.0, amount=amount)

    # Randomness here does not matter, so just merge the lists
    both = [*positive, *negative]

+    # Shuffle the dataset, just in case it affects the performance
+    # TODO: does it actually?
+    random.shuffle(both)
+
    return both


-def get_test_reviews(collection: pymongo.collection.Collection) -> list[Review]:
+def get_reviews_dataset_uniform(collection: pymongo.collection.Collection, amount: int) -> list[Review]:
    """
-    Get the subset of reviews that should act as test set.
+    Get a list of shuffled reviews of any rating.
    """
+    log.info("Building dataset with %d uniform reviews...", amount * 5)

-    log.info("Building test set...")
+    # Sample the required reviews
+    terrible = sample_reviews_by_rating(collection, rating=1.0, amount=amount)
+    negative = sample_reviews_by_rating(collection, rating=2.0, amount=amount)
+    mixed    = sample_reviews_by_rating(collection, rating=3.0, amount=amount)
+    positive = sample_reviews_by_rating(collection, rating=4.0, amount=amount)
+    great    = sample_reviews_by_rating(collection, rating=5.0, amount=amount)

-    amount: int = TEST_SET_SIZE.__wrapped__
+    # Randomness here does not matter, so just merge the lists
+    both = [*positive, *negative]

-    return list(sample_reviews_by_rating_polar(collection, amount))
+    # Shuffle the dataset, just in case it affects the performance
+    # TODO: does it actually?
+    random.shuffle(both)
+
+    return both


 __all__ = (
@ -140,7 +129,5 @@ __all__ = (
    "mongo_reviews_collection_from_config",
    "sample_reviews",
    "sample_reviews_by_rating",
-    "sample_reviews_by_rating_polar",
-    "get_training_reviews",
-    "get_test_reviews",
+    "get_reviews_dataset_polar",
 )