From 4344752cf64f78dfedcd6d39846ce879ddd2a92d Mon Sep 17 00:00:00 2001
From: Stefano Pigozzi <me@steffo.eu>
Date: Thu, 2 Feb 2023 05:01:31 +0100
Subject: [PATCH] Make some more progress for the night

Many things still do not work properly
---
 .vscode/launch.json               |  1 +
 unimore_bda_6/__main__.py         | 32 +++++++++-----
 unimore_bda_6/analysis/potts.py   |  8 +++-
 unimore_bda_6/analysis/vanilla.py | 20 +++++++++
 unimore_bda_6/config.py           | 23 ++---------
 unimore_bda_6/database.py         | 69 +++++++++++++------------------
 6 files changed, 80 insertions(+), 73 deletions(-)

diff --git a/.vscode/launch.json b/.vscode/launch.json
index e492b02..924e945 100644
--- a/.vscode/launch.json
+++ b/.vscode/launch.json
@@ -12,6 +12,7 @@
             "justMyCode": true,
             "env": {
                 "NLTK_DATA": "./data/nltk",
+                "DATA_SET_SIZE": "100",
             },
             "cwd": "${workspaceFolder}",
         }
diff --git a/unimore_bda_6/__main__.py b/unimore_bda_6/__main__.py
index 2ac2ec3..a8be723 100644
--- a/unimore_bda_6/__main__.py
+++ b/unimore_bda_6/__main__.py
@@ -1,8 +1,8 @@
 import logging
 
-from .config import config
-from .database import mongo_reviews_collection_from_config, get_training_reviews, get_test_reviews
-from .analysis.vanilla import VanillaReviewSA
+from .config import config, DATA_SET_SIZE
+from .database import mongo_reviews_collection_from_config, get_reviews_dataset_polar, get_reviews_dataset_uniform
+from .analysis.vanilla import VanillaReviewSA, VanillaUniformReviewSA
 from .analysis.potts import PottsReviewSA
 from .log import install_log_handler
 
@@ -11,16 +11,26 @@ log = logging.getLogger(__name__)
 
 def main():
     with mongo_reviews_collection_from_config() as reviews:
-        training_reviews = get_training_reviews(collection=reviews)
-        test_reviews = get_test_reviews(collection=reviews)
+        reviews_polar_training = get_reviews_dataset_polar(collection=reviews, amount=DATA_SET_SIZE.__wrapped__)
+        reviews_polar_evaluation = get_reviews_dataset_polar(collection=reviews, amount=DATA_SET_SIZE.__wrapped__)
+        reviews_uniform_training = get_reviews_dataset_uniform(collection=reviews, amount=DATA_SET_SIZE.__wrapped__)
+        reviews_uniform_evaluation = get_reviews_dataset_uniform(collection=reviews, amount=DATA_SET_SIZE.__wrapped__)
 
-    vanilla = VanillaReviewSA()
-    vanilla.train(training_reviews)
-    log.info("Vanilla evaluation results: %s", vanilla.evaluate(test_reviews))
+    vanilla_polar = VanillaReviewSA()
+    vanilla_polar.train(reviews_polar_training)
+    log.info("Vanilla polar evaluation results: %s", vanilla_polar.evaluate(reviews_polar_evaluation))
+
+    potts_polar = PottsReviewSA()
+    potts_polar.train(reviews_polar_training)
+    log.info("Potts polar evaluation results: %s", potts_polar.evaluate(reviews_polar_evaluation))
+
+    vanilla_uniform = VanillaUniformReviewSA()
+    vanilla_uniform.train(reviews_uniform_training)
+    log.info("Vanilla uniform evaluation results: %s", vanilla_polar.evaluate(reviews_polar_evaluation))
+
+    while True:
+        print(vanilla_uniform.use(input("> ")))
 
-    potts = PottsReviewSA()
-    potts.train(training_reviews)
-    log.info("Potts evaluation results: %s", potts.evaluate(test_reviews))
 
 
 if __name__ == "__main__":
diff --git a/unimore_bda_6/analysis/potts.py b/unimore_bda_6/analysis/potts.py
index 611e206..2438838 100644
--- a/unimore_bda_6/analysis/potts.py
+++ b/unimore_bda_6/analysis/potts.py
@@ -1,5 +1,5 @@
 from ..vendor.potts import Tokenizer
-from .vanilla import VanillaSA, VanillaReviewSA
+from .vanilla import VanillaSA, VanillaReviewSA, VanillaUniformReviewSA
 
 
 class PottsSA(VanillaSA):
@@ -24,6 +24,12 @@ class PottsReviewSA(VanillaReviewSA, PottsSA):
     """
 
 
+class PottsUniformReviewSA(VanillaUniformReviewSA, PottsSA):
+    """
+    A `PottsSA` with 5 buckets instead of 2.
+    """
+
+
 __all__ = (
     "PottsSA",
     "PottsReviewSA",
diff --git a/unimore_bda_6/analysis/vanilla.py b/unimore_bda_6/analysis/vanilla.py
index ed91aea..9660215 100644
--- a/unimore_bda_6/analysis/vanilla.py
+++ b/unimore_bda_6/analysis/vanilla.py
@@ -108,6 +108,26 @@ class VanillaReviewSA(VanillaSA):
         return self._use_with_tokens(self._tokenize_text(text))
 
 
+class VanillaUniformReviewSA(VanillaReviewSA):
+    @staticmethod
+    def _rating_to_label(rating: float) -> str:
+        match rating:
+            case 0.0:
+                return "abysmal"
+            case 1.0:
+                return "terrible"
+            case 2.0:
+                return "negative"
+            case 3.0:
+                return "mixed"
+            case 4.0:
+                return "positive"
+            case 5.0:
+                return "great"
+            case _:
+                return "unknown"
+
+
 __all__ = (
     "VanillaSA",
     "VanillaReviewSA",
diff --git a/unimore_bda_6/config.py b/unimore_bda_6/config.py
index 62ccdf9..d2aca9e 100644
--- a/unimore_bda_6/config.py
+++ b/unimore_bda_6/config.py
@@ -45,24 +45,9 @@ def WORKING_SET_SIZE(val: str | None) -> int:
 
 
 @config.optional()
-def TRAINING_SET_SIZE(val: str | None) -> int:
+def DATA_SET_SIZE(val: str | None) -> int:
     """
-    The number of reviews from each category to fetch for the training set.
-
-    Defaults to `1000`.
-    """
-    if val is None:
-        return 1000
-    try:
-        return int(val)
-    except ValueError:
-        raise cfig.InvalidValueError("Not an int.")
-
-
-@config.optional()
-def TEST_SET_SIZE(val: str | None) -> int:
-    """
-    The number of reviews to fetch for the test set.
+    The number of reviews from each category to fetch for the datasets.
 
     Defaults to `1000`.
     """
@@ -79,9 +64,7 @@ __all__ = (
     "MONGO_HOST",
     "MONGO_PORT",
     "WORKING_SET_SIZE",
-    "TRAINING_SET_SIZE",
-    "TEST_SET_SIZE",
-    "NLTK_DOUBLE_NEG_SWITCH",
+    "DATA_SET_SIZE",
 )
 
 
diff --git a/unimore_bda_6/database.py b/unimore_bda_6/database.py
index 8f1c174..f4c2564 100644
--- a/unimore_bda_6/database.py
+++ b/unimore_bda_6/database.py
@@ -4,8 +4,9 @@ import pymongo.collection
 import contextlib
 import bson
 import logging
+import random
 
-from .config import MONGO_HOST, MONGO_PORT, WORKING_SET_SIZE, TRAINING_SET_SIZE, TEST_SET_SIZE
+from .config import MONGO_HOST, MONGO_PORT, WORKING_SET_SIZE, DATA_SET_SIZE
 
 log = logging.getLogger(__name__)
 
@@ -79,59 +80,47 @@ def sample_reviews_by_rating(reviews: pymongo.collection.Collection, rating: flo
     ])
 
 
-def sample_reviews_by_rating_polar(reviews: pymongo.collection.Collection, amount: int) -> t.Iterable[Review]:
+def get_reviews_dataset_polar(collection: pymongo.collection.Collection, amount: int) -> list[Review]:
     """
-    Get ``amount`` random reviews with either a 5-star or 1-star rating from the ``reviews`` collection.
+    Get a list of shuffled 1-star and 5-star reviews.
     """
-    log.debug("Getting a sample of %d reviews with either 5 or 1 stars...", amount)
-
-    return reviews.aggregate([
-        {"$limit": WORKING_SET_SIZE.__wrapped__},
-        {"$match":
-            {"$or":
-                [
-                    {"overall": 1.0},
-                    {"overall": 5.0},
-                ]
-            },
-        },
-        {"$sample": {"size": amount}},
-    ])
-
-
-def get_training_reviews(collection: pymongo.collection.Collection) -> list[Review]:
-    """
-    Get the subset of reviews that should act as training set.
-    """
-    log.info("Building training set...")
-
-    # Get the amount from the config
-    amount: int = TRAINING_SET_SIZE.__wrapped__
-
-    # Handle odd numbers
-    positive_amount: int = amount // 2
-    negative_amount: int = amount - positive_amount
+    log.info("Building dataset with %d polar reviews...", amount * 2)
 
     # Sample the required reviews
-    positive = sample_reviews_by_rating(collection, 5.0, positive_amount)
-    negative = sample_reviews_by_rating(collection, 1.0, negative_amount)
+    positive = sample_reviews_by_rating(collection, rating=5.0, amount=amount)
+    negative = sample_reviews_by_rating(collection, rating=1.0, amount=amount)
 
     # Randomness here does not matter, so just merge the lists
     both = [*positive, *negative]
 
+    # Shuffle the dataset, just in case it affects the performance
+    # TODO: does it actually?
+    random.shuffle(both)
+
     return both
 
 
-def get_test_reviews(collection: pymongo.collection.Collection) -> list[Review]:
+def get_reviews_dataset_uniform(collection: pymongo.collection.Collection, amount: int) -> list[Review]:
     """
-    Get the subset of reviews that should act as test set.
+    Get a list of shuffled reviews of any rating.
     """
+    log.info("Building dataset with %d uniform reviews...", amount * 5)
 
-    log.info("Building test set...")
+    # Sample the required reviews
+    terrible = sample_reviews_by_rating(collection, rating=1.0, amount=amount)
+    negative = sample_reviews_by_rating(collection, rating=2.0, amount=amount)
+    mixed    = sample_reviews_by_rating(collection, rating=3.0, amount=amount)
+    positive = sample_reviews_by_rating(collection, rating=4.0, amount=amount)
+    great    = sample_reviews_by_rating(collection, rating=5.0, amount=amount)
 
-    amount: int = TEST_SET_SIZE.__wrapped__
+    # Randomness here does not matter, so just merge the lists
+    both = [*positive, *negative]
 
-    return list(sample_reviews_by_rating_polar(collection, amount))
+    # Shuffle the dataset, just in case it affects the performance
+    # TODO: does it actually?
+    random.shuffle(both)
+
+    return both
 
 
 __all__ = (
@@ -140,7 +129,5 @@ __all__ = (
     "mongo_reviews_collection_from_config",
     "sample_reviews",
     "sample_reviews_by_rating",
-    "sample_reviews_by_rating_polar",
-    "get_training_reviews",
-    "get_test_reviews",
+    "get_reviews_dataset_polar",
 )