Make some more progress for the night

Many things still do not work properly
2024-11-21 23:44:19 +00:00 · 2023-02-02 05:01:31 +01:00 · 2023-02-02 05:01:31 +01:00 · 4344752cf6
commit 4344752cf6
parent b347031663
6 changed files with 80 additions and 73 deletions
--- a/.vscode/launch.json
+++ b/.vscode/launch.json
@ -12,6 +12,7 @@
            "justMyCode": true,
            "env": {
                "NLTK_DATA": "./data/nltk",
                "DATA_SET_SIZE": "100",
            },
            "cwd": "${workspaceFolder}",
        }
--- a/unimore_bda_6/main.py
+++ b/unimore_bda_6/main.py
@ -1,8 +1,8 @@
 import logging
-from .config import config
+from .config import config, DATA_SET_SIZE
-from .database import mongo_reviews_collection_from_config, get_training_reviews, get_test_reviews
+from .database import mongo_reviews_collection_from_config, get_reviews_dataset_polar, get_reviews_dataset_uniform
-from .analysis.vanilla import VanillaReviewSA
+from .analysis.vanilla import VanillaReviewSA, VanillaUniformReviewSA
 from .analysis.potts import PottsReviewSA
 from .log import install_log_handler
@ -11,16 +11,26 @@ log = logging.getLogger(__name__)
 def main():
    with mongo_reviews_collection_from_config() as reviews:
-        training_reviews = get_training_reviews(collection=reviews)
+        reviews_polar_training = get_reviews_dataset_polar(collection=reviews, amount=DATA_SET_SIZE.__wrapped__)
-        test_reviews = get_test_reviews(collection=reviews)
+        reviews_polar_evaluation = get_reviews_dataset_polar(collection=reviews, amount=DATA_SET_SIZE.__wrapped__)
        reviews_uniform_training = get_reviews_dataset_uniform(collection=reviews, amount=DATA_SET_SIZE.__wrapped__)
        reviews_uniform_evaluation = get_reviews_dataset_uniform(collection=reviews, amount=DATA_SET_SIZE.__wrapped__)
-    vanilla = VanillaReviewSA()
+    vanilla_polar = VanillaReviewSA()
-    vanilla.train(training_reviews)
+    vanilla_polar.train(reviews_polar_training)
-    log.info("Vanilla evaluation results: %s", vanilla.evaluate(test_reviews))
+    log.info("Vanilla polar evaluation results: %s", vanilla_polar.evaluate(reviews_polar_evaluation))
    potts_polar = PottsReviewSA()
    potts_polar.train(reviews_polar_training)
    log.info("Potts polar evaluation results: %s", potts_polar.evaluate(reviews_polar_evaluation))
    vanilla_uniform = VanillaUniformReviewSA()
    vanilla_uniform.train(reviews_uniform_training)
    log.info("Vanilla uniform evaluation results: %s", vanilla_polar.evaluate(reviews_polar_evaluation))
    while True:
        print(vanilla_uniform.use(input("> ")))
    potts = PottsReviewSA()
    potts.train(training_reviews)
    log.info("Potts evaluation results: %s", potts.evaluate(test_reviews))
 if __name__ == "__main__":
--- a/unimore_bda_6/analysis/potts.py
+++ b/unimore_bda_6/analysis/potts.py
@ -1,5 +1,5 @@
 from ..vendor.potts import Tokenizer
-from .vanilla import VanillaSA, VanillaReviewSA
+from .vanilla import VanillaSA, VanillaReviewSA, VanillaUniformReviewSA
 class PottsSA(VanillaSA):
@ -24,6 +24,12 @@ class PottsReviewSA(VanillaReviewSA, PottsSA):
    """
 class PottsUniformReviewSA(VanillaUniformReviewSA, PottsSA):
    """
    A `PottsSA` with 5 buckets instead of 2.
    """
 __all__ = (
    "PottsSA",
    "PottsReviewSA",
--- a/unimore_bda_6/analysis/vanilla.py
+++ b/unimore_bda_6/analysis/vanilla.py
@ -108,6 +108,26 @@ class VanillaReviewSA(VanillaSA):
        return self._use_with_tokens(self._tokenize_text(text))
 class VanillaUniformReviewSA(VanillaReviewSA):
    @staticmethod
    def _rating_to_label(rating: float) -> str:
        match rating:
            case 0.0:
                return "abysmal"
            case 1.0:
                return "terrible"
            case 2.0:
                return "negative"
            case 3.0:
                return "mixed"
            case 4.0:
                return "positive"
            case 5.0:
                return "great"
            case _:
                return "unknown"
 __all__ = (
    "VanillaSA",
    "VanillaReviewSA",
--- a/unimore_bda_6/config.py
+++ b/unimore_bda_6/config.py
@ -45,24 +45,9 @@ def WORKING_SET_SIZE(val: str | None) -> int:
@config.optional()
-def TRAINING_SET_SIZE(val: str | None) -> int:
+def DATA_SET_SIZE(val: str | None) -> int:
    """
-    The number of reviews from each category to fetch for the training set.
+    The number of reviews from each category to fetch for the datasets.
    Defaults to `1000`.
    """
    if val is None:
        return 1000
    try:
        return int(val)
    except ValueError:
        raise cfig.InvalidValueError("Not an int.")
@config.optional()
 def TEST_SET_SIZE(val: str | None) -> int:
    """
    The number of reviews to fetch for the test set.
    Defaults to `1000`.
    """
@ -79,9 +64,7 @@ __all__ = (
    "MONGO_HOST",
    "MONGO_PORT",
    "WORKING_SET_SIZE",
-    "TRAINING_SET_SIZE",
+    "DATA_SET_SIZE",
    "TEST_SET_SIZE",
    "NLTK_DOUBLE_NEG_SWITCH",
 )
--- a/unimore_bda_6/database.py
+++ b/unimore_bda_6/database.py
@ -4,8 +4,9 @@ import pymongo.collection
 import contextlib
 import bson
 import logging
 import random
-from .config import MONGO_HOST, MONGO_PORT, WORKING_SET_SIZE, TRAINING_SET_SIZE, TEST_SET_SIZE
+from .config import MONGO_HOST, MONGO_PORT, WORKING_SET_SIZE, DATA_SET_SIZE
 log = logging.getLogger(__name__)
@ -79,59 +80,47 @@ def sample_reviews_by_rating(reviews: pymongo.collection.Collection, rating: flo
    ])
-def sample_reviews_by_rating_polar(reviews: pymongo.collection.Collection, amount: int) -> t.Iterable[Review]:
+def get_reviews_dataset_polar(collection: pymongo.collection.Collection, amount: int) -> list[Review]:
    """
-    Get ``amount`` random reviews with either a 5-star or 1-star rating from the ``reviews`` collection.
+    Get a list of shuffled 1-star and 5-star reviews.
    """
-    log.debug("Getting a sample of %d reviews with either 5 or 1 stars...", amount)
+    log.info("Building dataset with %d polar reviews...", amount * 2)
    return reviews.aggregate([
        {"$limit": WORKING_SET_SIZE.__wrapped__},
        {"$match":
            {"$or":
                [
                    {"overall": 1.0},
                    {"overall": 5.0},
                ]
            },
        },
        {"$sample": {"size": amount}},
    ])
 def get_training_reviews(collection: pymongo.collection.Collection) -> list[Review]:
    """
    Get the subset of reviews that should act as training set.
    """
    log.info("Building training set...")
    # Get the amount from the config
    amount: int = TRAINING_SET_SIZE.__wrapped__
    # Handle odd numbers
    positive_amount: int = amount // 2
    negative_amount: int = amount - positive_amount
    # Sample the required reviews
-    positive = sample_reviews_by_rating(collection, 5.0, positive_amount)
+    positive = sample_reviews_by_rating(collection, rating=5.0, amount=amount)
-    negative = sample_reviews_by_rating(collection, 1.0, negative_amount)
+    negative = sample_reviews_by_rating(collection, rating=1.0, amount=amount)
    # Randomness here does not matter, so just merge the lists
    both = [*positive, *negative]
    # Shuffle the dataset, just in case it affects the performance
    # TODO: does it actually?
    random.shuffle(both)
    return both
-def get_test_reviews(collection: pymongo.collection.Collection) -> list[Review]:
+def get_reviews_dataset_uniform(collection: pymongo.collection.Collection, amount: int) -> list[Review]:
    """
-    Get the subset of reviews that should act as test set.
+    Get a list of shuffled reviews of any rating.
    """
    log.info("Building dataset with %d uniform reviews...", amount * 5)
-    log.info("Building test set...")
+    # Sample the required reviews
    terrible = sample_reviews_by_rating(collection, rating=1.0, amount=amount)
    negative = sample_reviews_by_rating(collection, rating=2.0, amount=amount)
    mixed    = sample_reviews_by_rating(collection, rating=3.0, amount=amount)
    positive = sample_reviews_by_rating(collection, rating=4.0, amount=amount)
    great    = sample_reviews_by_rating(collection, rating=5.0, amount=amount)
-    amount: int = TEST_SET_SIZE.__wrapped__
+    # Randomness here does not matter, so just merge the lists
    both = [*positive, *negative]
-    return list(sample_reviews_by_rating_polar(collection, amount))
+    # Shuffle the dataset, just in case it affects the performance
    # TODO: does it actually?
    random.shuffle(both)
    return both
 __all__ = (
@ -140,7 +129,5 @@ __all__ = (
    "mongo_reviews_collection_from_config",
    "sample_reviews",
    "sample_reviews_by_rating",
-    "sample_reviews_by_rating_polar",
+    "get_reviews_dataset_polar",
    "get_training_reviews",
    "get_test_reviews",
 )