diff --git a/.vscode/launch.json b/.vscode/launch.json index e492b02..924e945 100644 --- a/.vscode/launch.json +++ b/.vscode/launch.json @@ -12,6 +12,7 @@ "justMyCode": true, "env": { "NLTK_DATA": "./data/nltk", + "DATA_SET_SIZE": "100", }, "cwd": "${workspaceFolder}", } diff --git a/unimore_bda_6/__main__.py b/unimore_bda_6/__main__.py index 2ac2ec3..a8be723 100644 --- a/unimore_bda_6/__main__.py +++ b/unimore_bda_6/__main__.py @@ -1,8 +1,8 @@ import logging -from .config import config -from .database import mongo_reviews_collection_from_config, get_training_reviews, get_test_reviews -from .analysis.vanilla import VanillaReviewSA +from .config import config, DATA_SET_SIZE +from .database import mongo_reviews_collection_from_config, get_reviews_dataset_polar, get_reviews_dataset_uniform +from .analysis.vanilla import VanillaReviewSA, VanillaUniformReviewSA from .analysis.potts import PottsReviewSA from .log import install_log_handler @@ -11,16 +11,26 @@ log = logging.getLogger(__name__) def main(): with mongo_reviews_collection_from_config() as reviews: - training_reviews = get_training_reviews(collection=reviews) - test_reviews = get_test_reviews(collection=reviews) + reviews_polar_training = get_reviews_dataset_polar(collection=reviews, amount=DATA_SET_SIZE.__wrapped__) + reviews_polar_evaluation = get_reviews_dataset_polar(collection=reviews, amount=DATA_SET_SIZE.__wrapped__) + reviews_uniform_training = get_reviews_dataset_uniform(collection=reviews, amount=DATA_SET_SIZE.__wrapped__) + reviews_uniform_evaluation = get_reviews_dataset_uniform(collection=reviews, amount=DATA_SET_SIZE.__wrapped__) - vanilla = VanillaReviewSA() - vanilla.train(training_reviews) - log.info("Vanilla evaluation results: %s", vanilla.evaluate(test_reviews)) + vanilla_polar = VanillaReviewSA() + vanilla_polar.train(reviews_polar_training) + log.info("Vanilla polar evaluation results: %s", vanilla_polar.evaluate(reviews_polar_evaluation)) + + potts_polar = PottsReviewSA() + potts_polar.train(reviews_polar_training) + log.info("Potts polar evaluation results: %s", potts_polar.evaluate(reviews_polar_evaluation)) + + vanilla_uniform = VanillaUniformReviewSA() + vanilla_uniform.train(reviews_uniform_training) + log.info("Vanilla uniform evaluation results: %s", vanilla_polar.evaluate(reviews_polar_evaluation)) + + while True: + print(vanilla_uniform.use(input("> "))) - potts = PottsReviewSA() - potts.train(training_reviews) - log.info("Potts evaluation results: %s", potts.evaluate(test_reviews)) if __name__ == "__main__": diff --git a/unimore_bda_6/analysis/potts.py b/unimore_bda_6/analysis/potts.py index 611e206..2438838 100644 --- a/unimore_bda_6/analysis/potts.py +++ b/unimore_bda_6/analysis/potts.py @@ -1,5 +1,5 @@ from ..vendor.potts import Tokenizer -from .vanilla import VanillaSA, VanillaReviewSA +from .vanilla import VanillaSA, VanillaReviewSA, VanillaUniformReviewSA class PottsSA(VanillaSA): @@ -24,6 +24,12 @@ class PottsReviewSA(VanillaReviewSA, PottsSA): """ +class PottsUniformReviewSA(VanillaUniformReviewSA, PottsSA): + """ + A `PottsSA` with 5 buckets instead of 2. + """ + + __all__ = ( "PottsSA", "PottsReviewSA", diff --git a/unimore_bda_6/analysis/vanilla.py b/unimore_bda_6/analysis/vanilla.py index ed91aea..9660215 100644 --- a/unimore_bda_6/analysis/vanilla.py +++ b/unimore_bda_6/analysis/vanilla.py @@ -108,6 +108,26 @@ class VanillaReviewSA(VanillaSA): return self._use_with_tokens(self._tokenize_text(text)) +class VanillaUniformReviewSA(VanillaReviewSA): + @staticmethod + def _rating_to_label(rating: float) -> str: + match rating: + case 0.0: + return "abysmal" + case 1.0: + return "terrible" + case 2.0: + return "negative" + case 3.0: + return "mixed" + case 4.0: + return "positive" + case 5.0: + return "great" + case _: + return "unknown" + + __all__ = ( "VanillaSA", "VanillaReviewSA", diff --git a/unimore_bda_6/config.py b/unimore_bda_6/config.py index 62ccdf9..d2aca9e 100644 --- a/unimore_bda_6/config.py +++ b/unimore_bda_6/config.py @@ -45,24 +45,9 @@ def WORKING_SET_SIZE(val: str | None) -> int: @config.optional() -def TRAINING_SET_SIZE(val: str | None) -> int: +def DATA_SET_SIZE(val: str | None) -> int: """ - The number of reviews from each category to fetch for the training set. - - Defaults to `1000`. - """ - if val is None: - return 1000 - try: - return int(val) - except ValueError: - raise cfig.InvalidValueError("Not an int.") - - -@config.optional() -def TEST_SET_SIZE(val: str | None) -> int: - """ - The number of reviews to fetch for the test set. + The number of reviews from each category to fetch for the datasets. Defaults to `1000`. """ @@ -79,9 +64,7 @@ __all__ = ( "MONGO_HOST", "MONGO_PORT", "WORKING_SET_SIZE", - "TRAINING_SET_SIZE", - "TEST_SET_SIZE", - "NLTK_DOUBLE_NEG_SWITCH", + "DATA_SET_SIZE", ) diff --git a/unimore_bda_6/database.py b/unimore_bda_6/database.py index 8f1c174..f4c2564 100644 --- a/unimore_bda_6/database.py +++ b/unimore_bda_6/database.py @@ -4,8 +4,9 @@ import pymongo.collection import contextlib import bson import logging +import random -from .config import MONGO_HOST, MONGO_PORT, WORKING_SET_SIZE, TRAINING_SET_SIZE, TEST_SET_SIZE +from .config import MONGO_HOST, MONGO_PORT, WORKING_SET_SIZE, DATA_SET_SIZE log = logging.getLogger(__name__) @@ -79,59 +80,47 @@ def sample_reviews_by_rating(reviews: pymongo.collection.Collection, rating: flo ]) -def sample_reviews_by_rating_polar(reviews: pymongo.collection.Collection, amount: int) -> t.Iterable[Review]: +def get_reviews_dataset_polar(collection: pymongo.collection.Collection, amount: int) -> list[Review]: """ - Get ``amount`` random reviews with either a 5-star or 1-star rating from the ``reviews`` collection. + Get a list of shuffled 1-star and 5-star reviews. """ - log.debug("Getting a sample of %d reviews with either 5 or 1 stars...", amount) - - return reviews.aggregate([ - {"$limit": WORKING_SET_SIZE.__wrapped__}, - {"$match": - {"$or": - [ - {"overall": 1.0}, - {"overall": 5.0}, - ] - }, - }, - {"$sample": {"size": amount}}, - ]) - - -def get_training_reviews(collection: pymongo.collection.Collection) -> list[Review]: - """ - Get the subset of reviews that should act as training set. - """ - log.info("Building training set...") - - # Get the amount from the config - amount: int = TRAINING_SET_SIZE.__wrapped__ - - # Handle odd numbers - positive_amount: int = amount // 2 - negative_amount: int = amount - positive_amount + log.info("Building dataset with %d polar reviews...", amount * 2) # Sample the required reviews - positive = sample_reviews_by_rating(collection, 5.0, positive_amount) - negative = sample_reviews_by_rating(collection, 1.0, negative_amount) + positive = sample_reviews_by_rating(collection, rating=5.0, amount=amount) + negative = sample_reviews_by_rating(collection, rating=1.0, amount=amount) # Randomness here does not matter, so just merge the lists both = [*positive, *negative] + # Shuffle the dataset, just in case it affects the performance + # TODO: does it actually? + random.shuffle(both) + return both -def get_test_reviews(collection: pymongo.collection.Collection) -> list[Review]: +def get_reviews_dataset_uniform(collection: pymongo.collection.Collection, amount: int) -> list[Review]: """ - Get the subset of reviews that should act as test set. + Get a list of shuffled reviews of any rating. """ + log.info("Building dataset with %d uniform reviews...", amount * 5) - log.info("Building test set...") + # Sample the required reviews + terrible = sample_reviews_by_rating(collection, rating=1.0, amount=amount) + negative = sample_reviews_by_rating(collection, rating=2.0, amount=amount) + mixed = sample_reviews_by_rating(collection, rating=3.0, amount=amount) + positive = sample_reviews_by_rating(collection, rating=4.0, amount=amount) + great = sample_reviews_by_rating(collection, rating=5.0, amount=amount) - amount: int = TEST_SET_SIZE.__wrapped__ + # Randomness here does not matter, so just merge the lists + both = [*positive, *negative] - return list(sample_reviews_by_rating_polar(collection, amount)) + # Shuffle the dataset, just in case it affects the performance + # TODO: does it actually? + random.shuffle(both) + + return both __all__ = ( @@ -140,7 +129,5 @@ __all__ = ( "mongo_reviews_collection_from_config", "sample_reviews", "sample_reviews_by_rating", - "sample_reviews_by_rating_polar", - "get_training_reviews", - "get_test_reviews", + "get_reviews_dataset_polar", )