diff --git a/.gitignore b/.gitignore index f11c9ac..8467da3 100644 --- a/.gitignore +++ b/.gitignore @@ -9,6 +9,7 @@ data/raw/ data/db/ +data/nltk/ ################## # Python ignores # diff --git a/.idea/dictionaries/steffo.xml b/.idea/dictionaries/steffo.xml new file mode 100644 index 0000000..56712aa --- /dev/null +++ b/.idea/dictionaries/steffo.xml @@ -0,0 +1,3 @@ + + + \ No newline at end of file diff --git a/.idea/runConfigurations/unimore_bda_6.xml b/.idea/runConfigurations/unimore_bda_6.xml new file mode 100644 index 0000000..22a226f --- /dev/null +++ b/.idea/runConfigurations/unimore_bda_6.xml @@ -0,0 +1,26 @@ + + + + + \ No newline at end of file diff --git a/.vscode/launch.json b/.vscode/launch.json new file mode 100644 index 0000000..e492b02 --- /dev/null +++ b/.vscode/launch.json @@ -0,0 +1,19 @@ +{ + // Use IntelliSense to learn about possible attributes. + // Hover to view descriptions of existing attributes. + // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387 + "version": "0.2.0", + "configurations": [ + { + "name": "Python: unimore_bda_6", + "type": "python", + "request": "launch", + "module": "unimore_bda_6", + "justMyCode": true, + "env": { + "NLTK_DATA": "./data/nltk", + }, + "cwd": "${workspaceFolder}", + } + ] +} \ No newline at end of file diff --git a/data/scripts/download-nltk.sh b/data/scripts/download-nltk.sh new file mode 100755 index 0000000..3ea2b95 --- /dev/null +++ b/data/scripts/download-nltk.sh @@ -0,0 +1,4 @@ +#!/usr/bin/env bash +repo=$(git rev-parse --show-toplevel) +export NLTK_DATA="$repo/data/nltk" +"$repo/.venv/bin/python" -m nltk.downloader popular diff --git a/data/scripts/index-db.mongodb b/data/scripts/index-db.mongodb new file mode 100644 index 0000000..7517ff9 --- /dev/null +++ b/data/scripts/index-db.mongodb @@ -0,0 +1,8 @@ +db.reviews.createIndex( + { + overall: 1, + }, + { + name: "rating_index" + } +) diff --git a/unimore-bda-6.iml b/unimore-bda-6.iml index 7616de1..1312514 100644 --- a/unimore-bda-6.iml +++ b/unimore-bda-6.iml @@ -6,6 +6,7 @@ + diff --git a/unimore_bda_6/__init__.py b/unimore_bda_6/__init__.py index 8c571c4..e69de29 100644 --- a/unimore_bda_6/__init__.py +++ b/unimore_bda_6/__init__.py @@ -1,5 +0,0 @@ -# If you are building a **library**, use this file to export objects! - -__all__ = ( - # "", -) diff --git a/unimore_bda_6/__main__.py b/unimore_bda_6/__main__.py index eb4e616..73f03b9 100644 --- a/unimore_bda_6/__main__.py +++ b/unimore_bda_6/__main__.py @@ -1,12 +1,23 @@ from .config import config -from .analysis.vanilla import create_model_vanilla, train_model_vanilla, evaluate_model_vanilla +from .database import mongo_reviews_collection_from_config, get_training_reviews, get_test_reviews +from .analysis.vanilla import VanillaReviewSA from .log import install_log_handler def main(): - model = create_model_vanilla() - train_model_vanilla(model) - evaluate_model_vanilla(model) + with mongo_reviews_collection_from_config() as reviews: + training_reviews = get_training_reviews(collection=reviews) + test_reviews = get_test_reviews(collection=reviews) + + model = VanillaReviewSA() + model.train(training_reviews) + + evaluation = model.evaluate(test_reviews) + print(evaluation) + + while True: + classification = model.use(input()) + print(classification) if __name__ == "__main__": diff --git a/unimore_bda_6/analysis/base.py b/unimore_bda_6/analysis/base.py new file mode 100644 index 0000000..736cc28 --- /dev/null +++ b/unimore_bda_6/analysis/base.py @@ -0,0 +1,54 @@ +import abc + + +class BaseSA(metaclass=abc.ABCMeta): + """ + Abstract base class for sentiment analyzers implemented in this project. + """ + + def __init__(self) -> None: + """ + Create the empty shell of the sentiment analyzer. + """ + + self.trained = False + "If :meth:`train` has been called at least once, and the analyzer is ready or not to be evaluated or used." + + @abc.abstractmethod + def train(self, training_set) -> None: + """ + Train the analyzer with the given training set. + """ + raise NotImplementedError() + + @abc.abstractmethod + def evaluate(self, test_set) -> None: + """ + Evaluate the analyzer with the given test set. + """ + raise NotImplementedError() + + @abc.abstractmethod + def use(self, text: str) -> str: + """ + Use the sentiment analyzer. + """ + raise NotImplementedError() + + +class AlreadyTrainedError(Exception): + """ + This model has already been trained and cannot be trained again. + """ + +class NotTrainedError(Exception): + """ + This model has not been trained yet. + """ + + +__all__ = ( + "BaseSA", + "AlreadyTrainedError", + "NotTrainedError", +) diff --git a/unimore_bda_6/analysis/vanilla.py b/unimore_bda_6/analysis/vanilla.py index c190b78..249dedd 100644 --- a/unimore_bda_6/analysis/vanilla.py +++ b/unimore_bda_6/analysis/vanilla.py @@ -1,58 +1,118 @@ +import abc import nltk import nltk.classify import nltk.sentiment import nltk.sentiment.util import logging +import typing as t -from ..database import mongo_reviews_collection_from_config, get_reviews_training_set, get_reviews_test_set +from ..database import Review +from .base import BaseSA, AlreadyTrainedError, NotTrainedError log = logging.getLogger(__name__) -def create_model_vanilla() -> nltk.sentiment.SentimentAnalyzer: - log.debug("Creating model...") - model = nltk.sentiment.SentimentAnalyzer() - log.debug("Created model %s!", model) - return model +class VanillaSA(BaseSA, metaclass=abc.ABCMeta): + """ + A sentiment analyzer resembling the one implemented in structure the one implemented in the classroom, using the basic sentiment analyzer of NLTK. + """ + + def __init__(self, language="english") -> None: + super().__init__() + self.language: str = language + self.model: nltk.sentiment.SentimentAnalyzer = nltk.sentiment.SentimentAnalyzer() + + def _tokenize_text(self, text: str) -> list[str]: + """ + Convert a text string into a list of tokens, using the language of the model. + """ + tokens = nltk.word_tokenize(text, language=self.language) + nltk.sentiment.util.mark_negation(tokens, shallow=True) + return tokens + + def __add_feature_unigrams(self, training_set: list[tuple[list[str], str]]) -> None: + """ + Add the `nltk.sentiment.util.extract_unigram_feats` feature to the model. + """ + all_words = self.model.all_words(training_set, labeled=True) + unigrams = self.model.unigram_word_feats(words=all_words, min_freq=4) + self.model.add_feat_extractor(nltk.sentiment.util.extract_unigram_feats, unigrams=unigrams) + + def _featurize_documents(self, documents: list[tuple[list[str], str]]): + """ + Apply features to a document. + """ + return self.model.apply_features(documents, labeled=True) + + def _train_with_set(self, training_set: list[tuple[list[str], str]]) -> None: + """ + Train the model with the given **pre-classified but not pre-tokenized** training set. + """ + if self.trained: + raise AlreadyTrainedError() + + self.__add_feature_unigrams(training_set) + training_set_with_features = self._featurize_documents(training_set) + + self.model.train(trainer=nltk.classify.NaiveBayesClassifier.train, training_set=training_set_with_features) + self.trained = True + + def _evaluate_with_set(self, test_set: list[tuple[list[str], str]]) -> dict: + if not self.trained: + raise NotTrainedError() + + test_set_with_features = self._featurize_documents(test_set) + return self.model.evaluate(test_set_with_features) + + def _use_with_tokens(self, tokens: list[str]) -> str: + if not self.trained: + raise NotTrainedError() + + return self.model.classify(instance=tokens) -def train_model_vanilla(model: nltk.sentiment.SentimentAnalyzer) -> None: - # TODO: This doesn't work yet +class VanillaReviewSA(VanillaSA): + """ + A `VanillaSA` to be used with `Review`s. + """ - with mongo_reviews_collection_from_config() as reviews: - training_set = get_reviews_training_set(reviews) + @staticmethod + def _rating_to_label(rating: float) -> str: + """ + Return the label corresponding to the given rating. - log.debug("Marking negations...") - training_negated_set = list(map(nltk.sentiment.util.mark_negation, training_set)) + Possible categories are: + * negative (0.0 <= rating < 2.5) + * mixed (2.5 <= rating <= 3.5) + * positive (3.5 < rating <= 5.0) + """ + if rating < 2.5: + return "negative" + elif rating <= 3.5: + return "mixed" + else: + return "positive" - log.debug("Extracting tokens...") - training_tokens = model.all_words(training_negated_set, labeled=False) + def _review_to_data_set(self, review: Review) -> tuple[list[str], str]: + """ + Convert a review to a NLTK-compatible dataset. + """ + return self._tokenize_text(text=review["reviewText"]), self._rating_to_label(rating=review["overall"]) + + def train(self, reviews: t.Iterable[Review]) -> None: + data_set = list(map(self._review_to_data_set, reviews)) + self._train_with_set(data_set) - log.debug("Counting unigrams...") - training_unigrams = model.unigram_word_feats(words=training_tokens, min_freq=4) + def evaluate(self, reviews: t.Iterable[Review]): + data_set = list(map(self._review_to_data_set, reviews)) + return self._evaluate_with_set(data_set) - log.debug("Configuring model features...") - model.add_feat_extractor(nltk.sentiment.util.extract_unigram_feats, unigrams=training_unigrams) - training_set = model.apply_features(documents=training_set) - - log.info("Training model...") - model.train(trainer=nltk.classify.NaiveBayesClassifier.train, training_set=training_set) - - -def evaluate_model_vanilla(model: nltk.sentiment.SentimentAnalyzer): - with mongo_reviews_collection_from_config() as reviews: - test_set = get_reviews_test_set(reviews) - - log.info("Evaluating model...") - model.evaluate(test_set) - - # TODO - breakpoint() + def use(self, text: str) -> str: + return self._use_with_tokens(self._tokenize_text(text)) __all__ = ( - "create_model_vanilla", - "train_model_vanilla", - "evaluate_model_vanilla", + "VanillaSA", + "VanillaReviewSA", ) diff --git a/unimore_bda_6/config.py b/unimore_bda_6/config.py index 758b973..9173139 100644 --- a/unimore_bda_6/config.py +++ b/unimore_bda_6/config.py @@ -7,6 +7,8 @@ config = cfig.Configuration() def MONGO_HOST(val: str | None) -> str: """ The hostname of the MongoDB database to connect to. + + Defaults to `"127.0.0.1"`. """ return val or "127.0.0.1" @@ -15,6 +17,8 @@ def MONGO_HOST(val: str | None) -> str: def MONGO_PORT(val: str | None) -> int: """ The port of the MongoDB database to connect to. + + Defaults to `27017`. """ if val is None: return 27017 @@ -24,23 +28,12 @@ def MONGO_PORT(val: str | None) -> int: raise cfig.InvalidValueError("Not an int.") -@config.optional() -def SAMPLE_MODE(val: str | None) -> str: - """ - Whether `$sample` or `$limit` should be used to aggregate the training and test sets. - `$limit` is much faster, but not truly random, while `$sample` is completely random. - """ - if val is None: - return "$sample" - if val not in ["$sample", "$limit"]: - raise cfig.InvalidValueError("Neither $sample or $limit.") - return val - - @config.optional() def TRAINING_SET_SIZE(val: str | None) -> int: """ The number of reviews from each category to fetch for the training set. + + Defaults to `1000`. """ if val is None: return 1000 @@ -54,6 +47,8 @@ def TRAINING_SET_SIZE(val: str | None) -> int: def TEST_SET_SIZE(val: str | None) -> int: """ The number of reviews to fetch for the test set. + + Defaults to `1000`. """ if val is None: return 1000 @@ -67,7 +62,11 @@ __all__ = ( "config", "MONGO_HOST", "MONGO_PORT", - "SAMPLE_MODE", "TRAINING_SET_SIZE", "TEST_SET_SIZE", + "NLTK_DOUBLE_NEG_SWITCH", ) + + +if __name__ == "__main__": + config.cli() diff --git a/unimore_bda_6/database.py b/unimore_bda_6/database.py index c152d24..29090da 100644 --- a/unimore_bda_6/database.py +++ b/unimore_bda_6/database.py @@ -4,9 +4,8 @@ import pymongo.collection import contextlib import bson import logging -import random -from .config import MONGO_HOST, MONGO_PORT, TRAINING_SET_SIZE, TEST_SET_SIZE, SAMPLE_MODE +from .config import MONGO_HOST, MONGO_PORT, TRAINING_SET_SIZE, TEST_SET_SIZE log = logging.getLogger(__name__) @@ -55,25 +54,6 @@ def mongo_reviews_collection_from_config() -> pymongo.collection.Collection[Revi yield collection -def pipeline_sample(collection: pymongo.collection.Collection, amount: int) -> list: - """ - Create pipeline stages for sampling random documents, either with true randomness or by skipping a random amount of them. - """ - if SAMPLE_MODE.__wrapped__ == "$sample": - return [ - {"$sample": {"size": amount}}, - ] - elif SAMPLE_MODE.__wrapped__ == "$limit": - log.warning("USE_SAMPLE is disabled, sampling documents using $skip and $limit.") - skip = random.randint(0, collection.estimated_document_count(maxTimeMS=100)) - return [ - {"$skip": skip}, - {"$limit": amount}, - ] - else: - raise ValueError("Unknown sample mode", SAMPLE_MODE) - - def sample_reviews(reviews: pymongo.collection.Collection, amount: int) -> t.Iterable[Review]: """ Get ``amount`` random reviews from the ``reviews`` collection. @@ -81,7 +61,8 @@ def sample_reviews(reviews: pymongo.collection.Collection, amount: int) -> t.Ite log.debug("Getting a sample of %d reviews...", amount) return reviews.aggregate([ - *pipeline_sample(reviews, amount), + {"$limit": 10000}, # TODO + {"$sample": {"size": amount}}, ]) @@ -92,8 +73,9 @@ def sample_reviews_by_rating(reviews: pymongo.collection.Collection, rating: flo log.debug("Getting a sample of %d reviews with %d stars...", amount, rating) return reviews.aggregate([ + {"$limit": 10000}, # TODO {"$match": {"overall": rating}}, - *pipeline_sample(reviews, amount), + {"$sample": {"size": amount}}, ]) @@ -104,6 +86,7 @@ def sample_reviews_by_rating_polar(reviews: pymongo.collection.Collection, amoun log.debug("Getting a sample of %d reviews with either 5 or 1 stars...", amount) return reviews.aggregate([ + {"$limit": 10000}, # TODO {"$match": {"$or": [ @@ -112,11 +95,11 @@ def sample_reviews_by_rating_polar(reviews: pymongo.collection.Collection, amoun ] }, }, - *pipeline_sample(reviews, amount), + {"$sample": {"size": amount}}, ]) -def get_reviews_training_set(reviews: pymongo.collection.Collection) -> t.Iterable[Review]: +def get_training_reviews(collection: pymongo.collection.Collection) -> list[Review]: """ Get the subset of reviews that should act as training set. """ @@ -130,8 +113,8 @@ def get_reviews_training_set(reviews: pymongo.collection.Collection) -> t.Iterab negative_amount: int = amount - positive_amount # Sample the required reviews - positive = sample_reviews_by_rating(reviews, 5.0, positive_amount) - negative = sample_reviews_by_rating(reviews, 1.0, negative_amount) + positive = sample_reviews_by_rating(collection, 5.0, positive_amount) + negative = sample_reviews_by_rating(collection, 1.0, negative_amount) # Randomness here does not matter, so just merge the lists both = [*positive, *negative] @@ -139,7 +122,7 @@ def get_reviews_training_set(reviews: pymongo.collection.Collection) -> t.Iterab return both -def get_reviews_test_set(reviews: pymongo.collection.Collection) -> t.Iterable[Review]: +def get_test_reviews(collection: pymongo.collection.Collection) -> list[Review]: """ Get the subset of reviews that should act as test set. """ @@ -148,7 +131,7 @@ def get_reviews_test_set(reviews: pymongo.collection.Collection) -> t.Iterable[R amount: int = TEST_SET_SIZE.__wrapped__ - return sample_reviews_by_rating_polar(reviews, amount) + return list(sample_reviews_by_rating_polar(collection, amount)) __all__ = ( @@ -158,6 +141,6 @@ __all__ = ( "sample_reviews", "sample_reviews_by_rating", "sample_reviews_by_rating_polar", - "get_reviews_training_set", - "get_reviews_test_set", + "get_training_reviews", + "get_test_reviews", )