Working prototype

2024-11-25 01:04:19 +00:00 · 2023-02-02 02:56:37 +01:00 · 2023-02-02 02:56:37 +01:00 · 14d1e1a22f
commit 14d1e1a22f
parent 2f7237ebfa
13 changed files with 254 additions and 90 deletions
--- a/.gitignore
+++ b/.gitignore
@ -9,6 +9,7 @@

 data/raw/
 data/db/
+data/nltk/

 ##################
 # Python ignores #
--- a/.idea/dictionaries/steffo.xml
+++ b/.idea/dictionaries/steffo.xml
@ -0,0 +1,3 @@
+<component name="ProjectDictionaryState">
+  <dictionary name="steffo" />
+</component>
--- a/.idea/runConfigurations/unimore_bda_6.xml
+++ b/.idea/runConfigurations/unimore_bda_6.xml
@ -0,0 +1,26 @@
+<component name="ProjectRunConfigurationManager">
+  <configuration default="false" name="unimore_bda_6" type="PythonConfigurationType" factoryName="Python" nameIsGenerated="true">
+    <module name="unimore-bda-6" />
+    <option name="INTERPRETER_OPTIONS" value="" />
+    <option name="PARENT_ENVS" value="true" />
+    <envs>
+      <env name="NLTK_DATA" value="./data/nltk" />
+      <env name="PYTHONUNBUFFERED" value="1" />
+    </envs>
+    <option name="SDK_HOME" value="$PROJECT_DIR$/.venv/bin/python" />
+    <option name="SDK_NAME" value="Poetry (unimore-bda-6)" />
+    <option name="WORKING_DIRECTORY" value="$PROJECT_DIR$" />
+    <option name="IS_MODULE_SDK" value="false" />
+    <option name="ADD_CONTENT_ROOTS" value="true" />
+    <option name="ADD_SOURCE_ROOTS" value="true" />
+    <EXTENSION ID="PythonCoverageRunConfigurationExtension" runner="coverage.py" />
+    <option name="SCRIPT_NAME" value="unimore_bda_6" />
+    <option name="PARAMETERS" value="" />
+    <option name="SHOW_COMMAND_LINE" value="false" />
+    <option name="EMULATE_TERMINAL" value="false" />
+    <option name="MODULE_MODE" value="true" />
+    <option name="REDIRECT_INPUT" value="false" />
+    <option name="INPUT_FILE" value="" />
+    <method v="2" />
+  </configuration>
+</component>
--- a/.vscode/launch.json
+++ b/.vscode/launch.json
@ -0,0 +1,19 @@
+{
+    // Use IntelliSense to learn about possible attributes.
+    // Hover to view descriptions of existing attributes.
+    // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
+    "version": "0.2.0",
+    "configurations": [
+        {
+            "name": "Python: unimore_bda_6",
+            "type": "python",
+            "request": "launch",
+            "module": "unimore_bda_6",
+            "justMyCode": true,
+            "env": {
+                "NLTK_DATA": "./data/nltk",
+            },
+            "cwd": "${workspaceFolder}",
+        }
+    ]
+}
--- a/data/scripts/download-nltk.sh
+++ b/data/scripts/download-nltk.sh
@ -0,0 +1,4 @@
+#!/usr/bin/env bash
+repo=$(git rev-parse --show-toplevel)
+export NLTK_DATA="$repo/data/nltk"
+"$repo/.venv/bin/python" -m nltk.downloader popular
--- a/data/scripts/index-db.mongodb
+++ b/data/scripts/index-db.mongodb
@ -0,0 +1,8 @@
+db.reviews.createIndex(
+    {
+        overall: 1,
+    },
+    {
+        name: "rating_index"
+    }
+)
--- a/unimore-bda-6.iml
+++ b/unimore-bda-6.iml
@ -6,6 +6,7 @@
      <sourceFolder url="file://$MODULE_DIR$/unimore_bda_6" isTestSource="false" />
      <excludeFolder url="file://$MODULE_DIR$/data/db" />
      <excludeFolder url="file://$MODULE_DIR$/data/raw" />
+      <excludeFolder url="file://$MODULE_DIR$/data/nltk" />
    </content>
    <orderEntry type="jdk" jdkName="Poetry (unimore-bda-6)" jdkType="Python SDK" />
    <orderEntry type="sourceFolder" forTests="false" />
--- a/unimore_bda_6/init.py
+++ b/unimore_bda_6/init.py
@ -1,5 +0,0 @@
-# If you are building a **library**, use this file to export objects!
-
-__all__ = (
-    # "",
-)
--- a/unimore_bda_6/main.py
+++ b/unimore_bda_6/main.py
@ -1,12 +1,23 @@
 from .config import config
-from .analysis.vanilla import create_model_vanilla, train_model_vanilla, evaluate_model_vanilla
+from .database import mongo_reviews_collection_from_config, get_training_reviews, get_test_reviews
+from .analysis.vanilla import VanillaReviewSA
 from .log import install_log_handler


 def main():
-    model = create_model_vanilla()
-    train_model_vanilla(model)
-    evaluate_model_vanilla(model)
+    with mongo_reviews_collection_from_config() as reviews:
+        training_reviews = get_training_reviews(collection=reviews)
+        test_reviews = get_test_reviews(collection=reviews)
+
+    model = VanillaReviewSA()
+    model.train(training_reviews)
+    
+    evaluation = model.evaluate(test_reviews)
+    print(evaluation)
+    
+    while True:
+        classification = model.use(input())
+        print(classification)


 if __name__ == "__main__":
--- a/unimore_bda_6/analysis/base.py
+++ b/unimore_bda_6/analysis/base.py
@ -0,0 +1,54 @@
+import abc
+
+
+class BaseSA(metaclass=abc.ABCMeta):
+    """
+    Abstract base class for sentiment analyzers implemented in this project.
+    """
+
+    def __init__(self) -> None:
+        """
+        Create the empty shell of the sentiment analyzer.
+        """
+
+        self.trained = False
+        "If :meth:`train` has been called at least once, and the analyzer is ready or not to be evaluated or used."
+
+    @abc.abstractmethod
+    def train(self, training_set) -> None:
+        """
+        Train the analyzer with the given training set.
+        """
+        raise NotImplementedError()
+
+    @abc.abstractmethod
+    def evaluate(self, test_set) -> None:
+        """
+        Evaluate the analyzer with the given test set.
+        """
+        raise NotImplementedError()
+
+    @abc.abstractmethod
+    def use(self, text: str) -> str:
+        """
+        Use the sentiment analyzer.
+        """
+        raise NotImplementedError()
+
+
+class AlreadyTrainedError(Exception):
+    """
+    This model has already been trained and cannot be trained again.
+    """
+
+class NotTrainedError(Exception):
+    """
+    This model has not been trained yet.
+    """
+
+
+__all__ = (
+    "BaseSA",
+    "AlreadyTrainedError",
+    "NotTrainedError",
+)
--- a/unimore_bda_6/analysis/vanilla.py
+++ b/unimore_bda_6/analysis/vanilla.py
@ -1,58 +1,118 @@
+import abc
 import nltk
 import nltk.classify
 import nltk.sentiment
 import nltk.sentiment.util
 import logging
+import typing as t

-from ..database import mongo_reviews_collection_from_config, get_reviews_training_set, get_reviews_test_set
+from ..database import Review
+from .base import BaseSA, AlreadyTrainedError, NotTrainedError


 log = logging.getLogger(__name__)


-def create_model_vanilla() -> nltk.sentiment.SentimentAnalyzer:
-    log.debug("Creating model...")
-    model = nltk.sentiment.SentimentAnalyzer()
-    log.debug("Created model %s!", model)
-    return model
+class VanillaSA(BaseSA, metaclass=abc.ABCMeta):
+    """
+    A sentiment analyzer resembling the one implemented in structure the one implemented in the classroom, using the basic sentiment analyzer of NLTK.
+    """
+
+    def __init__(self, language="english") -> None:
+        super().__init__()
+        self.language: str = language
+        self.model: nltk.sentiment.SentimentAnalyzer = nltk.sentiment.SentimentAnalyzer()
+
+    def _tokenize_text(self, text: str) -> list[str]:
+        """
+        Convert a text string into a list of tokens, using the language of the model.
+        """
+        tokens = nltk.word_tokenize(text, language=self.language)
+        nltk.sentiment.util.mark_negation(tokens, shallow=True)
+        return tokens
+
+    def __add_feature_unigrams(self, training_set: list[tuple[list[str], str]]) -> None:
+        """
+        Add the `nltk.sentiment.util.extract_unigram_feats` feature to the model.
+        """
+        all_words = self.model.all_words(training_set, labeled=True)
+        unigrams = self.model.unigram_word_feats(words=all_words, min_freq=4)
+        self.model.add_feat_extractor(nltk.sentiment.util.extract_unigram_feats, unigrams=unigrams)
+
+    def _featurize_documents(self, documents: list[tuple[list[str], str]]):
+        """
+        Apply features to a document.
+        """
+        return self.model.apply_features(documents, labeled=True)
+
+    def _train_with_set(self, training_set: list[tuple[list[str], str]]) -> None:
+        """
+        Train the model with the given **pre-classified but not pre-tokenized** training set.
+        """
+        if self.trained:
+            raise AlreadyTrainedError()
+
+        self.__add_feature_unigrams(training_set)
+        training_set_with_features = self._featurize_documents(training_set)
+
+        self.model.train(trainer=nltk.classify.NaiveBayesClassifier.train, training_set=training_set_with_features)
+        self.trained = True
+
+    def _evaluate_with_set(self, test_set: list[tuple[list[str], str]]) -> dict:
+        if not self.trained:
+            raise NotTrainedError()
+        
+        test_set_with_features = self._featurize_documents(test_set)
+        return self.model.evaluate(test_set_with_features)
+
+    def _use_with_tokens(self, tokens: list[str]) -> str:
+        if not self.trained:
+            raise NotTrainedError()
+        
+        return self.model.classify(instance=tokens)


-def train_model_vanilla(model: nltk.sentiment.SentimentAnalyzer) -> None:
-    # TODO: This doesn't work yet
+class VanillaReviewSA(VanillaSA):
+    """
+    A `VanillaSA` to be used with `Review`s.
+    """

-    with mongo_reviews_collection_from_config() as reviews:
-        training_set = get_reviews_training_set(reviews)
+    @staticmethod
+    def _rating_to_label(rating: float) -> str:
+        """
+        Return the label corresponding to the given rating.

-    log.debug("Marking negations...")
-    training_negated_set = list(map(nltk.sentiment.util.mark_negation, training_set))
+        Possible categories are:
+        * negative (0.0 <= rating < 2.5)
+        * mixed (2.5 <= rating <= 3.5)
+        * positive (3.5 < rating <= 5.0)
+        """
+        if rating < 2.5:
+            return "negative"
+        elif rating <= 3.5:
+            return "mixed"
+        else:
+            return "positive"

-    log.debug("Extracting tokens...")
-    training_tokens = model.all_words(training_negated_set, labeled=False)
+    def _review_to_data_set(self, review: Review) -> tuple[list[str], str]:
+        """
+        Convert a review to a NLTK-compatible dataset.
+        """
+        return self._tokenize_text(text=review["reviewText"]), self._rating_to_label(rating=review["overall"])
+        
+    def train(self, reviews: t.Iterable[Review]) -> None:
+        data_set = list(map(self._review_to_data_set, reviews))
+        self._train_with_set(data_set)

-    log.debug("Counting unigrams...")
-    training_unigrams = model.unigram_word_feats(words=training_tokens, min_freq=4)
+    def evaluate(self, reviews: t.Iterable[Review]):
+        data_set = list(map(self._review_to_data_set, reviews))
+        return self._evaluate_with_set(data_set)

-    log.debug("Configuring model features...")
-    model.add_feat_extractor(nltk.sentiment.util.extract_unigram_feats, unigrams=training_unigrams)
-    training_set = model.apply_features(documents=training_set)
-
-    log.info("Training model...")
-    model.train(trainer=nltk.classify.NaiveBayesClassifier.train, training_set=training_set)
-
-
-def evaluate_model_vanilla(model: nltk.sentiment.SentimentAnalyzer):
-    with mongo_reviews_collection_from_config() as reviews:
-        test_set = get_reviews_test_set(reviews)
-
-    log.info("Evaluating model...")
-    model.evaluate(test_set)
-
-    # TODO
-    breakpoint()
+    def use(self, text: str) -> str:
+        return self._use_with_tokens(self._tokenize_text(text))


 __all__ = (
-    "create_model_vanilla",
-    "train_model_vanilla",
-    "evaluate_model_vanilla",
+    "VanillaSA",
+    "VanillaReviewSA",
 )
--- a/unimore_bda_6/config.py
+++ b/unimore_bda_6/config.py
@ -7,6 +7,8 @@ config = cfig.Configuration()
 def MONGO_HOST(val: str | None) -> str:
    """
    The hostname of the MongoDB database to connect to.
+
+    Defaults to `"127.0.0.1"`.
    """
    return val or "127.0.0.1"

@ -15,6 +17,8 @@ def MONGO_HOST(val: str | None) -> str:
 def MONGO_PORT(val: str | None) -> int:
    """
    The port of the MongoDB database to connect to.
+
+    Defaults to `27017`.
    """
    if val is None:
        return 27017
@ -24,23 +28,12 @@ def MONGO_PORT(val: str | None) -> int:
        raise cfig.InvalidValueError("Not an int.")


-@config.optional()
-def SAMPLE_MODE(val: str | None) -> str:
-    """
-    Whether `$sample` or `$limit` should be used to aggregate the training and test sets.
-    `$limit` is much faster, but not truly random, while `$sample` is completely random.
-    """
-    if val is None:
-        return "$sample"
-    if val not in ["$sample", "$limit"]:
-        raise cfig.InvalidValueError("Neither $sample or $limit.")
-    return val
-
-
@config.optional()
 def TRAINING_SET_SIZE(val: str | None) -> int:
    """
    The number of reviews from each category to fetch for the training set.
+
+    Defaults to `1000`.
    """
    if val is None:
        return 1000
@ -54,6 +47,8 @@ def TRAINING_SET_SIZE(val: str | None) -> int:
 def TEST_SET_SIZE(val: str | None) -> int:
    """
    The number of reviews to fetch for the test set.
+
+    Defaults to `1000`.
    """
    if val is None:
        return 1000
@ -67,7 +62,11 @@ __all__ = (
    "config",
    "MONGO_HOST",
    "MONGO_PORT",
-    "SAMPLE_MODE",
    "TRAINING_SET_SIZE",
    "TEST_SET_SIZE",
+    "NLTK_DOUBLE_NEG_SWITCH",
 )
+
+
+if __name__ == "__main__":
+    config.cli()
--- a/unimore_bda_6/database.py
+++ b/unimore_bda_6/database.py
@ -4,9 +4,8 @@ import pymongo.collection
 import contextlib
 import bson
 import logging
-import random

-from .config import MONGO_HOST, MONGO_PORT, TRAINING_SET_SIZE, TEST_SET_SIZE, SAMPLE_MODE
+from .config import MONGO_HOST, MONGO_PORT, TRAINING_SET_SIZE, TEST_SET_SIZE

 log = logging.getLogger(__name__)

@ -55,25 +54,6 @@ def mongo_reviews_collection_from_config() -> pymongo.collection.Collection[Revi
        yield collection


-def pipeline_sample(collection: pymongo.collection.Collection, amount: int) -> list:
-    """
-    Create pipeline stages for sampling random documents, either with true randomness or by skipping a random amount of them.
-    """
-    if SAMPLE_MODE.__wrapped__ == "$sample":
-        return [
-            {"$sample": {"size": amount}},
-        ]
-    elif SAMPLE_MODE.__wrapped__ == "$limit":
-        log.warning("USE_SAMPLE is disabled, sampling documents using $skip and $limit.")
-        skip = random.randint(0, collection.estimated_document_count(maxTimeMS=100))
-        return [
-            {"$skip": skip},
-            {"$limit": amount},
-        ]
-    else:
-        raise ValueError("Unknown sample mode", SAMPLE_MODE)
-
-
 def sample_reviews(reviews: pymongo.collection.Collection, amount: int) -> t.Iterable[Review]:
    """
    Get ``amount`` random reviews from the ``reviews`` collection.
@ -81,7 +61,8 @@ def sample_reviews(reviews: pymongo.collection.Collection, amount: int) -> t.Ite
    log.debug("Getting a sample of %d reviews...", amount)

    return reviews.aggregate([
-        *pipeline_sample(reviews, amount),
+        {"$limit": 10000},  # TODO
+        {"$sample": {"size": amount}},
    ])


@ -92,8 +73,9 @@ def sample_reviews_by_rating(reviews: pymongo.collection.Collection, rating: flo
    log.debug("Getting a sample of %d reviews with %d stars...", amount, rating)

    return reviews.aggregate([
+        {"$limit": 10000},  # TODO
        {"$match": {"overall": rating}},
-        *pipeline_sample(reviews, amount),
+        {"$sample": {"size": amount}},
    ])


@ -104,6 +86,7 @@ def sample_reviews_by_rating_polar(reviews: pymongo.collection.Collection, amoun
    log.debug("Getting a sample of %d reviews with either 5 or 1 stars...", amount)

    return reviews.aggregate([
+        {"$limit": 10000},  # TODO
        {"$match":
            {"$or":
                [
@ -112,11 +95,11 @@ def sample_reviews_by_rating_polar(reviews: pymongo.collection.Collection, amoun
                ]
            },
        },
-        *pipeline_sample(reviews, amount),
+        {"$sample": {"size": amount}},
    ])


-def get_reviews_training_set(reviews: pymongo.collection.Collection) -> t.Iterable[Review]:
+def get_training_reviews(collection: pymongo.collection.Collection) -> list[Review]:
    """
    Get the subset of reviews that should act as training set.
    """
@ -130,8 +113,8 @@ def get_reviews_training_set(reviews: pymongo.collection.Collection) -> t.Iterab
    negative_amount: int = amount - positive_amount

    # Sample the required reviews
-    positive = sample_reviews_by_rating(reviews, 5.0, positive_amount)
-    negative = sample_reviews_by_rating(reviews, 1.0, negative_amount)
+    positive = sample_reviews_by_rating(collection, 5.0, positive_amount)
+    negative = sample_reviews_by_rating(collection, 1.0, negative_amount)

    # Randomness here does not matter, so just merge the lists
    both = [*positive, *negative]
@ -139,7 +122,7 @@ def get_reviews_training_set(reviews: pymongo.collection.Collection) -> t.Iterab
    return both


-def get_reviews_test_set(reviews: pymongo.collection.Collection) -> t.Iterable[Review]:
+def get_test_reviews(collection: pymongo.collection.Collection) -> list[Review]:
    """
    Get the subset of reviews that should act as test set.
    """
@ -148,7 +131,7 @@ def get_reviews_test_set(reviews: pymongo.collection.Collection) -> t.Iterable[R

    amount: int = TEST_SET_SIZE.__wrapped__

-    return sample_reviews_by_rating_polar(reviews, amount)
+    return list(sample_reviews_by_rating_polar(collection, amount))


 __all__ = (
@ -158,6 +141,6 @@ __all__ = (
    "sample_reviews",
    "sample_reviews_by_rating",
    "sample_reviews_by_rating_polar",
-    "get_reviews_training_set",
-    "get_reviews_test_set",
+    "get_training_reviews",
+    "get_test_reviews",
 )