From dcfc4fbc3b7be413dfe75f84f47ff6cfccad44f9 Mon Sep 17 00:00:00 2001
From: Stefano Pigozzi <me@steffo.eu>
Date: Sat, 4 Feb 2023 06:14:24 +0100
Subject: [PATCH] Getting closer...

---
 unimore-bda-6.iml                        |   1 +
 unimore_bda_6/__main__.py                |  63 +++++-----
 unimore_bda_6/analysis/base.py           |  40 +++---
 unimore_bda_6/analysis/nltk_sentiment.py |  14 ++-
 unimore_bda_6/analysis/tf_text.py        |  29 +----
 unimore_bda_6/database.py                | 149 +++++++++++++++--------
 6 files changed, 170 insertions(+), 126 deletions(-)
diff --git a/unimore-bda-6.iml b/unimore-bda-6.iml
index 1312514..80da260 100644
--- a/unimore-bda-6.iml
+++ b/unimore-bda-6.iml
@@ -7,6 +7,7 @@
       <excludeFolder url="file://$MODULE_DIR$/data/db" />
       <excludeFolder url="file://$MODULE_DIR$/data/raw" />
       <excludeFolder url="file://$MODULE_DIR$/data/nltk" />
+      <excludeFolder url="file://$MODULE_DIR$/.venv" />
     </content>
     <orderEntry type="jdk" jdkName="Poetry (unimore-bda-6)" jdkType="Python SDK" />
     <orderEntry type="sourceFolder" forTests="false" />
diff --git a/unimore_bda_6/__main__.py b/unimore_bda_6/__main__.py
index f5c00f4..9bbd0ba 100644
--- a/unimore_bda_6/__main__.py
+++ b/unimore_bda_6/__main__.py
@@ -2,7 +2,7 @@ import logging
 import tensorflow
 
 from .config import config, DATA_SET_SIZE
-from .database import mongo_reviews_collection_from_config, polar_dataset, varied_dataset
+from .database import mongo_reviews_collection_from_config, sample_reviews_polar, sample_reviews_varied
 from .analysis.nltk_sentiment import NLTKSentimentAnalyzer
 from .analysis.tf_text import TensorflowSentimentAnalyzer
 from .tokenizer import NLTKWordTokenizer, PottsTokenizer, PottsTokenizerWithNegation, LowercaseTokenizer
@@ -17,37 +17,44 @@ def main():
     else:
         log.debug("Tensorflow successfully found GPU acceleration!")
 
-    for dataset_func in [polar_dataset, varied_dataset]:
-        for SentimentAnalyzer in [
-            NLTKSentimentAnalyzer,
-            # TensorflowSentimentAnalyzer,
+    for dataset_func in [sample_reviews_polar, sample_reviews_varied]:
+        # Tensorflow-based
+        for Tokenizer in [
+            LowercaseTokenizer
         ]:
-            for Tokenizer in [
-                NLTKWordTokenizer,
-                PottsTokenizer,
-                PottsTokenizerWithNegation,
-                LowercaseTokenizer,
-            ]:
-                tokenizer = Tokenizer()
-                model = SentimentAnalyzer(tokenizer=tokenizer)
+            tokenizer = Tokenizer()
+            model = TensorflowSentimentAnalyzer()
 
-                with mongo_reviews_collection_from_config() as reviews:
-                    reviews_training = dataset_func(collection=reviews, amount=DATA_SET_SIZE.__wrapped__)
-                    reviews_evaluation = dataset_func(collection=reviews, amount=DATA_SET_SIZE.__wrapped__)
+            with mongo_reviews_collection_from_config() as collection:
+                ...
 
-                    log.info("Training model %s", model)
-                    model.train(reviews_training)
-                    log.info("Evaluating model %s", model)
-                    correct, evaluated = model.evaluate(reviews_evaluation)
-                    log.info("%d evaluated, %d correct, %0.2d %% accuracy", evaluated, correct, correct / evaluated * 100)
+        # NLTK-based
+        for Tokenizer in [
+            NLTKWordTokenizer,
+            PottsTokenizer,
+            PottsTokenizerWithNegation,
+            LowercaseTokenizer,
+        ]:
+            tokenizer = Tokenizer()
+            model = NLTKSentimentAnalyzer(tokenizer=tokenizer)
 
-                # try:
-                #     print("Manual testing for %s" % model)
-                #     print("Input an empty string to continue to the next model.")
-                #     while inp := input():
-                #         print(model.use(inp))
-                # except KeyboardInterrupt:
-                #     pass
+            with mongo_reviews_collection_from_config() as collection:
+                reviews_training = dataset_func(collection=collection, amount=DATA_SET_SIZE.__wrapped__)
+                reviews_evaluation = dataset_func(collection=collection, amount=DATA_SET_SIZE.__wrapped__)
+
+                log.info("Training model %s", model)
+                model.train(reviews_training)
+                log.info("Evaluating model %s", model)
+                correct, evaluated = model.evaluate(reviews_evaluation)
+                log.info("%d evaluated, %d correct, %0.2d %% accuracy", evaluated, correct, correct / evaluated * 100)
+
+            # try:
+            #     print("Manual testing for %s" % model)
+            #     print("Input an empty string to continue to the next model.")
+            #     while inp := input():
+            #         print(model.use(inp))
+            # except KeyboardInterrupt:
+            #     pass
 
 
 if __name__ == "__main__":
diff --git a/unimore_bda_6/analysis/base.py b/unimore_bda_6/analysis/base.py
index 57f94ec..b9c3900 100644
--- a/unimore_bda_6/analysis/base.py
+++ b/unimore_bda_6/analysis/base.py
@@ -1,47 +1,55 @@
 import abc
 import logging
+import typing as t
+import dataclasses
 
-from ..database import DataSet, Text, Category
-from ..tokenizer import BaseTokenizer
+from ..database import Text, Category, Review, DatasetFunc
 
 log = logging.getLogger(__name__)
 
 
+@dataclasses.dataclass
+class EvaluationResults:
+    correct: int
+    evaluated: int
+
+    def __repr__(self):
+        return f"<EvaluationResults: {self.correct}/{self.evaluated}, {self.correct / self.evaluated * 100:.2f}>"
+
+    def __str__(self):
+        return f"{self.correct} / {self.evaluated} - {self.correct / self.evaluated * 100:.2f} %"
+
+
 class BaseSentimentAnalyzer(metaclass=abc.ABCMeta):
     """
     Abstract base class for sentiment analyzers implemented in this project.
     """
 
-    def __init__(self, *, tokenizer: BaseTokenizer):
-        self.tokenizer: BaseTokenizer = tokenizer
-
-    def __repr__(self):
-        return f"<{self.__class__.__qualname__} tokenizer={self.tokenizer!r}>"
-
     @abc.abstractmethod
-    def train(self, training_set: DataSet) -> None:
+    def train(self, dataset_func: DatasetFunc) -> None:
         """
         Train the analyzer with the given training dataset.
         """
         raise NotImplementedError()
 
-    def evaluate(self, test_set: DataSet) -> tuple[int, int]:
+    def evaluate(self, dataset_func: DatasetFunc) -> EvaluationResults:
         """
         Perform a model evaluation by calling repeatedly `.use` on every text of the test dataset and by comparing its resulting category with the expected category.
 
         Returns a tuple with the number of correct results and the number of evaluated results.
         """
-        evaluated: int = 0
-        correct: int   = 0
 
-        for text, expected_category in test_set:
-            resulting_category = self.use(text)
+        evaluated: int = 0
+        correct: int = 0
+
+        for review in dataset_func():
+            resulting_category = self.use(review.text)
             evaluated += 1
-            correct += 1 if resulting_category == expected_category else 0
+            correct += 1 if resulting_category == review.category else 0
             if not evaluated % 100:
                 log.debug("%d evaluated, %d correct, %0.2d %% accuracy", evaluated, correct, correct / evaluated * 100)
 
-        return correct, evaluated
+        return EvaluationResults(correct=correct, evaluated=evaluated)
 
     @abc.abstractmethod
     def use(self, text: Text) -> Category:
diff --git a/unimore_bda_6/analysis/nltk_sentiment.py b/unimore_bda_6/analysis/nltk_sentiment.py
index 6f7d03e..a1fd2ea 100644
--- a/unimore_bda_6/analysis/nltk_sentiment.py
+++ b/unimore_bda_6/analysis/nltk_sentiment.py
@@ -6,7 +6,7 @@ import logging
 import typing as t
 import itertools
 
-from ..database import Text, Category, DataTuple, DataSet
+from ..database import Text, Category, Review
 from .base import BaseSentimentAnalyzer, AlreadyTrainedError, NotTrainedError
 from ..log import count_passage
 from ..tokenizer import BaseTokenizer
@@ -23,16 +23,20 @@ class NLTKSentimentAnalyzer(BaseSentimentAnalyzer):
     """
 
     def __init__(self, *, tokenizer: BaseTokenizer) -> None:
-        super().__init__(tokenizer=tokenizer)
+        super().__init__()
         self.model: nltk.sentiment.SentimentAnalyzer = nltk.sentiment.SentimentAnalyzer()
         self.trained: bool = False
+        self.tokenizer: BaseTokenizer = tokenizer
 
-    def __tokenize_datatuple(self, datatuple: DataTuple) -> tuple[TokenBag, Category]:
+    def __repr__(self):
+        return f"<{self.__class__.__qualname__} tokenizer={self.tokenizer!r}>"
+
+    def __tokenize_datatuple(self, datatuple: Review) -> tuple[TokenBag, Category]:
         """
         Convert the `Text` of a `DataTuple` to a `TokenBag`.
         """
         count_passage(log, "tokenize_datatuple", 100)
-        return self.tokenizer.tokenize_builtins(datatuple[0]), datatuple[1]
+        return self.tokenizer.tokenize_builtins(datatuple.text), datatuple.category
 
     def _add_feature_unigrams(self, dataset: t.Iterator[tuple[TokenBag, Category]]) -> None:
         """
@@ -63,7 +67,7 @@ class NLTKSentimentAnalyzer(BaseSentimentAnalyzer):
         count_passage(log, "extract_features", 100)
         return self.model.extract_features(data[0]), data[1]
 
-    def train(self, dataset: DataSet) -> None:
+    def train(self, dataset: t.Iterator[Review]) -> None:
         # Forbid retraining the model
         if self.trained:
             raise AlreadyTrainedError()
diff --git a/unimore_bda_6/analysis/tf_text.py b/unimore_bda_6/analysis/tf_text.py
index df65f1d..fc4780a 100644
--- a/unimore_bda_6/analysis/tf_text.py
+++ b/unimore_bda_6/analysis/tf_text.py
@@ -2,42 +2,25 @@ import tensorflow
 import itertools
 import typing as t
 
-from ..database import DataSet, Text, Category
+from ..database import Text, Category, Review
 from ..tokenizer import BaseTokenizer
 from .base import BaseSentimentAnalyzer, AlreadyTrainedError, NotTrainedError
 
 
 class TensorflowSentimentAnalyzer(BaseSentimentAnalyzer):
-    def __init__(self, *, tokenizer: BaseTokenizer):
-        super().__init__(tokenizer=tokenizer)
+    def __init__(self):
+        super().__init__()
         self.trained = False
         self.text_vectorization_layer = None
         self.neural_network: tensorflow.keras.Sequential | None = None
 
-    @staticmethod
-    def __infinite_dataset_generator_factory(dataset: DataSet):
-        """
-        A generator of infinite copies of dataset.
-
-        .. todo:: Loads the whole dataset in memory. What a waste! Can we perform multiple MongoDB queries instead?
-        """
-        dataset = map(lambda text, category: (tensorflow.convert_to_tensor(text, dtype=tensorflow.string), tensorflow.convert_to_tensor(category, dtype=tensorflow.string)), dataset)
-
-        def generator():
-            while True:
-                nonlocal dataset
-                dataset, result = itertools.tee(dataset, 2)
-                yield result
-
-        return generator
-
     @classmethod
-    def __bda_dataset_to_tf_dataset(cls, dataset: DataSet) -> tensorflow.data.Dataset:
+    def __bda_dataset_to_tf_dataset(cls, dataset_func: t.Callable[[], t.Iterator[Review]]) -> tensorflow.data.Dataset:
         """
         Convert a `unimore_bda_6.database.DataSet` to a "real" `tensorflow.data.Dataset`.
         """
         return tensorflow.data.Dataset.from_generator(
-            cls.__infinite_dataset_generator_factory(dataset),
+            dataset_func,
             output_signature=(
                 tensorflow.TensorSpec(shape=(), dtype=tensorflow.string),
                 tensorflow.TensorSpec(shape=(), dtype=tensorflow.string),
@@ -48,7 +31,7 @@ class TensorflowSentimentAnalyzer(BaseSentimentAnalyzer):
     EMBEDDING_DIM = 16
     EPOCHS = 10
 
-    def train(self, training_set: DataSet) -> None:
+    def train(self, training_set: t.Iterator[Review]) -> None:
         if self.trained:
             raise AlreadyTrainedError()
 
diff --git a/unimore_bda_6/database.py b/unimore_bda_6/database.py
index 7536670..9828fd1 100644
--- a/unimore_bda_6/database.py
+++ b/unimore_bda_6/database.py
@@ -4,14 +4,19 @@ import pymongo.collection
 import contextlib
 import bson
 import logging
-import itertools
+import tensorflow
 
 from .config import MONGO_HOST, MONGO_PORT, WORKING_SET_SIZE
 
 log = logging.getLogger(__name__)
 
 
-class Review(t.TypedDict):
+class MongoReview(t.TypedDict):
+    """
+    A review as it is stored on MongoDB.
+
+    .. warning:: Do not instantiate: this is only for type hints!
+    """
     _id: bson.ObjectId
     reviewerID: str
     asin: str
@@ -28,13 +33,13 @@ Text = str
 Category = float
 
 
-class DataTuple:
-    def __init__(self, text, category):
+class Review:
+    def __init__(self, text: Text, category: Category):
         self.text: Text = text
         self.category: Category = category
 
     @classmethod
-    def from_review(cls, review):
+    def from_mongoreview(cls, review: MongoReview):
         return cls(
             text=review["reviewText"],
             category=review["overall"],
@@ -44,15 +49,15 @@ class DataTuple:
         return f"<{self.__class__.__qualname__}: [{self.category}] {self.text}>"
 
     def __getitem__(self, item):
-        if item == 0:
+        if item == 0 or item == "text":
             return self.text
-        elif item == 1:
+        elif item == 1 or item == "category":
             return self.category
         else:
             raise KeyError(item)
 
-
-DataSet = t.Iterable[DataTuple]
+    def to_tensor_tuple(self) -> tuple[tensorflow.Tensor, tensorflow.Tensor]:
+        return tensorflow.convert_to_tensor(self.text, dtype=tensorflow.string), tensorflow.convert_to_tensor(self.category, dtype=tensorflow.string)
 
 
 @contextlib.contextmanager
@@ -65,7 +70,7 @@ def mongo_client_from_config() -> t.ContextManager[pymongo.MongoClient]:
         host=MONGO_HOST.__wrapped__,
         port=MONGO_PORT.__wrapped__,
     )
-    log.info("Opened connection to MongoDB at %s!", client.address)
+    log.info("Opened connection to MongoDB!")
 
     yield client
 
@@ -75,7 +80,7 @@ def mongo_client_from_config() -> t.ContextManager[pymongo.MongoClient]:
 
 
 @contextlib.contextmanager
-def mongo_reviews_collection_from_config() -> pymongo.collection.Collection[Review]:
+def mongo_reviews_collection_from_config() -> pymongo.collection.Collection[MongoReview]:
     """
     Create a new MongoDB client, access the ``reviews`` collection in the ``reviews`` database, and yield it.
     """
@@ -86,82 +91,118 @@ def mongo_reviews_collection_from_config() -> pymongo.collection.Collection[Revi
         yield collection
 
 
-def sample_reviews(reviews: pymongo.collection.Collection, amount: int) -> t.Iterator[Review]:
+class DatasetFunc(t.Protocol):
+    def __call__(self) -> t.Iterator[Review]:
+        pass
+
+
+def sample_reviews(collection: pymongo.collection.Collection, amount: int) -> t.Iterator[Review]:
     """
     Get ``amount`` random reviews from the ``reviews`` collection.
     """
     log.debug("Getting a sample of %d reviews...", amount)
 
-    return reviews.aggregate([
+    cursor = collection.aggregate([
         {"$limit": WORKING_SET_SIZE.__wrapped__},
         {"$sample": {"size": amount}},
     ])
 
+    cursor = map(Review.from_mongoreview, cursor)
+    return cursor
 
-def sample_reviews_by_rating(reviews: pymongo.collection.Collection, rating: float, amount: int) -> t.Iterator[Review]:
+
+def sample_reviews_by_rating(collection: pymongo.collection.Collection, rating: float, amount: int) -> t.Iterator[Review]:
     """
     Get ``amount`` random reviews with ``rating`` stars from the ``reviews`` collection.
     """
     log.debug("Getting a sample of %d reviews with %d stars...", amount, rating)
 
-    return reviews.aggregate([
+    cursor = collection.aggregate([
         {"$limit": WORKING_SET_SIZE.__wrapped__},
         {"$match": {"overall": rating}},
         {"$sample": {"size": amount}},
     ])
 
-
-def polar_dataset(collection: pymongo.collection.Collection, amount: int) -> t.Iterator[DataTuple]:
-    """
-    Get a list of the same amount of 1-star and 5-star reviews.
-    """
-    log.info("Building polar dataset with %d reviews...", amount * 2)
-
-    # Sample the required reviews
-    positive = sample_reviews_by_rating(collection, rating=5.0, amount=amount)
-    negative = sample_reviews_by_rating(collection, rating=1.0, amount=amount)
-
-    # Chain the iterators
-    full = itertools.chain(positive, negative)
-
-    # Convert reviews to datatuples
-    full = map(DataTuple.from_review, full)
-
-    return full
+    cursor = map(Review.from_mongoreview, cursor)
+    return cursor
 
 
-def varied_dataset(collection: pymongo.collection.Collection, amount: int) -> t.Iterator[DataTuple]:
-    """
-    Get a list of the same amount of reviews for each rating.
-    """
-    log.info("Building varied dataset with %d reviews...", amount * 5)
+def sample_reviews_polar(collection: pymongo.collection.Collection, amount: int) -> t.Iterator[Review]:
+    log.debug("Getting a sample of %d polar reviews...", amount * 2)
 
-    # Sample the required reviews
-    terrible = sample_reviews_by_rating(collection, rating=1.0, amount=amount)
-    negative = sample_reviews_by_rating(collection, rating=2.0, amount=amount)
-    mixed    = sample_reviews_by_rating(collection, rating=3.0, amount=amount)
-    positive = sample_reviews_by_rating(collection, rating=4.0, amount=amount)
-    great    = sample_reviews_by_rating(collection, rating=5.0, amount=amount)
+    cursor = collection.aggregate([
+        {"$limit": WORKING_SET_SIZE.__wrapped__},
+        {"$match": {"overall": 1.0}},
+        {"$sample": {"size": amount}},
+        {"$unionWith": {
+            "coll": collection.name,
+            "pipeline": [
+                {"$limit": WORKING_SET_SIZE.__wrapped__},
+                {"$match": {"overall": 5.0}},
+                {"$sample": {"size": amount}},
+            ],
+        }}
+    ])
 
-    # Chain the iterators
-    full = itertools.chain(terrible, negative, mixed, positive, great)
+    cursor = map(Review.from_mongoreview, cursor)
+    return cursor
 
-    # Convert reviews to datatuples
-    full = map(DataTuple.from_review, full)
 
-    return full
+def sample_reviews_varied(collection: pymongo.collection.Collection, amount: int) -> t.Iterator[Review]:
+    log.debug("Getting a sample of %d varied reviews...", amount * 5)
+
+    # Wow, this is ugly.
+    cursor = collection.aggregate([
+        {"$limit": WORKING_SET_SIZE.__wrapped__},
+        {"$match": {"overall": 1.0}},
+        {"$sample": {"size": amount}},
+        {"$unionWith": {
+            "coll": collection.name,
+            "pipeline": [
+                {"$limit": WORKING_SET_SIZE.__wrapped__},
+                {"$match": {"overall": 2.0}},
+                {"$sample": {"size": amount}},
+                {"$unionWith": {
+                    "coll": collection.name,
+                    "pipeline": [
+                        {"$limit": WORKING_SET_SIZE.__wrapped__},
+                        {"$match": {"overall": 3.0}},
+                        {"$sample": {"size": amount}},
+                        {"$unionWith": {
+                            "coll": collection.name,
+                            "pipeline": [
+                                {"$limit": WORKING_SET_SIZE.__wrapped__},
+                                {"$match": {"overall": 4.0}},
+                                {"$sample": {"size": amount}},
+                                {"$unionWith": {
+                                    "coll": collection.name,
+                                    "pipeline": [
+                                        {"$limit": WORKING_SET_SIZE.__wrapped__},
+                                        {"$match": {"overall": 5.0}},
+                                        {"$sample": {"size": amount}},
+                                    ],
+                                }}
+                            ],
+                        }}
+                    ],
+                }}
+            ],
+        }}
+    ])
+
+    cursor = map(Review.from_mongoreview, cursor)
+    return cursor
 
 
 __all__ = (
-    "Review",
     "Text",
     "Category",
-    "DataTuple",
-    "DataSet",
+    "Review",
+    "DatasetFunc",
     "mongo_client_from_config",
     "mongo_reviews_collection_from_config",
     "sample_reviews",
     "sample_reviews_by_rating",
-    "polar_dataset",
-    "varied_dataset",
+    "sample_reviews_polar",
+    "sample_reviews_varied",
 )