CODE IS DONE

2024-11-21 15:34:18 +00:00 · 2023-02-12 05:11:58 +01:00 · 2023-02-12 05:11:58 +01:00 · 4e8aa68db3
commit 4e8aa68db3
parent ae2cf563e6
17 changed files with 331 additions and 304 deletions
--- a/.idea/runConfigurations/unimore_bda_6.xml
+++ b/.idea/runConfigurations/unimore_bda_6.xml
@ -5,14 +5,14 @@
    <option name="PARENT_ENVS" value="true" />
    <envs>
      <env name="CONFIRM_OVERWRITE" value="False" />
-      <env name="EVALUATION_SET_SIZE" value="100" />
+      <env name="EVALUATION_SET_SIZE" value="4000" />
      <env name="NLTK_DATA" value="./data/nltk" />
      <env name="PYTHONUNBUFFERED" value="1" />
      <env name="TENSORFLOW_EMBEDDING_SIZE" value="64" />
      <env name="TENSORFLOW_MAX_FEATURES" value="1000000" />
      <env name="TF_CPP_MIN_LOG_LEVEL" value="2" />
-      <env name="TRAINING_SET_SIZE" value="2000" />
-      <env name="VALIDATION_SET_SIZE" value="25" />
+      <env name="TRAINING_SET_SIZE" value="4000" />
+      <env name="VALIDATION_SET_SIZE" value="100" />
      <env name="WORKING_SET_SIZE" value="100000" />
      <env name="XLA_FLAGS" value="--xla_gpu_cuda_data_dir=/opt/cuda" />
    </envs>
--- a/unimore_bda_6/main.py
+++ b/unimore_bda_6/main.py
@ -6,7 +6,7 @@ install_general_log_handlers()

 from .config import config
 from .database import mongo_client_from_config, reviews_collection, sample_reviews_polar, sample_reviews_varied
-from .analysis import NLTKSentimentAnalyzer, TensorflowCategorySentimentAnalyzer, TensorflowPolarSentimentAnalyzer
+from .analysis import NLTKSentimentAnalyzer, TensorflowCategorySentimentAnalyzer, TensorflowPolarSentimentAnalyzer, ThreeCheat
 from .analysis.base import TrainingFailedError
 from .tokenizer import PlainTokenizer, LowercaseTokenizer, NLTKWordTokenizer, PottsTokenizer, PottsTokenizerWithNegation, HuggingBertTokenizer
 from .gathering import Caches
@ -32,25 +32,29 @@ def main():

        reviews = reviews_collection(db)

-        for sample_func in [sample_reviews_varied, sample_reviews_polar]:
+        for sample_func in [
+            sample_reviews_polar,
+            sample_reviews_varied,
+        ]:

            slog = logging.getLogger(f"{__name__}.{sample_func.__name__}")
            slog.debug("Selected sample_func: %s", sample_func.__name__)

            for SentimentAnalyzer in [
+                # ThreeCheat,
                TensorflowPolarSentimentAnalyzer,
                TensorflowCategorySentimentAnalyzer,
-                # NLTKSentimentAnalyzer,
+                NLTKSentimentAnalyzer,
            ]:

                slog = logging.getLogger(f"{__name__}.{sample_func.__name__}.{SentimentAnalyzer.__name__}")
                slog.debug("Selected SentimentAnalyzer: %s", SentimentAnalyzer.__name__)

                for Tokenizer in [
-                    PottsTokenizerWithNegation,
                    PottsTokenizer,
-                    HuggingBertTokenizer,
                    PlainTokenizer,
+                    HuggingBertTokenizer,
+                    PottsTokenizerWithNegation,
                    LowercaseTokenizer,
                    NLTKWordTokenizer,
                ]:
--- a/unimore_bda_6/analysis/init.py
+++ b/unimore_bda_6/analysis/init.py
@ -5,3 +5,4 @@ This module contains all implemented types of sentiment analyzers.
 from .base import *
 from .nltk_sentiment import *
 from .tf_text import *
+from .cheating import *
--- a/unimore_bda_6/analysis/base.py
+++ b/unimore_bda_6/analysis/base.py
@ -4,7 +4,7 @@ import abc
 import logging
 import dataclasses

-from ..database import Text, Category, CachedDatasetFunc
+from ..database import CachedDatasetFunc, TextReview, TokenizedReview
 from ..tokenizer import BaseTokenizer

 log = logging.getLogger(__name__)
@ -15,12 +15,11 @@ class BaseSentimentAnalyzer(metaclass=abc.ABCMeta):
    Abstract base class for sentiment analyzers implemented in this project.
    """

-    # noinspection PyUnusedLocal
    def __init__(self, *, tokenizer: BaseTokenizer):
-        pass
+        self.tokenizer: BaseTokenizer = tokenizer

    def __repr__(self):
-        return f"<{self.__class__.__qualname__}>"
+        return f"<{self.__class__.__qualname__} with {self.tokenizer} tokenizer>"

    @abc.abstractmethod
    def train(self, training_dataset_func: CachedDatasetFunc, validation_dataset_func: CachedDatasetFunc) -> None:
@ -30,34 +29,34 @@ class BaseSentimentAnalyzer(metaclass=abc.ABCMeta):
        raise NotImplementedError()

    @abc.abstractmethod
-    def use(self, text: Text) -> Category:
+    def use(self, text: str) -> float:
        """
-        Run the model on the given input.
+        Run the model on the given input, and return the predicted rating.
        """
        raise NotImplementedError()

    def evaluate(self, evaluation_dataset_func: CachedDatasetFunc) -> EvaluationResults:
        """
        Perform a model evaluation by calling repeatedly `.use` on every text of the test dataset and by comparing its resulting category with the expected category.
-
-        Returns a tuple with the number of correct results and the number of evaluated results.
        """

        evaluated: int = 0
-        correct: int = 0
-        score: float = 0.0
+
+        perfect: int = 0
+
+        squared_error: float = 0.0

        for review in evaluation_dataset_func():
            resulting_category = self.use(review.text)
-            log.debug("Evaluation step: expected %d, received %d, review was %s", review.category, resulting_category, review.text[:80])
+            log.debug("Evaluation step: %d for %s", resulting_category, review)
            evaluated += 1
            try:
-                correct += 1 if resulting_category == review.category else 0
-                score += 1 - (abs(resulting_category - review.category) / 4)
+                perfect += 1 if resulting_category == review.rating else 0
+                squared_error += (resulting_category - review.rating) ** 2
            except ValueError:
                log.warning("Model execution on %s resulted in a NaN value: %s", review, resulting_category)

-        return EvaluationResults(correct=correct, evaluated=evaluated, score=score)
+        return EvaluationResults(perfect=perfect, evaluated=evaluated, mse=squared_error / evaluated)


@dataclasses.dataclass
@ -66,15 +65,26 @@ class EvaluationResults:
    Container for the results of a dataset evaluation.
    """

-    correct: int
    evaluated: int
-    score: float
+    """
+    The number of reviews that were evaluated.
+    """
+
+    perfect: int
+    """
+    The number of reviews for which the model returned the correct rating.
+    """
+
+    mse: float
+    """
+    Mean squared error
+    """

    def __repr__(self):
        return f"<EvaluationResults: {self!s}>"

    def __str__(self):
-        return f"{self.evaluated} evaluated, {self.correct} correct, {self.correct / self.evaluated:.2%} accuracy, {self.score:.2f} score, {self.score / self.evaluated:.2%} scoreaccuracy"
+        return f"Evaluation results:\t{self.evaluated}\tevaluated\t{self.perfect}\tperfect\t{self.perfect / self.evaluated:.2%}\taccuracy\t{self.mse / self.evaluated:.2%}\tmean squared error"


 class AlreadyTrainedError(Exception):
--- a/unimore_bda_6/analysis/cheating.py
+++ b/unimore_bda_6/analysis/cheating.py
@ -0,0 +1,21 @@
+from .base import BaseSentimentAnalyzer
+from ..database.cache import CachedDatasetFunc
+
+
+class ThreeCheat(BaseSentimentAnalyzer):
+    """
+    A sentiment analyzer that always predicts a 3.0* rating.
+
+    Why? To test the scoring!
+    """
+
+    def train(self, training_dataset_func: CachedDatasetFunc, validation_dataset_func: CachedDatasetFunc) -> None:
+        pass
+
+    def use(self, text: str) -> float:
+        return 3.0
+
+
+__all__ = (
+    "ThreeCheat",
+)
--- a/unimore_bda_6/analysis/nltk_sentiment.py
+++ b/unimore_bda_6/analysis/nltk_sentiment.py
@ -6,7 +6,7 @@ import logging
 import typing as t
 import itertools

-from ..database import Text, Category, Review, CachedDatasetFunc
+from ..database import TextReview, CachedDatasetFunc, TokenizedReview
 from .base import BaseSentimentAnalyzer, AlreadyTrainedError, NotTrainedError
 from ..log import count_passage
 from ..tokenizer import BaseTokenizer
@ -23,31 +23,17 @@ class NLTKSentimentAnalyzer(BaseSentimentAnalyzer):
    """

    def __init__(self, *, tokenizer: BaseTokenizer) -> None:
-        if not tokenizer.supports_plain():
-            raise TypeError("Tokenizer does not support NLTK")
-
        super().__init__(tokenizer=tokenizer)

        self.model: nltk.sentiment.SentimentAnalyzer = nltk.sentiment.SentimentAnalyzer()
        self.trained: bool = False
-        self.tokenizer: BaseTokenizer = tokenizer

-    def __repr__(self):
-        return f"<{self.__class__.__qualname__} tokenizer={self.tokenizer!r}>"
-
-    def __tokenize_review(self, datatuple: Review) -> tuple[TokenBag, Category]:
-        """
-        Convert the `Text` of a `DataTuple` to a `TokenBag`.
-        """
-        count_passage(log, "tokenize_datatuple", 100)
-        return self.tokenizer.tokenize_and_split_plain(datatuple.text), datatuple.category
-
-    def _add_feature_unigrams(self, dataset: t.Iterator[tuple[TokenBag, Category]]) -> None:
+    def _add_feature_unigrams(self, dataset: t.Iterator[TokenizedReview]) -> None:
        """
        Register the `nltk.sentiment.util.extract_unigram_feats` feature extrator on the model.
        """
        # Ignore the category and only access the tokens
-        tokenbags = map(lambda d: d[0], dataset)
+        tokenbags = map(lambda r: r.rating, dataset)
        # Get all words in the documents
        all_words = self.model.all_words(tokenbags, labeled=False)
        # Create unigram `contains(*)` features from the previously gathered words
@ -55,59 +41,48 @@ class NLTKSentimentAnalyzer(BaseSentimentAnalyzer):
        # Add the feature extractor to the model
        self.model.add_feat_extractor(nltk.sentiment.util.extract_unigram_feats, unigrams=unigrams)

-    def _add_feature_extractors(self, dataset: t.Iterator[tuple[TokenBag, Category]]):
+    def _add_feature_extractors(self, dataset: t.Iterator[TextReview]):
        """
        Register new feature extractors on the `.model`.
        """
+        # Tokenize the reviews
+        dataset: t.Iterator[TokenizedReview] = map(self.tokenizer.tokenize_review, dataset)
        # Add the unigrams feature
        self._add_feature_unigrams(dataset)

-    def __extract_features(self, data: tuple[TokenBag, Category]) -> tuple[Features, Category]:
+    def __extract_features(self, review: TextReview) -> tuple[Features, float]:
        """
        Convert a (TokenBag, Category) tuple to a (Features, Category) tuple.

        Does not use `SentimentAnalyzer.apply_features` due to unexpected behaviour when using iterators.
        """
-        count_passage(log, "extract_features", 100)
-        return self.model.extract_features(data[0]), data[1]
+        review: TokenizedReview = self.tokenizer.tokenize_review(review)
+        return self.model.extract_features(review.tokens), review.rating

    def train(self, training_dataset_func: CachedDatasetFunc, validation_dataset_func: CachedDatasetFunc) -> None:
        # Forbid retraining the model
        if self.trained:
            raise AlreadyTrainedError()

-        # Get a generator
-        dataset: t.Generator[Review] = training_dataset_func()
-
-        # Tokenize the dataset
-        dataset: t.Iterator[tuple[TokenBag, Category]] = map(self.__tokenize_review, dataset)
-
-        # Cleanly duplicate the dataset iterator
-        # Reduce average memory footprint, but not maximum
-        dataset_1, dataset_2 = itertools.tee(dataset, 2)
-        dataset_1: t.Iterator[tuple[TokenBag, Category]]
-        dataset_2: t.Iterator[tuple[TokenBag, Category]]
-
        # Add the feature extractors to the model
-        self._add_feature_extractors(dataset_1)
-        del dataset_1  # Delete exausted iterator
+        self._add_feature_extractors(training_dataset_func())

        # Extract features from the dataset
-        dataset_2: t.Iterator[tuple[Features, Category]] = map(self.__extract_features, dataset_2)
+        featureset: t.Iterator[tuple[Features, float]] = map(self.__extract_features, training_dataset_func())

        # Train the classifier with the extracted features and category
-        self.model.classifier = nltk.classify.NaiveBayesClassifier.train(dataset_2)
+        self.model.classifier = nltk.classify.NaiveBayesClassifier.train(featureset)

        # Toggle the trained flag
        self.trained = True

-    def use(self, text: Text) -> Category:
+    def use(self, text: str) -> float:
        # Require the model to be trained
        if not self.trained:
            raise NotTrainedError()

        # Tokenize the input
-        tokens = self.tokenizer.tokenize_and_split_plain(text)
+        tokens = self.tokenizer.tokenize(text)

        # Run the classification method
        return self.model.classify(instance=tokens)
--- a/unimore_bda_6/analysis/tf_text.py
+++ b/unimore_bda_6/analysis/tf_text.py
@ -5,7 +5,7 @@ import numpy
 import tensorflow
 import logging

-from ..database import Text, Category, CachedDatasetFunc, Review
+from ..database import CachedDatasetFunc, TextReview, TokenizedReview
 from ..config import TENSORFLOW_EMBEDDING_SIZE, TENSORFLOW_MAX_FEATURES, TENSORFLOW_EPOCHS
 from ..tokenizer import BaseTokenizer
 from .base import BaseSentimentAnalyzer, AlreadyTrainedError, NotTrainedError, TrainingFailedError
@ -19,31 +19,7 @@ else:
    log.debug("Tensorflow successfully found GPU acceleration!")


-ConversionFunc = t.Callable[[Review], tensorflow.Tensor | tuple]
-
-
-def build_dataset(dataset_func: CachedDatasetFunc, conversion_func: ConversionFunc, output_signature: tensorflow.TensorSpec | tuple) -> tensorflow.data.Dataset:
-    """
-    Convert a `CachedDatasetFunc` to a `tensorflow.data.Dataset`.
-    """
-
-    def dataset_generator():
-        for review in dataset_func():
-            yield conversion_func(review)
-
-    log.debug("Creating dataset...")
-    dataset = tensorflow.data.Dataset.from_generator(
-        dataset_generator,
-        output_signature=output_signature,
-    )
-
-    log.debug("Caching dataset...")
-    dataset = dataset.cache()
-
-    log.debug("Configuring dataset prefetch...")
-    dataset = dataset.prefetch(buffer_size=tensorflow.data.AUTOTUNE)
-
-    return dataset
+ConversionFunc = t.Callable[[TextReview], tensorflow.Tensor | tuple]


 class TensorflowSentimentAnalyzer(BaseSentimentAnalyzer, metaclass=abc.ABCMeta):
@ -52,31 +28,15 @@ class TensorflowSentimentAnalyzer(BaseSentimentAnalyzer, metaclass=abc.ABCMeta):
    """

    def __init__(self, *, tokenizer: BaseTokenizer):
-        if not tokenizer.supports_tensorflow():
-            raise TypeError("Tokenizer does not support Tensorflow")
-
        super().__init__(tokenizer=tokenizer)

        self.trained: bool = False
        self.failed: bool = False

-        self.tokenizer: BaseTokenizer = tokenizer
-        self.text_vectorization_layer: tensorflow.keras.layers.TextVectorization = self._build_text_vectorization_layer()
+        self.string_lookup_layer: tensorflow.keras.layers.StringLookup = tensorflow.keras.layers.StringLookup(max_tokens=TENSORFLOW_MAX_FEATURES.__wrapped__)
        self.model: tensorflow.keras.Sequential = self._build_model()
        self.history: tensorflow.keras.callbacks.History | None = None

-    def _build_text_vectorization_layer(self) -> tensorflow.keras.layers.TextVectorization:
-        """
-        Create a `tensorflow`-compatible `TextVectorization` layer.
-        """
-        log.debug("Creating TextVectorization layer...")
-        layer = tensorflow.keras.layers.TextVectorization(
-            standardize=self.tokenizer.tokenize_tensorflow_and_expand_dims,
-            max_tokens=TENSORFLOW_MAX_FEATURES.__wrapped__
-        )
-        log.debug("Created TextVectorization layer: %s", layer)
-        return layer
-
    @abc.abstractmethod
    def _build_model(self) -> tensorflow.keras.Sequential:
        """
@ -84,33 +44,44 @@ class TensorflowSentimentAnalyzer(BaseSentimentAnalyzer, metaclass=abc.ABCMeta):
        """
        raise NotImplementedError()

-    @abc.abstractmethod
    def _build_dataset(self, dataset_func: CachedDatasetFunc) -> tensorflow.data.Dataset:
        """
        Create a `tensorflow.data.Dataset` from the given `CachedDatasetFunc`.
        """
-        raise NotImplementedError()
+
+        def dataset_generator():
+            for review in dataset_func():
+                review: TextReview
+                review: TokenizedReview = self.tokenizer.tokenize_review(review)
+                tokens: tensorflow.Tensor = self._tokens_to_tensor(review.tokens)
+                rating: tensorflow.Tensor = self._rating_to_input(review.rating)
+                yield tokens, rating
+
+        log.debug("Creating dataset...")
+        dataset = tensorflow.data.Dataset.from_generator(
+            dataset_generator,
+            output_signature=(
+                tensorflow.TensorSpec(shape=(1, None,), dtype=tensorflow.string, name="tokens"),
+                self._ratingtensor_shape(),
+            ),
+        )
+
+        log.debug("Caching dataset...")
+        dataset = dataset.cache()
+
+        log.debug("Configuring dataset prefetch...")
+        dataset = dataset.prefetch(buffer_size=tensorflow.data.AUTOTUNE)
+
+        return dataset

    def _adapt_textvectorization(self, dataset: tensorflow.data.Dataset) -> None:
        """
        Adapt the `.text_vectorization_layer` to the given dataset.
        """
-        log.debug("Preparing dataset to adapt %s...", self.text_vectorization_layer)
+        log.debug("Preparing dataset to adapt %s...", self.string_lookup_layer)
        dataset = dataset.map(lambda text, category: text)
-        log.debug("Adapting %s...", self.text_vectorization_layer)
-        self.text_vectorization_layer.adapt(dataset)
-
-    def _vectorize_dataset(self, dataset: tensorflow.data.Dataset) -> tensorflow.data.Dataset:
-        """
-        Apply the `.text_vectorization_layer` to the text in the dataset.
-        """
-        def vectorize_entry(text, category):
-            return self.text_vectorization_layer(text), category
-
-        log.debug("Vectorizing dataset: %s", dataset)
-        dataset = dataset.map(vectorize_entry)
-        log.debug("Vectorized dataset: %s", dataset)
-        return dataset
+        log.debug("Adapting %s...", self.string_lookup_layer)
+        self.string_lookup_layer.adapt(dataset)

    def train(self, training_dataset_func: CachedDatasetFunc, validation_dataset_func: CachedDatasetFunc) -> None:
        if self.failed:
@ -120,13 +91,17 @@ class TensorflowSentimentAnalyzer(BaseSentimentAnalyzer, metaclass=abc.ABCMeta):
            log.error("Tried to train an already trained model.")
            raise AlreadyTrainedError("Cannot re-train an already trained model.")

+        log.debug("Building training dataset...")
        training_set = self._build_dataset(training_dataset_func)
+
+        log.debug("Building validation dataset...")
        validation_set = self._build_dataset(validation_dataset_func)

-        self._adapt_textvectorization(training_set)
+        log.debug("Building vocabulary...")
+        vocabulary = training_set.map(lambda tokens, rating: tokens)

-        training_set = self._vectorize_dataset(training_set)
-        validation_set = self._vectorize_dataset(validation_set)
+        log.debug("Adapting lookup layer to the vocabulary...")
+        self.string_lookup_layer.adapt(vocabulary)

        log.info("Training: %s", self.model)
        self.history: tensorflow.keras.callbacks.History | None  = self.model.fit(
@ -146,25 +121,50 @@ class TensorflowSentimentAnalyzer(BaseSentimentAnalyzer, metaclass=abc.ABCMeta):
            log.info("Model %s training succeeded!", self.model)
            self.trained = True

-    @abc.abstractmethod
-    def _translate_prediction(self, a: numpy.array) -> Category:
+    @staticmethod
+    def _tokens_to_tensor(tokens: t.Iterator[str]) -> tensorflow.Tensor:
        """
-        Convert the results of `tensorflow.keras.Sequential.predict` into a `.Category`.
+        Convert an iterator of tokens to a `tensorflow.Tensor`.
+        """
+        tensor = tensorflow.convert_to_tensor(
+            [list(tokens)],
+            dtype=tensorflow.string,
+            name="tokens"
+        )
+        return tensor
+
+    def use(self, text: str) -> float:
+        if self.failed:
+            raise NotTrainedError("Cannot use a failed model.")
+        if not self.trained:
+            raise NotTrainedError("Cannot use a non-trained model.")
+
+        tokens = self.tokenizer.tokenize(text)
+        tokens = self._tokens_to_tensor(tokens)
+        prediction = self.model.predict(tokens, verbose=False)
+        prediction = self._prediction_to_rating(prediction)
+        return prediction
+
+    @abc.abstractmethod
+    def _rating_to_input(self, rating: float) -> tensorflow.Tensor:
+        """
+        Convert a review rating to a `tensorflow.Tensor`.
        """
        raise NotImplementedError()

-    def use(self, text: Text) -> Category:
-        if self.failed:
-            log.error("Tried to use a failed model.")
-            raise NotTrainedError("Cannot use a failed model.")
-        if not self.trained:
-            log.error("Tried to use a non-trained model.")
-            raise NotTrainedError("Cannot use a non-trained model.")
+    @abc.abstractmethod
+    def _ratingtensor_shape(self) -> tensorflow.TensorSpec:
+        """
+        Returns the shape of the tensor output by `._rating_to_tensor` and accepted as input by `._tensor_to_rating`.
+        """
+        raise NotImplementedError()

-        vector = self.text_vectorization_layer(text)
-        prediction = self.model.predict(vector, verbose=False)
-
-        return self._translate_prediction(prediction)
+    @abc.abstractmethod
+    def _prediction_to_rating(self, prediction: tensorflow.Tensor) -> float:
+        """
+        Convert the results of `tensorflow.keras.Sequential.predict` into a review rating.
+        """
+        raise NotImplementedError()


 class TensorflowCategorySentimentAnalyzer(TensorflowSentimentAnalyzer):
@ -172,19 +172,10 @@ class TensorflowCategorySentimentAnalyzer(TensorflowSentimentAnalyzer):
    A `tensorflow`-based sentiment analyzer that considers each star rating as a separate category.
    """

-    def _build_dataset(self, dataset_func: CachedDatasetFunc) -> tensorflow.data.Dataset:
-        return build_dataset(
-            dataset_func=dataset_func,
-            conversion_func=Review.to_tensor_tuple_category,
-            output_signature=(
-                tensorflow.TensorSpec(shape=(), dtype=tensorflow.string, name="text"),
-                tensorflow.TensorSpec(shape=(1, 5,), dtype=tensorflow.float32, name="category_one_hot"),
-            ),
-        )
-
    def _build_model(self) -> tensorflow.keras.Sequential:
        log.debug("Creating sequential categorizer model...")
        model = tensorflow.keras.Sequential([
+            self.string_lookup_layer,
            tensorflow.keras.layers.Embedding(
                input_dim=TENSORFLOW_MAX_FEATURES.__wrapped__ + 1,
                output_dim=TENSORFLOW_EMBEDDING_SIZE.__wrapped__,
@ -209,15 +200,35 @@ class TensorflowCategorySentimentAnalyzer(TensorflowSentimentAnalyzer):
        log.debug("Compiled model: %s", model)
        return model

-    def _translate_prediction(self, a: numpy.array) -> Category:
-        max_i = None
-        max_p = None
-        for i, p in enumerate(iter(a[0])):
-            if max_p is None or p > max_p:
-                max_i = i
-                max_p = p
-        result = float(max_i) + 1.0
-        return float(round(result))
+    def _rating_to_input(self, rating: float) -> tensorflow.Tensor:
+        tensor = tensorflow.convert_to_tensor(
+            [[
+                1.0 if rating == 1.0 else 0.0,
+                1.0 if rating == 2.0 else 0.0,
+                1.0 if rating == 3.0 else 0.0,
+                1.0 if rating == 4.0 else 0.0,
+                1.0 if rating == 5.0 else 0.0,
+            ]],
+            dtype=tensorflow.float32,
+            name="rating_one_hot"
+        )
+        return tensor
+
+    def _ratingtensor_shape(self) -> tensorflow.TensorSpec:
+        spec = tensorflow.TensorSpec(shape=(1, 5), dtype=tensorflow.float32, name="rating_one_hot")
+        return spec
+
+    def _prediction_to_rating(self, prediction: tensorflow.Tensor) -> float:
+        best_prediction = None
+        best_prediction_index = None
+
+        for index, prediction in enumerate(iter(prediction[0])):
+            if best_prediction is None or prediction > best_prediction:
+                best_prediction = prediction
+                best_prediction_index = index
+
+        result = float(best_prediction_index) + 1.0
+        return result


 class TensorflowPolarSentimentAnalyzer(TensorflowSentimentAnalyzer):
@ -225,19 +236,10 @@ class TensorflowPolarSentimentAnalyzer(TensorflowSentimentAnalyzer):
    A `tensorflow`-based sentiment analyzer that uses the floating point value rating to get as close as possible to the correct category.
    """

-    def _build_dataset(self, dataset_func: CachedDatasetFunc) -> tensorflow.data.Dataset:
-        return build_dataset(
-            dataset_func=dataset_func,
-            conversion_func=Review.to_tensor_tuple_normvalue,
-            output_signature=(
-                tensorflow.TensorSpec(shape=(), dtype=tensorflow.string, name="text"),
-                tensorflow.TensorSpec(shape=(1,), dtype=tensorflow.float32, name="category"),
-            ),
-        )
-
    def _build_model(self) -> tensorflow.keras.Sequential:
        log.debug("Creating sequential categorizer model...")
        model = tensorflow.keras.Sequential([
+            self.string_lookup_layer,
            tensorflow.keras.layers.Embedding(
                input_dim=TENSORFLOW_MAX_FEATURES.__wrapped__ + 1,
                output_dim=TENSORFLOW_EMBEDDING_SIZE.__wrapped__,
@ -245,7 +247,9 @@ class TensorflowPolarSentimentAnalyzer(TensorflowSentimentAnalyzer):
            tensorflow.keras.layers.Dropout(0.25),
            tensorflow.keras.layers.GlobalAveragePooling1D(),
            tensorflow.keras.layers.Dropout(0.25),
-            tensorflow.keras.layers.Dense(1, activation="sigmoid"),
+            tensorflow.keras.layers.Dense(8),
+            tensorflow.keras.layers.Dropout(0.25),
+            tensorflow.keras.layers.Dense(1, activation=tensorflow.keras.activations.sigmoid),
        ])

        log.debug("Compiling model: %s", model)
@ -257,11 +261,23 @@ class TensorflowPolarSentimentAnalyzer(TensorflowSentimentAnalyzer):
        log.debug("Compiled model: %s", model)
        return model

-    def _translate_prediction(self, a: numpy.array) -> Category:
-        a: float = a[0, 0]
-        a = a * 2 + 1
-        a = float(round(a))
-        return a
+    def _rating_to_input(self, rating: float) -> tensorflow.Tensor:
+        normalized_rating = (rating - 1) / 4
+        tensor = tensorflow.convert_to_tensor(
+            [normalized_rating],
+            dtype=tensorflow.float32,
+            name="rating_value"
+        )
+        return tensor
+
+    def _ratingtensor_shape(self) -> tensorflow.TensorSpec:
+        spec = tensorflow.TensorSpec(shape=(1,), dtype=tensorflow.float32, name="rating_value")
+        return spec
+
+    def _prediction_to_rating(self, prediction: numpy.array) -> float:
+        rating: float = prediction[0, 0]
+        rating = 1.0 if rating < 0.5 else 5.0
+        return rating


 __all__ = (
--- a/unimore_bda_6/database/cache.py
+++ b/unimore_bda_6/database/cache.py
@ -4,15 +4,15 @@ import shutil
 import pathlib
 import pickle

-from .datatypes import Review
+from .datatypes import TextReview

 log = logging.getLogger(__name__)


-CachedDatasetFunc = t.Callable[[], t.Generator[Review, t.Any, None]]
+CachedDatasetFunc = t.Callable[[], t.Generator[TextReview, t.Any, None]]


-def store_cache(reviews: t.Iterator[Review], path: str | pathlib.Path) -> None:
+def store_cache(reviews: t.Iterator[TextReview], path: str | pathlib.Path) -> None:
    """
    Store the contents of the given `Review` iterator to different files in a directory at the given path.
    """
@ -54,7 +54,7 @@ def load_cache(path: str | pathlib.Path) -> CachedDatasetFunc:

            log.debug("Loading pickle file: %s", document_path)
            with open(document_path, "rb") as file:
-                result: Review = pickle.load(file)
+                result: TextReview = pickle.load(file)
                yield result

    return data_cache_loader
--- a/unimore_bda_6/database/datatypes.py
+++ b/unimore_bda_6/database/datatypes.py
@ -1,75 +1,80 @@
-import tensorflow
+import abc
+import typing as t
+
 from .collections import MongoReview
-import logging
-
-log = logging.getLogger(__name__)


-Text = str
-Category = float
+class Review(metaclass=abc.ABCMeta):
+    """
+    Base class for method common to both review types.
+    """
+
+    def __init__(self, *, rating: float):
+        self.rating: float = rating
+        """
+        The star rating of the review, from ``1.0`` to ``5.0``.
+        """


-class Review:
+class TextReview(Review):
+    """
+    Optimized container for a review with the text still intact.
+
+    Uses `__slots__` for better performance.
+    """
+
    __slots__ = (
        "text",
-        "category",
+        "rating",
    )

-    def __init__(self, text: Text, category: Category):
+    def __init__(self, *, rating: float, text: str):
+        super().__init__(rating=rating)
+
        self.text: str = text
-        self.category: float = category
+        """
+        The contents of the review.
+        """

    @classmethod
-    def from_mongoreview(cls, review: MongoReview):
+    def from_mongoreview(cls, review: MongoReview) -> "TextReview":
+        """
+        Create a new `.Review` object from a `MongoReview` `dict`.
+        """
        return cls(
            text=review["reviewText"],
-            category=review["overall"],
+            rating=review["overall"],
        )

    def __repr__(self):
-        return f"<{self.__class__.__qualname__}: [{self.category}] {self.text}>"
+        return f"<{self.__class__.__qualname__}: ({self.rating}*) {self.text[:80]}>"

-    def __getitem__(self, item):
-        if item == 0 or item == "text":
-            return self.text
-        elif item == 1 or item == "category":
-            return self.category
-        else:
-            raise KeyError(item)

-    def normvalue(self) -> float:
-        return (self.category - 1) / 2
+class TokenizedReview(Review):
+    """
+    Optimized container for a review with a tokenized text.

-    def to_tensor_text(self) -> tensorflow.Tensor:
-        return tensorflow.convert_to_tensor(self.text, dtype=tensorflow.string)
+    Uses `__slots__` for better performance.
+    """

-    def to_tensor_normvalue(self) -> tensorflow.Tensor:
-        return tensorflow.convert_to_tensor([self.normvalue()], dtype=tensorflow.float32)
+    __slots__ = (
+        "tokens",
+        "rating",
+    )

-    def to_tensor_tuple_normvalue(self) -> tuple[tensorflow.Tensor, tensorflow.Tensor]:
-        return (
-            self.to_tensor_text(),
-            self.to_tensor_normvalue(),
-        )
+    def __init__(self, *, rating: float, tokens: t.Iterator[str]):
+        super().__init__(rating=rating)

-    def to_tensor_category(self) -> tensorflow.Tensor:
-        return tensorflow.convert_to_tensor([[
-            1.0 if self.category == 1.0 else 0.0,
-            1.0 if self.category == 2.0 else 0.0,
-            1.0 if self.category == 3.0 else 0.0,
-            1.0 if self.category == 4.0 else 0.0,
-            1.0 if self.category == 5.0 else 0.0,
-        ]], dtype=tensorflow.float32)
+        self.tokens: list[str] = list(tokens)
+        """
+        List of all tokens in the review text.
+        """

-    def to_tensor_tuple_category(self) -> tuple[tensorflow.Tensor, tensorflow.Tensor]:
-        return (
-            self.to_tensor_text(),
-            self.to_tensor_category(),
-        )
+    def __repr__(self):
+        return f"<{self.__class__.__qualname__}: ({self.rating}*) [{len(self.tokens)} tokens]>"


 __all__ = (
-    "Text",
-    "Category",
-    "Review",
+    "TextReview",
+    "TokenizedReview",
 )
--- a/unimore_bda_6/database/queries.py
+++ b/unimore_bda_6/database/queries.py
@ -4,15 +4,15 @@ import typing as t

 from ..config import WORKING_SET_SIZE
 from .collections import MongoReview
-from .datatypes import Review
+from .datatypes import TextReview

 log = logging.getLogger(__name__)


-SampleFunc = t.Callable[[pymongo.collection.Collection, int], t.Iterator[Review]]
+SampleFunc = t.Callable[[pymongo.collection.Collection, int], t.Iterator[TextReview]]


-def sample_reviews(collection: pymongo.collection.Collection, amount: int) -> t.Iterator[Review]:
+def sample_reviews(collection: pymongo.collection.Collection, amount: int) -> t.Iterator[TextReview]:
    """
    Get ``amount`` random reviews from the ``reviews`` collection.
    """
@ -23,12 +23,12 @@ def sample_reviews(collection: pymongo.collection.Collection, amount: int) -> t.
        {"$sample": {"size": amount}},
    ])

-    cursor = map(Review.from_mongoreview, cursor)
+    cursor = map(TextReview.from_mongoreview, cursor)

    return cursor


-def sample_reviews_by_rating(collection: pymongo.collection.Collection, rating: float, amount: int) -> t.Iterator[Review]:
+def sample_reviews_by_rating(collection: pymongo.collection.Collection, rating: float, amount: int) -> t.Iterator[TextReview]:
    """
    Get ``amount`` random reviews with ``rating`` stars from the ``reviews`` collection.
    """
@ -43,7 +43,7 @@ def sample_reviews_by_rating(collection: pymongo.collection.Collection, rating:
    return cursor


-def sample_reviews_polar(collection: pymongo.collection.Collection, amount: int) -> t.Iterator[Review]:
+def sample_reviews_polar(collection: pymongo.collection.Collection, amount: int) -> t.Iterator[TextReview]:
    category_amount = amount // 2

    log.debug("Getting a sample of %d polar reviews...", category_amount * 2)
@ -68,12 +68,12 @@ def sample_reviews_polar(collection: pymongo.collection.Collection, amount: int)
        }}
    ])

-    cursor = map(Review.from_mongoreview, cursor)
+    cursor = map(TextReview.from_mongoreview, cursor)

    return cursor


-def sample_reviews_varied(collection: pymongo.collection.Collection, amount: int) -> t.Iterator[Review]:
+def sample_reviews_varied(collection: pymongo.collection.Collection, amount: int) -> t.Iterator[TextReview]:
    category_amount = amount // 5

    log.debug("Getting a sample of %d varied reviews...", category_amount * 5)
@ -123,7 +123,7 @@ def sample_reviews_varied(collection: pymongo.collection.Collection, amount: int
        }}
    ])

-    cursor = map(Review.from_mongoreview, cursor)
+    cursor = map(TextReview.from_mongoreview, cursor)

    return cursor

--- a/unimore_bda_6/gathering.py
+++ b/unimore_bda_6/gathering.py
@ -5,7 +5,7 @@ import logging
 import pymongo

 from .config import TRAINING_SET_SIZE, VALIDATION_SET_SIZE, EVALUATION_SET_SIZE
-from .database import SampleFunc, CachedDatasetFunc, mongo_client_from_config, reviews_collection, store_cache, load_cache, delete_cache
+from .database import SampleFunc, CachedDatasetFunc, store_cache, load_cache, delete_cache

 log = logging.getLogger(__name__)

--- a/unimore_bda_6/tokenizer/base.py
+++ b/unimore_bda_6/tokenizer/base.py
@ -1,51 +1,26 @@
-import tensorflow
+import typing as t
+import abc
+from ..database.datatypes import TextReview, TokenizedReview


-class BaseTokenizer:
+class BaseTokenizer(metaclass=abc.ABCMeta):
    """
    The base for all tokenizers in this project.
    """

    def __repr__(self):
-        return f"{self.__class__.__qualname__}()"
+        return f"<{self.__class__.__qualname__}>"

-    @staticmethod
-    def __not_implemented(f):
-        f.__notimplemented__ = True
-        return f
-
-    def supports_plain(self) -> bool:
-        return not getattr(self.tokenize_plain, "__notimplemented__", False)
-
-    def supports_tensorflow(self) -> bool:
-        return not getattr(self.tokenize_tensorflow, "__notimplemented__", False)
-
-    @__not_implemented
-    def tokenize_plain(self, text: str) -> str:
+    @abc.abstractmethod
+    def tokenize(self, text: str) -> t.Iterator[str]:
        """
        Convert a text `str` into another `str` containing a series of whitespace-separated tokens.
        """
        raise NotImplementedError()

-    def tokenize_and_split_plain(self, text: str) -> list[str]:
+    def tokenize_review(self, review: TextReview) -> TokenizedReview:
        """
-        Run `.tokenize_plain`, then split the result using `str.split`.
+        Apply `.tokenize` to the text of a `TextReview`, converting it in a `TokenizedReview`.
        """
-        text = self.tokenize_plain(text)
-        text = text.split()
-        return text
-
-    @__not_implemented
-    def tokenize_tensorflow(self, text: tensorflow.Tensor) -> tensorflow.Tensor:
-        """
-        Convert a `tensorflow.Tensor` string into another `tensorflow.Tensor` space-separated string.
-        """
-        raise NotImplementedError()
-
-    def tokenize_tensorflow_and_expand_dims(self, text: tensorflow.Tensor) -> tensorflow.Tensor:
-        """
-        Run `.tokenize_tensorflow`, then add a dimension to the tensor for reasons unknown to me, but required to get `tensorflow` to work properly.
-        """
-        text = self.tokenize_tensorflow(text)
-        text = tensorflow.expand_dims(text, -1, name="tokens")
-        return text
+        tokens = self.tokenize(review.text)
+        return TokenizedReview(rating=review.rating, tokens=tokens)
--- a/unimore_bda_6/tokenizer/hugging.py
+++ b/unimore_bda_6/tokenizer/hugging.py
@ -1,10 +1,15 @@
 import abc
 import tokenizers
+import typing as t

 from .base import BaseTokenizer


 class HuggingTokenizer(BaseTokenizer, metaclass=abc.ABCMeta):
+    """
+    Abstract tokenizer to implement any tokenizer based on HuggingFace `tokenizers.Tokenizer`.
+    """
+
    def __init__(self):
        super().__init__()
        self.hug: tokenizers.Tokenizer = self._build_hugging_tokenizer()
@ -12,11 +17,15 @@ class HuggingTokenizer(BaseTokenizer, metaclass=abc.ABCMeta):
    def _build_hugging_tokenizer(self) -> tokenizers.Tokenizer:
        raise NotImplementedError()

-    def tokenize_plain(self, text: str) -> str:
-        return " ".join(self.hug.encode(text).tokens)
+    def tokenize(self, text: str) -> t.Iterator[str]:
+        return self.hug.encode(text).tokens


 class HuggingBertTokenizer(HuggingTokenizer):
+    """
+    Tokenizer based on the `bert-base-cased <https://huggingface.co/bert-base-cased>`_ tokenizer.
+    """
+
    def _build_hugging_tokenizer(self):
        return tokenizers.Tokenizer.from_pretrained("bert-base-cased")

--- a/unimore_bda_6/tokenizer/lower.py
+++ b/unimore_bda_6/tokenizer/lower.py
@ -1,17 +1,14 @@
-import tensorflow
+import typing as t

 from .base import BaseTokenizer


 class LowercaseTokenizer(BaseTokenizer):
    """
-    Tokenizer which converts the words to lowercase before splitting them via spaces.
+    Tokenizer which converts the words to lowercase before splitting them with `str.split`.
    """

-    def tokenize_plain(self, text: str) -> str:
+    def tokenize(self, text: str) -> t.Iterator[str]:
        text = text.lower()
-        return text
-
-    def tokenize_tensorflow(self, text: tensorflow.Tensor) -> tensorflow.Tensor:
-        text = tensorflow.strings.lower(text)
-        return text
+        tokens = text.split()
+        return tokens
--- a/unimore_bda_6/tokenizer/nltk_word_tokenize.py
+++ b/unimore_bda_6/tokenizer/nltk_word_tokenize.py
@ -9,7 +9,7 @@ class NLTKWordTokenizer(BaseTokenizer):
    Tokenizer based on `nltk.word_tokenize`.
    """

-    def tokenize_plain(self, text: str) -> str:
+    def tokenize(self, text: str) -> str:
        tokens = nltk.word_tokenize(text)
        nltk.sentiment.util.mark_negation(tokens, shallow=True)
        return " ".join(tokens)
--- a/unimore_bda_6/tokenizer/plain.py
+++ b/unimore_bda_6/tokenizer/plain.py
@ -1,15 +1,13 @@
-import tensorflow
+import typing as t

 from .base import BaseTokenizer


 class PlainTokenizer(BaseTokenizer):
    """
-    Tokenizer which just splits the text into tokens by separating them at whitespaces.
+    Tokenizer which just splits the text into tokens by separating them at whitespaces with `str.split`.
    """

-    def tokenize_plain(self, text: str) -> str:
-        return text
-
-    def tokenize_tensorflow(self, text: tensorflow.Tensor) -> tensorflow.Tensor:
-        return text
+    def tokenize(self, text: str) -> t.Iterator[str]:
+        tokens = text.split()
+        return tokens
--- a/unimore_bda_6/tokenizer/potts.py
+++ b/unimore_bda_6/tokenizer/potts.py
@ -1,4 +1,3 @@
-import tensorflow
 import re
 import html.entities
 import typing as t
@ -11,7 +10,7 @@ class PottsTokenizer(BaseTokenizer):
    """
    Tokenizer based on `Christopher Potts' tokenizer <http://sentiment.christopherpotts.net/tokenizing.html>`_, released in 2011.

-    This module is released under the Creative Commons Attribution-NonCommercial-ShareAlike 3.0 Unported License: https://creativecommons.org/licenses/by-nc-sa/3.0/ .
+    This class is released under the Creative Commons Attribution-NonCommercial-ShareAlike 3.0 Unported License: https://creativecommons.org/licenses/by-nc-sa/3.0/ .
    """

    # noinspection RegExpRepeatedSpace
@ -76,7 +75,7 @@ class PottsTokenizer(BaseTokenizer):
    amp = "&amp;"

    @classmethod
-    def __html2string(cls, s: str) -> str:
+    def html_entities_to_chr(cls, s: str) -> str:
        """
        Internal metod that seeks to replace all the HTML entities in s with their corresponding characters.
        """
@ -102,24 +101,41 @@ class PottsTokenizer(BaseTokenizer):
            s = s.replace(cls.amp, " and ")
        return s

-    def tokenize_plain(self, text: str) -> str:
+    @classmethod
+    def lower_but_preserve_emoticons(cls, word):
+        """
+        Internal method which lowercases the word if it does not match `.emoticon_re`.
+        """
+        if cls.emoticon_re.search(word):
+            return word
+        else:
+            return word.lower()
+
+    def tokenize(self, text: str) -> t.Iterator[str]:
        # Fix HTML character entitites
-        s = self.__html2string(text)
+        text = self.html_entities_to_chr(text)
        # Tokenize
-        words = self.words_re.findall(s)
+        tokens = self.words_re.findall(text)
        # Possible alter the case, but avoid changing emoticons like :D into :d:
-        words = list(map(lambda x: x if self.emoticon_re.search(x) else x.lower(), words))
-        # Re-join words
-        result = " ".join(words)
+        tokens = map(self.lower_but_preserve_emoticons, tokens)
        # Return the result
-        return result
+        return tokens


 class PottsTokenizerWithNegation(PottsTokenizer):
-    def tokenize_plain(self, text: str) -> str:
-        words = super().tokenize_plain(text).split()
+    """
+    Version of `.PottsTokenizer` which after tokenizing applies `nltk.sentiment.util.mark_negation`.
+    """
+
+    def tokenize(self, text: str) -> str:
+        # Apply the base tokenization
+        words = super().tokenize(text)
+        # Convert to a list (sigh) the iterator
+        words = list(words)
+        # Use nltk to mark negation
        nltk.sentiment.util.mark_negation(words, shallow=True)
-        return " ".join(words)
+        # Return the result
+        return words


 __all__ = (