From b77723673547cc324421aaba291b9e2f01bb5a02 Mon Sep 17 00:00:00 2001
From: Stefano Pigozzi <me@steffo.eu>
Date: Tue, 14 Feb 2023 02:25:38 +0100
Subject: [PATCH] Various upgrades

---
 unimore_bda_6/__main__.py      |  33 ++++---
 unimore_bda_6/analysis/base.py | 168 +++++++++++++++++++++++++--------
 unimore_bda_6/config.py        |  29 ++++++
 unimore_bda_6/log.py           |   9 +-
 4 files changed, 189 insertions(+), 50 deletions(-)

diff --git a/unimore_bda_6/__main__.py b/unimore_bda_6/__main__.py
index 849a245..c42952c 100644
--- a/unimore_bda_6/__main__.py
+++ b/unimore_bda_6/__main__.py
@@ -5,10 +5,10 @@ from .log import install_general_log_handlers
 
 install_general_log_handlers()
 
-from .config import config
+from .config import config, TARGET_RUNS, MAXIMUM_RUNS
 from .database import mongo_client_from_config, reviews_collection, sample_reviews_polar, sample_reviews_varied
 from .analysis import NLTKSentimentAnalyzer, TensorflowCategorySentimentAnalyzer, TensorflowPolarSentimentAnalyzer, ThreeCheat
-from .analysis.base import TrainingFailedError
+from .analysis.base import TrainingFailedError, EvaluationResults
 from .tokenizer import PlainTokenizer, LowercaseTokenizer, NLTKWordTokenizer, PottsTokenizer, PottsTokenizerWithNegation, HuggingBertTokenizer
 from .gathering import Caches
 
@@ -42,7 +42,7 @@ def main():
             slog.debug("Selected sample_func: %s", sample_func.__name__)
 
             for SentimentAnalyzer in [
-                ThreeCheat,
+                # ThreeCheat,
                 NLTKSentimentAnalyzer,
                 TensorflowPolarSentimentAnalyzer,
                 TensorflowCategorySentimentAnalyzer,
@@ -67,17 +67,25 @@ def main():
                     slog = logging.getLogger(f"{__name__}.{sample_func.__name__}.{SentimentAnalyzer.__name__}.{Tokenizer.__name__}")
                     slog.debug("Selected Tokenizer: %s", Tokenizer.__name__)
 
-                    run_counter = 0
+                    runs = 0
+                    successful_runs = 0
+                    cumulative_evaluation_results = EvaluationResults()
 
                     while True:
 
-                        slog = logging.getLogger(f"{__name__}.{sample_func.__name__}.{SentimentAnalyzer.__name__}.{Tokenizer.__name__}.{run_counter}")
-                        run_counter += 1
-                        slog.debug("Run #%d", run_counter)
+                        slog = logging.getLogger(f"{__name__}.{sample_func.__name__}.{SentimentAnalyzer.__name__}.{Tokenizer.__name__}")
 
-                        if run_counter >= 100:
-                            slog.fatal("Exceeded 100 runs, giving up and exiting...")
-                            exit(2)
+                        if successful_runs >= TARGET_RUNS.__wrapped__:
+                            slog.info("Reached target of %d runs, moving on...", TARGET_RUNS.__wrapped__)
+                            break
+
+                        if runs >= MAXIMUM_RUNS.__wrapped__:
+                            slog.fatal("Exceeded %d runs, giving up and exiting...", MAXIMUM_RUNS.__wrapped__)
+                            break
+
+                        runs += 1
+                        slog = logging.getLogger(f"{__name__}.{sample_func.__name__}.{SentimentAnalyzer.__name__}.{Tokenizer.__name__}.{runs}")
+                        slog.debug("Run #%d", runs)
 
                         try:
                             slog.debug("Instantiating %s with %s...", SentimentAnalyzer.__name__, Tokenizer.__name__)
@@ -97,12 +105,15 @@ def main():
 
                             else:
                                 slog.info("Training succeeded!")
-
                                 slog.info("Evaluating sentiment analyzer: %s", sa)
                                 evaluation_results = sa.evaluate(evaluation_dataset_func=datasets.evaluation)
                                 slog.info("Evaluation results: %s", evaluation_results)
+                                successful_runs += 1
+                                cumulative_evaluation_results += evaluation_results
                                 break
 
+                    slog.info("Cumulative evaluation results: %s", cumulative_evaluation_results)
+
 
 if __name__ == "__main__":
     main()
diff --git a/unimore_bda_6/analysis/base.py b/unimore_bda_6/analysis/base.py
index 93d7c2e..62428b1 100644
--- a/unimore_bda_6/analysis/base.py
+++ b/unimore_bda_6/analysis/base.py
@@ -2,7 +2,7 @@ from __future__ import annotations
 
 import abc
 import logging
-import dataclasses
+import collections
 
 from ..database import CachedDatasetFunc
 from ..tokenizer import BaseTokenizer
@@ -39,54 +39,148 @@ class BaseSentimentAnalyzer(metaclass=abc.ABCMeta):
         """
         Perform a model evaluation by calling repeatedly `.use` on every text of the test dataset and by comparing its resulting category with the expected category.
         """
-
-        # TODO: Add precision and recall measures
-
-        evaluated: int = 0
-
-        perfect: int = 0
-
-        squared_error: float = 0.0
-
+        er = EvaluationResults()
         for review in evaluation_dataset_func():
-            resulting_category = self.use(review.text)
-            log.debug("Evaluation step: %.1d* for %s", resulting_category, review)
-            evaluated += 1
-            try:
-                perfect += 1 if resulting_category == review.rating else 0
-                squared_error += (resulting_category - review.rating) ** 2
-            except ValueError:
-                log.warning("Model execution on %s resulted in a NaN value: %s", review, resulting_category)
-
-        return EvaluationResults(perfect=perfect, evaluated=evaluated, mse=squared_error / evaluated)
+            er.add(expected=review.rating, predicted=self.use(review.text))
+        return er
 
 
-@dataclasses.dataclass
 class EvaluationResults:
     """
     Container for the results of a dataset evaluation.
     """
 
-    evaluated: int
-    """
-    The number of reviews that were evaluated.
-    """
+    def __init__(self):
+        self.confusion_matrix: dict[float, dict[float, int]] = collections.defaultdict(lambda: collections.defaultdict(lambda: 0))
+        """
+        Confusion matrix of the evaluation.
 
-    perfect: int
-    """
-    The number of reviews for which the model returned the correct rating.
-    """
+        First key is the expected rating, second key is the output label.
+        """
 
-    mse: float
-    """
-    Mean squared error
-    """
+        self.absolute_error_total: float = 0.0
+        """
+        Sum of the absolute errors committed in the evaluation.
+        """
 
-    def __repr__(self):
-        return f"<EvaluationResults: {self!s}>"
+        self.squared_error_total: float = 0.0
+        """
+        Sum of the squared errors committed in the evaluation.
+        """
 
-    def __str__(self):
-        return f"Evaluation results:\t{self.evaluated}\tevaluated\t{self.perfect}\tperfect\t{self.perfect / self.evaluated:.2%}\taccuracy\t{self.mse / self.evaluated:.2}\tmean squared error"
+    def __repr__(self) -> str:
+        return f"<EvaluationResults with {self.evaluated_count()} evaluated and {len(self.keys())} categories>"
+
+    def __str__(self) -> str:
+        text = [f"Evaluation results: {self.evaluated_count()} evaluated, {self.mean_absolute_error()} mean absolute error, {self.mean_squared_error()} mean squared error, "]
+        for key in self.keys():
+            text.append(f"{self.recall(key)} recall of {key}, ")
+            text.append(f"{self.precision(key)} precision of {key}, ")
+        text.append(f"{self.perfect_count()} perfect matches.")
+        return "".join(text)
+
+    def __add__(self, other: EvaluationResults) -> EvaluationResults:
+        new = self.__class__()
+        for expected, value in self.confusion_matrix.items():
+            for predicted, amount in value.items():
+                new.confusion_matrix[expected][predicted] += amount
+        for expected, value in other.confusion_matrix.items():
+            for predicted, amount in value.items():
+                new.confusion_matrix[expected][predicted] += amount
+        return new
+
+    def keys(self) -> set[float]:
+        """
+        Return all processed categories.
+        """
+        keys: set[float] = set()
+
+        for expected, value in self.confusion_matrix.items():
+            keys.add(expected)
+            for predicted, _ in value.items():
+                keys.add(predicted)
+
+        return keys
+
+    def evaluated_count(self) -> int:
+        """
+        Return the total number of evaluated reviews.
+        """
+        total: int = 0
+        for row in self.confusion_matrix.values():
+            for el in row.values():
+                total += el
+        return total
+
+    def perfect_count(self) -> int:
+        """
+        Return the total number of perfect reviews.
+        """
+        total: int = 0
+        for key in self.keys():
+            total += self.confusion_matrix[key][key]
+        return total
+
+    def recall_count(self, rating: float) -> int:
+        """
+        Return the number of reviews processed with the given rating.
+        """
+        total: int = 0
+        for el in self.confusion_matrix[rating].values():
+            total += el
+        return total
+
+    def precision_count(self, rating: float) -> int:
+        """
+        Return the number of reviews for which the model returned the given rating.
+        """
+        total: int = 0
+        for col in self.confusion_matrix.values():
+            total += col[rating]
+        return total
+
+    def recall(self, rating: float) -> float:
+        """
+        Return the recall for a given rating.
+        """
+        try:
+            return self.confusion_matrix[rating][rating] / self.recall_count(rating)
+        except ZeroDivisionError:
+            return float("inf")
+
+    def precision(self, rating: float) -> float:
+        """
+        Return the precision for a given rating.
+        """
+        try:
+            return self.confusion_matrix[rating][rating] / self.precision_count(rating)
+        except ZeroDivisionError:
+            return float("inf")
+
+    def mean_absolute_error(self) -> float:
+        """
+        Return the mean absolute error.
+        """
+        return self.absolute_error_total / self.evaluated_count()
+
+    def mean_squared_error(self) -> float:
+        """
+        Return the mean squared error.
+        """
+        return self.squared_error_total / self.evaluated_count()
+
+    def add(self, expected: float, predicted: float) -> None:
+        """
+        Count a new prediction.
+        """
+        if expected == predicted:
+            log.log(11, "Expected %.1d*, predicted %.1d*", expected, predicted)  # Success
+        else:
+            log.log(12, "Expected %.1d*, predicted %.1d*", expected, predicted)  # Failure
+
+        self.confusion_matrix[expected][predicted] += 1
+        self.absolute_error_total += abs(expected - predicted)
+        self.squared_error_total += (expected - predicted) ** 2
 
 
 class AlreadyTrainedError(Exception):
diff --git a/unimore_bda_6/config.py b/unimore_bda_6/config.py
index c7c0266..d283f8c 100644
--- a/unimore_bda_6/config.py
+++ b/unimore_bda_6/config.py
@@ -125,6 +125,35 @@ def TENSORFLOW_EPOCHS(val: str | None) -> int:
         raise cfig.InvalidValueError("Not an int.")
 
 
+@config.optional()
+def TARGET_RUNS(val: str | None) -> int:
+    """
+    The amount of successful runs to perform on a sample-model-tokenizer combination.
+    Defaults to `1`.
+    """
+    if val is None:
+        return 1
+    try:
+        return int(val)
+    except ValueError:
+        raise cfig.InvalidValueError("Not an int.")
+
+
+@config.optional()
+def MAXIMUM_RUNS(val: str | None) -> int:
+    """
+    The maximum amount of runs to perform on a sample-model-tokenizer combination before skipping it.
+    Defaults to `25`.
+    """
+    if val is None:
+        return 25
+    try:
+        return int(val)
+    except ValueError:
+        raise cfig.InvalidValueError("Not an int.")
+
+
+
 __all__ = (
     "config",
     "MONGO_HOST",
diff --git a/unimore_bda_6/log.py b/unimore_bda_6/log.py
index 4c6c391..aec6785 100644
--- a/unimore_bda_6/log.py
+++ b/unimore_bda_6/log.py
@@ -3,6 +3,9 @@ import logging
 import coloredlogs
 import pathlib
 
+logging.addLevelName(11, "SUCCESS")
+logging.addLevelName(12, "FAILURE")
+
 this_log = logging.getLogger(__name__)
 
 
@@ -34,9 +37,11 @@ def install_general_log_handlers():
             level_styles=dict(
                 debug=dict(color="white"),
                 info=dict(color="cyan"),
-                warning=dict(color="yellow"),
-                error=dict(color="red"),
+                warning=dict(color="yellow", bold=True),
+                error=dict(color="red", bold=True),
                 critical=dict(color="black", background="red", bold=True),
+                success=dict(color="green"),
+                failure=dict(color="yellow"),
             ),
             field_styles=dict(
                 asctime=dict(color='magenta'),