bda-6-steffo/unimore_bda_6/analysis/base.py

from __future__ import annotations

import abc
import logging
import collections

from ..database import CachedDatasetFunc
from ..tokenizer import BaseTokenizer

log = logging.getLogger(__name__)


class BaseSentimentAnalyzer(metaclass=abc.ABCMeta):
    """
    Abstract base class for sentiment analyzers implemented in this project.
    """

    def __init__(self, *, tokenizer: BaseTokenizer):
        self.tokenizer: BaseTokenizer = tokenizer

    def __repr__(self):
        return f"<{self.__class__.__qualname__} with {self.tokenizer} tokenizer>"

    @abc.abstractmethod
    def train(self, training_dataset_func: CachedDatasetFunc, validation_dataset_func: CachedDatasetFunc) -> None:
        """
        Train the analyzer with the given training and validation datasets.
        """
        raise NotImplementedError()

    @abc.abstractmethod
    def use(self, text: str) -> float:
        """
        Run the model on the given input, and return the predicted rating.
        """
        raise NotImplementedError()

    def evaluate(self, evaluation_dataset_func: CachedDatasetFunc) -> EvaluationResults:
        """
        Perform a model evaluation by calling repeatedly `.use` on every text of the test dataset and by comparing its resulting category with the expected category.
        """
        er = EvaluationResults()
        for review in evaluation_dataset_func():
            er.add(expected=review.rating, predicted=self.use(review.text))
        return er


class EvaluationResults:
    """
    Container for the results of a dataset evaluation.
    """

    def __init__(self):
        self.confusion_matrix: dict[float, dict[float, int]] = collections.defaultdict(lambda: collections.defaultdict(lambda: 0))
        """
        Confusion matrix of the evaluation.

        First key is the expected rating, second key is the output label.
        """

        self.absolute_error_total: float = 0.0
        """
        Sum of the absolute errors committed in the evaluation.
        """

        self.squared_error_total: float = 0.0
        """
        Sum of the squared errors committed in the evaluation.
        """

    def __repr__(self) -> str:
        return f"<EvaluationResults with {self.evaluated_count()} evaluated and {len(self.keys())} categories>"

    def __str__(self) -> str:
        text = [f"Evaluation results: {self.evaluated_count()} evaluated, {self.mean_absolute_error()} mean absolute error, {self.mean_squared_error()} mean squared error, "]
        for key in self.keys():
            text.append(f"{self.recall(key)} recall of {key}, ")
            text.append(f"{self.precision(key)} precision of {key}, ")
        text.append(f"{self.perfect_count()} perfect matches.")
        return "".join(text)

    def __add__(self, other: EvaluationResults) -> EvaluationResults:
        new = self.__class__()
        new.absolute_error_total = self.absolute_error_total + other.absolute_error_total
        new.squared_error_total = self.squared_error_total + other.squared_error_total
        for expected, value in self.confusion_matrix.items():
            for predicted, amount in value.items():
                new.confusion_matrix[expected][predicted] += amount
        for expected, value in other.confusion_matrix.items():
            for predicted, amount in value.items():
                new.confusion_matrix[expected][predicted] += amount
        return new

    def keys(self) -> set[float]:
        """
        Return all processed categories.
        """
        keys: set[float] = set()

        for expected, value in self.confusion_matrix.items():
            keys.add(expected)
            for predicted, _ in value.items():
                keys.add(predicted)

        return keys

    def evaluated_count(self) -> int:
        """
        Return the total number of evaluated reviews.
        """
        total: int = 0
        for row in self.confusion_matrix.values():
            for el in row.values():
                total += el
        return total

    def perfect_count(self) -> int:
        """
        Return the total number of perfect reviews.
        """
        total: int = 0
        for key in self.keys():
            total += self.confusion_matrix[key][key]
        return total

    def recall_count(self, rating: float) -> int:
        """
        Return the number of reviews processed with the given rating.
        """
        total: int = 0
        for el in self.confusion_matrix[rating].values():
            total += el
        return total

    def precision_count(self, rating: float) -> int:
        """
        Return the number of reviews for which the model returned the given rating.
        """
        total: int = 0
        for col in self.confusion_matrix.values():
            total += col[rating]
        return total

    def recall(self, rating: float) -> float:
        """
        Return the recall for a given rating.
        """
        try:
            return self.confusion_matrix[rating][rating] / self.recall_count(rating)
        except KeyError:
            return float("NaN")
        except ZeroDivisionError:
            return float("inf")

    def precision(self, rating: float) -> float:
        """
        Return the precision for a given rating.
        """
        try:
            return self.confusion_matrix[rating][rating] / self.precision_count(rating)
        except KeyError:
            return float("NaN")
        except ZeroDivisionError:
            return float("inf")

    def mean_absolute_error(self) -> float:
        """
        Return the mean absolute error.
        """
        return self.absolute_error_total / self.evaluated_count()

    def mean_squared_error(self) -> float:
        """
        Return the mean squared error.
        """
        return self.squared_error_total / self.evaluated_count()

    def add(self, expected: float, predicted: float) -> None:
        """
        Count a new prediction.
        """
        if expected == predicted:
            log.log(11, "Expected %.1d*, predicted %.1d*", expected, predicted)  # Success
        else:
            log.log(12, "Expected %.1d*, predicted %.1d*", expected, predicted)  # Failure

        self.confusion_matrix[expected][predicted] += 1
        self.absolute_error_total += abs(expected - predicted)
        self.squared_error_total += (expected - predicted) ** 2


class AlreadyTrainedError(Exception):
    """
    This model has already been trained and cannot be trained again.
    """


class NotTrainedError(Exception):
    """
    This model has not been trained yet.
    """


class TrainingFailedError(Exception):
    """
    The model wasn't able to complete the training and should not be used anymore.
    """


__all__ = (
    "BaseSentimentAnalyzer",
    "AlreadyTrainedError",
    "NotTrainedError",
    "TrainingFailedError",
)
enough 2023-02-08 18:46:05 +00:00			`from __future__ import annotations`

Working prototype 2023-02-02 01:56:37 +00:00			`import abc`
New version working nicely 2023-02-03 22:27:44 +00:00			`import logging`
Various upgrades 2023-02-14 01:25:38 +00:00			`import collections`
Refactor things to work better 2023-02-02 16:24:11 +00:00
Remove unused imports 2023-02-13 14:42:45 +00:00			`from ..database import CachedDatasetFunc`
enough 2023-02-08 18:46:05 +00:00			`from ..tokenizer import BaseTokenizer`
Refactor things to work better 2023-02-02 16:24:11 +00:00
New version working nicely 2023-02-03 22:27:44 +00:00			`log = logging.getLogger(__name__)`
Working prototype 2023-02-02 01:56:37 +00:00

New version working nicely 2023-02-03 22:27:44 +00:00			`class BaseSentimentAnalyzer(metaclass=abc.ABCMeta):`
Working prototype 2023-02-02 01:56:37 +00:00			`"""`
			`Abstract base class for sentiment analyzers implemented in this project.`
			`"""`

enough 2023-02-08 18:46:05 +00:00			`def __init__(self, *, tokenizer: BaseTokenizer):`
CODE IS DONE 2023-02-12 04:11:58 +00:00			`self.tokenizer: BaseTokenizer = tokenizer`
enough 2023-02-08 18:46:05 +00:00
			`def __repr__(self):`
CODE IS DONE 2023-02-12 04:11:58 +00:00			`return f"<{self.__class__.__qualname__} with {self.tokenizer} tokenizer>"`
enough 2023-02-08 18:46:05 +00:00
Working prototype 2023-02-02 01:56:37 +00:00			`@abc.abstractmethod`
enough 2023-02-08 18:46:05 +00:00			`def train(self, training_dataset_func: CachedDatasetFunc, validation_dataset_func: CachedDatasetFunc) -> None:`
Working prototype 2023-02-02 01:56:37 +00:00			`"""`
enough 2023-02-08 18:46:05 +00:00			`Train the analyzer with the given training and validation datasets.`
Working prototype 2023-02-02 01:56:37 +00:00			`"""`
			`raise NotImplementedError()`

enough 2023-02-08 18:46:05 +00:00			`@abc.abstractmethod`
CODE IS DONE 2023-02-12 04:11:58 +00:00			`def use(self, text: str) -> float:`
enough 2023-02-08 18:46:05 +00:00			`"""`
CODE IS DONE 2023-02-12 04:11:58 +00:00			`Run the model on the given input, and return the predicted rating.`
enough 2023-02-08 18:46:05 +00:00			`"""`
			`raise NotImplementedError()`

			`def evaluate(self, evaluation_dataset_func: CachedDatasetFunc) -> EvaluationResults:`
Working prototype 2023-02-02 01:56:37 +00:00			`"""`
New version working nicely 2023-02-03 22:27:44 +00:00			Perform a model evaluation by calling repeatedly `.use` on every text of the test dataset and by comparing its resulting category with the expected category.
			`"""`
Various upgrades 2023-02-14 01:25:38 +00:00			`er = EvaluationResults()`
			`for review in evaluation_dataset_func():`
			`er.add(expected=review.rating, predicted=self.use(review.text))`
			`return er`
Getting closer... 2023-02-04 05:14:24 +00:00
Add reminder to myself 2023-02-13 17:47:29 +00:00
Various upgrades 2023-02-14 01:25:38 +00:00			`class EvaluationResults:`
			`"""`
			`Container for the results of a dataset evaluation.`
			`"""`
CODE IS DONE 2023-02-12 04:11:58 +00:00
Various upgrades 2023-02-14 01:25:38 +00:00			`def __init__(self):`
			`self.confusion_matrix: dict[float, dict[float, int]] = collections.defaultdict(lambda: collections.defaultdict(lambda: 0))`
			`"""`
			`Confusion matrix of the evaluation.`
CODE IS DONE 2023-02-12 04:11:58 +00:00
Various upgrades 2023-02-14 01:25:38 +00:00			`First key is the expected rating, second key is the output label.`
			`"""`
Working prototype 2023-02-02 01:56:37 +00:00
Various upgrades 2023-02-14 01:25:38 +00:00			`self.absolute_error_total: float = 0.0`
			`"""`
			`Sum of the absolute errors committed in the evaluation.`
			`"""`
Working prototype 2023-02-02 01:56:37 +00:00
Various upgrades 2023-02-14 01:25:38 +00:00			`self.squared_error_total: float = 0.0`
			`"""`
			`Sum of the squared errors committed in the evaluation.`
			`"""`
PEP8 2023-02-03 16:50:40 +00:00
Various upgrades 2023-02-14 01:25:38 +00:00			`def __repr__(self) -> str:`
			`return f"<EvaluationResults with {self.evaluated_count()} evaluated and {len(self.keys())} categories>"`

			`def __str__(self) -> str:`
			`text = [f"Evaluation results: {self.evaluated_count()} evaluated, {self.mean_absolute_error()} mean absolute error, {self.mean_squared_error()} mean squared error, "]`
			`for key in self.keys():`
			`text.append(f"{self.recall(key)} recall of {key}, ")`
			`text.append(f"{self.precision(key)} precision of {key}, ")`
			`text.append(f"{self.perfect_count()} perfect matches.")`
			`return "".join(text)`

			`def __add__(self, other: EvaluationResults) -> EvaluationResults:`
			`new = self.__class__()`
Properly accumulate errors in `EvaluationResults` 2023-04-13 23:47:59 +00:00			`new.absolute_error_total = self.absolute_error_total + other.absolute_error_total`
			`new.squared_error_total = self.squared_error_total + other.squared_error_total`
Various upgrades 2023-02-14 01:25:38 +00:00			`for expected, value in self.confusion_matrix.items():`
			`for predicted, amount in value.items():`
			`new.confusion_matrix[expected][predicted] += amount`
			`for expected, value in other.confusion_matrix.items():`
			`for predicted, amount in value.items():`
			`new.confusion_matrix[expected][predicted] += amount`
			`return new`

			`def keys(self) -> set[float]:`
			`"""`
			`Return all processed categories.`
			`"""`
			`keys: set[float] = set()`
enough 2023-02-08 18:46:05 +00:00
Various upgrades 2023-02-14 01:25:38 +00:00			`for expected, value in self.confusion_matrix.items():`
			`keys.add(expected)`
			`for predicted, _ in value.items():`
			`keys.add(predicted)`
enough 2023-02-08 18:46:05 +00:00
Various upgrades 2023-02-14 01:25:38 +00:00			`return keys`
CODE IS DONE 2023-02-12 04:11:58 +00:00
Various upgrades 2023-02-14 01:25:38 +00:00			`def evaluated_count(self) -> int:`
			`"""`
			`Return the total number of evaluated reviews.`
			`"""`
			`total: int = 0`
			`for row in self.confusion_matrix.values():`
			`for el in row.values():`
			`total += el`
			`return total`
CODE IS DONE 2023-02-12 04:11:58 +00:00
Various upgrades 2023-02-14 01:25:38 +00:00			`def perfect_count(self) -> int:`
			`"""`
			`Return the total number of perfect reviews.`
			`"""`
			`total: int = 0`
			`for key in self.keys():`
			`total += self.confusion_matrix[key][key]`
			`return total`
enough 2023-02-08 18:46:05 +00:00
Various upgrades 2023-02-14 01:25:38 +00:00			`def recall_count(self, rating: float) -> int:`
			`"""`
			`Return the number of reviews processed with the given rating.`
			`"""`
			`total: int = 0`
			`for el in self.confusion_matrix[rating].values():`
			`total += el`
			`return total`

			`def precision_count(self, rating: float) -> int:`
			`"""`
			`Return the number of reviews for which the model returned the given rating.`
			`"""`
			`total: int = 0`
			`for col in self.confusion_matrix.values():`
			`total += col[rating]`
			`return total`

			`def recall(self, rating: float) -> float:`
			`"""`
			`Return the recall for a given rating.`
			`"""`
			`try:`
			`return self.confusion_matrix[rating][rating] / self.recall_count(rating)`
Write results to a `./data/logs/results.tsv` file as well 2023-05-08 02:16:32 +00:00			`except KeyError:`
			`return float("NaN")`
Various upgrades 2023-02-14 01:25:38 +00:00			`except ZeroDivisionError:`
			`return float("inf")`

			`def precision(self, rating: float) -> float:`
			`"""`
			`Return the precision for a given rating.`
			`"""`
			`try:`
			`return self.confusion_matrix[rating][rating] / self.precision_count(rating)`
Write results to a `./data/logs/results.tsv` file as well 2023-05-08 02:16:32 +00:00			`except KeyError:`
			`return float("NaN")`
Various upgrades 2023-02-14 01:25:38 +00:00			`except ZeroDivisionError:`
			`return float("inf")`

			`def mean_absolute_error(self) -> float:`
			`"""`
			`Return the mean absolute error.`
			`"""`
			`return self.absolute_error_total / self.evaluated_count()`

			`def mean_squared_error(self) -> float:`
			`"""`
			`Return the mean squared error.`
			`"""`
			`return self.squared_error_total / self.evaluated_count()`
enough 2023-02-08 18:46:05 +00:00
Various upgrades 2023-02-14 01:25:38 +00:00			`def add(self, expected: float, predicted: float) -> None:`
			`"""`
			`Count a new prediction.`
			`"""`
			`if expected == predicted:`
			`log.log(11, "Expected %.1d, predicted %.1d", expected, predicted) # Success`
			`else:`
			`log.log(12, "Expected %.1d, predicted %.1d", expected, predicted) # Failure`

			`self.confusion_matrix[expected][predicted] += 1`
			`self.absolute_error_total += abs(expected - predicted)`
			`self.squared_error_total += (expected - predicted) ** 2`
Working prototype 2023-02-02 01:56:37 +00:00

stop here for now 2023-02-04 00:36:42 +00:00			`class AlreadyTrainedError(Exception):`
			`"""`
			`This model has already been trained and cannot be trained again.`
			`"""`


			`class NotTrainedError(Exception):`
			`"""`
			`This model has not been trained yet.`
			`"""`


stuff's working 2023-02-08 09:54:14 +00:00			`class TrainingFailedError(Exception):`
			`"""`
			`The model wasn't able to complete the training and should not be used anymore.`
			`"""`


Working prototype 2023-02-02 01:56:37 +00:00			`__all__ = (`
New version working nicely 2023-02-03 22:27:44 +00:00			`"BaseSentimentAnalyzer",`
stop here for now 2023-02-04 00:36:42 +00:00			`"AlreadyTrainedError",`
			`"NotTrainedError",`
stuff's working 2023-02-08 09:54:14 +00:00			`"TrainingFailedError",`
Working prototype 2023-02-02 01:56:37 +00:00			`)`