diff --git a/unimore_bda_6/__main__.py b/unimore_bda_6/__main__.py index 849a245..c42952c 100644 --- a/unimore_bda_6/__main__.py +++ b/unimore_bda_6/__main__.py @@ -5,10 +5,10 @@ from .log import install_general_log_handlers install_general_log_handlers() -from .config import config +from .config import config, TARGET_RUNS, MAXIMUM_RUNS from .database import mongo_client_from_config, reviews_collection, sample_reviews_polar, sample_reviews_varied from .analysis import NLTKSentimentAnalyzer, TensorflowCategorySentimentAnalyzer, TensorflowPolarSentimentAnalyzer, ThreeCheat -from .analysis.base import TrainingFailedError +from .analysis.base import TrainingFailedError, EvaluationResults from .tokenizer import PlainTokenizer, LowercaseTokenizer, NLTKWordTokenizer, PottsTokenizer, PottsTokenizerWithNegation, HuggingBertTokenizer from .gathering import Caches @@ -42,7 +42,7 @@ def main(): slog.debug("Selected sample_func: %s", sample_func.__name__) for SentimentAnalyzer in [ - ThreeCheat, + # ThreeCheat, NLTKSentimentAnalyzer, TensorflowPolarSentimentAnalyzer, TensorflowCategorySentimentAnalyzer, @@ -67,17 +67,25 @@ def main(): slog = logging.getLogger(f"{__name__}.{sample_func.__name__}.{SentimentAnalyzer.__name__}.{Tokenizer.__name__}") slog.debug("Selected Tokenizer: %s", Tokenizer.__name__) - run_counter = 0 + runs = 0 + successful_runs = 0 + cumulative_evaluation_results = EvaluationResults() while True: - slog = logging.getLogger(f"{__name__}.{sample_func.__name__}.{SentimentAnalyzer.__name__}.{Tokenizer.__name__}.{run_counter}") - run_counter += 1 - slog.debug("Run #%d", run_counter) + slog = logging.getLogger(f"{__name__}.{sample_func.__name__}.{SentimentAnalyzer.__name__}.{Tokenizer.__name__}") - if run_counter >= 100: - slog.fatal("Exceeded 100 runs, giving up and exiting...") - exit(2) + if successful_runs >= TARGET_RUNS.__wrapped__: + slog.info("Reached target of %d runs, moving on...", TARGET_RUNS.__wrapped__) + break + + if runs >= MAXIMUM_RUNS.__wrapped__: + slog.fatal("Exceeded %d runs, giving up and exiting...", MAXIMUM_RUNS.__wrapped__) + break + + runs += 1 + slog = logging.getLogger(f"{__name__}.{sample_func.__name__}.{SentimentAnalyzer.__name__}.{Tokenizer.__name__}.{runs}") + slog.debug("Run #%d", runs) try: slog.debug("Instantiating %s with %s...", SentimentAnalyzer.__name__, Tokenizer.__name__) @@ -97,12 +105,15 @@ def main(): else: slog.info("Training succeeded!") - slog.info("Evaluating sentiment analyzer: %s", sa) evaluation_results = sa.evaluate(evaluation_dataset_func=datasets.evaluation) slog.info("Evaluation results: %s", evaluation_results) + successful_runs += 1 + cumulative_evaluation_results += evaluation_results break + slog.info("Cumulative evaluation results: %s", cumulative_evaluation_results) + if __name__ == "__main__": main() diff --git a/unimore_bda_6/analysis/base.py b/unimore_bda_6/analysis/base.py index 93d7c2e..62428b1 100644 --- a/unimore_bda_6/analysis/base.py +++ b/unimore_bda_6/analysis/base.py @@ -2,7 +2,7 @@ from __future__ import annotations import abc import logging -import dataclasses +import collections from ..database import CachedDatasetFunc from ..tokenizer import BaseTokenizer @@ -39,54 +39,148 @@ class BaseSentimentAnalyzer(metaclass=abc.ABCMeta): """ Perform a model evaluation by calling repeatedly `.use` on every text of the test dataset and by comparing its resulting category with the expected category. """ - - # TODO: Add precision and recall measures - - evaluated: int = 0 - - perfect: int = 0 - - squared_error: float = 0.0 - + er = EvaluationResults() for review in evaluation_dataset_func(): - resulting_category = self.use(review.text) - log.debug("Evaluation step: %.1d* for %s", resulting_category, review) - evaluated += 1 - try: - perfect += 1 if resulting_category == review.rating else 0 - squared_error += (resulting_category - review.rating) ** 2 - except ValueError: - log.warning("Model execution on %s resulted in a NaN value: %s", review, resulting_category) - - return EvaluationResults(perfect=perfect, evaluated=evaluated, mse=squared_error / evaluated) + er.add(expected=review.rating, predicted=self.use(review.text)) + return er -@dataclasses.dataclass class EvaluationResults: """ Container for the results of a dataset evaluation. """ - evaluated: int - """ - The number of reviews that were evaluated. - """ + def __init__(self): + self.confusion_matrix: dict[float, dict[float, int]] = collections.defaultdict(lambda: collections.defaultdict(lambda: 0)) + """ + Confusion matrix of the evaluation. - perfect: int - """ - The number of reviews for which the model returned the correct rating. - """ + First key is the expected rating, second key is the output label. + """ - mse: float - """ - Mean squared error - """ + self.absolute_error_total: float = 0.0 + """ + Sum of the absolute errors committed in the evaluation. + """ - def __repr__(self): - return f"" + self.squared_error_total: float = 0.0 + """ + Sum of the squared errors committed in the evaluation. + """ - def __str__(self): - return f"Evaluation results:\t{self.evaluated}\tevaluated\t{self.perfect}\tperfect\t{self.perfect / self.evaluated:.2%}\taccuracy\t{self.mse / self.evaluated:.2}\tmean squared error" + def __repr__(self) -> str: + return f"" + + def __str__(self) -> str: + text = [f"Evaluation results: {self.evaluated_count()} evaluated, {self.mean_absolute_error()} mean absolute error, {self.mean_squared_error()} mean squared error, "] + for key in self.keys(): + text.append(f"{self.recall(key)} recall of {key}, ") + text.append(f"{self.precision(key)} precision of {key}, ") + text.append(f"{self.perfect_count()} perfect matches.") + return "".join(text) + + def __add__(self, other: EvaluationResults) -> EvaluationResults: + new = self.__class__() + for expected, value in self.confusion_matrix.items(): + for predicted, amount in value.items(): + new.confusion_matrix[expected][predicted] += amount + for expected, value in other.confusion_matrix.items(): + for predicted, amount in value.items(): + new.confusion_matrix[expected][predicted] += amount + return new + + def keys(self) -> set[float]: + """ + Return all processed categories. + """ + keys: set[float] = set() + + for expected, value in self.confusion_matrix.items(): + keys.add(expected) + for predicted, _ in value.items(): + keys.add(predicted) + + return keys + + def evaluated_count(self) -> int: + """ + Return the total number of evaluated reviews. + """ + total: int = 0 + for row in self.confusion_matrix.values(): + for el in row.values(): + total += el + return total + + def perfect_count(self) -> int: + """ + Return the total number of perfect reviews. + """ + total: int = 0 + for key in self.keys(): + total += self.confusion_matrix[key][key] + return total + + def recall_count(self, rating: float) -> int: + """ + Return the number of reviews processed with the given rating. + """ + total: int = 0 + for el in self.confusion_matrix[rating].values(): + total += el + return total + + def precision_count(self, rating: float) -> int: + """ + Return the number of reviews for which the model returned the given rating. + """ + total: int = 0 + for col in self.confusion_matrix.values(): + total += col[rating] + return total + + def recall(self, rating: float) -> float: + """ + Return the recall for a given rating. + """ + try: + return self.confusion_matrix[rating][rating] / self.recall_count(rating) + except ZeroDivisionError: + return float("inf") + + def precision(self, rating: float) -> float: + """ + Return the precision for a given rating. + """ + try: + return self.confusion_matrix[rating][rating] / self.precision_count(rating) + except ZeroDivisionError: + return float("inf") + + def mean_absolute_error(self) -> float: + """ + Return the mean absolute error. + """ + return self.absolute_error_total / self.evaluated_count() + + def mean_squared_error(self) -> float: + """ + Return the mean squared error. + """ + return self.squared_error_total / self.evaluated_count() + + def add(self, expected: float, predicted: float) -> None: + """ + Count a new prediction. + """ + if expected == predicted: + log.log(11, "Expected %.1d*, predicted %.1d*", expected, predicted) # Success + else: + log.log(12, "Expected %.1d*, predicted %.1d*", expected, predicted) # Failure + + self.confusion_matrix[expected][predicted] += 1 + self.absolute_error_total += abs(expected - predicted) + self.squared_error_total += (expected - predicted) ** 2 class AlreadyTrainedError(Exception): diff --git a/unimore_bda_6/config.py b/unimore_bda_6/config.py index c7c0266..d283f8c 100644 --- a/unimore_bda_6/config.py +++ b/unimore_bda_6/config.py @@ -125,6 +125,35 @@ def TENSORFLOW_EPOCHS(val: str | None) -> int: raise cfig.InvalidValueError("Not an int.") +@config.optional() +def TARGET_RUNS(val: str | None) -> int: + """ + The amount of successful runs to perform on a sample-model-tokenizer combination. + Defaults to `1`. + """ + if val is None: + return 1 + try: + return int(val) + except ValueError: + raise cfig.InvalidValueError("Not an int.") + + +@config.optional() +def MAXIMUM_RUNS(val: str | None) -> int: + """ + The maximum amount of runs to perform on a sample-model-tokenizer combination before skipping it. + Defaults to `25`. + """ + if val is None: + return 25 + try: + return int(val) + except ValueError: + raise cfig.InvalidValueError("Not an int.") + + + __all__ = ( "config", "MONGO_HOST", diff --git a/unimore_bda_6/log.py b/unimore_bda_6/log.py index 4c6c391..aec6785 100644 --- a/unimore_bda_6/log.py +++ b/unimore_bda_6/log.py @@ -3,6 +3,9 @@ import logging import coloredlogs import pathlib +logging.addLevelName(11, "SUCCESS") +logging.addLevelName(12, "FAILURE") + this_log = logging.getLogger(__name__) @@ -34,9 +37,11 @@ def install_general_log_handlers(): level_styles=dict( debug=dict(color="white"), info=dict(color="cyan"), - warning=dict(color="yellow"), - error=dict(color="red"), + warning=dict(color="yellow", bold=True), + error=dict(color="red", bold=True), critical=dict(color="black", background="red", bold=True), + success=dict(color="green"), + failure=dict(color="yellow"), ), field_styles=dict( asctime=dict(color='magenta'),