bda-6-steffo/unimore_bda_6/analysis/base.py

from __future__ import annotations

import abc
import logging
import dataclasses

from ..database import CachedDatasetFunc
from ..tokenizer import BaseTokenizer

log = logging.getLogger(__name__)


class BaseSentimentAnalyzer(metaclass=abc.ABCMeta):
    """
    Abstract base class for sentiment analyzers implemented in this project.
    """

    def __init__(self, *, tokenizer: BaseTokenizer):
        self.tokenizer: BaseTokenizer = tokenizer

    def __repr__(self):
        return f"<{self.__class__.__qualname__} with {self.tokenizer} tokenizer>"

    @abc.abstractmethod
    def train(self, training_dataset_func: CachedDatasetFunc, validation_dataset_func: CachedDatasetFunc) -> None:
        """
        Train the analyzer with the given training and validation datasets.
        """
        raise NotImplementedError()

    @abc.abstractmethod
    def use(self, text: str) -> float:
        """
        Run the model on the given input, and return the predicted rating.
        """
        raise NotImplementedError()

    def evaluate(self, evaluation_dataset_func: CachedDatasetFunc) -> EvaluationResults:
        """
        Perform a model evaluation by calling repeatedly `.use` on every text of the test dataset and by comparing its resulting category with the expected category.
        """

        # TODO: Add precision and recall measures

        evaluated: int = 0

        perfect: int = 0

        squared_error: float = 0.0

        for review in evaluation_dataset_func():
            resulting_category = self.use(review.text)
            log.debug("Evaluation step: %.1d* for %s", resulting_category, review)
            evaluated += 1
            try:
                perfect += 1 if resulting_category == review.rating else 0
                squared_error += (resulting_category - review.rating) ** 2
            except ValueError:
                log.warning("Model execution on %s resulted in a NaN value: %s", review, resulting_category)

        return EvaluationResults(perfect=perfect, evaluated=evaluated, mse=squared_error / evaluated)


@dataclasses.dataclass
class EvaluationResults:
    """
    Container for the results of a dataset evaluation.
    """

    evaluated: int
    """
    The number of reviews that were evaluated.
    """

    perfect: int
    """
    The number of reviews for which the model returned the correct rating.
    """

    mse: float
    """
    Mean squared error
    """

    def __repr__(self):
        return f"<EvaluationResults: {self!s}>"

    def __str__(self):
        return f"Evaluation results:\t{self.evaluated}\tevaluated\t{self.perfect}\tperfect\t{self.perfect / self.evaluated:.2%}\taccuracy\t{self.mse / self.evaluated:.2}\tmean squared error"


class AlreadyTrainedError(Exception):
    """
    This model has already been trained and cannot be trained again.
    """


class NotTrainedError(Exception):
    """
    This model has not been trained yet.
    """


class TrainingFailedError(Exception):
    """
    The model wasn't able to complete the training and should not be used anymore.
    """


__all__ = (
    "BaseSentimentAnalyzer",
    "AlreadyTrainedError",
    "NotTrainedError",
    "TrainingFailedError",
)
enough 2023-02-08 18:46:05 +00:00			`from __future__ import annotations`

Working prototype 2023-02-02 01:56:37 +00:00			`import abc`
New version working nicely 2023-02-03 22:27:44 +00:00			`import logging`
Getting closer... 2023-02-04 05:14:24 +00:00			`import dataclasses`
Refactor things to work better 2023-02-02 16:24:11 +00:00
Remove unused imports 2023-02-13 14:42:45 +00:00			`from ..database import CachedDatasetFunc`
enough 2023-02-08 18:46:05 +00:00			`from ..tokenizer import BaseTokenizer`
Refactor things to work better 2023-02-02 16:24:11 +00:00
New version working nicely 2023-02-03 22:27:44 +00:00			`log = logging.getLogger(__name__)`
Working prototype 2023-02-02 01:56:37 +00:00

New version working nicely 2023-02-03 22:27:44 +00:00			`class BaseSentimentAnalyzer(metaclass=abc.ABCMeta):`
Working prototype 2023-02-02 01:56:37 +00:00			`"""`
			`Abstract base class for sentiment analyzers implemented in this project.`
			`"""`

enough 2023-02-08 18:46:05 +00:00			`def __init__(self, *, tokenizer: BaseTokenizer):`
CODE IS DONE 2023-02-12 04:11:58 +00:00			`self.tokenizer: BaseTokenizer = tokenizer`
enough 2023-02-08 18:46:05 +00:00
			`def __repr__(self):`
CODE IS DONE 2023-02-12 04:11:58 +00:00			`return f"<{self.__class__.__qualname__} with {self.tokenizer} tokenizer>"`
enough 2023-02-08 18:46:05 +00:00
Working prototype 2023-02-02 01:56:37 +00:00			`@abc.abstractmethod`
enough 2023-02-08 18:46:05 +00:00			`def train(self, training_dataset_func: CachedDatasetFunc, validation_dataset_func: CachedDatasetFunc) -> None:`
Working prototype 2023-02-02 01:56:37 +00:00			`"""`
enough 2023-02-08 18:46:05 +00:00			`Train the analyzer with the given training and validation datasets.`
Working prototype 2023-02-02 01:56:37 +00:00			`"""`
			`raise NotImplementedError()`

enough 2023-02-08 18:46:05 +00:00			`@abc.abstractmethod`
CODE IS DONE 2023-02-12 04:11:58 +00:00			`def use(self, text: str) -> float:`
enough 2023-02-08 18:46:05 +00:00			`"""`
CODE IS DONE 2023-02-12 04:11:58 +00:00			`Run the model on the given input, and return the predicted rating.`
enough 2023-02-08 18:46:05 +00:00			`"""`
			`raise NotImplementedError()`

			`def evaluate(self, evaluation_dataset_func: CachedDatasetFunc) -> EvaluationResults:`
Working prototype 2023-02-02 01:56:37 +00:00			`"""`
New version working nicely 2023-02-03 22:27:44 +00:00			Perform a model evaluation by calling repeatedly `.use` on every text of the test dataset and by comparing its resulting category with the expected category.
			`"""`
Getting closer... 2023-02-04 05:14:24 +00:00
Add reminder to myself 2023-02-13 17:47:29 +00:00			`# TODO: Add precision and recall measures`

New version working nicely 2023-02-03 22:27:44 +00:00			`evaluated: int = 0`
CODE IS DONE 2023-02-12 04:11:58 +00:00
			`perfect: int = 0`

			`squared_error: float = 0.0`
Working prototype 2023-02-02 01:56:37 +00:00
enough 2023-02-08 18:46:05 +00:00			`for review in evaluation_dataset_func():`
Getting closer... 2023-02-04 05:14:24 +00:00			`resulting_category = self.use(review.text)`
Improve rendering of evaluation steps 2023-02-13 16:14:56 +00:00			`log.debug("Evaluation step: %.1d* for %s", resulting_category, review)`
New version working nicely 2023-02-03 22:27:44 +00:00			`evaluated += 1`
idk something more & more 2023-02-10 05:21:50 +00:00			`try:`
CODE IS DONE 2023-02-12 04:11:58 +00:00			`perfect += 1 if resulting_category == review.rating else 0`
			`squared_error += (resulting_category - review.rating) ** 2`
idk something more & more 2023-02-10 05:21:50 +00:00			`except ValueError:`
			`log.warning("Model execution on %s resulted in a NaN value: %s", review, resulting_category)`
Working prototype 2023-02-02 01:56:37 +00:00
CODE IS DONE 2023-02-12 04:11:58 +00:00			`return EvaluationResults(perfect=perfect, evaluated=evaluated, mse=squared_error / evaluated)`
PEP8 2023-02-03 16:50:40 +00:00
enough 2023-02-08 18:46:05 +00:00
			`@dataclasses.dataclass`
			`class EvaluationResults:`
			`"""`
			`Container for the results of a dataset evaluation.`
			`"""`

			`evaluated: int`
CODE IS DONE 2023-02-12 04:11:58 +00:00			`"""`
			`The number of reviews that were evaluated.`
			`"""`

			`perfect: int`
			`"""`
			`The number of reviews for which the model returned the correct rating.`
			`"""`

			`mse: float`
			`"""`
			`Mean squared error`
			`"""`
enough 2023-02-08 18:46:05 +00:00
			`def __repr__(self):`
			`return f"<EvaluationResults: {self!s}>"`

			`def __str__(self):`
Fix mse rendering 2023-02-13 14:42:35 +00:00			`return f"Evaluation results:\t{self.evaluated}\tevaluated\t{self.perfect}\tperfect\t{self.perfect / self.evaluated:.2%}\taccuracy\t{self.mse / self.evaluated:.2}\tmean squared error"`
Working prototype 2023-02-02 01:56:37 +00:00

stop here for now 2023-02-04 00:36:42 +00:00			`class AlreadyTrainedError(Exception):`
			`"""`
			`This model has already been trained and cannot be trained again.`
			`"""`


			`class NotTrainedError(Exception):`
			`"""`
			`This model has not been trained yet.`
			`"""`


stuff's working 2023-02-08 09:54:14 +00:00			`class TrainingFailedError(Exception):`
			`"""`
			`The model wasn't able to complete the training and should not be used anymore.`
			`"""`


Working prototype 2023-02-02 01:56:37 +00:00			`__all__ = (`
New version working nicely 2023-02-03 22:27:44 +00:00			`"BaseSentimentAnalyzer",`
stop here for now 2023-02-04 00:36:42 +00:00			`"AlreadyTrainedError",`
			`"NotTrainedError",`
stuff's working 2023-02-08 09:54:14 +00:00			`"TrainingFailedError",`
Working prototype 2023-02-02 01:56:37 +00:00			`)`