bda-6-steffo/unimore_bda_6/analysis/vanilla.py

import nltk
import nltk.classify
import nltk.sentiment
import nltk.sentiment.util
import logging
import typing as t
import itertools

from .base import Input, Category, BaseSA, AlreadyTrainedError, NotTrainedError
from ..log import count_passage

TokenBag = list[str]
IntermediateValue = t.TypeVar("IntermediateValue")
Features = dict[str, int]


log = logging.getLogger(__name__)


class VanillaSA(BaseSA):
    """
    A sentiment analyzer resembling the one implemented in structure the one implemented in the classroom, using the basic sentiment analyzer of NLTK.
    """

    def __init__(self, *, extractor: t.Callable[[Input], tuple[str, IntermediateValue]], tokenizer: t.Callable[[str], TokenBag], categorizer: t.Callable[[IntermediateValue], Category]) -> None:
        super().__init__()
        self.model: nltk.sentiment.SentimentAnalyzer = nltk.sentiment.SentimentAnalyzer()
        self.trained: bool = False
        self.extractor: t.Callable[[Input], tuple[str, IntermediateValue]] = extractor
        self.tokenizer: t.Callable[[str], TokenBag] = tokenizer
        self.categorizer: t.Callable[[IntermediateValue], Category] = categorizer

    def __repr__(self):
        return f"<{self.__class__.__qualname__} {'trained' if self.trained else 'untrained'} tokenizer={self.extractor!r} categorizer={self.categorizer!r}>"

    @staticmethod
    def __data_to_tokenbag(data: tuple[TokenBag, Category]) -> TokenBag:
        """
        Access the tokenbag of a data tuple.
        """
        return data[0]

    def __add_feature_unigrams(self, dataset: t.Iterator[tuple[TokenBag, Category]]) -> None:
        """
        Register the `nltk.sentiment.util.extract_unigram_feats` feature extrator on the model.
        """
        tokenbags = map(self.__data_to_tokenbag, dataset)
        all_words = self.model.all_words(tokenbags, labeled=False)
        unigrams = self.model.unigram_word_feats(words=all_words, min_freq=4)
        self.model.add_feat_extractor(nltk.sentiment.util.extract_unigram_feats, unigrams=unigrams)

    def _add_features(self, dataset: t.Iterator[tuple[TokenBag, Category]]):
        """
        Register new feature extractors on the `.model`.
        """
        self.__add_feature_unigrams(dataset)

    def __extract_features(self, data: tuple[TokenBag, Category]) -> tuple[Features, Category]:
        """
        Convert a (TokenBag, Category) tuple to a (Features, Category) tuple.

        Does not use `SentimentAnalyzer.apply_features` due to unexpected behaviour when using iterators.
        """
        count_passage("processed_features", 100)
        return self.model.extract_features(data[0]), data[1]

    def _train_from_dataset(self, dataset: t.Iterator[tuple[TokenBag, Category]]) -> None:
        """
        Train the model with the given training set.
        """
        if self.trained:
            raise AlreadyTrainedError()

        dataset_1, dataset_2 = itertools.tee(dataset, 2)

        self._add_features(dataset_1)
        del dataset_1

        dataset_2 = map(self.__extract_features, dataset_2)
        self.model.classifier = nltk.classify.NaiveBayesClassifier.train(dataset_2)
        self.trained = True

    def _evaluate_from_dataset(self, dataset: t.Iterator[tuple[TokenBag, Category]]) -> dict:
        """
        Perform a model evaluation with the given test set.
        """
        if not self.trained:
            raise NotTrainedError()

        dataset_1 = map(self.__extract_features, dataset)
        # FIXME: This won't work with streams :(
        return self.model.evaluate(list(dataset_1))

    def _use_from_tokenbag(self, tokens: TokenBag) -> Category:
        """
        Categorize the given token bag.
        """
        if not self.trained:
            raise NotTrainedError()

        return self.model.classify(instance=tokens)

    def _extract_data(self, inp: Input) -> tuple[TokenBag, Category]:
        count_passage("processed_data", 100)
        text, value = self.extractor(inp)
        return self.tokenizer(text), self.categorizer(value)

    def _extract_dataset(self, inp: t.Iterator[Input]) -> list[tuple[TokenBag, Category]]:
        return map(self._extract_data, inp)

    def train(self, training_set: t.Iterator[Input]) -> None:
        dataset = self._extract_dataset(training_set)
        self._train_from_dataset(dataset)

    def evaluate(self, test_set: t.Iterator[Input]) -> dict:
        dataset = self._extract_dataset(test_set)
        return self._evaluate_from_dataset(dataset)

    def use(self, text: Input) -> Category:
        tokens = self.tokenizer(text)
        return self._use_from_tokenbag(tokens)


__all__ = (
    "VanillaSA",
)
Make some progress 2023-02-01 16:46:25 +00:00			`import nltk`
			`import nltk.classify`
			`import nltk.sentiment`
			`import nltk.sentiment.util`
			`import logging`
Working prototype 2023-02-02 01:56:37 +00:00			`import typing as t`
Fix VanillaSA to work with iterators 2023-02-03 01:10:00 +00:00			`import itertools`
Make some progress 2023-02-01 16:46:25 +00:00
Refactor things to work better 2023-02-02 16:24:11 +00:00			`from .base import Input, Category, BaseSA, AlreadyTrainedError, NotTrainedError`
Fix VanillaSA to work with iterators 2023-02-03 01:10:00 +00:00			`from ..log import count_passage`
Refactor things to work better 2023-02-02 16:24:11 +00:00
			`TokenBag = list[str]`
			`IntermediateValue = t.TypeVar("IntermediateValue")`
Fix VanillaSA to work with iterators 2023-02-03 01:10:00 +00:00			`Features = dict[str, int]`
Make some progress 2023-02-01 16:46:25 +00:00

			`log = logging.getLogger(__name__)`


Refactor things to work better 2023-02-02 16:24:11 +00:00			`class VanillaSA(BaseSA):`
Working prototype 2023-02-02 01:56:37 +00:00			`"""`
			`A sentiment analyzer resembling the one implemented in structure the one implemented in the classroom, using the basic sentiment analyzer of NLTK.`
			`"""`

Fix VanillaSA to work with iterators 2023-02-03 01:10:00 +00:00			`def __init__(self, *, extractor: t.Callable[[Input], tuple[str, IntermediateValue]], tokenizer: t.Callable[[str], TokenBag], categorizer: t.Callable[[IntermediateValue], Category]) -> None:`
Working prototype 2023-02-02 01:56:37 +00:00			`super().__init__()`
			`self.model: nltk.sentiment.SentimentAnalyzer = nltk.sentiment.SentimentAnalyzer()`
Refactor things to work better 2023-02-02 16:24:11 +00:00			`self.trained: bool = False`
			`self.extractor: t.Callable[[Input], tuple[str, IntermediateValue]] = extractor`
			`self.tokenizer: t.Callable[[str], TokenBag] = tokenizer`
			`self.categorizer: t.Callable[[IntermediateValue], Category] = categorizer`
Working prototype 2023-02-02 01:56:37 +00:00
Fix VanillaSA to work with iterators 2023-02-03 01:10:00 +00:00			`def __repr__(self):`
			`return f"<{self.__class__.__qualname__} {'trained' if self.trained else 'untrained'} tokenizer={self.extractor!r} categorizer={self.categorizer!r}>"`

			`@staticmethod`
			`def __data_to_tokenbag(data: tuple[TokenBag, Category]) -> TokenBag:`
			`"""`
			`Access the tokenbag of a data tuple.`
			`"""`
			`return data[0]`

			`def __add_feature_unigrams(self, dataset: t.Iterator[tuple[TokenBag, Category]]) -> None:`
Working prototype 2023-02-02 01:56:37 +00:00			`"""`
Fix VanillaSA to work with iterators 2023-02-03 01:10:00 +00:00			Register the `nltk.sentiment.util.extract_unigram_feats` feature extrator on the model.
Working prototype 2023-02-02 01:56:37 +00:00			`"""`
Fix VanillaSA to work with iterators 2023-02-03 01:10:00 +00:00			`tokenbags = map(self.__data_to_tokenbag, dataset)`
			`all_words = self.model.all_words(tokenbags, labeled=False)`
Working prototype 2023-02-02 01:56:37 +00:00			`unigrams = self.model.unigram_word_feats(words=all_words, min_freq=4)`
			`self.model.add_feat_extractor(nltk.sentiment.util.extract_unigram_feats, unigrams=unigrams)`

Fix VanillaSA to work with iterators 2023-02-03 01:10:00 +00:00			`def _add_features(self, dataset: t.Iterator[tuple[TokenBag, Category]]):`
			`"""`
			Register new feature extractors on the `.model`.
			`"""`
			`self.__add_feature_unigrams(dataset)`

			`def __extract_features(self, data: tuple[TokenBag, Category]) -> tuple[Features, Category]:`
Working prototype 2023-02-02 01:56:37 +00:00			`"""`
Fix VanillaSA to work with iterators 2023-02-03 01:10:00 +00:00			`Convert a (TokenBag, Category) tuple to a (Features, Category) tuple.`

			Does not use `SentimentAnalyzer.apply_features` due to unexpected behaviour when using iterators.
Working prototype 2023-02-02 01:56:37 +00:00			`"""`
it works, but at what cost 2023-02-03 01:49:14 +00:00			`count_passage("processed_features", 100)`
Fix VanillaSA to work with iterators 2023-02-03 01:10:00 +00:00			`return self.model.extract_features(data[0]), data[1]`
Working prototype 2023-02-02 01:56:37 +00:00
Fix VanillaSA to work with iterators 2023-02-03 01:10:00 +00:00			`def _train_from_dataset(self, dataset: t.Iterator[tuple[TokenBag, Category]]) -> None:`
Working prototype 2023-02-02 01:56:37 +00:00			`"""`
Refactor things to work better 2023-02-02 16:24:11 +00:00			`Train the model with the given training set.`
Working prototype 2023-02-02 01:56:37 +00:00			`"""`
			`if self.trained:`
			`raise AlreadyTrainedError()`

Fix VanillaSA to work with iterators 2023-02-03 01:10:00 +00:00			`dataset_1, dataset_2 = itertools.tee(dataset, 2)`

			`self._add_features(dataset_1)`
			`del dataset_1`
Working prototype 2023-02-02 01:56:37 +00:00
Fix VanillaSA to work with iterators 2023-02-03 01:10:00 +00:00			`dataset_2 = map(self.__extract_features, dataset_2)`
			`self.model.classifier = nltk.classify.NaiveBayesClassifier.train(dataset_2)`
Working prototype 2023-02-02 01:56:37 +00:00			`self.trained = True`

Fix VanillaSA to work with iterators 2023-02-03 01:10:00 +00:00			`def _evaluate_from_dataset(self, dataset: t.Iterator[tuple[TokenBag, Category]]) -> dict:`
Refactor things to work better 2023-02-02 16:24:11 +00:00			`"""`
			`Perform a model evaluation with the given test set.`
			`"""`
Working prototype 2023-02-02 01:56:37 +00:00			`if not self.trained:`
			`raise NotTrainedError()`

Fix VanillaSA to work with iterators 2023-02-03 01:10:00 +00:00			`dataset_1 = map(self.__extract_features, dataset)`
it works, but at what cost 2023-02-03 01:49:14 +00:00			`# FIXME: This won't work with streams :(`
			`return self.model.evaluate(list(dataset_1))`
Working prototype 2023-02-02 01:56:37 +00:00
Refactor things to work better 2023-02-02 16:24:11 +00:00			`def _use_from_tokenbag(self, tokens: TokenBag) -> Category:`
Working prototype 2023-02-02 01:56:37 +00:00			`"""`
Refactor things to work better 2023-02-02 16:24:11 +00:00			`Categorize the given token bag.`
Working prototype 2023-02-02 01:56:37 +00:00			`"""`
Refactor things to work better 2023-02-02 16:24:11 +00:00			`if not self.trained:`
			`raise NotTrainedError()`
Working prototype 2023-02-02 01:56:37 +00:00
Refactor things to work better 2023-02-02 16:24:11 +00:00			`return self.model.classify(instance=tokens)`
Make some progress 2023-02-01 16:46:25 +00:00
Refactor things to work better 2023-02-02 16:24:11 +00:00			`def _extract_data(self, inp: Input) -> tuple[TokenBag, Category]:`
Fix VanillaSA to work with iterators 2023-02-03 01:10:00 +00:00			`count_passage("processed_data", 100)`
Refactor things to work better 2023-02-02 16:24:11 +00:00			`text, value = self.extractor(inp)`
			`return self.tokenizer(text), self.categorizer(value)`
Make some progress 2023-02-01 16:46:25 +00:00
Fix VanillaSA to work with iterators 2023-02-03 01:10:00 +00:00			`def _extract_dataset(self, inp: t.Iterator[Input]) -> list[tuple[TokenBag, Category]]:`
			`return map(self._extract_data, inp)`
Use composition instead of inheritance 2023-02-02 15:03:07 +00:00
Fix VanillaSA to work with iterators 2023-02-03 01:10:00 +00:00			`def train(self, training_set: t.Iterator[Input]) -> None:`
Refactor things to work better 2023-02-02 16:24:11 +00:00			`dataset = self._extract_dataset(training_set)`
			`self._train_from_dataset(dataset)`
Use composition instead of inheritance 2023-02-02 15:03:07 +00:00
Fix VanillaSA to work with iterators 2023-02-03 01:10:00 +00:00			`def evaluate(self, test_set: t.Iterator[Input]) -> dict:`
Refactor things to work better 2023-02-02 16:24:11 +00:00			`dataset = self._extract_dataset(test_set)`
			`return self._evaluate_from_dataset(dataset)`
Use composition instead of inheritance 2023-02-02 15:03:07 +00:00
Refactor things to work better 2023-02-02 16:24:11 +00:00			`def use(self, text: Input) -> Category:`
			`tokens = self.tokenizer(text)`
			`return self._use_from_tokenbag(tokens)`
Make some more progress for the night Many things still do not work properly 2023-02-02 04:01:31 +00:00

Make some progress 2023-02-01 16:46:25 +00:00			`__all__ = (`
Working prototype 2023-02-02 01:56:37 +00:00			`"VanillaSA",`
Make some progress 2023-02-01 16:46:25 +00:00			`)`