bda-6-steffo/unimore_bda_6/analysis/nltk_sentiment.py

import nltk
import nltk.classify
import nltk.sentiment
import nltk.sentiment.util
import logging
import typing as t
import itertools

from ..database import Text, Category, Review, CachedDatasetFunc
from .base import BaseSentimentAnalyzer, AlreadyTrainedError, NotTrainedError
from ..log import count_passage
from ..tokenizer import BaseTokenizer

log = logging.getLogger(__name__)

TokenBag = list[str]
Features = dict[str, int]


class NLTKSentimentAnalyzer(BaseSentimentAnalyzer):
    """
    A sentiment analyzer resembling the one implemented in structure the one implemented in the classroom, using the basic sentiment analyzer of NLTK.
    """

    def __init__(self, *, tokenizer: BaseTokenizer) -> None:
        if not tokenizer.supports_plain():
            raise TypeError("Tokenizer does not support NLTK")

        super().__init__(tokenizer=tokenizer)

        self.model: nltk.sentiment.SentimentAnalyzer = nltk.sentiment.SentimentAnalyzer()
        self.trained: bool = False
        self.tokenizer: BaseTokenizer = tokenizer

    def __repr__(self):
        return f"<{self.__class__.__qualname__} tokenizer={self.tokenizer!r}>"

    def __tokenize_review(self, datatuple: Review) -> tuple[TokenBag, Category]:
        """
        Convert the `Text` of a `DataTuple` to a `TokenBag`.
        """
        count_passage(log, "tokenize_datatuple", 100)
        return self.tokenizer.tokenize_and_split_plain(datatuple.text), datatuple.category

    def _add_feature_unigrams(self, dataset: t.Iterator[tuple[TokenBag, Category]]) -> None:
        """
        Register the `nltk.sentiment.util.extract_unigram_feats` feature extrator on the model.
        """
        # Ignore the category and only access the tokens
        tokenbags = map(lambda d: d[0], dataset)
        # Get all words in the documents
        all_words = self.model.all_words(tokenbags, labeled=False)
        # Create unigram `contains(*)` features from the previously gathered words
        unigrams = self.model.unigram_word_feats(words=all_words, min_freq=4)
        # Add the feature extractor to the model
        self.model.add_feat_extractor(nltk.sentiment.util.extract_unigram_feats, unigrams=unigrams)

    def _add_feature_extractors(self, dataset: t.Iterator[tuple[TokenBag, Category]]):
        """
        Register new feature extractors on the `.model`.
        """
        # Add the unigrams feature
        self._add_feature_unigrams(dataset)

    def __extract_features(self, data: tuple[TokenBag, Category]) -> tuple[Features, Category]:
        """
        Convert a (TokenBag, Category) tuple to a (Features, Category) tuple.

        Does not use `SentimentAnalyzer.apply_features` due to unexpected behaviour when using iterators.
        """
        count_passage(log, "extract_features", 100)
        return self.model.extract_features(data[0]), data[1]

    def train(self, training_dataset_func: CachedDatasetFunc, validation_dataset_func: CachedDatasetFunc) -> None:
        # Forbid retraining the model
        if self.trained:
            raise AlreadyTrainedError()

        # Get a generator
        dataset: t.Generator[Review] = training_dataset_func()

        # Tokenize the dataset
        dataset: t.Iterator[tuple[TokenBag, Category]] = map(self.__tokenize_review, dataset)

        # Cleanly duplicate the dataset iterator
        # Reduce average memory footprint, but not maximum
        dataset_1, dataset_2 = itertools.tee(dataset, 2)
        dataset_1: t.Iterator[tuple[TokenBag, Category]]
        dataset_2: t.Iterator[tuple[TokenBag, Category]]

        # Add the feature extractors to the model
        self._add_feature_extractors(dataset_1)
        del dataset_1  # Delete exausted iterator

        # Extract features from the dataset
        dataset_2: t.Iterator[tuple[Features, Category]] = map(self.__extract_features, dataset_2)

        # Train the classifier with the extracted features and category
        self.model.classifier = nltk.classify.NaiveBayesClassifier.train(dataset_2)

        # Toggle the trained flag
        self.trained = True

    def use(self, text: Text) -> Category:
        # Require the model to be trained
        if not self.trained:
            raise NotTrainedError()

        # Tokenize the input
        tokens = self.tokenizer.tokenize_and_split_plain(text)

        # Run the classification method
        return self.model.classify(instance=tokens)


__all__ = (
    "NLTKSentimentAnalyzer",
)
New version working nicely 2023-02-03 22:27:44 +00:00			`import nltk`
			`import nltk.classify`
			`import nltk.sentiment`
			`import nltk.sentiment.util`
			`import logging`
			`import typing as t`
			`import itertools`

enough 2023-02-08 18:46:05 +00:00			`from ..database import Text, Category, Review, CachedDatasetFunc`
stop here for now 2023-02-04 00:36:42 +00:00			`from .base import BaseSentimentAnalyzer, AlreadyTrainedError, NotTrainedError`
New version working nicely 2023-02-03 22:27:44 +00:00			`from ..log import count_passage`
			`from ..tokenizer import BaseTokenizer`

			`log = logging.getLogger(__name__)`

			`TokenBag = list[str]`
			`Features = dict[str, int]`


			`class NLTKSentimentAnalyzer(BaseSentimentAnalyzer):`
			`"""`
			`A sentiment analyzer resembling the one implemented in structure the one implemented in the classroom, using the basic sentiment analyzer of NLTK.`
			`"""`

			`def __init__(self, *, tokenizer: BaseTokenizer) -> None:`
enough 2023-02-08 18:46:05 +00:00			`if not tokenizer.supports_plain():`
			`raise TypeError("Tokenizer does not support NLTK")`

			`super().__init__(tokenizer=tokenizer)`

New version working nicely 2023-02-03 22:27:44 +00:00			`self.model: nltk.sentiment.SentimentAnalyzer = nltk.sentiment.SentimentAnalyzer()`
			`self.trained: bool = False`
Getting closer... 2023-02-04 05:14:24 +00:00			`self.tokenizer: BaseTokenizer = tokenizer`
New version working nicely 2023-02-03 22:27:44 +00:00
Getting closer... 2023-02-04 05:14:24 +00:00			`def __repr__(self):`
			`return f"<{self.__class__.__qualname__} tokenizer={self.tokenizer!r}>"`

Made good progress How does text vectorization in tensorflow work? 2023-02-05 16:40:22 +00:00			`def __tokenize_review(self, datatuple: Review) -> tuple[TokenBag, Category]:`
New version working nicely 2023-02-03 22:27:44 +00:00			`"""`
			Convert the `Text` of a `DataTuple` to a `TokenBag`.
			`"""`
			`count_passage(log, "tokenize_datatuple", 100)`
Fix some leftover bugs 2023-02-10 04:18:24 +00:00			`return self.tokenizer.tokenize_and_split_plain(datatuple.text), datatuple.category`
New version working nicely 2023-02-03 22:27:44 +00:00
			`def _add_feature_unigrams(self, dataset: t.Iterator[tuple[TokenBag, Category]]) -> None:`
			`"""`
			Register the `nltk.sentiment.util.extract_unigram_feats` feature extrator on the model.
			`"""`
			`# Ignore the category and only access the tokens`
			`tokenbags = map(lambda d: d[0], dataset)`
			`# Get all words in the documents`
			`all_words = self.model.all_words(tokenbags, labeled=False)`
			# Create unigram `contains(*)` features from the previously gathered words
			`unigrams = self.model.unigram_word_feats(words=all_words, min_freq=4)`
			`# Add the feature extractor to the model`
			`self.model.add_feat_extractor(nltk.sentiment.util.extract_unigram_feats, unigrams=unigrams)`

			`def _add_feature_extractors(self, dataset: t.Iterator[tuple[TokenBag, Category]]):`
			`"""`
			Register new feature extractors on the `.model`.
			`"""`
			`# Add the unigrams feature`
			`self._add_feature_unigrams(dataset)`

			`def __extract_features(self, data: tuple[TokenBag, Category]) -> tuple[Features, Category]:`
			`"""`
			`Convert a (TokenBag, Category) tuple to a (Features, Category) tuple.`

			Does not use `SentimentAnalyzer.apply_features` due to unexpected behaviour when using iterators.
			`"""`
			`count_passage(log, "extract_features", 100)`
			`return self.model.extract_features(data[0]), data[1]`

enough 2023-02-08 18:46:05 +00:00			`def train(self, training_dataset_func: CachedDatasetFunc, validation_dataset_func: CachedDatasetFunc) -> None:`
New version working nicely 2023-02-03 22:27:44 +00:00			`# Forbid retraining the model`
			`if self.trained:`
			`raise AlreadyTrainedError()`

Made good progress How does text vectorization in tensorflow work? 2023-02-05 16:40:22 +00:00			`# Get a generator`
enough 2023-02-08 18:46:05 +00:00			`dataset: t.Generator[Review] = training_dataset_func()`
Made good progress How does text vectorization in tensorflow work? 2023-02-05 16:40:22 +00:00
New version working nicely 2023-02-03 22:27:44 +00:00			`# Tokenize the dataset`
Made good progress How does text vectorization in tensorflow work? 2023-02-05 16:40:22 +00:00			`dataset: t.Iterator[tuple[TokenBag, Category]] = map(self.__tokenize_review, dataset)`
New version working nicely 2023-02-03 22:27:44 +00:00
			`# Cleanly duplicate the dataset iterator`
			`# Reduce average memory footprint, but not maximum`
			`dataset_1, dataset_2 = itertools.tee(dataset, 2)`
			`dataset_1: t.Iterator[tuple[TokenBag, Category]]`
			`dataset_2: t.Iterator[tuple[TokenBag, Category]]`

			`# Add the feature extractors to the model`
			`self._add_feature_extractors(dataset_1)`
			`del dataset_1 # Delete exausted iterator`

			`# Extract features from the dataset`
			`dataset_2: t.Iterator[tuple[Features, Category]] = map(self.__extract_features, dataset_2)`

			`# Train the classifier with the extracted features and category`
			`self.model.classifier = nltk.classify.NaiveBayesClassifier.train(dataset_2)`

			`# Toggle the trained flag`
			`self.trained = True`

			`def use(self, text: Text) -> Category:`
			`# Require the model to be trained`
			`if not self.trained:`
			`raise NotTrainedError()`

			`# Tokenize the input`
fix and patch things 2023-02-11 03:32:17 +00:00			`tokens = self.tokenizer.tokenize_and_split_plain(text)`
New version working nicely 2023-02-03 22:27:44 +00:00
			`# Run the classification method`
			`return self.model.classify(instance=tokens)`


			`__all__ = (`
			`"NLTKSentimentAnalyzer",`
			`)`