2023-02-01 16:46:25 +00:00
|
|
|
import nltk
|
|
|
|
import nltk.classify
|
|
|
|
import nltk.sentiment
|
|
|
|
import nltk.sentiment.util
|
|
|
|
import logging
|
2023-02-02 01:56:37 +00:00
|
|
|
import typing as t
|
2023-02-03 01:10:00 +00:00
|
|
|
import itertools
|
2023-02-01 16:46:25 +00:00
|
|
|
|
2023-02-02 16:24:11 +00:00
|
|
|
from .base import Input, Category, BaseSA, AlreadyTrainedError, NotTrainedError
|
2023-02-03 01:10:00 +00:00
|
|
|
from ..log import count_passage
|
2023-02-02 16:24:11 +00:00
|
|
|
|
|
|
|
TokenBag = list[str]
|
|
|
|
IntermediateValue = t.TypeVar("IntermediateValue")
|
2023-02-03 01:10:00 +00:00
|
|
|
Features = dict[str, int]
|
2023-02-01 16:46:25 +00:00
|
|
|
|
|
|
|
|
|
|
|
log = logging.getLogger(__name__)
|
|
|
|
|
|
|
|
|
2023-02-02 16:24:11 +00:00
|
|
|
class VanillaSA(BaseSA):
|
2023-02-02 01:56:37 +00:00
|
|
|
"""
|
|
|
|
A sentiment analyzer resembling the one implemented in structure the one implemented in the classroom, using the basic sentiment analyzer of NLTK.
|
|
|
|
"""
|
|
|
|
|
2023-02-03 01:10:00 +00:00
|
|
|
def __init__(self, *, extractor: t.Callable[[Input], tuple[str, IntermediateValue]], tokenizer: t.Callable[[str], TokenBag], categorizer: t.Callable[[IntermediateValue], Category]) -> None:
|
2023-02-02 01:56:37 +00:00
|
|
|
super().__init__()
|
|
|
|
self.model: nltk.sentiment.SentimentAnalyzer = nltk.sentiment.SentimentAnalyzer()
|
2023-02-02 16:24:11 +00:00
|
|
|
self.trained: bool = False
|
|
|
|
self.extractor: t.Callable[[Input], tuple[str, IntermediateValue]] = extractor
|
|
|
|
self.tokenizer: t.Callable[[str], TokenBag] = tokenizer
|
|
|
|
self.categorizer: t.Callable[[IntermediateValue], Category] = categorizer
|
2023-02-02 01:56:37 +00:00
|
|
|
|
2023-02-03 01:10:00 +00:00
|
|
|
def __repr__(self):
|
|
|
|
return f"<{self.__class__.__qualname__} {'trained' if self.trained else 'untrained'} tokenizer={self.extractor!r} categorizer={self.categorizer!r}>"
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
def __data_to_tokenbag(data: tuple[TokenBag, Category]) -> TokenBag:
|
|
|
|
"""
|
|
|
|
Access the tokenbag of a data tuple.
|
|
|
|
"""
|
|
|
|
return data[0]
|
|
|
|
|
|
|
|
def __add_feature_unigrams(self, dataset: t.Iterator[tuple[TokenBag, Category]]) -> None:
|
2023-02-02 01:56:37 +00:00
|
|
|
"""
|
2023-02-03 01:10:00 +00:00
|
|
|
Register the `nltk.sentiment.util.extract_unigram_feats` feature extrator on the model.
|
2023-02-02 01:56:37 +00:00
|
|
|
"""
|
2023-02-03 01:10:00 +00:00
|
|
|
tokenbags = map(self.__data_to_tokenbag, dataset)
|
|
|
|
all_words = self.model.all_words(tokenbags, labeled=False)
|
2023-02-02 01:56:37 +00:00
|
|
|
unigrams = self.model.unigram_word_feats(words=all_words, min_freq=4)
|
|
|
|
self.model.add_feat_extractor(nltk.sentiment.util.extract_unigram_feats, unigrams=unigrams)
|
|
|
|
|
2023-02-03 01:10:00 +00:00
|
|
|
def _add_features(self, dataset: t.Iterator[tuple[TokenBag, Category]]):
|
|
|
|
"""
|
|
|
|
Register new feature extractors on the `.model`.
|
|
|
|
"""
|
|
|
|
self.__add_feature_unigrams(dataset)
|
|
|
|
|
|
|
|
def __extract_features(self, data: tuple[TokenBag, Category]) -> tuple[Features, Category]:
|
2023-02-02 01:56:37 +00:00
|
|
|
"""
|
2023-02-03 01:10:00 +00:00
|
|
|
Convert a (TokenBag, Category) tuple to a (Features, Category) tuple.
|
|
|
|
|
|
|
|
Does not use `SentimentAnalyzer.apply_features` due to unexpected behaviour when using iterators.
|
2023-02-02 01:56:37 +00:00
|
|
|
"""
|
2023-02-03 01:49:14 +00:00
|
|
|
count_passage("processed_features", 100)
|
2023-02-03 01:10:00 +00:00
|
|
|
return self.model.extract_features(data[0]), data[1]
|
2023-02-02 01:56:37 +00:00
|
|
|
|
2023-02-03 01:10:00 +00:00
|
|
|
def _train_from_dataset(self, dataset: t.Iterator[tuple[TokenBag, Category]]) -> None:
|
2023-02-02 01:56:37 +00:00
|
|
|
"""
|
2023-02-02 16:24:11 +00:00
|
|
|
Train the model with the given training set.
|
2023-02-02 01:56:37 +00:00
|
|
|
"""
|
|
|
|
if self.trained:
|
|
|
|
raise AlreadyTrainedError()
|
|
|
|
|
2023-02-03 01:10:00 +00:00
|
|
|
dataset_1, dataset_2 = itertools.tee(dataset, 2)
|
|
|
|
|
|
|
|
self._add_features(dataset_1)
|
|
|
|
del dataset_1
|
2023-02-02 01:56:37 +00:00
|
|
|
|
2023-02-03 01:10:00 +00:00
|
|
|
dataset_2 = map(self.__extract_features, dataset_2)
|
|
|
|
self.model.classifier = nltk.classify.NaiveBayesClassifier.train(dataset_2)
|
2023-02-02 01:56:37 +00:00
|
|
|
self.trained = True
|
|
|
|
|
2023-02-03 01:10:00 +00:00
|
|
|
def _evaluate_from_dataset(self, dataset: t.Iterator[tuple[TokenBag, Category]]) -> dict:
|
2023-02-02 16:24:11 +00:00
|
|
|
"""
|
|
|
|
Perform a model evaluation with the given test set.
|
|
|
|
"""
|
2023-02-02 01:56:37 +00:00
|
|
|
if not self.trained:
|
|
|
|
raise NotTrainedError()
|
|
|
|
|
2023-02-03 01:10:00 +00:00
|
|
|
dataset_1 = map(self.__extract_features, dataset)
|
2023-02-03 01:49:14 +00:00
|
|
|
# FIXME: This won't work with streams :(
|
|
|
|
return self.model.evaluate(list(dataset_1))
|
2023-02-02 01:56:37 +00:00
|
|
|
|
2023-02-02 16:24:11 +00:00
|
|
|
def _use_from_tokenbag(self, tokens: TokenBag) -> Category:
|
2023-02-02 01:56:37 +00:00
|
|
|
"""
|
2023-02-02 16:24:11 +00:00
|
|
|
Categorize the given token bag.
|
2023-02-02 01:56:37 +00:00
|
|
|
"""
|
2023-02-02 16:24:11 +00:00
|
|
|
if not self.trained:
|
|
|
|
raise NotTrainedError()
|
2023-02-02 01:56:37 +00:00
|
|
|
|
2023-02-02 16:24:11 +00:00
|
|
|
return self.model.classify(instance=tokens)
|
2023-02-01 16:46:25 +00:00
|
|
|
|
2023-02-02 16:24:11 +00:00
|
|
|
def _extract_data(self, inp: Input) -> tuple[TokenBag, Category]:
|
2023-02-03 01:10:00 +00:00
|
|
|
count_passage("processed_data", 100)
|
2023-02-02 16:24:11 +00:00
|
|
|
text, value = self.extractor(inp)
|
|
|
|
return self.tokenizer(text), self.categorizer(value)
|
2023-02-01 16:46:25 +00:00
|
|
|
|
2023-02-03 01:10:00 +00:00
|
|
|
def _extract_dataset(self, inp: t.Iterator[Input]) -> list[tuple[TokenBag, Category]]:
|
|
|
|
return map(self._extract_data, inp)
|
2023-02-02 15:03:07 +00:00
|
|
|
|
2023-02-03 01:10:00 +00:00
|
|
|
def train(self, training_set: t.Iterator[Input]) -> None:
|
2023-02-02 16:24:11 +00:00
|
|
|
dataset = self._extract_dataset(training_set)
|
|
|
|
self._train_from_dataset(dataset)
|
2023-02-02 15:03:07 +00:00
|
|
|
|
2023-02-03 01:10:00 +00:00
|
|
|
def evaluate(self, test_set: t.Iterator[Input]) -> dict:
|
2023-02-02 16:24:11 +00:00
|
|
|
dataset = self._extract_dataset(test_set)
|
|
|
|
return self._evaluate_from_dataset(dataset)
|
2023-02-02 15:03:07 +00:00
|
|
|
|
2023-02-02 16:24:11 +00:00
|
|
|
def use(self, text: Input) -> Category:
|
|
|
|
tokens = self.tokenizer(text)
|
|
|
|
return self._use_from_tokenbag(tokens)
|
2023-02-02 04:01:31 +00:00
|
|
|
|
|
|
|
|
2023-02-01 16:46:25 +00:00
|
|
|
__all__ = (
|
2023-02-02 01:56:37 +00:00
|
|
|
"VanillaSA",
|
2023-02-01 16:46:25 +00:00
|
|
|
)
|