1
Fork 0
mirror of https://github.com/Steffo99/unimore-bda-6.git synced 2024-11-23 00:14:19 +00:00
bda-6-steffo/unimore_bda_6/analysis/nltk_sentiment.py

119 lines
4.3 KiB
Python
Raw Normal View History

2023-02-03 22:27:44 +00:00
import nltk
import nltk.classify
import nltk.sentiment
import nltk.sentiment.util
import logging
import typing as t
import itertools
2023-02-08 18:46:05 +00:00
from ..database import Text, Category, Review, CachedDatasetFunc
2023-02-04 00:36:42 +00:00
from .base import BaseSentimentAnalyzer, AlreadyTrainedError, NotTrainedError
2023-02-03 22:27:44 +00:00
from ..log import count_passage
from ..tokenizer import BaseTokenizer
log = logging.getLogger(__name__)
TokenBag = list[str]
Features = dict[str, int]
class NLTKSentimentAnalyzer(BaseSentimentAnalyzer):
"""
A sentiment analyzer resembling the one implemented in structure the one implemented in the classroom, using the basic sentiment analyzer of NLTK.
"""
def __init__(self, *, tokenizer: BaseTokenizer) -> None:
2023-02-08 18:46:05 +00:00
if not tokenizer.supports_plain():
raise TypeError("Tokenizer does not support NLTK")
super().__init__(tokenizer=tokenizer)
2023-02-03 22:27:44 +00:00
self.model: nltk.sentiment.SentimentAnalyzer = nltk.sentiment.SentimentAnalyzer()
self.trained: bool = False
2023-02-04 05:14:24 +00:00
self.tokenizer: BaseTokenizer = tokenizer
2023-02-03 22:27:44 +00:00
2023-02-04 05:14:24 +00:00
def __repr__(self):
return f"<{self.__class__.__qualname__} tokenizer={self.tokenizer!r}>"
def __tokenize_review(self, datatuple: Review) -> tuple[TokenBag, Category]:
2023-02-03 22:27:44 +00:00
"""
Convert the `Text` of a `DataTuple` to a `TokenBag`.
"""
count_passage(log, "tokenize_datatuple", 100)
2023-02-08 18:46:05 +00:00
return self.tokenizer.tokenize_plain(datatuple.text), datatuple.category
2023-02-03 22:27:44 +00:00
def _add_feature_unigrams(self, dataset: t.Iterator[tuple[TokenBag, Category]]) -> None:
"""
Register the `nltk.sentiment.util.extract_unigram_feats` feature extrator on the model.
"""
# Ignore the category and only access the tokens
tokenbags = map(lambda d: d[0], dataset)
# Get all words in the documents
all_words = self.model.all_words(tokenbags, labeled=False)
# Create unigram `contains(*)` features from the previously gathered words
unigrams = self.model.unigram_word_feats(words=all_words, min_freq=4)
# Add the feature extractor to the model
self.model.add_feat_extractor(nltk.sentiment.util.extract_unigram_feats, unigrams=unigrams)
def _add_feature_extractors(self, dataset: t.Iterator[tuple[TokenBag, Category]]):
"""
Register new feature extractors on the `.model`.
"""
# Add the unigrams feature
self._add_feature_unigrams(dataset)
def __extract_features(self, data: tuple[TokenBag, Category]) -> tuple[Features, Category]:
"""
Convert a (TokenBag, Category) tuple to a (Features, Category) tuple.
Does not use `SentimentAnalyzer.apply_features` due to unexpected behaviour when using iterators.
"""
count_passage(log, "extract_features", 100)
return self.model.extract_features(data[0]), data[1]
2023-02-08 18:46:05 +00:00
def train(self, training_dataset_func: CachedDatasetFunc, validation_dataset_func: CachedDatasetFunc) -> None:
2023-02-03 22:27:44 +00:00
# Forbid retraining the model
if self.trained:
raise AlreadyTrainedError()
# Get a generator
2023-02-08 18:46:05 +00:00
dataset: t.Generator[Review] = training_dataset_func()
2023-02-03 22:27:44 +00:00
# Tokenize the dataset
dataset: t.Iterator[tuple[TokenBag, Category]] = map(self.__tokenize_review, dataset)
2023-02-03 22:27:44 +00:00
# Cleanly duplicate the dataset iterator
# Reduce average memory footprint, but not maximum
dataset_1, dataset_2 = itertools.tee(dataset, 2)
dataset_1: t.Iterator[tuple[TokenBag, Category]]
dataset_2: t.Iterator[tuple[TokenBag, Category]]
# Add the feature extractors to the model
self._add_feature_extractors(dataset_1)
del dataset_1 # Delete exausted iterator
# Extract features from the dataset
dataset_2: t.Iterator[tuple[Features, Category]] = map(self.__extract_features, dataset_2)
# Train the classifier with the extracted features and category
self.model.classifier = nltk.classify.NaiveBayesClassifier.train(dataset_2)
# Toggle the trained flag
self.trained = True
def use(self, text: Text) -> Category:
# Require the model to be trained
if not self.trained:
raise NotTrainedError()
# Tokenize the input
2023-02-08 18:46:05 +00:00
tokens = self.tokenizer.tokenize_plain(text)
2023-02-03 22:27:44 +00:00
# Run the classification method
return self.model.classify(instance=tokens)
__all__ = (
"NLTKSentimentAnalyzer",
)