1
Fork 0
mirror of https://github.com/Steffo99/unimore-bda-6.git synced 2024-11-21 23:44:19 +00:00

Fix PottsTokenizer failing to train NLTK model

This commit is contained in:
Steffo 2023-02-13 17:45:28 +01:00
parent 597020ac2d
commit c82352c7dc
Signed by: steffo
GPG key ID: 2A24051445686895
2 changed files with 15 additions and 10 deletions

View file

@ -43,19 +43,19 @@ class NLTKSentimentAnalyzer(BaseSentimentAnalyzer):
"""
Register new feature extractors on the `.model`.
"""
# Tokenize the reviews
# Tokenize the reviews and collect the iterator to avoid breaking NLTK
dataset: t.Iterator[TokenizedReview] = map(self.tokenizer.tokenize_review, dataset)
# Add the unigrams feature
self._add_feature_unigrams(dataset)
def __extract_features(self, review: TextReview) -> tuple[Features, float]:
def __extract_features(self, review: TextReview) -> tuple[Features, str]:
"""
Convert a (TokenBag, Category) tuple to a (Features, Category) tuple.
Does not use `SentimentAnalyzer.apply_features` due to unexpected behaviour when using iterators.
"""
review: TokenizedReview = self.tokenizer.tokenize_review(review)
return self.model.extract_features(review.tokens), review.rating
return self.model.extract_features(review.tokens), str(review.rating)
def train(self, training_dataset_func: CachedDatasetFunc, validation_dataset_func: CachedDatasetFunc) -> None:
# Forbid retraining the model
@ -66,7 +66,7 @@ class NLTKSentimentAnalyzer(BaseSentimentAnalyzer):
self._add_feature_extractors(training_dataset_func())
# Extract features from the dataset
featureset: t.Iterator[tuple[Features, float]] = map(self.__extract_features, training_dataset_func())
featureset: t.Iterator[tuple[Features, str]] = map(self.__extract_features, training_dataset_func())
# Train the classifier with the extracted features and category
self.model.classifier = nltk.classify.NaiveBayesClassifier.train(featureset)
@ -83,7 +83,12 @@ class NLTKSentimentAnalyzer(BaseSentimentAnalyzer):
tokens = self.tokenizer.tokenize(text)
# Run the classification method
return self.model.classify(instance=tokens)
rating = self.model.classify(instance=tokens)
# Convert the class back into a float
rating = float(rating)
return rating
__all__ = (

View file

@ -118,6 +118,8 @@ class PottsTokenizer(BaseTokenizer):
tokens = self.words_re.findall(text)
# Possible alter the case, but avoid changing emoticons like :D into :d:
tokens = map(self.lower_but_preserve_emoticons, tokens)
# Convert to a list (sigh) the iterator
tokens = list(tokens)
# Return the result
return tokens
@ -129,13 +131,11 @@ class PottsTokenizerWithNegation(PottsTokenizer):
def tokenize(self, text: str) -> t.Iterator[str]:
# Apply the base tokenization
words = super().tokenize(text)
# Convert to a list (sigh) the iterator
words = list(words)
tokens = super().tokenize(text)
# Use nltk to mark negation
nltk.sentiment.util.mark_negation(words, shallow=True)
nltk.sentiment.util.mark_negation(tokens, shallow=True)
# Return the result
return words
return tokens
__all__ = (