1
Fork 0
mirror of https://github.com/Steffo99/unimore-bda-6.git synced 2024-11-21 15:34:18 +00:00

Fix PottsTokenizer failing to train NLTK model

This commit is contained in:
Steffo 2023-02-13 17:45:28 +01:00
parent 597020ac2d
commit c82352c7dc
Signed by: steffo
GPG key ID: 2A24051445686895
2 changed files with 15 additions and 10 deletions

View file

@ -43,19 +43,19 @@ class NLTKSentimentAnalyzer(BaseSentimentAnalyzer):
""" """
Register new feature extractors on the `.model`. Register new feature extractors on the `.model`.
""" """
# Tokenize the reviews # Tokenize the reviews and collect the iterator to avoid breaking NLTK
dataset: t.Iterator[TokenizedReview] = map(self.tokenizer.tokenize_review, dataset) dataset: t.Iterator[TokenizedReview] = map(self.tokenizer.tokenize_review, dataset)
# Add the unigrams feature # Add the unigrams feature
self._add_feature_unigrams(dataset) self._add_feature_unigrams(dataset)
def __extract_features(self, review: TextReview) -> tuple[Features, float]: def __extract_features(self, review: TextReview) -> tuple[Features, str]:
""" """
Convert a (TokenBag, Category) tuple to a (Features, Category) tuple. Convert a (TokenBag, Category) tuple to a (Features, Category) tuple.
Does not use `SentimentAnalyzer.apply_features` due to unexpected behaviour when using iterators. Does not use `SentimentAnalyzer.apply_features` due to unexpected behaviour when using iterators.
""" """
review: TokenizedReview = self.tokenizer.tokenize_review(review) review: TokenizedReview = self.tokenizer.tokenize_review(review)
return self.model.extract_features(review.tokens), review.rating return self.model.extract_features(review.tokens), str(review.rating)
def train(self, training_dataset_func: CachedDatasetFunc, validation_dataset_func: CachedDatasetFunc) -> None: def train(self, training_dataset_func: CachedDatasetFunc, validation_dataset_func: CachedDatasetFunc) -> None:
# Forbid retraining the model # Forbid retraining the model
@ -66,7 +66,7 @@ class NLTKSentimentAnalyzer(BaseSentimentAnalyzer):
self._add_feature_extractors(training_dataset_func()) self._add_feature_extractors(training_dataset_func())
# Extract features from the dataset # Extract features from the dataset
featureset: t.Iterator[tuple[Features, float]] = map(self.__extract_features, training_dataset_func()) featureset: t.Iterator[tuple[Features, str]] = map(self.__extract_features, training_dataset_func())
# Train the classifier with the extracted features and category # Train the classifier with the extracted features and category
self.model.classifier = nltk.classify.NaiveBayesClassifier.train(featureset) self.model.classifier = nltk.classify.NaiveBayesClassifier.train(featureset)
@ -83,7 +83,12 @@ class NLTKSentimentAnalyzer(BaseSentimentAnalyzer):
tokens = self.tokenizer.tokenize(text) tokens = self.tokenizer.tokenize(text)
# Run the classification method # Run the classification method
return self.model.classify(instance=tokens) rating = self.model.classify(instance=tokens)
# Convert the class back into a float
rating = float(rating)
return rating
__all__ = ( __all__ = (

View file

@ -118,6 +118,8 @@ class PottsTokenizer(BaseTokenizer):
tokens = self.words_re.findall(text) tokens = self.words_re.findall(text)
# Possible alter the case, but avoid changing emoticons like :D into :d: # Possible alter the case, but avoid changing emoticons like :D into :d:
tokens = map(self.lower_but_preserve_emoticons, tokens) tokens = map(self.lower_but_preserve_emoticons, tokens)
# Convert to a list (sigh) the iterator
tokens = list(tokens)
# Return the result # Return the result
return tokens return tokens
@ -129,13 +131,11 @@ class PottsTokenizerWithNegation(PottsTokenizer):
def tokenize(self, text: str) -> t.Iterator[str]: def tokenize(self, text: str) -> t.Iterator[str]:
# Apply the base tokenization # Apply the base tokenization
words = super().tokenize(text) tokens = super().tokenize(text)
# Convert to a list (sigh) the iterator
words = list(words)
# Use nltk to mark negation # Use nltk to mark negation
nltk.sentiment.util.mark_negation(words, shallow=True) nltk.sentiment.util.mark_negation(tokens, shallow=True)
# Return the result # Return the result
return words return tokens
__all__ = ( __all__ = (