diff --git a/unimore_bda_6/analysis/nltk_sentiment.py b/unimore_bda_6/analysis/nltk_sentiment.py index 25fc310..862b72c 100644 --- a/unimore_bda_6/analysis/nltk_sentiment.py +++ b/unimore_bda_6/analysis/nltk_sentiment.py @@ -43,19 +43,19 @@ class NLTKSentimentAnalyzer(BaseSentimentAnalyzer): """ Register new feature extractors on the `.model`. """ - # Tokenize the reviews + # Tokenize the reviews and collect the iterator to avoid breaking NLTK dataset: t.Iterator[TokenizedReview] = map(self.tokenizer.tokenize_review, dataset) # Add the unigrams feature self._add_feature_unigrams(dataset) - def __extract_features(self, review: TextReview) -> tuple[Features, float]: + def __extract_features(self, review: TextReview) -> tuple[Features, str]: """ Convert a (TokenBag, Category) tuple to a (Features, Category) tuple. Does not use `SentimentAnalyzer.apply_features` due to unexpected behaviour when using iterators. """ review: TokenizedReview = self.tokenizer.tokenize_review(review) - return self.model.extract_features(review.tokens), review.rating + return self.model.extract_features(review.tokens), str(review.rating) def train(self, training_dataset_func: CachedDatasetFunc, validation_dataset_func: CachedDatasetFunc) -> None: # Forbid retraining the model @@ -66,7 +66,7 @@ class NLTKSentimentAnalyzer(BaseSentimentAnalyzer): self._add_feature_extractors(training_dataset_func()) # Extract features from the dataset - featureset: t.Iterator[tuple[Features, float]] = map(self.__extract_features, training_dataset_func()) + featureset: t.Iterator[tuple[Features, str]] = map(self.__extract_features, training_dataset_func()) # Train the classifier with the extracted features and category self.model.classifier = nltk.classify.NaiveBayesClassifier.train(featureset) @@ -83,7 +83,12 @@ class NLTKSentimentAnalyzer(BaseSentimentAnalyzer): tokens = self.tokenizer.tokenize(text) # Run the classification method - return self.model.classify(instance=tokens) + rating = self.model.classify(instance=tokens) + + # Convert the class back into a float + rating = float(rating) + + return rating __all__ = ( diff --git a/unimore_bda_6/tokenizer/potts.py b/unimore_bda_6/tokenizer/potts.py index 414832c..8da0304 100644 --- a/unimore_bda_6/tokenizer/potts.py +++ b/unimore_bda_6/tokenizer/potts.py @@ -118,6 +118,8 @@ class PottsTokenizer(BaseTokenizer): tokens = self.words_re.findall(text) # Possible alter the case, but avoid changing emoticons like :D into :d: tokens = map(self.lower_but_preserve_emoticons, tokens) + # Convert to a list (sigh) the iterator + tokens = list(tokens) # Return the result return tokens @@ -129,13 +131,11 @@ class PottsTokenizerWithNegation(PottsTokenizer): def tokenize(self, text: str) -> t.Iterator[str]: # Apply the base tokenization - words = super().tokenize(text) - # Convert to a list (sigh) the iterator - words = list(words) + tokens = super().tokenize(text) # Use nltk to mark negation - nltk.sentiment.util.mark_negation(words, shallow=True) + nltk.sentiment.util.mark_negation(tokens, shallow=True) # Return the result - return words + return tokens __all__ = (