Fix PottsTokenizer failing to train NLTK model

2025-03-29 03:40:35 +00:00 · 2023-02-13 17:45:28 +01:00 · 2023-02-13 17:45:28 +01:00 · c82352c7dc
commit c82352c7dc
parent 597020ac2d
2 changed files with 15 additions and 10 deletions
--- a/unimore_bda_6/analysis/nltk_sentiment.py
+++ b/unimore_bda_6/analysis/nltk_sentiment.py
@ -43,19 +43,19 @@ class NLTKSentimentAnalyzer(BaseSentimentAnalyzer):
        """
        Register new feature extractors on the `.model`.
        """
-        # Tokenize the reviews
+        # Tokenize the reviews and collect the iterator to avoid breaking NLTK
        dataset: t.Iterator[TokenizedReview] = map(self.tokenizer.tokenize_review, dataset)
        # Add the unigrams feature
        self._add_feature_unigrams(dataset)

-    def __extract_features(self, review: TextReview) -> tuple[Features, float]:
+    def __extract_features(self, review: TextReview) -> tuple[Features, str]:
        """
        Convert a (TokenBag, Category) tuple to a (Features, Category) tuple.

        Does not use `SentimentAnalyzer.apply_features` due to unexpected behaviour when using iterators.
        """
        review: TokenizedReview = self.tokenizer.tokenize_review(review)
-        return self.model.extract_features(review.tokens), review.rating
+        return self.model.extract_features(review.tokens), str(review.rating)

    def train(self, training_dataset_func: CachedDatasetFunc, validation_dataset_func: CachedDatasetFunc) -> None:
        # Forbid retraining the model
@ -66,7 +66,7 @@ class NLTKSentimentAnalyzer(BaseSentimentAnalyzer):
        self._add_feature_extractors(training_dataset_func())

        # Extract features from the dataset
-        featureset: t.Iterator[tuple[Features, float]] = map(self.__extract_features, training_dataset_func())
+        featureset: t.Iterator[tuple[Features, str]] = map(self.__extract_features, training_dataset_func())

        # Train the classifier with the extracted features and category
        self.model.classifier = nltk.classify.NaiveBayesClassifier.train(featureset)
@ -83,7 +83,12 @@ class NLTKSentimentAnalyzer(BaseSentimentAnalyzer):
        tokens = self.tokenizer.tokenize(text)

        # Run the classification method
-        return self.model.classify(instance=tokens)
+        rating = self.model.classify(instance=tokens)
+
+        # Convert the class back into a float
+        rating = float(rating)
+
+        return rating


 __all__ = (
--- a/unimore_bda_6/tokenizer/potts.py
+++ b/unimore_bda_6/tokenizer/potts.py
@ -118,6 +118,8 @@ class PottsTokenizer(BaseTokenizer):
        tokens = self.words_re.findall(text)
        # Possible alter the case, but avoid changing emoticons like :D into :d:
        tokens = map(self.lower_but_preserve_emoticons, tokens)
+        # Convert to a list (sigh) the iterator
+        tokens = list(tokens)
        # Return the result
        return tokens

@ -129,13 +131,11 @@ class PottsTokenizerWithNegation(PottsTokenizer):

    def tokenize(self, text: str) -> t.Iterator[str]:
        # Apply the base tokenization
-        words = super().tokenize(text)
-        # Convert to a list (sigh) the iterator
-        words = list(words)
+        tokens = super().tokenize(text)
        # Use nltk to mark negation
-        nltk.sentiment.util.mark_negation(words, shallow=True)
+        nltk.sentiment.util.mark_negation(tokens, shallow=True)
        # Return the result
-        return words
+        return tokens


 __all__ = (