mirror of
https://github.com/Steffo99/unimore-bda-6.git
synced 2024-11-21 15:34:18 +00:00
Fix PottsTokenizer failing to train NLTK model
This commit is contained in:
parent
597020ac2d
commit
c82352c7dc
2 changed files with 15 additions and 10 deletions
|
@ -43,19 +43,19 @@ class NLTKSentimentAnalyzer(BaseSentimentAnalyzer):
|
||||||
"""
|
"""
|
||||||
Register new feature extractors on the `.model`.
|
Register new feature extractors on the `.model`.
|
||||||
"""
|
"""
|
||||||
# Tokenize the reviews
|
# Tokenize the reviews and collect the iterator to avoid breaking NLTK
|
||||||
dataset: t.Iterator[TokenizedReview] = map(self.tokenizer.tokenize_review, dataset)
|
dataset: t.Iterator[TokenizedReview] = map(self.tokenizer.tokenize_review, dataset)
|
||||||
# Add the unigrams feature
|
# Add the unigrams feature
|
||||||
self._add_feature_unigrams(dataset)
|
self._add_feature_unigrams(dataset)
|
||||||
|
|
||||||
def __extract_features(self, review: TextReview) -> tuple[Features, float]:
|
def __extract_features(self, review: TextReview) -> tuple[Features, str]:
|
||||||
"""
|
"""
|
||||||
Convert a (TokenBag, Category) tuple to a (Features, Category) tuple.
|
Convert a (TokenBag, Category) tuple to a (Features, Category) tuple.
|
||||||
|
|
||||||
Does not use `SentimentAnalyzer.apply_features` due to unexpected behaviour when using iterators.
|
Does not use `SentimentAnalyzer.apply_features` due to unexpected behaviour when using iterators.
|
||||||
"""
|
"""
|
||||||
review: TokenizedReview = self.tokenizer.tokenize_review(review)
|
review: TokenizedReview = self.tokenizer.tokenize_review(review)
|
||||||
return self.model.extract_features(review.tokens), review.rating
|
return self.model.extract_features(review.tokens), str(review.rating)
|
||||||
|
|
||||||
def train(self, training_dataset_func: CachedDatasetFunc, validation_dataset_func: CachedDatasetFunc) -> None:
|
def train(self, training_dataset_func: CachedDatasetFunc, validation_dataset_func: CachedDatasetFunc) -> None:
|
||||||
# Forbid retraining the model
|
# Forbid retraining the model
|
||||||
|
@ -66,7 +66,7 @@ class NLTKSentimentAnalyzer(BaseSentimentAnalyzer):
|
||||||
self._add_feature_extractors(training_dataset_func())
|
self._add_feature_extractors(training_dataset_func())
|
||||||
|
|
||||||
# Extract features from the dataset
|
# Extract features from the dataset
|
||||||
featureset: t.Iterator[tuple[Features, float]] = map(self.__extract_features, training_dataset_func())
|
featureset: t.Iterator[tuple[Features, str]] = map(self.__extract_features, training_dataset_func())
|
||||||
|
|
||||||
# Train the classifier with the extracted features and category
|
# Train the classifier with the extracted features and category
|
||||||
self.model.classifier = nltk.classify.NaiveBayesClassifier.train(featureset)
|
self.model.classifier = nltk.classify.NaiveBayesClassifier.train(featureset)
|
||||||
|
@ -83,7 +83,12 @@ class NLTKSentimentAnalyzer(BaseSentimentAnalyzer):
|
||||||
tokens = self.tokenizer.tokenize(text)
|
tokens = self.tokenizer.tokenize(text)
|
||||||
|
|
||||||
# Run the classification method
|
# Run the classification method
|
||||||
return self.model.classify(instance=tokens)
|
rating = self.model.classify(instance=tokens)
|
||||||
|
|
||||||
|
# Convert the class back into a float
|
||||||
|
rating = float(rating)
|
||||||
|
|
||||||
|
return rating
|
||||||
|
|
||||||
|
|
||||||
__all__ = (
|
__all__ = (
|
||||||
|
|
|
@ -118,6 +118,8 @@ class PottsTokenizer(BaseTokenizer):
|
||||||
tokens = self.words_re.findall(text)
|
tokens = self.words_re.findall(text)
|
||||||
# Possible alter the case, but avoid changing emoticons like :D into :d:
|
# Possible alter the case, but avoid changing emoticons like :D into :d:
|
||||||
tokens = map(self.lower_but_preserve_emoticons, tokens)
|
tokens = map(self.lower_but_preserve_emoticons, tokens)
|
||||||
|
# Convert to a list (sigh) the iterator
|
||||||
|
tokens = list(tokens)
|
||||||
# Return the result
|
# Return the result
|
||||||
return tokens
|
return tokens
|
||||||
|
|
||||||
|
@ -129,13 +131,11 @@ class PottsTokenizerWithNegation(PottsTokenizer):
|
||||||
|
|
||||||
def tokenize(self, text: str) -> t.Iterator[str]:
|
def tokenize(self, text: str) -> t.Iterator[str]:
|
||||||
# Apply the base tokenization
|
# Apply the base tokenization
|
||||||
words = super().tokenize(text)
|
tokens = super().tokenize(text)
|
||||||
# Convert to a list (sigh) the iterator
|
|
||||||
words = list(words)
|
|
||||||
# Use nltk to mark negation
|
# Use nltk to mark negation
|
||||||
nltk.sentiment.util.mark_negation(words, shallow=True)
|
nltk.sentiment.util.mark_negation(tokens, shallow=True)
|
||||||
# Return the result
|
# Return the result
|
||||||
return words
|
return tokens
|
||||||
|
|
||||||
|
|
||||||
__all__ = (
|
__all__ = (
|
||||||
|
|
Loading…
Reference in a new issue