mirror of
https://github.com/Steffo99/unimore-bda-6.git
synced 2024-11-21 23:44:19 +00:00
Fix VanillaSA to work with iterators
This commit is contained in:
parent
767a6087a8
commit
4e1a9f842f
12 changed files with 99 additions and 63 deletions
|
@ -5,6 +5,8 @@
|
||||||
<option name="ignoredErrors">
|
<option name="ignoredErrors">
|
||||||
<list>
|
<list>
|
||||||
<option value="E124" />
|
<option value="E124" />
|
||||||
|
<option value="E501" />
|
||||||
|
<option value="E221" />
|
||||||
</list>
|
</list>
|
||||||
</option>
|
</option>
|
||||||
</inspection_tool>
|
</inspection_tool>
|
||||||
|
|
|
@ -4,6 +4,9 @@
|
||||||
<option name="show" value="ASK" />
|
<option name="show" value="ASK" />
|
||||||
<option name="description" value="" />
|
<option name="description" value="" />
|
||||||
</component>
|
</component>
|
||||||
|
<component name="PWA">
|
||||||
|
<option name="wasEnabledAtLeastOnce" value="true" />
|
||||||
|
</component>
|
||||||
<component name="ProjectRootManager" version="2" languageLevel="JDK_19">
|
<component name="ProjectRootManager" version="2" languageLevel="JDK_19">
|
||||||
<output url="file://$PROJECT_DIR$/out" />
|
<output url="file://$PROJECT_DIR$/out" />
|
||||||
</component>
|
</component>
|
||||||
|
|
|
@ -4,8 +4,10 @@
|
||||||
<option name="INTERPRETER_OPTIONS" value="" />
|
<option name="INTERPRETER_OPTIONS" value="" />
|
||||||
<option name="PARENT_ENVS" value="true" />
|
<option name="PARENT_ENVS" value="true" />
|
||||||
<envs>
|
<envs>
|
||||||
|
<env name="DATA_SET_SIZE" value="10000" />
|
||||||
<env name="NLTK_DATA" value="./data/nltk" />
|
<env name="NLTK_DATA" value="./data/nltk" />
|
||||||
<env name="PYTHONUNBUFFERED" value="1" />
|
<env name="PYTHONUNBUFFERED" value="1" />
|
||||||
|
<env name="WORKING_SET_SIZE" value="1000000" />
|
||||||
</envs>
|
</envs>
|
||||||
<option name="SDK_HOME" value="$PROJECT_DIR$/.venv/bin/python" />
|
<option name="SDK_HOME" value="$PROJECT_DIR$/.venv/bin/python" />
|
||||||
<option name="SDK_NAME" value="Poetry (unimore-bda-6)" />
|
<option name="SDK_NAME" value="Poetry (unimore-bda-6)" />
|
||||||
|
|
4
.vscode/launch.json
vendored
4
.vscode/launch.json
vendored
|
@ -9,7 +9,7 @@
|
||||||
"type": "python",
|
"type": "python",
|
||||||
"request": "launch",
|
"request": "launch",
|
||||||
"module": "unimore_bda_6",
|
"module": "unimore_bda_6",
|
||||||
"justMyCode": true,
|
"justMyCode": false,
|
||||||
"env": {
|
"env": {
|
||||||
"NLTK_DATA": "./data/nltk",
|
"NLTK_DATA": "./data/nltk",
|
||||||
"DATA_SET_SIZE": "250",
|
"DATA_SET_SIZE": "250",
|
||||||
|
@ -17,4 +17,4 @@
|
||||||
"cwd": "${workspaceFolder}",
|
"cwd": "${workspaceFolder}",
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
|
|
|
@ -64,35 +64,28 @@ def varied_categorizer(rating: float) -> str:
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
with mongo_reviews_collection_from_config() as reviews:
|
for dataset_func, categorizer in [
|
||||||
reviews_polar_training = dataset_polar(collection=reviews, amount=DATA_SET_SIZE.__wrapped__)
|
(dataset_polar, polar_categorizer),
|
||||||
reviews_polar_evaluation = dataset_polar(collection=reviews, amount=DATA_SET_SIZE.__wrapped__)
|
(dataset_varied, varied_categorizer),
|
||||||
|
]:
|
||||||
|
for tokenizer in all_tokenizers:
|
||||||
|
with mongo_reviews_collection_from_config() as reviews:
|
||||||
|
reviews_training = dataset_func(collection=reviews, amount=DATA_SET_SIZE.__wrapped__)
|
||||||
|
reviews_evaluation = dataset_func(collection=reviews, amount=DATA_SET_SIZE.__wrapped__)
|
||||||
|
|
||||||
for tokenizer in all_tokenizers:
|
model = VanillaSA(extractor=review_vanilla_extractor, tokenizer=tokenizer, categorizer=categorizer)
|
||||||
log.info("Training polar model with %s tokenizer", tokenizer)
|
log.info("Training model %s", model)
|
||||||
model = VanillaSA(extractor=review_vanilla_extractor, tokenizer=tokenizer, categorizer=polar_categorizer)
|
model.train(reviews_training)
|
||||||
model.train(reviews_polar_training)
|
log.info("Evaluating model %s", model)
|
||||||
log.info("Evaluating polar model with %s tokenizer", tokenizer)
|
evaluation = model.evaluate(reviews_evaluation)
|
||||||
evaluation = model.evaluate(reviews_polar_evaluation)
|
log.info("Results of model %s: %s", tokenizer, evaluation)
|
||||||
log.info("Polar model with %s results: %s", tokenizer, evaluation)
|
|
||||||
|
|
||||||
del reviews_polar_training
|
try:
|
||||||
del reviews_polar_evaluation
|
print("Model %s" % model)
|
||||||
|
while True:
|
||||||
with mongo_reviews_collection_from_config() as reviews:
|
print(model.use(input()))
|
||||||
reviews_varied_training = dataset_varied(collection=reviews, amount=DATA_SET_SIZE.__wrapped__)
|
except KeyboardInterrupt:
|
||||||
reviews_varied_evaluation = dataset_varied(collection=reviews, amount=DATA_SET_SIZE.__wrapped__)
|
pass
|
||||||
|
|
||||||
for tokenizer in all_tokenizers:
|
|
||||||
log.info("Training varied model with %s tokenizer", tokenizer)
|
|
||||||
model = VanillaSA(extractor=review_vanilla_extractor, tokenizer=tokenizer, categorizer=varied_categorizer)
|
|
||||||
model.train(reviews_varied_training)
|
|
||||||
log.info("Evaluating varied model with %s tokenizer", tokenizer)
|
|
||||||
evaluation = model.evaluate(reviews_varied_evaluation)
|
|
||||||
log.info("Varied model with %s results: %s", tokenizer, evaluation)
|
|
||||||
|
|
||||||
del reviews_varied_training
|
|
||||||
del reviews_varied_evaluation
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|
|
@ -12,7 +12,7 @@ class BaseSA(metaclass=abc.ABCMeta):
|
||||||
"""
|
"""
|
||||||
|
|
||||||
@abc.abstractmethod
|
@abc.abstractmethod
|
||||||
def train(self, training_set: list[tuple[Input, Category]]) -> None:
|
def train(self, training_set: t.Iterable[tuple[Input, Category]]) -> None:
|
||||||
"""
|
"""
|
||||||
Train the analyzer with the given training set.
|
Train the analyzer with the given training set.
|
||||||
"""
|
"""
|
||||||
|
|
|
@ -4,11 +4,14 @@ import nltk.sentiment
|
||||||
import nltk.sentiment.util
|
import nltk.sentiment.util
|
||||||
import logging
|
import logging
|
||||||
import typing as t
|
import typing as t
|
||||||
|
import itertools
|
||||||
|
|
||||||
from .base import Input, Category, BaseSA, AlreadyTrainedError, NotTrainedError
|
from .base import Input, Category, BaseSA, AlreadyTrainedError, NotTrainedError
|
||||||
|
from ..log import count_passage
|
||||||
|
|
||||||
TokenBag = list[str]
|
TokenBag = list[str]
|
||||||
IntermediateValue = t.TypeVar("IntermediateValue")
|
IntermediateValue = t.TypeVar("IntermediateValue")
|
||||||
|
Features = dict[str, int]
|
||||||
|
|
||||||
|
|
||||||
log = logging.getLogger(__name__)
|
log = logging.getLogger(__name__)
|
||||||
|
@ -19,51 +22,72 @@ class VanillaSA(BaseSA):
|
||||||
A sentiment analyzer resembling the one implemented in structure the one implemented in the classroom, using the basic sentiment analyzer of NLTK.
|
A sentiment analyzer resembling the one implemented in structure the one implemented in the classroom, using the basic sentiment analyzer of NLTK.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, *, extractor: t.Callable[[Input], tuple[str, Category]], tokenizer: t.Callable[[str], TokenBag], categorizer: t.Callable[[Input], Category]) -> None:
|
def __init__(self, *, extractor: t.Callable[[Input], tuple[str, IntermediateValue]], tokenizer: t.Callable[[str], TokenBag], categorizer: t.Callable[[IntermediateValue], Category]) -> None:
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self.model: nltk.sentiment.SentimentAnalyzer = nltk.sentiment.SentimentAnalyzer()
|
self.model: nltk.sentiment.SentimentAnalyzer = nltk.sentiment.SentimentAnalyzer()
|
||||||
self.trained: bool = False
|
self.trained: bool = False
|
||||||
|
|
||||||
self.extractor: t.Callable[[Input], tuple[str, IntermediateValue]] = extractor
|
self.extractor: t.Callable[[Input], tuple[str, IntermediateValue]] = extractor
|
||||||
self.tokenizer: t.Callable[[str], TokenBag] = tokenizer
|
self.tokenizer: t.Callable[[str], TokenBag] = tokenizer
|
||||||
self.categorizer: t.Callable[[IntermediateValue], Category] = categorizer
|
self.categorizer: t.Callable[[IntermediateValue], Category] = categorizer
|
||||||
|
|
||||||
def __add_feature_unigrams(self, training_set: list[tuple[TokenBag, Category]]) -> None:
|
def __repr__(self):
|
||||||
|
return f"<{self.__class__.__qualname__} {'trained' if self.trained else 'untrained'} tokenizer={self.extractor!r} categorizer={self.categorizer!r}>"
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def __data_to_tokenbag(data: tuple[TokenBag, Category]) -> TokenBag:
|
||||||
"""
|
"""
|
||||||
Add the `nltk.sentiment.util.extract_unigram_feats` feature to the model.
|
Access the tokenbag of a data tuple.
|
||||||
"""
|
"""
|
||||||
all_words = self.model.all_words(training_set, labeled=True)
|
return data[0]
|
||||||
|
|
||||||
|
def __add_feature_unigrams(self, dataset: t.Iterator[tuple[TokenBag, Category]]) -> None:
|
||||||
|
"""
|
||||||
|
Register the `nltk.sentiment.util.extract_unigram_feats` feature extrator on the model.
|
||||||
|
"""
|
||||||
|
tokenbags = map(self.__data_to_tokenbag, dataset)
|
||||||
|
all_words = self.model.all_words(tokenbags, labeled=False)
|
||||||
unigrams = self.model.unigram_word_feats(words=all_words, min_freq=4)
|
unigrams = self.model.unigram_word_feats(words=all_words, min_freq=4)
|
||||||
self.model.add_feat_extractor(nltk.sentiment.util.extract_unigram_feats, unigrams=unigrams)
|
self.model.add_feat_extractor(nltk.sentiment.util.extract_unigram_feats, unigrams=unigrams)
|
||||||
|
|
||||||
def _add_features(self, training_set: list[tuple[TokenBag, Category]]):
|
def _add_features(self, dataset: t.Iterator[tuple[TokenBag, Category]]):
|
||||||
"""
|
"""
|
||||||
Add new features to the sentiment analyzer.
|
Register new feature extractors on the `.model`.
|
||||||
"""
|
"""
|
||||||
self.__add_feature_unigrams(training_set)
|
self.__add_feature_unigrams(dataset)
|
||||||
|
|
||||||
def _train_from_dataset(self, dataset: list[tuple[TokenBag, Category]]) -> None:
|
def __extract_features(self, data: tuple[TokenBag, Category]) -> tuple[Features, Category]:
|
||||||
|
"""
|
||||||
|
Convert a (TokenBag, Category) tuple to a (Features, Category) tuple.
|
||||||
|
|
||||||
|
Does not use `SentimentAnalyzer.apply_features` due to unexpected behaviour when using iterators.
|
||||||
|
"""
|
||||||
|
return self.model.extract_features(data[0]), data[1]
|
||||||
|
|
||||||
|
def _train_from_dataset(self, dataset: t.Iterator[tuple[TokenBag, Category]]) -> None:
|
||||||
"""
|
"""
|
||||||
Train the model with the given training set.
|
Train the model with the given training set.
|
||||||
"""
|
"""
|
||||||
if self.trained:
|
if self.trained:
|
||||||
raise AlreadyTrainedError()
|
raise AlreadyTrainedError()
|
||||||
|
|
||||||
self.__add_feature_unigrams(dataset)
|
dataset_1, dataset_2 = itertools.tee(dataset, 2)
|
||||||
training_set_with_features = self.model.apply_features(dataset, labeled=True)
|
|
||||||
|
|
||||||
self.model.train(trainer=nltk.classify.NaiveBayesClassifier.train, training_set=training_set_with_features)
|
self._add_features(dataset_1)
|
||||||
|
del dataset_1
|
||||||
|
|
||||||
|
dataset_2 = map(self.__extract_features, dataset_2)
|
||||||
|
self.model.classifier = nltk.classify.NaiveBayesClassifier.train(dataset_2)
|
||||||
self.trained = True
|
self.trained = True
|
||||||
|
|
||||||
def _evaluate_from_dataset(self, dataset: list[tuple[TokenBag, Category]]) -> dict:
|
def _evaluate_from_dataset(self, dataset: t.Iterator[tuple[TokenBag, Category]]) -> dict:
|
||||||
"""
|
"""
|
||||||
Perform a model evaluation with the given test set.
|
Perform a model evaluation with the given test set.
|
||||||
"""
|
"""
|
||||||
if not self.trained:
|
if not self.trained:
|
||||||
raise NotTrainedError()
|
raise NotTrainedError()
|
||||||
|
|
||||||
test_set_with_features = self.model.apply_features(dataset, labeled=True)
|
dataset_1 = map(self.__extract_features, dataset)
|
||||||
return self.model.evaluate(test_set_with_features)
|
return self.model.evaluate(dataset_1)
|
||||||
|
|
||||||
def _use_from_tokenbag(self, tokens: TokenBag) -> Category:
|
def _use_from_tokenbag(self, tokens: TokenBag) -> Category:
|
||||||
"""
|
"""
|
||||||
|
@ -75,17 +99,18 @@ class VanillaSA(BaseSA):
|
||||||
return self.model.classify(instance=tokens)
|
return self.model.classify(instance=tokens)
|
||||||
|
|
||||||
def _extract_data(self, inp: Input) -> tuple[TokenBag, Category]:
|
def _extract_data(self, inp: Input) -> tuple[TokenBag, Category]:
|
||||||
|
count_passage("processed_data", 100)
|
||||||
text, value = self.extractor(inp)
|
text, value = self.extractor(inp)
|
||||||
return self.tokenizer(text), self.categorizer(value)
|
return self.tokenizer(text), self.categorizer(value)
|
||||||
|
|
||||||
def _extract_dataset(self, inp: list[Input]) -> list[tuple[TokenBag, Category]]:
|
def _extract_dataset(self, inp: t.Iterator[Input]) -> list[tuple[TokenBag, Category]]:
|
||||||
return list(map(self._extract_data, inp))
|
return map(self._extract_data, inp)
|
||||||
|
|
||||||
def train(self, training_set: list[Input]) -> None:
|
def train(self, training_set: t.Iterator[Input]) -> None:
|
||||||
dataset = self._extract_dataset(training_set)
|
dataset = self._extract_dataset(training_set)
|
||||||
self._train_from_dataset(dataset)
|
self._train_from_dataset(dataset)
|
||||||
|
|
||||||
def evaluate(self, test_set: list[tuple[Input, Category]]) -> None:
|
def evaluate(self, test_set: t.Iterator[Input]) -> dict:
|
||||||
dataset = self._extract_dataset(test_set)
|
dataset = self._extract_dataset(test_set)
|
||||||
return self._evaluate_from_dataset(dataset)
|
return self._evaluate_from_dataset(dataset)
|
||||||
|
|
||||||
|
|
|
@ -4,7 +4,7 @@ import pymongo.collection
|
||||||
import contextlib
|
import contextlib
|
||||||
import bson
|
import bson
|
||||||
import logging
|
import logging
|
||||||
import random
|
import itertools
|
||||||
|
|
||||||
from .config import MONGO_HOST, MONGO_PORT, WORKING_SET_SIZE, DATA_SET_SIZE
|
from .config import MONGO_HOST, MONGO_PORT, WORKING_SET_SIZE, DATA_SET_SIZE
|
||||||
|
|
||||||
|
@ -55,7 +55,7 @@ def mongo_reviews_collection_from_config() -> pymongo.collection.Collection[Revi
|
||||||
yield collection
|
yield collection
|
||||||
|
|
||||||
|
|
||||||
def sample_reviews(reviews: pymongo.collection.Collection, amount: int) -> t.Iterable[Review]:
|
def sample_reviews(reviews: pymongo.collection.Collection, amount: int) -> t.Iterator[Review]:
|
||||||
"""
|
"""
|
||||||
Get ``amount`` random reviews from the ``reviews`` collection.
|
Get ``amount`` random reviews from the ``reviews`` collection.
|
||||||
"""
|
"""
|
||||||
|
@ -67,7 +67,7 @@ def sample_reviews(reviews: pymongo.collection.Collection, amount: int) -> t.Ite
|
||||||
])
|
])
|
||||||
|
|
||||||
|
|
||||||
def sample_reviews_by_rating(reviews: pymongo.collection.Collection, rating: float, amount: int) -> t.Iterable[Review]:
|
def sample_reviews_by_rating(reviews: pymongo.collection.Collection, rating: float, amount: int) -> t.Iterator[Review]:
|
||||||
"""
|
"""
|
||||||
Get ``amount`` random reviews with ``rating`` stars from the ``reviews`` collection.
|
Get ``amount`` random reviews with ``rating`` stars from the ``reviews`` collection.
|
||||||
"""
|
"""
|
||||||
|
@ -80,7 +80,7 @@ def sample_reviews_by_rating(reviews: pymongo.collection.Collection, rating: flo
|
||||||
])
|
])
|
||||||
|
|
||||||
|
|
||||||
def dataset_polar(collection: pymongo.collection.Collection, amount: int) -> list[Review]:
|
def dataset_polar(collection: pymongo.collection.Collection, amount: int) -> t.Iterator[Review]:
|
||||||
"""
|
"""
|
||||||
Get a list of the same amount of 1-star and 5-star reviews.
|
Get a list of the same amount of 1-star and 5-star reviews.
|
||||||
"""
|
"""
|
||||||
|
@ -91,12 +91,12 @@ def dataset_polar(collection: pymongo.collection.Collection, amount: int) -> lis
|
||||||
negative = sample_reviews_by_rating(collection, rating=1.0, amount=amount)
|
negative = sample_reviews_by_rating(collection, rating=1.0, amount=amount)
|
||||||
|
|
||||||
# Randomness here does not matter, so just merge the lists
|
# Randomness here does not matter, so just merge the lists
|
||||||
both = [*positive, *negative]
|
both = itertools.chain(positive, negative)
|
||||||
|
|
||||||
return both
|
return both
|
||||||
|
|
||||||
|
|
||||||
def dataset_varied(collection: pymongo.collection.Collection, amount: int) -> list[Review]:
|
def dataset_varied(collection: pymongo.collection.Collection, amount: int) -> t.Iterator[Review]:
|
||||||
"""
|
"""
|
||||||
Get a list of the same amount of reviews for each rating.
|
Get a list of the same amount of reviews for each rating.
|
||||||
"""
|
"""
|
||||||
|
@ -109,8 +109,7 @@ def dataset_varied(collection: pymongo.collection.Collection, amount: int) -> li
|
||||||
positive = sample_reviews_by_rating(collection, rating=4.0, amount=amount)
|
positive = sample_reviews_by_rating(collection, rating=4.0, amount=amount)
|
||||||
great = sample_reviews_by_rating(collection, rating=5.0, amount=amount)
|
great = sample_reviews_by_rating(collection, rating=5.0, amount=amount)
|
||||||
|
|
||||||
# Randomness here does not matter, so just merge the lists
|
full = itertools.chain(terrible, negative, mixed, positive, great)
|
||||||
full = [*terrible, *negative, *mixed, *positive, *great]
|
|
||||||
|
|
||||||
return full
|
return full
|
||||||
|
|
||||||
|
@ -122,4 +121,5 @@ __all__ = (
|
||||||
"sample_reviews",
|
"sample_reviews",
|
||||||
"sample_reviews_by_rating",
|
"sample_reviews_by_rating",
|
||||||
"dataset_polar",
|
"dataset_polar",
|
||||||
|
"dataset_varied",
|
||||||
)
|
)
|
||||||
|
|
|
@ -1,3 +1,4 @@
|
||||||
|
import collections
|
||||||
import logging
|
import logging
|
||||||
import coloredlogs
|
import coloredlogs
|
||||||
|
|
||||||
|
@ -34,6 +35,16 @@ def install_log_handler(loggers: list[logging.Logger] = None):
|
||||||
log.debug("Installed custom log handler on: %s", logger)
|
log.debug("Installed custom log handler on: %s", logger)
|
||||||
|
|
||||||
|
|
||||||
|
_passage_counts = collections.defaultdict(lambda: 0)
|
||||||
|
|
||||||
|
|
||||||
|
def count_passage(key: str, mod: int):
|
||||||
|
_passage_counts[key] += 1
|
||||||
|
if not _passage_counts[key] % mod:
|
||||||
|
log.debug("%s - %d calls", key, _passage_counts[key])
|
||||||
|
|
||||||
|
|
||||||
__all__ = (
|
__all__ = (
|
||||||
"install_log_handler",
|
"install_log_handler",
|
||||||
|
"count_passage",
|
||||||
)
|
)
|
||||||
|
|
|
@ -3,8 +3,8 @@ from . import potts_based
|
||||||
|
|
||||||
|
|
||||||
all_tokenizers = [
|
all_tokenizers = [
|
||||||
nltk_based.tokenizer,
|
nltk_based.nltk_tokenizer,
|
||||||
potts_based.tokenizer,
|
potts_based.potts_tokenizer,
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -2,7 +2,7 @@ import nltk
|
||||||
import nltk.sentiment.util
|
import nltk.sentiment.util
|
||||||
|
|
||||||
|
|
||||||
def tokenizer(text: str) -> list[str]:
|
def nltk_tokenizer(text: str) -> list[str]:
|
||||||
"""
|
"""
|
||||||
Convert a text string into a list of tokens.
|
Convert a text string into a list of tokens.
|
||||||
"""
|
"""
|
||||||
|
@ -12,5 +12,5 @@ def tokenizer(text: str) -> list[str]:
|
||||||
|
|
||||||
|
|
||||||
__all__ = (
|
__all__ = (
|
||||||
"tokenizer",
|
"nltk_tokenizer",
|
||||||
)
|
)
|
||||||
|
|
|
@ -143,7 +143,7 @@ amp = "&"
|
||||||
######################################################################
|
######################################################################
|
||||||
|
|
||||||
|
|
||||||
def tokenizer(text: str) -> t.Iterable[str]:
|
def potts_tokenizer(text: str) -> t.Iterable[str]:
|
||||||
"""
|
"""
|
||||||
Argument: s -- any string object
|
Argument: s -- any string object
|
||||||
Value: a tokenize list of strings; conatenating this list returns the original string if preserve_case=False
|
Value: a tokenize list of strings; conatenating this list returns the original string if preserve_case=False
|
||||||
|
@ -187,5 +187,5 @@ def __html2string(html: str) -> str:
|
||||||
|
|
||||||
|
|
||||||
__all__ = (
|
__all__ = (
|
||||||
"tokenizer",
|
"potts_tokenizer",
|
||||||
)
|
)
|
||||||
|
|
Loading…
Reference in a new issue