mirror of
https://github.com/Steffo99/unimore-bda-6.git
synced 2024-11-21 23:44:19 +00:00
Refactor things to work better
This commit is contained in:
parent
4c3f892038
commit
965cea692a
12 changed files with 239 additions and 234 deletions
12
.editorconfig
Normal file
12
.editorconfig
Normal file
|
@ -0,0 +1,12 @@
|
||||||
|
# EditorConfig is awesome: https://EditorConfig.org
|
||||||
|
|
||||||
|
# top-most EditorConfig file
|
||||||
|
root = true
|
||||||
|
|
||||||
|
[*]
|
||||||
|
indent_style = space
|
||||||
|
indent_size = 4
|
||||||
|
end_of_line = lf
|
||||||
|
charset = utf-8
|
||||||
|
trim_trailing_whitespace = true
|
||||||
|
insert_final_newline = true
|
2
.vscode/launch.json
vendored
2
.vscode/launch.json
vendored
|
@ -12,7 +12,7 @@
|
||||||
"justMyCode": true,
|
"justMyCode": true,
|
||||||
"env": {
|
"env": {
|
||||||
"NLTK_DATA": "./data/nltk",
|
"NLTK_DATA": "./data/nltk",
|
||||||
"DATA_SET_SIZE": "100",
|
"DATA_SET_SIZE": "250",
|
||||||
},
|
},
|
||||||
"cwd": "${workspaceFolder}",
|
"cwd": "${workspaceFolder}",
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,36 +1,98 @@
|
||||||
import logging
|
import logging
|
||||||
|
|
||||||
from .config import config, DATA_SET_SIZE
|
from .config import config, DATA_SET_SIZE
|
||||||
from .database import mongo_reviews_collection_from_config, get_reviews_dataset_polar, get_reviews_dataset_uniform
|
from .database import Review, mongo_reviews_collection_from_config, dataset_polar, dataset_varied
|
||||||
from .analysis.vanilla import VanillaReviewSA, polar_categorizer, stars_categorizer
|
from .analysis.vanilla import VanillaSA
|
||||||
from .analysis.potts import PottsReviewSA
|
from .tokenization import all_tokenizers
|
||||||
from .log import install_log_handler
|
from .log import install_log_handler
|
||||||
|
|
||||||
log = logging.getLogger(__name__)
|
log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
def review_vanilla_extractor(review: Review) -> tuple[str, float]:
|
||||||
|
"""
|
||||||
|
Extract review text and rating from a `Review`.
|
||||||
|
"""
|
||||||
|
return review["reviewText"], review["overall"]
|
||||||
|
|
||||||
|
|
||||||
|
def polar_categorizer(rating: float) -> str:
|
||||||
|
"""
|
||||||
|
Return the polar label corresponding to the given rating.
|
||||||
|
|
||||||
|
Possible categories are:
|
||||||
|
|
||||||
|
* negative (1.0, 2.0)
|
||||||
|
* positive (3.0, 4.0, 5.0)
|
||||||
|
* unknown (everything else)
|
||||||
|
"""
|
||||||
|
match rating:
|
||||||
|
case 1.0 | 2.0:
|
||||||
|
return "negative"
|
||||||
|
case 3.0 | 4.0 | 5.0:
|
||||||
|
return "positive"
|
||||||
|
case _:
|
||||||
|
return "unknown"
|
||||||
|
|
||||||
|
|
||||||
|
def varied_categorizer(rating: float) -> str:
|
||||||
|
"""
|
||||||
|
Return the "stars" label corresponding to the given rating.
|
||||||
|
|
||||||
|
Possible categories are:
|
||||||
|
|
||||||
|
* terrible (1.0)
|
||||||
|
* negative (2.0)
|
||||||
|
* mixed (3.0)
|
||||||
|
* positive (4.0)
|
||||||
|
* great (5.0)
|
||||||
|
* unknown (everything else)
|
||||||
|
"""
|
||||||
|
match rating:
|
||||||
|
case 1.0:
|
||||||
|
return "terrible"
|
||||||
|
case 2.0:
|
||||||
|
return "negative"
|
||||||
|
case 3.0:
|
||||||
|
return "mixed"
|
||||||
|
case 4.0:
|
||||||
|
return "positive"
|
||||||
|
case 5.0:
|
||||||
|
return "great"
|
||||||
|
case _:
|
||||||
|
return "unknown"
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
with mongo_reviews_collection_from_config() as reviews:
|
with mongo_reviews_collection_from_config() as reviews:
|
||||||
reviews_polar_training = get_reviews_dataset_polar(collection=reviews, amount=DATA_SET_SIZE.__wrapped__)
|
reviews_polar_training = dataset_polar(collection=reviews, amount=DATA_SET_SIZE.__wrapped__)
|
||||||
reviews_polar_evaluation = get_reviews_dataset_polar(collection=reviews, amount=DATA_SET_SIZE.__wrapped__)
|
reviews_polar_evaluation = dataset_polar(collection=reviews, amount=DATA_SET_SIZE.__wrapped__)
|
||||||
reviews_uniform_training = get_reviews_dataset_uniform(collection=reviews, amount=DATA_SET_SIZE.__wrapped__)
|
|
||||||
reviews_uniform_evaluation = get_reviews_dataset_uniform(collection=reviews, amount=DATA_SET_SIZE.__wrapped__)
|
|
||||||
|
|
||||||
vanilla_polar = VanillaReviewSA(categorizer=polar_categorizer)
|
for tokenizer in all_tokenizers:
|
||||||
vanilla_polar.train(reviews_polar_training)
|
log.info("Training polar model with %s tokenizer", tokenizer)
|
||||||
log.info("Vanilla polar evaluation results: %s", vanilla_polar.evaluate(reviews_polar_evaluation))
|
model = VanillaSA(extractor=review_vanilla_extractor, tokenizer=tokenizer, categorizer=polar_categorizer)
|
||||||
|
model.train(reviews_polar_training)
|
||||||
|
log.info("Evaluating polar model with %s tokenizer", tokenizer)
|
||||||
|
evaluation = model.evaluate(reviews_polar_evaluation)
|
||||||
|
log.info("Polar model with %s results: %s", tokenizer, evaluation)
|
||||||
|
|
||||||
potts_polar = PottsReviewSA()
|
del reviews_polar_training
|
||||||
potts_polar.train(reviews_polar_training)
|
del reviews_polar_evaluation
|
||||||
log.info("Potts polar evaluation results: %s", potts_polar.evaluate(reviews_polar_evaluation))
|
|
||||||
|
|
||||||
vanilla_uniform = VanillaReviewSA(categorizer=stars_categorizer)
|
with mongo_reviews_collection_from_config() as reviews:
|
||||||
vanilla_uniform.train(reviews_uniform_training)
|
reviews_varied_training = dataset_varied(collection=reviews, amount=DATA_SET_SIZE.__wrapped__)
|
||||||
log.info("Vanilla uniform evaluation results: %s", vanilla_polar.evaluate(reviews_polar_evaluation))
|
reviews_varied_evaluation = dataset_varied(collection=reviews, amount=DATA_SET_SIZE.__wrapped__)
|
||||||
|
|
||||||
while True:
|
for tokenizer in all_tokenizers:
|
||||||
print(vanilla_uniform.use(input("> ")))
|
log.info("Training varied model with %s tokenizer", tokenizer)
|
||||||
|
model = VanillaSA(extractor=review_vanilla_extractor, tokenizer=tokenizer, categorizer=varied_categorizer)
|
||||||
|
model.train(reviews_varied_training)
|
||||||
|
log.info("Evaluating varied model with %s tokenizer", tokenizer)
|
||||||
|
evaluation = model.evaluate(reviews_varied_evaluation)
|
||||||
|
log.info("Varied model with %s results: %s", tokenizer, evaluation)
|
||||||
|
|
||||||
|
del reviews_varied_training
|
||||||
|
del reviews_varied_evaluation
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|
|
@ -0,0 +1,6 @@
|
||||||
|
from .vanilla import BaseSA
|
||||||
|
|
||||||
|
|
||||||
|
__all__ = (
|
||||||
|
"BaseSA",
|
||||||
|
)
|
|
@ -1,4 +1,9 @@
|
||||||
import abc
|
import abc
|
||||||
|
import typing as t
|
||||||
|
|
||||||
|
|
||||||
|
Input = t.TypeVar("Input")
|
||||||
|
Category = t.TypeVar("Category")
|
||||||
|
|
||||||
|
|
||||||
class BaseSA(metaclass=abc.ABCMeta):
|
class BaseSA(metaclass=abc.ABCMeta):
|
||||||
|
@ -6,30 +11,15 @@ class BaseSA(metaclass=abc.ABCMeta):
|
||||||
Abstract base class for sentiment analyzers implemented in this project.
|
Abstract base class for sentiment analyzers implemented in this project.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self) -> None:
|
|
||||||
"""
|
|
||||||
Create the empty shell of the sentiment analyzer.
|
|
||||||
"""
|
|
||||||
|
|
||||||
self.trained = False
|
|
||||||
"If :meth:`train` has been called at least once, and the analyzer is ready or not to be evaluated or used."
|
|
||||||
|
|
||||||
@abc.abstractmethod
|
@abc.abstractmethod
|
||||||
def train(self, training_set) -> None:
|
def train(self, training_set: list[tuple[Input, Category]]) -> None:
|
||||||
"""
|
"""
|
||||||
Train the analyzer with the given training set.
|
Train the analyzer with the given training set.
|
||||||
"""
|
"""
|
||||||
raise NotImplementedError()
|
raise NotImplementedError()
|
||||||
|
|
||||||
@abc.abstractmethod
|
@abc.abstractmethod
|
||||||
def evaluate(self, test_set) -> None:
|
def use(self, text: Input) -> Category:
|
||||||
"""
|
|
||||||
Evaluate the analyzer with the given test set.
|
|
||||||
"""
|
|
||||||
raise NotImplementedError()
|
|
||||||
|
|
||||||
@abc.abstractmethod
|
|
||||||
def use(self, text: str) -> str:
|
|
||||||
"""
|
"""
|
||||||
Use the sentiment analyzer.
|
Use the sentiment analyzer.
|
||||||
"""
|
"""
|
||||||
|
@ -48,6 +38,8 @@ class NotTrainedError(Exception):
|
||||||
|
|
||||||
|
|
||||||
__all__ = (
|
__all__ = (
|
||||||
|
"Input",
|
||||||
|
"Category",
|
||||||
"BaseSA",
|
"BaseSA",
|
||||||
"AlreadyTrainedError",
|
"AlreadyTrainedError",
|
||||||
"NotTrainedError",
|
"NotTrainedError",
|
||||||
|
|
|
@ -1,30 +0,0 @@
|
||||||
from ..vendor.potts import Tokenizer
|
|
||||||
from .vanilla import VanillaSA, VanillaReviewSA
|
|
||||||
|
|
||||||
|
|
||||||
class PottsSA(VanillaSA):
|
|
||||||
"""
|
|
||||||
A sentiment analyzer using Potts' tokenizer.
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(self) -> None:
|
|
||||||
super().__init__()
|
|
||||||
|
|
||||||
def _tokenize_text(self, text: str) -> list[str]:
|
|
||||||
"""
|
|
||||||
Convert a text string into a list of tokens, using the language of the model.
|
|
||||||
"""
|
|
||||||
tokenizer: Tokenizer = Tokenizer(preserve_case=False)
|
|
||||||
return list(tokenizer.tokenize(text))
|
|
||||||
|
|
||||||
|
|
||||||
class PottsReviewSA(VanillaReviewSA, PottsSA):
|
|
||||||
"""
|
|
||||||
A `PottsSA` to be used with `Review`s.
|
|
||||||
"""
|
|
||||||
|
|
||||||
|
|
||||||
__all__ = (
|
|
||||||
"PottsSA",
|
|
||||||
"PottsReviewSA",
|
|
||||||
)
|
|
|
@ -1,4 +1,3 @@
|
||||||
import abc
|
|
||||||
import nltk
|
import nltk
|
||||||
import nltk.classify
|
import nltk.classify
|
||||||
import nltk.sentiment
|
import nltk.sentiment
|
||||||
|
@ -6,31 +5,30 @@ import nltk.sentiment.util
|
||||||
import logging
|
import logging
|
||||||
import typing as t
|
import typing as t
|
||||||
|
|
||||||
from ..database import Review
|
from .base import Input, Category, BaseSA, AlreadyTrainedError, NotTrainedError
|
||||||
from .base import BaseSA, AlreadyTrainedError, NotTrainedError
|
|
||||||
|
TokenBag = list[str]
|
||||||
|
IntermediateValue = t.TypeVar("IntermediateValue")
|
||||||
|
|
||||||
|
|
||||||
log = logging.getLogger(__name__)
|
log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
class VanillaSA(BaseSA, metaclass=abc.ABCMeta):
|
class VanillaSA(BaseSA):
|
||||||
"""
|
"""
|
||||||
A sentiment analyzer resembling the one implemented in structure the one implemented in the classroom, using the basic sentiment analyzer of NLTK.
|
A sentiment analyzer resembling the one implemented in structure the one implemented in the classroom, using the basic sentiment analyzer of NLTK.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self) -> None:
|
def __init__(self, *, extractor: t.Callable[[Input], tuple[str, Category]], tokenizer: t.Callable[[str], TokenBag], categorizer: t.Callable[[Input], Category]) -> None:
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self.model: nltk.sentiment.SentimentAnalyzer = nltk.sentiment.SentimentAnalyzer()
|
self.model: nltk.sentiment.SentimentAnalyzer = nltk.sentiment.SentimentAnalyzer()
|
||||||
|
self.trained: bool = False
|
||||||
|
|
||||||
def _tokenize_text(self, text: str) -> list[str]:
|
self.extractor: t.Callable[[Input], tuple[str, IntermediateValue]] = extractor
|
||||||
"""
|
self.tokenizer: t.Callable[[str], TokenBag] = tokenizer
|
||||||
Convert a text string into a list of tokens.
|
self.categorizer: t.Callable[[IntermediateValue], Category] = categorizer
|
||||||
"""
|
|
||||||
tokens = nltk.word_tokenize(text)
|
|
||||||
nltk.sentiment.util.mark_negation(tokens, shallow=True)
|
|
||||||
return tokens
|
|
||||||
|
|
||||||
def __add_feature_unigrams(self, training_set: list[tuple[list[str], str]]) -> None:
|
def __add_feature_unigrams(self, training_set: list[tuple[TokenBag, Category]]) -> None:
|
||||||
"""
|
"""
|
||||||
Add the `nltk.sentiment.util.extract_unigram_feats` feature to the model.
|
Add the `nltk.sentiment.util.extract_unigram_feats` feature to the model.
|
||||||
"""
|
"""
|
||||||
|
@ -38,116 +36,64 @@ class VanillaSA(BaseSA, metaclass=abc.ABCMeta):
|
||||||
unigrams = self.model.unigram_word_feats(words=all_words, min_freq=4)
|
unigrams = self.model.unigram_word_feats(words=all_words, min_freq=4)
|
||||||
self.model.add_feat_extractor(nltk.sentiment.util.extract_unigram_feats, unigrams=unigrams)
|
self.model.add_feat_extractor(nltk.sentiment.util.extract_unigram_feats, unigrams=unigrams)
|
||||||
|
|
||||||
def _featurize_documents(self, documents: list[tuple[list[str], str]]):
|
def _add_features(self, training_set: list[tuple[TokenBag, Category]]):
|
||||||
"""
|
"""
|
||||||
Apply features to a document.
|
Add new features to the sentiment analyzer.
|
||||||
"""
|
"""
|
||||||
return self.model.apply_features(documents, labeled=True)
|
self.__add_feature_unigrams(training_set)
|
||||||
|
|
||||||
def _train_with_set(self, training_set: list[tuple[list[str], str]]) -> None:
|
def _train_from_dataset(self, dataset: list[tuple[TokenBag, Category]]) -> None:
|
||||||
"""
|
"""
|
||||||
Train the model with the given **pre-classified but not pre-tokenized** training set.
|
Train the model with the given training set.
|
||||||
"""
|
"""
|
||||||
if self.trained:
|
if self.trained:
|
||||||
raise AlreadyTrainedError()
|
raise AlreadyTrainedError()
|
||||||
|
|
||||||
self.__add_feature_unigrams(training_set)
|
self.__add_feature_unigrams(dataset)
|
||||||
training_set_with_features = self._featurize_documents(training_set)
|
training_set_with_features = self.model.apply_features(dataset, labeled=True)
|
||||||
|
|
||||||
self.model.train(trainer=nltk.classify.NaiveBayesClassifier.train, training_set=training_set_with_features)
|
self.model.train(trainer=nltk.classify.NaiveBayesClassifier.train, training_set=training_set_with_features)
|
||||||
self.trained = True
|
self.trained = True
|
||||||
|
|
||||||
def _evaluate_with_set(self, test_set: list[tuple[list[str], str]]) -> dict:
|
def _evaluate_from_dataset(self, dataset: list[tuple[TokenBag, Category]]) -> dict:
|
||||||
|
"""
|
||||||
|
Perform a model evaluation with the given test set.
|
||||||
|
"""
|
||||||
if not self.trained:
|
if not self.trained:
|
||||||
raise NotTrainedError()
|
raise NotTrainedError()
|
||||||
|
|
||||||
test_set_with_features = self._featurize_documents(test_set)
|
test_set_with_features = self.model.apply_features(dataset, labeled=True)
|
||||||
return self.model.evaluate(test_set_with_features)
|
return self.model.evaluate(test_set_with_features)
|
||||||
|
|
||||||
def _use_with_tokens(self, tokens: list[str]) -> str:
|
def _use_from_tokenbag(self, tokens: TokenBag) -> Category:
|
||||||
|
"""
|
||||||
|
Categorize the given token bag.
|
||||||
|
"""
|
||||||
if not self.trained:
|
if not self.trained:
|
||||||
raise NotTrainedError()
|
raise NotTrainedError()
|
||||||
|
|
||||||
return self.model.classify(instance=tokens)
|
return self.model.classify(instance=tokens)
|
||||||
|
|
||||||
|
def _extract_data(self, inp: Input) -> tuple[TokenBag, Category]:
|
||||||
|
text, value = self.extractor(inp)
|
||||||
|
return self.tokenizer(text), self.categorizer(value)
|
||||||
|
|
||||||
class VanillaReviewSA(VanillaSA):
|
def _extract_dataset(self, inp: list[Input]) -> list[tuple[TokenBag, Category]]:
|
||||||
"""
|
return list(map(self._extract_data, inp))
|
||||||
A `VanillaSA` to be used with `Review`s.
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(self, categorizer: t.Callable[[Review], str]) -> None:
|
def train(self, training_set: list[Input]) -> None:
|
||||||
super().__init__()
|
dataset = self._extract_dataset(training_set)
|
||||||
self.categorizer: t.Callable[[Review], str] = categorizer
|
self._train_from_dataset(dataset)
|
||||||
|
|
||||||
def _review_to_data_set(self, review: Review) -> tuple[list[str], str]:
|
def evaluate(self, test_set: list[tuple[Input, Category]]) -> None:
|
||||||
"""
|
dataset = self._extract_dataset(test_set)
|
||||||
Convert a review to a NLTK-compatible dataset.
|
return self._evaluate_from_dataset(dataset)
|
||||||
"""
|
|
||||||
return self._tokenize_text(text=review["reviewText"]), self.categorizer(rating=review["overall"])
|
|
||||||
|
|
||||||
def train(self, reviews: t.Iterable[Review]) -> None:
|
def use(self, text: Input) -> Category:
|
||||||
data_set = list(map(self._review_to_data_set, reviews))
|
tokens = self.tokenizer(text)
|
||||||
self._train_with_set(data_set)
|
return self._use_from_tokenbag(tokens)
|
||||||
|
|
||||||
def evaluate(self, reviews: t.Iterable[Review]):
|
|
||||||
data_set = list(map(self._review_to_data_set, reviews))
|
|
||||||
return self._evaluate_with_set(data_set)
|
|
||||||
|
|
||||||
def use(self, text: str) -> str:
|
|
||||||
return self._use_with_tokens(self._tokenize_text(text))
|
|
||||||
|
|
||||||
|
|
||||||
def polar_categorizer(rating: float) -> str:
|
|
||||||
"""
|
|
||||||
Return the polar label corresponding to the given rating.
|
|
||||||
|
|
||||||
Possible categories are:
|
|
||||||
|
|
||||||
* negative (1.0, 2.0)
|
|
||||||
* positive (3.0, 4.0, 5.0)
|
|
||||||
* unknown (everything else)
|
|
||||||
"""
|
|
||||||
match rating:
|
|
||||||
case 1.0 | 2.0:
|
|
||||||
return "negative"
|
|
||||||
case 3.0 | 4.0 | 5.0:
|
|
||||||
return "positive"
|
|
||||||
case _:
|
|
||||||
return "unknown"
|
|
||||||
|
|
||||||
|
|
||||||
def stars_categorizer(rating: float) -> str:
|
|
||||||
"""
|
|
||||||
Return the "stars" label corresponding to the given rating.
|
|
||||||
|
|
||||||
Possible categories are:
|
|
||||||
|
|
||||||
* terrible (1.0)
|
|
||||||
* negative (2.0)
|
|
||||||
* mixed (3.0)
|
|
||||||
* positive (4.0)
|
|
||||||
* great (5.0)
|
|
||||||
* unknown (everything else)
|
|
||||||
"""
|
|
||||||
match rating:
|
|
||||||
case 1.0:
|
|
||||||
return "terrible"
|
|
||||||
case 2.0:
|
|
||||||
return "negative"
|
|
||||||
case 3.0:
|
|
||||||
return "mixed"
|
|
||||||
case 4.0:
|
|
||||||
return "positive"
|
|
||||||
case 5.0:
|
|
||||||
return "great"
|
|
||||||
case _:
|
|
||||||
return "unknown"
|
|
||||||
|
|
||||||
|
|
||||||
__all__ = (
|
__all__ = (
|
||||||
"VanillaSA",
|
"VanillaSA",
|
||||||
"VanillaReviewSA",
|
|
||||||
"polar_categorizer",
|
|
||||||
"stars_categorizer",
|
|
||||||
)
|
)
|
||||||
|
|
|
@ -80,11 +80,11 @@ def sample_reviews_by_rating(reviews: pymongo.collection.Collection, rating: flo
|
||||||
])
|
])
|
||||||
|
|
||||||
|
|
||||||
def get_reviews_dataset_polar(collection: pymongo.collection.Collection, amount: int) -> list[Review]:
|
def dataset_polar(collection: pymongo.collection.Collection, amount: int) -> list[Review]:
|
||||||
"""
|
"""
|
||||||
Get a list of shuffled 1-star and 5-star reviews.
|
Get a list of the same amount of 1-star and 5-star reviews.
|
||||||
"""
|
"""
|
||||||
log.info("Building dataset with %d polar reviews...", amount * 2)
|
log.info("Building polar dataset with %d reviews...", amount * 2)
|
||||||
|
|
||||||
# Sample the required reviews
|
# Sample the required reviews
|
||||||
positive = sample_reviews_by_rating(collection, rating=5.0, amount=amount)
|
positive = sample_reviews_by_rating(collection, rating=5.0, amount=amount)
|
||||||
|
@ -93,18 +93,14 @@ def get_reviews_dataset_polar(collection: pymongo.collection.Collection, amount:
|
||||||
# Randomness here does not matter, so just merge the lists
|
# Randomness here does not matter, so just merge the lists
|
||||||
both = [*positive, *negative]
|
both = [*positive, *negative]
|
||||||
|
|
||||||
# Shuffle the dataset, just in case it affects the performance
|
|
||||||
# TODO: does it actually?
|
|
||||||
random.shuffle(both)
|
|
||||||
|
|
||||||
return both
|
return both
|
||||||
|
|
||||||
|
|
||||||
def get_reviews_dataset_uniform(collection: pymongo.collection.Collection, amount: int) -> list[Review]:
|
def dataset_varied(collection: pymongo.collection.Collection, amount: int) -> list[Review]:
|
||||||
"""
|
"""
|
||||||
Get a list of shuffled reviews of any rating.
|
Get a list of the same amount of reviews for each rating.
|
||||||
"""
|
"""
|
||||||
log.info("Building dataset with %d uniform reviews...", amount * 5)
|
log.info("Building varied dataset with %d reviews...", amount * 5)
|
||||||
|
|
||||||
# Sample the required reviews
|
# Sample the required reviews
|
||||||
terrible = sample_reviews_by_rating(collection, rating=1.0, amount=amount)
|
terrible = sample_reviews_by_rating(collection, rating=1.0, amount=amount)
|
||||||
|
@ -116,10 +112,6 @@ def get_reviews_dataset_uniform(collection: pymongo.collection.Collection, amoun
|
||||||
# Randomness here does not matter, so just merge the lists
|
# Randomness here does not matter, so just merge the lists
|
||||||
full = [*terrible, *negative, *mixed, *positive, *great]
|
full = [*terrible, *negative, *mixed, *positive, *great]
|
||||||
|
|
||||||
# Shuffle the dataset, just in case it affects the performance
|
|
||||||
# TODO: does it actually?
|
|
||||||
random.shuffle(full)
|
|
||||||
|
|
||||||
return full
|
return full
|
||||||
|
|
||||||
|
|
||||||
|
@ -129,5 +121,5 @@ __all__ = (
|
||||||
"mongo_reviews_collection_from_config",
|
"mongo_reviews_collection_from_config",
|
||||||
"sample_reviews",
|
"sample_reviews",
|
||||||
"sample_reviews_by_rating",
|
"sample_reviews_by_rating",
|
||||||
"get_reviews_dataset_polar",
|
"dataset_polar",
|
||||||
)
|
)
|
||||||
|
|
15
unimore_bda_6/tokenization/__init__.py
Normal file
15
unimore_bda_6/tokenization/__init__.py
Normal file
|
@ -0,0 +1,15 @@
|
||||||
|
from . import nltk_based
|
||||||
|
from . import potts_based
|
||||||
|
|
||||||
|
|
||||||
|
all_tokenizers = [
|
||||||
|
nltk_based.tokenizer,
|
||||||
|
potts_based.tokenizer,
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
__all__ = (
|
||||||
|
"nltk_based",
|
||||||
|
"potts_based",
|
||||||
|
"all_tokenizers",
|
||||||
|
)
|
16
unimore_bda_6/tokenization/nltk_based.py
Normal file
16
unimore_bda_6/tokenization/nltk_based.py
Normal file
|
@ -0,0 +1,16 @@
|
||||||
|
import nltk
|
||||||
|
import nltk.sentiment.util
|
||||||
|
|
||||||
|
|
||||||
|
def tokenizer(text: str) -> list[str]:
|
||||||
|
"""
|
||||||
|
Convert a text string into a list of tokens.
|
||||||
|
"""
|
||||||
|
tokens = nltk.word_tokenize(text)
|
||||||
|
nltk.sentiment.util.mark_negation(tokens, shallow=True)
|
||||||
|
return tokens
|
||||||
|
|
||||||
|
|
||||||
|
__all__ = (
|
||||||
|
"tokenizer",
|
||||||
|
)
|
|
@ -142,47 +142,50 @@ amp = "&"
|
||||||
|
|
||||||
######################################################################
|
######################################################################
|
||||||
|
|
||||||
class Tokenizer:
|
|
||||||
def __init__(self, preserve_case=False):
|
|
||||||
self.preserve_case = preserve_case
|
|
||||||
|
|
||||||
def tokenize(self, s: str) -> t.Iterable[str]:
|
def tokenizer(text: str) -> t.Iterable[str]:
|
||||||
"""
|
"""
|
||||||
Argument: s -- any string object
|
Argument: s -- any string object
|
||||||
Value: a tokenize list of strings; conatenating this list returns the original string if preserve_case=False
|
Value: a tokenize list of strings; conatenating this list returns the original string if preserve_case=False
|
||||||
"""
|
"""
|
||||||
# Fix HTML character entitites:
|
# Fix HTML character entitites:
|
||||||
s = self.__html2string(s)
|
s = __html2string(text)
|
||||||
# Tokenize:
|
# Tokenize:
|
||||||
words = word_re.findall(s)
|
words = word_re.findall(s)
|
||||||
# Possible alter the case, but avoid changing emoticons like :D into :d:
|
# Possible alter the case, but avoid changing emoticons like :D into :d:
|
||||||
if not self.preserve_case:
|
|
||||||
words = map((lambda x : x if emoticon_re.search(x) else x.lower()), words)
|
words = map((lambda x : x if emoticon_re.search(x) else x.lower()), words)
|
||||||
|
# Return the results
|
||||||
return words
|
return words
|
||||||
|
|
||||||
def __html2string(self, s: str) -> str:
|
|
||||||
|
def __html2string(html: str) -> str:
|
||||||
"""
|
"""
|
||||||
Internal metod that seeks to replace all the HTML entities in
|
Internal metod that seeks to replace all the HTML entities in
|
||||||
s with their corresponding unicode characters.
|
s with their corresponding unicode characters.
|
||||||
"""
|
"""
|
||||||
# First the digits:
|
# First the digits:
|
||||||
ents = set(html_entity_digit_re.findall(s))
|
ents = set(html_entity_digit_re.findall(html))
|
||||||
if len(ents) > 0:
|
if len(ents) > 0:
|
||||||
for ent in ents:
|
for ent in ents:
|
||||||
entnum = ent[2:-1]
|
entnum = ent[2:-1]
|
||||||
try:
|
try:
|
||||||
entnum = int(entnum)
|
entnum = int(entnum)
|
||||||
s = s.replace(ent, chr(entnum))
|
html = html.replace(ent, chr(entnum))
|
||||||
except:
|
except:
|
||||||
pass
|
pass
|
||||||
# Now the alpha versions:
|
# Now the alpha versions:
|
||||||
ents = set(html_entity_alpha_re.findall(s))
|
ents = set(html_entity_alpha_re.findall(html))
|
||||||
ents = filter((lambda x : x != amp), ents)
|
ents = filter((lambda x : x != amp), ents)
|
||||||
for ent in ents:
|
for ent in ents:
|
||||||
entname = ent[1:-1]
|
entname = ent[1:-1]
|
||||||
try:
|
try:
|
||||||
s = s.replace(ent, chr(html.entities.name2codepoint[entname]))
|
html = html.replace(ent, chr(html.entities.name2codepoint[entname]))
|
||||||
except:
|
except:
|
||||||
pass
|
pass
|
||||||
s = s.replace(amp, " and ")
|
html = html.replace(amp, " and ")
|
||||||
return s
|
return html
|
||||||
|
|
||||||
|
|
||||||
|
__all__ = (
|
||||||
|
"tokenizer",
|
||||||
|
)
|
9
unimore_bda_6/vendor/__init__.py
vendored
9
unimore_bda_6/vendor/__init__.py
vendored
|
@ -1,9 +0,0 @@
|
||||||
"""
|
|
||||||
This module contains modules downloaded from the Internet and adapted for the project.
|
|
||||||
|
|
||||||
Edits to the respective modules are released under the same license as the modules themselves.
|
|
||||||
|
|
||||||
Currently:
|
|
||||||
|
|
||||||
* the adaptation of :mod:`potts` is released under the CC BY-NC-SA 3.0 license.
|
|
||||||
"""
|
|
Loading…
Reference in a new issue