1
Fork 0
mirror of https://github.com/Steffo99/unimore-bda-6.git synced 2024-11-21 15:34:18 +00:00

Refactor things to work better

This commit is contained in:
Steffo 2023-02-02 17:24:11 +01:00
parent 4c3f892038
commit 965cea692a
Signed by: steffo
GPG key ID: 2A24051445686895
12 changed files with 239 additions and 234 deletions

12
.editorconfig Normal file
View file

@ -0,0 +1,12 @@
# EditorConfig is awesome: https://EditorConfig.org
# top-most EditorConfig file
root = true
[*]
indent_style = space
indent_size = 4
end_of_line = lf
charset = utf-8
trim_trailing_whitespace = true
insert_final_newline = true

2
.vscode/launch.json vendored
View file

@ -12,7 +12,7 @@
"justMyCode": true, "justMyCode": true,
"env": { "env": {
"NLTK_DATA": "./data/nltk", "NLTK_DATA": "./data/nltk",
"DATA_SET_SIZE": "100", "DATA_SET_SIZE": "250",
}, },
"cwd": "${workspaceFolder}", "cwd": "${workspaceFolder}",
} }

View file

@ -1,36 +1,98 @@
import logging import logging
from .config import config, DATA_SET_SIZE from .config import config, DATA_SET_SIZE
from .database import mongo_reviews_collection_from_config, get_reviews_dataset_polar, get_reviews_dataset_uniform from .database import Review, mongo_reviews_collection_from_config, dataset_polar, dataset_varied
from .analysis.vanilla import VanillaReviewSA, polar_categorizer, stars_categorizer from .analysis.vanilla import VanillaSA
from .analysis.potts import PottsReviewSA from .tokenization import all_tokenizers
from .log import install_log_handler from .log import install_log_handler
log = logging.getLogger(__name__) log = logging.getLogger(__name__)
def review_vanilla_extractor(review: Review) -> tuple[str, float]:
"""
Extract review text and rating from a `Review`.
"""
return review["reviewText"], review["overall"]
def polar_categorizer(rating: float) -> str:
"""
Return the polar label corresponding to the given rating.
Possible categories are:
* negative (1.0, 2.0)
* positive (3.0, 4.0, 5.0)
* unknown (everything else)
"""
match rating:
case 1.0 | 2.0:
return "negative"
case 3.0 | 4.0 | 5.0:
return "positive"
case _:
return "unknown"
def varied_categorizer(rating: float) -> str:
"""
Return the "stars" label corresponding to the given rating.
Possible categories are:
* terrible (1.0)
* negative (2.0)
* mixed (3.0)
* positive (4.0)
* great (5.0)
* unknown (everything else)
"""
match rating:
case 1.0:
return "terrible"
case 2.0:
return "negative"
case 3.0:
return "mixed"
case 4.0:
return "positive"
case 5.0:
return "great"
case _:
return "unknown"
def main(): def main():
with mongo_reviews_collection_from_config() as reviews: with mongo_reviews_collection_from_config() as reviews:
reviews_polar_training = get_reviews_dataset_polar(collection=reviews, amount=DATA_SET_SIZE.__wrapped__) reviews_polar_training = dataset_polar(collection=reviews, amount=DATA_SET_SIZE.__wrapped__)
reviews_polar_evaluation = get_reviews_dataset_polar(collection=reviews, amount=DATA_SET_SIZE.__wrapped__) reviews_polar_evaluation = dataset_polar(collection=reviews, amount=DATA_SET_SIZE.__wrapped__)
reviews_uniform_training = get_reviews_dataset_uniform(collection=reviews, amount=DATA_SET_SIZE.__wrapped__)
reviews_uniform_evaluation = get_reviews_dataset_uniform(collection=reviews, amount=DATA_SET_SIZE.__wrapped__)
vanilla_polar = VanillaReviewSA(categorizer=polar_categorizer) for tokenizer in all_tokenizers:
vanilla_polar.train(reviews_polar_training) log.info("Training polar model with %s tokenizer", tokenizer)
log.info("Vanilla polar evaluation results: %s", vanilla_polar.evaluate(reviews_polar_evaluation)) model = VanillaSA(extractor=review_vanilla_extractor, tokenizer=tokenizer, categorizer=polar_categorizer)
model.train(reviews_polar_training)
log.info("Evaluating polar model with %s tokenizer", tokenizer)
evaluation = model.evaluate(reviews_polar_evaluation)
log.info("Polar model with %s results: %s", tokenizer, evaluation)
potts_polar = PottsReviewSA() del reviews_polar_training
potts_polar.train(reviews_polar_training) del reviews_polar_evaluation
log.info("Potts polar evaluation results: %s", potts_polar.evaluate(reviews_polar_evaluation))
vanilla_uniform = VanillaReviewSA(categorizer=stars_categorizer) with mongo_reviews_collection_from_config() as reviews:
vanilla_uniform.train(reviews_uniform_training) reviews_varied_training = dataset_varied(collection=reviews, amount=DATA_SET_SIZE.__wrapped__)
log.info("Vanilla uniform evaluation results: %s", vanilla_polar.evaluate(reviews_polar_evaluation)) reviews_varied_evaluation = dataset_varied(collection=reviews, amount=DATA_SET_SIZE.__wrapped__)
while True: for tokenizer in all_tokenizers:
print(vanilla_uniform.use(input("> "))) log.info("Training varied model with %s tokenizer", tokenizer)
model = VanillaSA(extractor=review_vanilla_extractor, tokenizer=tokenizer, categorizer=varied_categorizer)
model.train(reviews_varied_training)
log.info("Evaluating varied model with %s tokenizer", tokenizer)
evaluation = model.evaluate(reviews_varied_evaluation)
log.info("Varied model with %s results: %s", tokenizer, evaluation)
del reviews_varied_training
del reviews_varied_evaluation
if __name__ == "__main__": if __name__ == "__main__":

View file

@ -0,0 +1,6 @@
from .vanilla import BaseSA
__all__ = (
"BaseSA",
)

View file

@ -1,4 +1,9 @@
import abc import abc
import typing as t
Input = t.TypeVar("Input")
Category = t.TypeVar("Category")
class BaseSA(metaclass=abc.ABCMeta): class BaseSA(metaclass=abc.ABCMeta):
@ -6,30 +11,15 @@ class BaseSA(metaclass=abc.ABCMeta):
Abstract base class for sentiment analyzers implemented in this project. Abstract base class for sentiment analyzers implemented in this project.
""" """
def __init__(self) -> None:
"""
Create the empty shell of the sentiment analyzer.
"""
self.trained = False
"If :meth:`train` has been called at least once, and the analyzer is ready or not to be evaluated or used."
@abc.abstractmethod @abc.abstractmethod
def train(self, training_set) -> None: def train(self, training_set: list[tuple[Input, Category]]) -> None:
""" """
Train the analyzer with the given training set. Train the analyzer with the given training set.
""" """
raise NotImplementedError() raise NotImplementedError()
@abc.abstractmethod @abc.abstractmethod
def evaluate(self, test_set) -> None: def use(self, text: Input) -> Category:
"""
Evaluate the analyzer with the given test set.
"""
raise NotImplementedError()
@abc.abstractmethod
def use(self, text: str) -> str:
""" """
Use the sentiment analyzer. Use the sentiment analyzer.
""" """
@ -48,6 +38,8 @@ class NotTrainedError(Exception):
__all__ = ( __all__ = (
"Input",
"Category",
"BaseSA", "BaseSA",
"AlreadyTrainedError", "AlreadyTrainedError",
"NotTrainedError", "NotTrainedError",

View file

@ -1,30 +0,0 @@
from ..vendor.potts import Tokenizer
from .vanilla import VanillaSA, VanillaReviewSA
class PottsSA(VanillaSA):
"""
A sentiment analyzer using Potts' tokenizer.
"""
def __init__(self) -> None:
super().__init__()
def _tokenize_text(self, text: str) -> list[str]:
"""
Convert a text string into a list of tokens, using the language of the model.
"""
tokenizer: Tokenizer = Tokenizer(preserve_case=False)
return list(tokenizer.tokenize(text))
class PottsReviewSA(VanillaReviewSA, PottsSA):
"""
A `PottsSA` to be used with `Review`s.
"""
__all__ = (
"PottsSA",
"PottsReviewSA",
)

View file

@ -1,4 +1,3 @@
import abc
import nltk import nltk
import nltk.classify import nltk.classify
import nltk.sentiment import nltk.sentiment
@ -6,31 +5,30 @@ import nltk.sentiment.util
import logging import logging
import typing as t import typing as t
from ..database import Review from .base import Input, Category, BaseSA, AlreadyTrainedError, NotTrainedError
from .base import BaseSA, AlreadyTrainedError, NotTrainedError
TokenBag = list[str]
IntermediateValue = t.TypeVar("IntermediateValue")
log = logging.getLogger(__name__) log = logging.getLogger(__name__)
class VanillaSA(BaseSA, metaclass=abc.ABCMeta): class VanillaSA(BaseSA):
""" """
A sentiment analyzer resembling the one implemented in structure the one implemented in the classroom, using the basic sentiment analyzer of NLTK. A sentiment analyzer resembling the one implemented in structure the one implemented in the classroom, using the basic sentiment analyzer of NLTK.
""" """
def __init__(self) -> None: def __init__(self, *, extractor: t.Callable[[Input], tuple[str, Category]], tokenizer: t.Callable[[str], TokenBag], categorizer: t.Callable[[Input], Category]) -> None:
super().__init__() super().__init__()
self.model: nltk.sentiment.SentimentAnalyzer = nltk.sentiment.SentimentAnalyzer() self.model: nltk.sentiment.SentimentAnalyzer = nltk.sentiment.SentimentAnalyzer()
self.trained: bool = False
def _tokenize_text(self, text: str) -> list[str]: self.extractor: t.Callable[[Input], tuple[str, IntermediateValue]] = extractor
""" self.tokenizer: t.Callable[[str], TokenBag] = tokenizer
Convert a text string into a list of tokens. self.categorizer: t.Callable[[IntermediateValue], Category] = categorizer
"""
tokens = nltk.word_tokenize(text)
nltk.sentiment.util.mark_negation(tokens, shallow=True)
return tokens
def __add_feature_unigrams(self, training_set: list[tuple[list[str], str]]) -> None: def __add_feature_unigrams(self, training_set: list[tuple[TokenBag, Category]]) -> None:
""" """
Add the `nltk.sentiment.util.extract_unigram_feats` feature to the model. Add the `nltk.sentiment.util.extract_unigram_feats` feature to the model.
""" """
@ -38,116 +36,64 @@ class VanillaSA(BaseSA, metaclass=abc.ABCMeta):
unigrams = self.model.unigram_word_feats(words=all_words, min_freq=4) unigrams = self.model.unigram_word_feats(words=all_words, min_freq=4)
self.model.add_feat_extractor(nltk.sentiment.util.extract_unigram_feats, unigrams=unigrams) self.model.add_feat_extractor(nltk.sentiment.util.extract_unigram_feats, unigrams=unigrams)
def _featurize_documents(self, documents: list[tuple[list[str], str]]): def _add_features(self, training_set: list[tuple[TokenBag, Category]]):
""" """
Apply features to a document. Add new features to the sentiment analyzer.
""" """
return self.model.apply_features(documents, labeled=True) self.__add_feature_unigrams(training_set)
def _train_with_set(self, training_set: list[tuple[list[str], str]]) -> None: def _train_from_dataset(self, dataset: list[tuple[TokenBag, Category]]) -> None:
""" """
Train the model with the given **pre-classified but not pre-tokenized** training set. Train the model with the given training set.
""" """
if self.trained: if self.trained:
raise AlreadyTrainedError() raise AlreadyTrainedError()
self.__add_feature_unigrams(training_set) self.__add_feature_unigrams(dataset)
training_set_with_features = self._featurize_documents(training_set) training_set_with_features = self.model.apply_features(dataset, labeled=True)
self.model.train(trainer=nltk.classify.NaiveBayesClassifier.train, training_set=training_set_with_features) self.model.train(trainer=nltk.classify.NaiveBayesClassifier.train, training_set=training_set_with_features)
self.trained = True self.trained = True
def _evaluate_with_set(self, test_set: list[tuple[list[str], str]]) -> dict: def _evaluate_from_dataset(self, dataset: list[tuple[TokenBag, Category]]) -> dict:
"""
Perform a model evaluation with the given test set.
"""
if not self.trained: if not self.trained:
raise NotTrainedError() raise NotTrainedError()
test_set_with_features = self._featurize_documents(test_set) test_set_with_features = self.model.apply_features(dataset, labeled=True)
return self.model.evaluate(test_set_with_features) return self.model.evaluate(test_set_with_features)
def _use_with_tokens(self, tokens: list[str]) -> str: def _use_from_tokenbag(self, tokens: TokenBag) -> Category:
"""
Categorize the given token bag.
"""
if not self.trained: if not self.trained:
raise NotTrainedError() raise NotTrainedError()
return self.model.classify(instance=tokens) return self.model.classify(instance=tokens)
def _extract_data(self, inp: Input) -> tuple[TokenBag, Category]:
text, value = self.extractor(inp)
return self.tokenizer(text), self.categorizer(value)
class VanillaReviewSA(VanillaSA): def _extract_dataset(self, inp: list[Input]) -> list[tuple[TokenBag, Category]]:
""" return list(map(self._extract_data, inp))
A `VanillaSA` to be used with `Review`s.
"""
def __init__(self, categorizer: t.Callable[[Review], str]) -> None: def train(self, training_set: list[Input]) -> None:
super().__init__() dataset = self._extract_dataset(training_set)
self.categorizer: t.Callable[[Review], str] = categorizer self._train_from_dataset(dataset)
def _review_to_data_set(self, review: Review) -> tuple[list[str], str]: def evaluate(self, test_set: list[tuple[Input, Category]]) -> None:
""" dataset = self._extract_dataset(test_set)
Convert a review to a NLTK-compatible dataset. return self._evaluate_from_dataset(dataset)
"""
return self._tokenize_text(text=review["reviewText"]), self.categorizer(rating=review["overall"])
def train(self, reviews: t.Iterable[Review]) -> None:
data_set = list(map(self._review_to_data_set, reviews))
self._train_with_set(data_set)
def evaluate(self, reviews: t.Iterable[Review]): def use(self, text: Input) -> Category:
data_set = list(map(self._review_to_data_set, reviews)) tokens = self.tokenizer(text)
return self._evaluate_with_set(data_set) return self._use_from_tokenbag(tokens)
def use(self, text: str) -> str:
return self._use_with_tokens(self._tokenize_text(text))
def polar_categorizer(rating: float) -> str:
"""
Return the polar label corresponding to the given rating.
Possible categories are:
* negative (1.0, 2.0)
* positive (3.0, 4.0, 5.0)
* unknown (everything else)
"""
match rating:
case 1.0 | 2.0:
return "negative"
case 3.0 | 4.0 | 5.0:
return "positive"
case _:
return "unknown"
def stars_categorizer(rating: float) -> str:
"""
Return the "stars" label corresponding to the given rating.
Possible categories are:
* terrible (1.0)
* negative (2.0)
* mixed (3.0)
* positive (4.0)
* great (5.0)
* unknown (everything else)
"""
match rating:
case 1.0:
return "terrible"
case 2.0:
return "negative"
case 3.0:
return "mixed"
case 4.0:
return "positive"
case 5.0:
return "great"
case _:
return "unknown"
__all__ = ( __all__ = (
"VanillaSA", "VanillaSA",
"VanillaReviewSA",
"polar_categorizer",
"stars_categorizer",
) )

View file

@ -80,11 +80,11 @@ def sample_reviews_by_rating(reviews: pymongo.collection.Collection, rating: flo
]) ])
def get_reviews_dataset_polar(collection: pymongo.collection.Collection, amount: int) -> list[Review]: def dataset_polar(collection: pymongo.collection.Collection, amount: int) -> list[Review]:
""" """
Get a list of shuffled 1-star and 5-star reviews. Get a list of the same amount of 1-star and 5-star reviews.
""" """
log.info("Building dataset with %d polar reviews...", amount * 2) log.info("Building polar dataset with %d reviews...", amount * 2)
# Sample the required reviews # Sample the required reviews
positive = sample_reviews_by_rating(collection, rating=5.0, amount=amount) positive = sample_reviews_by_rating(collection, rating=5.0, amount=amount)
@ -93,18 +93,14 @@ def get_reviews_dataset_polar(collection: pymongo.collection.Collection, amount:
# Randomness here does not matter, so just merge the lists # Randomness here does not matter, so just merge the lists
both = [*positive, *negative] both = [*positive, *negative]
# Shuffle the dataset, just in case it affects the performance
# TODO: does it actually?
random.shuffle(both)
return both return both
def get_reviews_dataset_uniform(collection: pymongo.collection.Collection, amount: int) -> list[Review]: def dataset_varied(collection: pymongo.collection.Collection, amount: int) -> list[Review]:
""" """
Get a list of shuffled reviews of any rating. Get a list of the same amount of reviews for each rating.
""" """
log.info("Building dataset with %d uniform reviews...", amount * 5) log.info("Building varied dataset with %d reviews...", amount * 5)
# Sample the required reviews # Sample the required reviews
terrible = sample_reviews_by_rating(collection, rating=1.0, amount=amount) terrible = sample_reviews_by_rating(collection, rating=1.0, amount=amount)
@ -116,10 +112,6 @@ def get_reviews_dataset_uniform(collection: pymongo.collection.Collection, amoun
# Randomness here does not matter, so just merge the lists # Randomness here does not matter, so just merge the lists
full = [*terrible, *negative, *mixed, *positive, *great] full = [*terrible, *negative, *mixed, *positive, *great]
# Shuffle the dataset, just in case it affects the performance
# TODO: does it actually?
random.shuffle(full)
return full return full
@ -129,5 +121,5 @@ __all__ = (
"mongo_reviews_collection_from_config", "mongo_reviews_collection_from_config",
"sample_reviews", "sample_reviews",
"sample_reviews_by_rating", "sample_reviews_by_rating",
"get_reviews_dataset_polar", "dataset_polar",
) )

View file

@ -0,0 +1,15 @@
from . import nltk_based
from . import potts_based
all_tokenizers = [
nltk_based.tokenizer,
potts_based.tokenizer,
]
__all__ = (
"nltk_based",
"potts_based",
"all_tokenizers",
)

View file

@ -0,0 +1,16 @@
import nltk
import nltk.sentiment.util
def tokenizer(text: str) -> list[str]:
"""
Convert a text string into a list of tokens.
"""
tokens = nltk.word_tokenize(text)
nltk.sentiment.util.mark_negation(tokens, shallow=True)
return tokens
__all__ = (
"tokenizer",
)

View file

@ -74,7 +74,7 @@ emoticon_string = r"""
[<>]? [<>]?
[:;=8] # eyes [:;=8] # eyes
[\-o\*\']? # optional nose [\-o\*\']? # optional nose
[\)\]\(\[dDpP/\:\}\{@\|\\] # mouth [\)\]\(\[dDpP/\:\}\{@\|\\] # mouth
| |
[\)\]\(\[dDpP/\:\}\{@\|\\] # mouth [\)\]\(\[dDpP/\:\}\{@\|\\] # mouth
[\-o\*\']? # optional nose [\-o\*\']? # optional nose
@ -90,20 +90,20 @@ regex_strings = (
(?: # (international) (?: # (international)
\+?[01] \+?[01]
[\-\s.]* [\-\s.]*
)? )?
(?: # (area code) (?: # (area code)
[\(]? [\(]?
\d{3} \d{3}
[\-\s.\)]* [\-\s.\)]*
)? )?
\d{3} # exchange \d{3} # exchange
[\-\s.]* [\-\s.]*
\d{4} # base \d{4} # base
)""" )"""
, ,
# Emoticons: # Emoticons:
emoticon_string emoticon_string
, ,
# HTML tags: # HTML tags:
r"""<[^>]+>""" r"""<[^>]+>"""
, ,
@ -121,7 +121,7 @@ regex_strings = (
| |
(?:[\w_]+) # Words without apostrophes or dashes. (?:[\w_]+) # Words without apostrophes or dashes.
| |
(?:\.(?:\s*\.){1,}) # Ellipsis dots. (?:\.(?:\s*\.){1,}) # Ellipsis dots.
| |
(?:\S) # Everything else that isn't whitespace. (?:\S) # Everything else that isn't whitespace.
""" """
@ -129,7 +129,7 @@ regex_strings = (
###################################################################### ######################################################################
# This is the core tokenizing regex: # This is the core tokenizing regex:
word_re = re.compile(r"""(%s)""" % "|".join(regex_strings), re.VERBOSE | re.I | re.UNICODE) word_re = re.compile(r"""(%s)""" % "|".join(regex_strings), re.VERBOSE | re.I | re.UNICODE)
# The emoticon string gets its own regex so that we can preserve case for them as needed: # The emoticon string gets its own regex so that we can preserve case for them as needed:
@ -142,47 +142,50 @@ amp = "&amp;"
###################################################################### ######################################################################
class Tokenizer:
def __init__(self, preserve_case=False):
self.preserve_case = preserve_case
def tokenize(self, s: str) -> t.Iterable[str]: def tokenizer(text: str) -> t.Iterable[str]:
""" """
Argument: s -- any string object Argument: s -- any string object
Value: a tokenize list of strings; conatenating this list returns the original string if preserve_case=False Value: a tokenize list of strings; conatenating this list returns the original string if preserve_case=False
""" """
# Fix HTML character entitites: # Fix HTML character entitites:
s = self.__html2string(s) s = __html2string(text)
# Tokenize: # Tokenize:
words = word_re.findall(s) words = word_re.findall(s)
# Possible alter the case, but avoid changing emoticons like :D into :d: # Possible alter the case, but avoid changing emoticons like :D into :d:
if not self.preserve_case: words = map((lambda x : x if emoticon_re.search(x) else x.lower()), words)
words = map((lambda x : x if emoticon_re.search(x) else x.lower()), words) # Return the results
return words return words
def __html2string(self, s: str) -> str:
""" def __html2string(html: str) -> str:
Internal metod that seeks to replace all the HTML entities in """
s with their corresponding unicode characters. Internal metod that seeks to replace all the HTML entities in
""" s with their corresponding unicode characters.
# First the digits: """
ents = set(html_entity_digit_re.findall(s)) # First the digits:
if len(ents) > 0: ents = set(html_entity_digit_re.findall(html))
for ent in ents: if len(ents) > 0:
entnum = ent[2:-1]
try:
entnum = int(entnum)
s = s.replace(ent, chr(entnum))
except:
pass
# Now the alpha versions:
ents = set(html_entity_alpha_re.findall(s))
ents = filter((lambda x : x != amp), ents)
for ent in ents: for ent in ents:
entname = ent[1:-1] entnum = ent[2:-1]
try: try:
s = s.replace(ent, chr(html.entities.name2codepoint[entname])) entnum = int(entnum)
html = html.replace(ent, chr(entnum))
except: except:
pass pass
s = s.replace(amp, " and ") # Now the alpha versions:
return s ents = set(html_entity_alpha_re.findall(html))
ents = filter((lambda x : x != amp), ents)
for ent in ents:
entname = ent[1:-1]
try:
html = html.replace(ent, chr(html.entities.name2codepoint[entname]))
except:
pass
html = html.replace(amp, " and ")
return html
__all__ = (
"tokenizer",
)

View file

@ -1,9 +0,0 @@
"""
This module contains modules downloaded from the Internet and adapted for the project.
Edits to the respective modules are released under the same license as the modules themselves.
Currently:
* the adaptation of :mod:`potts` is released under the CC BY-NC-SA 3.0 license.
"""