From ab5f12f8fc20a4ee99fcdbb7d3fcdf74231f441f Mon Sep 17 00:00:00 2001 From: Stefano Pigozzi Date: Thu, 2 Feb 2023 04:34:05 +0100 Subject: [PATCH] Implement basic Potts sentiment analyzer --- unimore_bda_6/__main__.py | 21 ++++++++++++--------- unimore_bda_6/analysis/potts.py | 30 ++++++++++++++++++++++++++++++ 2 files changed, 42 insertions(+), 9 deletions(-) create mode 100644 unimore_bda_6/analysis/potts.py diff --git a/unimore_bda_6/__main__.py b/unimore_bda_6/__main__.py index 73f03b9..2ac2ec3 100644 --- a/unimore_bda_6/__main__.py +++ b/unimore_bda_6/__main__.py @@ -1,23 +1,26 @@ +import logging + from .config import config from .database import mongo_reviews_collection_from_config, get_training_reviews, get_test_reviews from .analysis.vanilla import VanillaReviewSA +from .analysis.potts import PottsReviewSA from .log import install_log_handler +log = logging.getLogger(__name__) + def main(): with mongo_reviews_collection_from_config() as reviews: training_reviews = get_training_reviews(collection=reviews) test_reviews = get_test_reviews(collection=reviews) - model = VanillaReviewSA() - model.train(training_reviews) - - evaluation = model.evaluate(test_reviews) - print(evaluation) - - while True: - classification = model.use(input()) - print(classification) + vanilla = VanillaReviewSA() + vanilla.train(training_reviews) + log.info("Vanilla evaluation results: %s", vanilla.evaluate(test_reviews)) + + potts = PottsReviewSA() + potts.train(training_reviews) + log.info("Potts evaluation results: %s", potts.evaluate(test_reviews)) if __name__ == "__main__": diff --git a/unimore_bda_6/analysis/potts.py b/unimore_bda_6/analysis/potts.py new file mode 100644 index 0000000..611e206 --- /dev/null +++ b/unimore_bda_6/analysis/potts.py @@ -0,0 +1,30 @@ +from ..vendor.potts import Tokenizer +from .vanilla import VanillaSA, VanillaReviewSA + + +class PottsSA(VanillaSA): + """ + A sentiment analyzer using Potts' tokenizer. + """ + + def __init__(self) -> None: + super().__init__() + + def _tokenize_text(self, text: str) -> list[str]: + """ + Convert a text string into a list of tokens, using the language of the model. + """ + tokenizer: Tokenizer = Tokenizer(preserve_case=False) + return list(tokenizer.tokenize(text)) + + +class PottsReviewSA(VanillaReviewSA, PottsSA): + """ + A `PottsSA` to be used with `Review`s. + """ + + +__all__ = ( + "PottsSA", + "PottsReviewSA", +) \ No newline at end of file