From ab5f12f8fc20a4ee99fcdbb7d3fcdf74231f441f Mon Sep 17 00:00:00 2001
From: Stefano Pigozzi <me@steffo.eu>
Date: Thu, 2 Feb 2023 04:34:05 +0100
Subject: [PATCH] Implement basic Potts sentiment analyzer

---
 unimore_bda_6/__main__.py       | 21 ++++++++++++---------
 unimore_bda_6/analysis/potts.py | 30 ++++++++++++++++++++++++++++++
 2 files changed, 42 insertions(+), 9 deletions(-)
 create mode 100644 unimore_bda_6/analysis/potts.py

diff --git a/unimore_bda_6/__main__.py b/unimore_bda_6/__main__.py
index 73f03b9..2ac2ec3 100644
--- a/unimore_bda_6/__main__.py
+++ b/unimore_bda_6/__main__.py
@@ -1,23 +1,26 @@
+import logging
+
 from .config import config
 from .database import mongo_reviews_collection_from_config, get_training_reviews, get_test_reviews
 from .analysis.vanilla import VanillaReviewSA
+from .analysis.potts import PottsReviewSA
 from .log import install_log_handler
 
+log = logging.getLogger(__name__)
+
 
 def main():
     with mongo_reviews_collection_from_config() as reviews:
         training_reviews = get_training_reviews(collection=reviews)
         test_reviews = get_test_reviews(collection=reviews)
 
-    model = VanillaReviewSA()
-    model.train(training_reviews)
-    
-    evaluation = model.evaluate(test_reviews)
-    print(evaluation)
-    
-    while True:
-        classification = model.use(input())
-        print(classification)
+    vanilla = VanillaReviewSA()
+    vanilla.train(training_reviews)
+    log.info("Vanilla evaluation results: %s", vanilla.evaluate(test_reviews))
+
+    potts = PottsReviewSA()
+    potts.train(training_reviews)
+    log.info("Potts evaluation results: %s", potts.evaluate(test_reviews))
 
 
 if __name__ == "__main__":
diff --git a/unimore_bda_6/analysis/potts.py b/unimore_bda_6/analysis/potts.py
new file mode 100644
index 0000000..611e206
--- /dev/null
+++ b/unimore_bda_6/analysis/potts.py
@@ -0,0 +1,30 @@
+from ..vendor.potts import Tokenizer
+from .vanilla import VanillaSA, VanillaReviewSA
+
+
+class PottsSA(VanillaSA):
+    """
+    A sentiment analyzer using Potts' tokenizer.
+    """
+
+    def __init__(self) -> None:
+        super().__init__()
+
+    def _tokenize_text(self, text: str) -> list[str]:
+        """
+        Convert a text string into a list of tokens, using the language of the model.
+        """
+        tokenizer: Tokenizer = Tokenizer(preserve_case=False)
+        return list(tokenizer.tokenize(text))
+
+
+class PottsReviewSA(VanillaReviewSA, PottsSA):
+    """
+    A `PottsSA` to be used with `Review`s.
+    """
+
+
+__all__ = (
+    "PottsSA",
+    "PottsReviewSA",
+)
\ No newline at end of file