Refactor things to work better

2024-11-21 15:34:18 +00:00 · 2023-02-02 17:24:11 +01:00 · 2023-02-02 17:24:11 +01:00 · 965cea692a
commit 965cea692a
parent 4c3f892038
12 changed files with 239 additions and 234 deletions
--- a/.editorconfig
+++ b/.editorconfig
@ -0,0 +1,12 @@
+# EditorConfig is awesome: https://EditorConfig.org
+
+# top-most EditorConfig file
+root = true
+
+[*]
+indent_style = space
+indent_size = 4
+end_of_line = lf
+charset = utf-8
+trim_trailing_whitespace = true
+insert_final_newline = true
--- a/.vscode/launch.json
+++ b/.vscode/launch.json
@ -12,7 +12,7 @@
            "justMyCode": true,
            "env": {
                "NLTK_DATA": "./data/nltk",
-                "DATA_SET_SIZE": "100",
+                "DATA_SET_SIZE": "250",
            },
            "cwd": "${workspaceFolder}",
        }
--- a/unimore_bda_6/main.py
+++ b/unimore_bda_6/main.py
@ -1,36 +1,98 @@
 import logging

 from .config import config, DATA_SET_SIZE
-from .database import mongo_reviews_collection_from_config, get_reviews_dataset_polar, get_reviews_dataset_uniform
-from .analysis.vanilla import VanillaReviewSA, polar_categorizer, stars_categorizer
-from .analysis.potts import PottsReviewSA
+from .database import Review, mongo_reviews_collection_from_config, dataset_polar, dataset_varied
+from .analysis.vanilla import VanillaSA
+from .tokenization import all_tokenizers
 from .log import install_log_handler

 log = logging.getLogger(__name__)


+def review_vanilla_extractor(review: Review) -> tuple[str, float]:
+    """
+    Extract review text and rating from a `Review`.
+    """
+    return review["reviewText"], review["overall"]
+
+
+def polar_categorizer(rating: float) -> str:
+    """
+    Return the polar label corresponding to the given rating.
+
+    Possible categories are:
+
+    * negative (1.0, 2.0)
+    * positive (3.0, 4.0, 5.0)
+    * unknown (everything else)
+    """
+    match rating:
+        case 1.0 | 2.0:
+            return "negative"
+        case 3.0 | 4.0 | 5.0:
+            return "positive"
+        case _:
+            return "unknown"
+
+
+def varied_categorizer(rating: float) -> str:
+    """
+    Return the "stars" label corresponding to the given rating.
+
+    Possible categories are:
+
+    * terrible (1.0)
+    * negative (2.0)
+    * mixed (3.0)
+    * positive (4.0)
+    * great (5.0)
+    * unknown (everything else)
+    """
+    match rating:
+        case 1.0:
+            return "terrible"
+        case 2.0:
+            return "negative"
+        case 3.0:
+            return "mixed"
+        case 4.0:
+            return "positive"
+        case 5.0:
+            return "great"
+        case _:
+            return "unknown"
+
+
 def main():
    with mongo_reviews_collection_from_config() as reviews:
-        reviews_polar_training = get_reviews_dataset_polar(collection=reviews, amount=DATA_SET_SIZE.__wrapped__)
-        reviews_polar_evaluation = get_reviews_dataset_polar(collection=reviews, amount=DATA_SET_SIZE.__wrapped__)
-        reviews_uniform_training = get_reviews_dataset_uniform(collection=reviews, amount=DATA_SET_SIZE.__wrapped__)
-        reviews_uniform_evaluation = get_reviews_dataset_uniform(collection=reviews, amount=DATA_SET_SIZE.__wrapped__)
+        reviews_polar_training = dataset_polar(collection=reviews, amount=DATA_SET_SIZE.__wrapped__)
+        reviews_polar_evaluation = dataset_polar(collection=reviews, amount=DATA_SET_SIZE.__wrapped__)

-    vanilla_polar = VanillaReviewSA(categorizer=polar_categorizer)
-    vanilla_polar.train(reviews_polar_training)
-    log.info("Vanilla polar evaluation results: %s", vanilla_polar.evaluate(reviews_polar_evaluation))
+    for tokenizer in all_tokenizers:
+        log.info("Training polar model with %s tokenizer", tokenizer)
+        model = VanillaSA(extractor=review_vanilla_extractor, tokenizer=tokenizer, categorizer=polar_categorizer)
+        model.train(reviews_polar_training)
+        log.info("Evaluating polar model with %s tokenizer", tokenizer)
+        evaluation = model.evaluate(reviews_polar_evaluation)
+        log.info("Polar model with %s results: %s", tokenizer, evaluation)

-    potts_polar = PottsReviewSA()
-    potts_polar.train(reviews_polar_training)
-    log.info("Potts polar evaluation results: %s", potts_polar.evaluate(reviews_polar_evaluation))
+    del reviews_polar_training
+    del reviews_polar_evaluation

-    vanilla_uniform = VanillaReviewSA(categorizer=stars_categorizer)
-    vanilla_uniform.train(reviews_uniform_training)
-    log.info("Vanilla uniform evaluation results: %s", vanilla_polar.evaluate(reviews_polar_evaluation))
+    with mongo_reviews_collection_from_config() as reviews:
+        reviews_varied_training = dataset_varied(collection=reviews, amount=DATA_SET_SIZE.__wrapped__)
+        reviews_varied_evaluation = dataset_varied(collection=reviews, amount=DATA_SET_SIZE.__wrapped__)

-    while True:
-        print(vanilla_uniform.use(input("> ")))
+    for tokenizer in all_tokenizers:
+        log.info("Training varied model with %s tokenizer", tokenizer)
+        model = VanillaSA(extractor=review_vanilla_extractor, tokenizer=tokenizer, categorizer=varied_categorizer)
+        model.train(reviews_varied_training)
+        log.info("Evaluating varied model with %s tokenizer", tokenizer)
+        evaluation =  model.evaluate(reviews_varied_evaluation)
+        log.info("Varied model with %s results: %s", tokenizer, evaluation)

+    del reviews_varied_training
+    del reviews_varied_evaluation


 if __name__ == "__main__":
--- a/unimore_bda_6/analysis/init.py
+++ b/unimore_bda_6/analysis/init.py
@ -0,0 +1,6 @@
+from .vanilla import BaseSA
+
+
+__all__ = (
+    "BaseSA",
+)
--- a/unimore_bda_6/analysis/base.py
+++ b/unimore_bda_6/analysis/base.py
@ -1,4 +1,9 @@
 import abc
+import typing as t
+
+
+Input = t.TypeVar("Input")
+Category = t.TypeVar("Category")


 class BaseSA(metaclass=abc.ABCMeta):
@ -6,30 +11,15 @@ class BaseSA(metaclass=abc.ABCMeta):
    Abstract base class for sentiment analyzers implemented in this project.
    """

-    def __init__(self) -> None:
-        """
-        Create the empty shell of the sentiment analyzer.
-        """
-
-        self.trained = False
-        "If :meth:`train` has been called at least once, and the analyzer is ready or not to be evaluated or used."
-
    @abc.abstractmethod
-    def train(self, training_set) -> None:
+    def train(self, training_set: list[tuple[Input, Category]]) -> None:
        """
        Train the analyzer with the given training set.
        """
        raise NotImplementedError()

    @abc.abstractmethod
-    def evaluate(self, test_set) -> None:
-        """
-        Evaluate the analyzer with the given test set.
-        """
-        raise NotImplementedError()
-
-    @abc.abstractmethod
-    def use(self, text: str) -> str:
+    def use(self, text: Input) -> Category:
        """
        Use the sentiment analyzer.
        """
@ -48,6 +38,8 @@ class NotTrainedError(Exception):


 __all__ = (
+    "Input",
+    "Category",
    "BaseSA",
    "AlreadyTrainedError",
    "NotTrainedError",
--- a/unimore_bda_6/analysis/potts.py
+++ b/unimore_bda_6/analysis/potts.py
@ -1,30 +0,0 @@
-from ..vendor.potts import Tokenizer
-from .vanilla import VanillaSA, VanillaReviewSA
-
-
-class PottsSA(VanillaSA):
-    """
-    A sentiment analyzer using Potts' tokenizer.
-    """
-
-    def __init__(self) -> None:
-        super().__init__()
-
-    def _tokenize_text(self, text: str) -> list[str]:
-        """
-        Convert a text string into a list of tokens, using the language of the model.
-        """
-        tokenizer: Tokenizer = Tokenizer(preserve_case=False)
-        return list(tokenizer.tokenize(text))
-
-
-class PottsReviewSA(VanillaReviewSA, PottsSA):
-    """
-    A `PottsSA` to be used with `Review`s.
-    """
-
-
-__all__ = (
-    "PottsSA",
-    "PottsReviewSA",
-)
--- a/unimore_bda_6/analysis/vanilla.py
+++ b/unimore_bda_6/analysis/vanilla.py
@ -1,4 +1,3 @@
-import abc
 import nltk
 import nltk.classify
 import nltk.sentiment
@ -6,31 +5,30 @@ import nltk.sentiment.util
 import logging
 import typing as t

-from ..database import Review
-from .base import BaseSA, AlreadyTrainedError, NotTrainedError
+from .base import Input, Category, BaseSA, AlreadyTrainedError, NotTrainedError
+
+TokenBag = list[str]
+IntermediateValue = t.TypeVar("IntermediateValue")


 log = logging.getLogger(__name__)


-class VanillaSA(BaseSA, metaclass=abc.ABCMeta):
+class VanillaSA(BaseSA):
    """
    A sentiment analyzer resembling the one implemented in structure the one implemented in the classroom, using the basic sentiment analyzer of NLTK.
    """

-    def __init__(self) -> None:
+    def __init__(self, *, extractor: t.Callable[[Input], tuple[str, Category]], tokenizer: t.Callable[[str], TokenBag], categorizer: t.Callable[[Input], Category]) -> None:
        super().__init__()
        self.model: nltk.sentiment.SentimentAnalyzer = nltk.sentiment.SentimentAnalyzer()
+        self.trained: bool = False

-    def _tokenize_text(self, text: str) -> list[str]:
-        """
-        Convert a text string into a list of tokens.
-        """
-        tokens = nltk.word_tokenize(text)
-        nltk.sentiment.util.mark_negation(tokens, shallow=True)
-        return tokens
+        self.extractor: t.Callable[[Input], tuple[str, IntermediateValue]] = extractor
+        self.tokenizer: t.Callable[[str], TokenBag] = tokenizer
+        self.categorizer: t.Callable[[IntermediateValue], Category] = categorizer

-    def __add_feature_unigrams(self, training_set: list[tuple[list[str], str]]) -> None:
+    def __add_feature_unigrams(self, training_set: list[tuple[TokenBag, Category]]) -> None:
        """
        Add the `nltk.sentiment.util.extract_unigram_feats` feature to the model.
        """
@ -38,116 +36,64 @@ class VanillaSA(BaseSA, metaclass=abc.ABCMeta):
        unigrams = self.model.unigram_word_feats(words=all_words, min_freq=4)
        self.model.add_feat_extractor(nltk.sentiment.util.extract_unigram_feats, unigrams=unigrams)

-    def _featurize_documents(self, documents: list[tuple[list[str], str]]):
+    def _add_features(self, training_set: list[tuple[TokenBag, Category]]):
        """
-        Apply features to a document.
+        Add new features to the sentiment analyzer.
        """
-        return self.model.apply_features(documents, labeled=True)
+        self.__add_feature_unigrams(training_set)

-    def _train_with_set(self, training_set: list[tuple[list[str], str]]) -> None:
+    def _train_from_dataset(self, dataset: list[tuple[TokenBag, Category]]) -> None:
        """
-        Train the model with the given **pre-classified but not pre-tokenized** training set.
+        Train the model with the given training set.
        """
        if self.trained:
            raise AlreadyTrainedError()

-        self.__add_feature_unigrams(training_set)
-        training_set_with_features = self._featurize_documents(training_set)
+        self.__add_feature_unigrams(dataset)
+        training_set_with_features = self.model.apply_features(dataset, labeled=True)

        self.model.train(trainer=nltk.classify.NaiveBayesClassifier.train, training_set=training_set_with_features)
        self.trained = True

-    def _evaluate_with_set(self, test_set: list[tuple[list[str], str]]) -> dict:
+    def _evaluate_from_dataset(self, dataset: list[tuple[TokenBag, Category]]) -> dict:
+        """
+        Perform a model evaluation with the given test set.
+        """
        if not self.trained:
            raise NotTrainedError()
-        
-        test_set_with_features = self._featurize_documents(test_set)
+
+        test_set_with_features = self.model.apply_features(dataset, labeled=True)
        return self.model.evaluate(test_set_with_features)

-    def _use_with_tokens(self, tokens: list[str]) -> str:
+    def _use_from_tokenbag(self, tokens: TokenBag) -> Category:
+        """
+        Categorize the given token bag.
+        """
        if not self.trained:
            raise NotTrainedError()
-        
+
        return self.model.classify(instance=tokens)

+    def _extract_data(self, inp: Input) -> tuple[TokenBag, Category]:
+        text, value = self.extractor(inp)
+        return self.tokenizer(text), self.categorizer(value)

-class VanillaReviewSA(VanillaSA):
-    """
-    A `VanillaSA` to be used with `Review`s.
-    """
+    def _extract_dataset(self, inp: list[Input]) -> list[tuple[TokenBag, Category]]:
+        return list(map(self._extract_data, inp))

-    def __init__(self, categorizer: t.Callable[[Review], str]) -> None:
-        super().__init__()
-        self.categorizer: t.Callable[[Review], str] = categorizer
+    def train(self, training_set: list[Input]) -> None:
+        dataset = self._extract_dataset(training_set)
+        self._train_from_dataset(dataset)

-    def _review_to_data_set(self, review: Review) -> tuple[list[str], str]:
-        """
-        Convert a review to a NLTK-compatible dataset.
-        """
-        return self._tokenize_text(text=review["reviewText"]), self.categorizer(rating=review["overall"])
-        
-    def train(self, reviews: t.Iterable[Review]) -> None:
-        data_set = list(map(self._review_to_data_set, reviews))
-        self._train_with_set(data_set)
+    def evaluate(self, test_set: list[tuple[Input, Category]]) -> None:
+        dataset = self._extract_dataset(test_set)
+        return self._evaluate_from_dataset(dataset)

-    def evaluate(self, reviews: t.Iterable[Review]):
-        data_set = list(map(self._review_to_data_set, reviews))
-        return self._evaluate_with_set(data_set)
-
-    def use(self, text: str) -> str:
-        return self._use_with_tokens(self._tokenize_text(text))
-
-
-def polar_categorizer(rating: float) -> str:
-    """
-    Return the polar label corresponding to the given rating.
-
-    Possible categories are:
-    
-    * negative (1.0, 2.0)
-    * positive (3.0, 4.0, 5.0)
-    * unknown (everything else)
-    """
-    match rating:
-        case 1.0 | 2.0:
-            return "negative"
-        case 3.0 | 4.0 | 5.0:
-            return "positive"
-        case _:
-            return "unknown"
-
-
-def stars_categorizer(rating: float) -> str:
-    """
-    Return the "stars" label corresponding to the given rating.
-
-    Possible categories are:
-    
-    * terrible (1.0)
-    * negative (2.0)
-    * mixed (3.0)
-    * positive (4.0)
-    * great (5.0)
-    * unknown (everything else)
-    """
-    match rating:
-        case 1.0:
-            return "terrible"
-        case 2.0:
-            return "negative"
-        case 3.0:
-            return "mixed"
-        case 4.0:
-            return "positive"
-        case 5.0:
-            return "great"
-        case _:
-            return "unknown"
+    def use(self, text: Input) -> Category:
+        tokens = self.tokenizer(text)
+        return self._use_from_tokenbag(tokens)


 __all__ = (
    "VanillaSA",
-    "VanillaReviewSA",
-    "polar_categorizer",
-    "stars_categorizer",
 )
--- a/unimore_bda_6/database.py
+++ b/unimore_bda_6/database.py
@ -80,11 +80,11 @@ def sample_reviews_by_rating(reviews: pymongo.collection.Collection, rating: flo
    ])


-def get_reviews_dataset_polar(collection: pymongo.collection.Collection, amount: int) -> list[Review]:
+def dataset_polar(collection: pymongo.collection.Collection, amount: int) -> list[Review]:
    """
-    Get a list of shuffled 1-star and 5-star reviews.
+    Get a list of the same amount of 1-star and 5-star reviews.
    """
-    log.info("Building dataset with %d polar reviews...", amount * 2)
+    log.info("Building polar dataset with %d reviews...", amount * 2)

    # Sample the required reviews
    positive = sample_reviews_by_rating(collection, rating=5.0, amount=amount)
@ -93,18 +93,14 @@ def get_reviews_dataset_polar(collection: pymongo.collection.Collection, amount:
    # Randomness here does not matter, so just merge the lists
    both = [*positive, *negative]

-    # Shuffle the dataset, just in case it affects the performance
-    # TODO: does it actually?
-    random.shuffle(both)
-
    return both


-def get_reviews_dataset_uniform(collection: pymongo.collection.Collection, amount: int) -> list[Review]:
+def dataset_varied(collection: pymongo.collection.Collection, amount: int) -> list[Review]:
    """
-    Get a list of shuffled reviews of any rating.
+    Get a list of the same amount of reviews for each rating.
    """
-    log.info("Building dataset with %d uniform reviews...", amount * 5)
+    log.info("Building varied dataset with %d reviews...", amount * 5)

    # Sample the required reviews
    terrible = sample_reviews_by_rating(collection, rating=1.0, amount=amount)
@ -116,10 +112,6 @@ def get_reviews_dataset_uniform(collection: pymongo.collection.Collection, amoun
    # Randomness here does not matter, so just merge the lists
    full = [*terrible, *negative, *mixed, *positive, *great]

-    # Shuffle the dataset, just in case it affects the performance
-    # TODO: does it actually?
-    random.shuffle(full)
-
    return full


@ -129,5 +121,5 @@ __all__ = (
    "mongo_reviews_collection_from_config",
    "sample_reviews",
    "sample_reviews_by_rating",
-    "get_reviews_dataset_polar",
+    "dataset_polar",
 )
--- a/unimore_bda_6/tokenization/init.py
+++ b/unimore_bda_6/tokenization/init.py
@ -0,0 +1,15 @@
+from . import nltk_based
+from . import potts_based
+
+
+all_tokenizers = [
+    nltk_based.tokenizer,
+    potts_based.tokenizer,
+]
+
+
+__all__ = (
+    "nltk_based",
+    "potts_based",
+    "all_tokenizers",
+)
--- a/unimore_bda_6/tokenization/nltk_based.py
+++ b/unimore_bda_6/tokenization/nltk_based.py
@ -0,0 +1,16 @@
+import nltk
+import nltk.sentiment.util
+
+
+def tokenizer(text: str) -> list[str]:
+    """
+    Convert a text string into a list of tokens.
+    """
+    tokens = nltk.word_tokenize(text)
+    nltk.sentiment.util.mark_negation(tokens, shallow=True)
+    return tokens
+
+
+__all__ = (
+    "tokenizer",
+)
--- a/unimore_bda_6/tokenization/potts_based.py
+++ b/unimore_bda_6/tokenization/potts_based.py
@ -74,7 +74,7 @@ emoticon_string = r"""
      [<>]?
      [:;=8]                     # eyes
      [\-o\*\']?                 # optional nose
-      [\)\]\(\[dDpP/\:\}\{@\|\\] # mouth      
+      [\)\]\(\[dDpP/\:\}\{@\|\\] # mouth
      |
      [\)\]\(\[dDpP/\:\}\{@\|\\] # mouth
      [\-o\*\']?                 # optional nose
@ -90,20 +90,20 @@ regex_strings = (
      (?:            # (international)
        \+?[01]
        [\-\s.]*
-      )?            
+      )?
      (?:            # (area code)
        [\(]?
        \d{3}
        [\-\s.\)]*
-      )?    
+      )?
      \d{3}          # exchange
-      [\-\s.]*   
+      [\-\s.]*
      \d{4}          # base
    )"""
    ,
    # Emoticons:
    emoticon_string
-    ,    
+    ,
    # HTML tags:
     r"""<[^>]+>"""
    ,
@ -121,7 +121,7 @@ regex_strings = (
    |
    (?:[\w_]+)                     # Words without apostrophes or dashes.
    |
-    (?:\.(?:\s*\.){1,})            # Ellipsis dots. 
+    (?:\.(?:\s*\.){1,})            # Ellipsis dots.
    |
    (?:\S)                         # Everything else that isn't whitespace.
    """
@ -129,7 +129,7 @@ regex_strings = (

 ######################################################################
 # This is the core tokenizing regex:
-    
+
 word_re = re.compile(r"""(%s)""" % "|".join(regex_strings), re.VERBOSE | re.I | re.UNICODE)

 # The emoticon string gets its own regex so that we can preserve case for them as needed:
@ -142,47 +142,50 @@ amp = "&amp;"

 ######################################################################

-class Tokenizer:
-    def __init__(self, preserve_case=False):
-        self.preserve_case = preserve_case

-    def tokenize(self, s: str) -> t.Iterable[str]:
-        """
-        Argument: s -- any string object
-        Value: a tokenize list of strings; conatenating this list returns the original string if preserve_case=False
-        """
-        # Fix HTML character entitites:
-        s = self.__html2string(s)
-        # Tokenize:
-        words = word_re.findall(s)
-        # Possible alter the case, but avoid changing emoticons like :D into :d:
-        if not self.preserve_case:            
-            words = map((lambda x : x if emoticon_re.search(x) else x.lower()), words)
-        return words
+def tokenizer(text: str) -> t.Iterable[str]:
+    """
+    Argument: s -- any string object
+    Value: a tokenize list of strings; conatenating this list returns the original string if preserve_case=False
+    """
+    # Fix HTML character entitites:
+    s = __html2string(text)
+    # Tokenize:
+    words = word_re.findall(s)
+    # Possible alter the case, but avoid changing emoticons like :D into :d:
+    words = map((lambda x : x if emoticon_re.search(x) else x.lower()), words)
+    # Return the results
+    return words

-    def __html2string(self, s: str) -> str:
-        """
-        Internal metod that seeks to replace all the HTML entities in
-        s with their corresponding unicode characters.
-        """
-        # First the digits:
-        ents = set(html_entity_digit_re.findall(s))
-        if len(ents) > 0:
-            for ent in ents:
-                entnum = ent[2:-1]
-                try:
-                    entnum = int(entnum)
-                    s = s.replace(ent, chr(entnum))	
-                except:
-                    pass
-        # Now the alpha versions:
-        ents = set(html_entity_alpha_re.findall(s))
-        ents = filter((lambda x : x != amp), ents)
+
+def __html2string(html: str) -> str:
+    """
+    Internal metod that seeks to replace all the HTML entities in
+    s with their corresponding unicode characters.
+    """
+    # First the digits:
+    ents = set(html_entity_digit_re.findall(html))
+    if len(ents) > 0:
        for ent in ents:
-            entname = ent[1:-1]
-            try:            
-                s = s.replace(ent, chr(html.entities.name2codepoint[entname]))
+            entnum = ent[2:-1]
+            try:
+                entnum = int(entnum)
+                html = html.replace(ent, chr(entnum))
            except:
-                pass                    
-            s = s.replace(amp, " and ")
-        return s
+                pass
+    # Now the alpha versions:
+    ents = set(html_entity_alpha_re.findall(html))
+    ents = filter((lambda x : x != amp), ents)
+    for ent in ents:
+        entname = ent[1:-1]
+        try:
+            html = html.replace(ent, chr(html.entities.name2codepoint[entname]))
+        except:
+            pass
+        html = html.replace(amp, " and ")
+    return html
+
+
+__all__ = (
+    "tokenizer",
+)
--- a/unimore_bda_6/vendor/init.py
+++ b/unimore_bda_6/vendor/init.py
@ -1,9 +0,0 @@
-"""
-This module contains modules downloaded from the Internet and adapted for the project.
-
-Edits to the respective modules are released under the same license as the modules themselves.
-
-Currently:
-
-* the adaptation of :mod:`potts` is released under the CC BY-NC-SA 3.0 license.
-"""