New version working nicely

2024-11-22 07:54:19 +00:00 · 2023-02-03 23:27:44 +01:00 · 2023-02-03 23:27:44 +01:00 · 6ef81c1c19
commit 6ef81c1c19
parent 379cbdd13a
15 changed files with 371 additions and 316 deletions
--- a/.idea/inspectionProfiles/Project_Default.xml
+++ b/.idea/inspectionProfiles/Project_Default.xml
@ -1,12 +1,44 @@
 <component name="InspectionProjectProfileManager">
  <profile version="1.0">
    <option name="myName" value="Project Default" />
+    <inspection_tool class="HttpUrlsUsage" enabled="true" level="WEAK WARNING" enabled_by_default="true">
+      <option name="ignoredUrls">
+        <list>
+          <option value="http://localhost" />
+          <option value="http://127.0.0.1" />
+          <option value="http://0.0.0.0" />
+          <option value="http://www.w3.org/" />
+          <option value="http://json-schema.org/draft" />
+          <option value="http://java.sun.com/" />
+          <option value="http://xmlns.jcp.org/" />
+          <option value="http://javafx.com/javafx/" />
+          <option value="http://javafx.com/fxml" />
+          <option value="http://maven.apache.org/xsd/" />
+          <option value="http://maven.apache.org/POM/" />
+          <option value="http://www.springframework.org/schema/" />
+          <option value="http://www.springframework.org/tags" />
+          <option value="http://www.springframework.org/security/tags" />
+          <option value="http://www.thymeleaf.org" />
+          <option value="http://www.jboss.org/j2ee/schema/" />
+          <option value="http://www.jboss.com/xml/ns/" />
+          <option value="http://www.ibm.com/webservices/xsd" />
+          <option value="http://activemq.apache.org/schema/" />
+          <option value="http://schema.cloudfoundry.org/spring/" />
+          <option value="http://schemas.xmlsoap.org/" />
+          <option value="http://cxf.apache.org/schemas/" />
+          <option value="http://primefaces.org/ui" />
+          <option value="http://tiles.apache.org/" />
+          <option value="http://sentiment.christopherpotts.net" />
+        </list>
+      </option>
+    </inspection_tool>
    <inspection_tool class="PyPep8Inspection" enabled="true" level="WEAK WARNING" enabled_by_default="true">
      <option name="ignoredErrors">
        <list>
          <option value="E124" />
          <option value="E501" />
          <option value="E221" />
+          <option value="E203" />
        </list>
      </option>
    </inspection_tool>
--- a/README.md
+++ b/README.md
@ -76,11 +76,11 @@ $ mongosh < ./data/scripts/index-db.js

 <!-- TODO -->

-## `base`: Costruzione dell'impalcatura necessaria al confronto
+## `.analysis.base`: Costruzione dell'impalcatura necessaria al confronto

 <!-- TODO -->

-## `vanilla`: Ricostruzione e ottimizzazione del modello basato su `nltk.sentiment` realizzato a lezione
+## `.analysis.nltk_sentiment`: Ricostruzione e ottimizzazione del modello basato su `nltk.sentiment` realizzato a lezione

 Per avere un modello baseline con cui effettuare un confronto, si è ricostruito un modello basato su `nltk.sentiment` ispirato a quello realizzato a lezione.

--- a/unimore_bda_6/main.py
+++ b/unimore_bda_6/main.py
@ -1,91 +1,38 @@
 import logging

 from .config import config, DATA_SET_SIZE
-from .database import Review, mongo_reviews_collection_from_config, dataset_polar, dataset_varied
-from .analysis.vanilla import VanillaSA
-from .tokenization import all_tokenizers
+from .database import mongo_reviews_collection_from_config, polar_dataset, varied_dataset
+from .analysis.nltk_sentiment import NLTKSentimentAnalyzer
+from .tokenizer import NLTKWordTokenizer, PottsTokenizer, PottsTokenizerWithNegation
 from .log import install_log_handler

 log = logging.getLogger(__name__)


-def review_vanilla_extractor(review: Review) -> tuple[str, float]:
-    """
-    Extract review text and rating from a `Review`.
-    """
-    return review["reviewText"], review["overall"]
-
-
-def polar_categorizer(rating: float) -> str:
-    """
-    Return the polar label corresponding to the given rating.
-
-    Possible categories are:
-
-    * negative (1.0, 2.0)
-    * positive (3.0, 4.0, 5.0)
-    * unknown (everything else)
-    """
-    match rating:
-        case 1.0 | 2.0:
-            return "negative"
-        case 3.0 | 4.0 | 5.0:
-            return "positive"
-        case _:
-            return "unknown"
-
-
-def varied_categorizer(rating: float) -> str:
-    """
-    Return the "stars" label corresponding to the given rating.
-
-    Possible categories are:
-
-    * terrible (1.0)
-    * negative (2.0)
-    * mixed (3.0)
-    * positive (4.0)
-    * great (5.0)
-    * unknown (everything else)
-    """
-    match rating:
-        case 1.0:
-            return "terrible"
-        case 2.0:
-            return "negative"
-        case 3.0:
-            return "mixed"
-        case 4.0:
-            return "positive"
-        case 5.0:
-            return "great"
-        case _:
-            return "unknown"
-
-
 def main():
-    for dataset_func, categorizer in [
-        (dataset_polar, polar_categorizer),
-        (dataset_varied, varied_categorizer),
-    ]:
-        for tokenizer in all_tokenizers:
+    for dataset_func in [polar_dataset, varied_dataset]:
+        for SentimentAnalyzer in [NLTKSentimentAnalyzer]:
+            for Tokenizer in [NLTKWordTokenizer, PottsTokenizer, PottsTokenizerWithNegation]:
+                tokenizer = Tokenizer()
+                model = SentimentAnalyzer(tokenizer=tokenizer)
+
                with mongo_reviews_collection_from_config() as reviews:
                    reviews_training = dataset_func(collection=reviews, amount=DATA_SET_SIZE.__wrapped__)
                    reviews_evaluation = dataset_func(collection=reviews, amount=DATA_SET_SIZE.__wrapped__)

-                model = VanillaSA(extractor=review_vanilla_extractor, tokenizer=tokenizer, categorizer=categorizer)
                    log.info("Training model %s", model)
                    model.train(reviews_training)
                    log.info("Evaluating model %s", model)
-                evaluation = model.evaluate(reviews_evaluation)
-                log.info("Results of model %s: %s", tokenizer, evaluation)
+                    correct, evaluated = model.evaluate(reviews_evaluation)
+                    log.info("%d evaluated, %d correct, %0.2d %% accuracy", evaluated, correct, correct / evaluated * 100)

-            try:
-                print("Model %s" % model)
-                while inp := input():
-                    print(model.use(inp))
-            except KeyboardInterrupt:
-                pass
+                # try:
+                #     print("Manual testing for %s" % model)
+                #     print("Input an empty string to continue to the next model.")
+                #     while inp := input():
+                #         print(model.use(inp))
+                # except KeyboardInterrupt:
+                #     pass


 if __name__ == "__main__":
--- a/unimore_bda_6/analysis/init.py
+++ b/unimore_bda_6/analysis/init.py
@ -1,6 +1,8 @@
-from .vanilla import BaseSA
+from .base import BaseSentimentAnalyzer
+from .nltk_sentiment import NLTKSentimentAnalyzer


 __all__ = (
-    "BaseSA",
+    "BaseSentimentAnalyzer",
+    "NLTKSentimentAnalyzer",
 )
--- a/unimore_bda_6/analysis/base.py
+++ b/unimore_bda_6/analysis/base.py
@ -1,47 +1,49 @@
 import abc
-import typing as t
+import logging
+
+from ..database import DataSet, Text, Category
+
+log = logging.getLogger(__name__)


-Input = t.TypeVar("Input")
-Category = t.TypeVar("Category")
-
-
-class BaseSA(metaclass=abc.ABCMeta):
+class BaseSentimentAnalyzer(metaclass=abc.ABCMeta):
    """
    Abstract base class for sentiment analyzers implemented in this project.
    """

    @abc.abstractmethod
-    def train(self, training_set: t.Iterable[tuple[Input, Category]]) -> None:
+    def train(self, training_set: DataSet) -> None:
        """
-        Train the analyzer with the given training set.
+        Train the analyzer with the given training dataset.
        """
        raise NotImplementedError()

+    def evaluate(self, test_set: DataSet) -> tuple[int, int]:
+        """
+        Perform a model evaluation by calling repeatedly `.use` on every text of the test dataset and by comparing its resulting category with the expected category.
+
+        Returns a tuple with the number of correct results and the number of evaluated results.
+        """
+        evaluated: int = 0
+        correct: int   = 0
+
+        for text, expected_category in test_set:
+            resulting_category = self.use(text)
+            evaluated += 1
+            correct += 1 if resulting_category == expected_category else 0
+            if not evaluated % 100:
+                log.debug("%d evaluated, %d correct, %0.2d %% accuracy", evaluated, correct, correct / evaluated * 100)
+
+        return correct, evaluated
+
    @abc.abstractmethod
-    def use(self, text: Input) -> Category:
+    def use(self, text: Text) -> Category:
        """
-        Use the sentiment analyzer.
+        Run the model on the given input.
        """
        raise NotImplementedError()


-class AlreadyTrainedError(Exception):
-    """
-    This model has already been trained and cannot be trained again.
-    """
-
-
-class NotTrainedError(Exception):
-    """
-    This model has not been trained yet.
-    """
-
-
 __all__ = (
-    "Input",
-    "Category",
-    "BaseSA",
-    "AlreadyTrainedError",
-    "NotTrainedError",
+    "BaseSentimentAnalyzer",
 )
--- a/unimore_bda_6/analysis/nltk_sentiment.py
+++ b/unimore_bda_6/analysis/nltk_sentiment.py
@ -0,0 +1,123 @@
+import nltk
+import nltk.classify
+import nltk.sentiment
+import nltk.sentiment.util
+import logging
+import typing as t
+import itertools
+
+from ..database import Text, Category, DataTuple, DataSet
+from .base import BaseSentimentAnalyzer
+from ..log import count_passage
+from ..tokenizer import BaseTokenizer
+
+log = logging.getLogger(__name__)
+
+TokenBag = list[str]
+Features = dict[str, int]
+
+
+class AlreadyTrainedError(Exception):
+    """
+    This model has already been trained and cannot be trained again.
+    """
+
+
+class NotTrainedError(Exception):
+    """
+    This model has not been trained yet.
+    """
+
+
+class NLTKSentimentAnalyzer(BaseSentimentAnalyzer):
+    """
+    A sentiment analyzer resembling the one implemented in structure the one implemented in the classroom, using the basic sentiment analyzer of NLTK.
+    """
+
+    def __init__(self, *, tokenizer: BaseTokenizer) -> None:
+        super().__init__()
+        self.model: nltk.sentiment.SentimentAnalyzer = nltk.sentiment.SentimentAnalyzer()
+        self.trained: bool = False
+        self.tokenizer: BaseTokenizer = tokenizer
+
+    def __repr__(self):
+        return f"<{self.__class__.__qualname__} {'trained' if self.trained else 'untrained'} tokenizer={self.tokenizer!r}>"
+
+    def __tokenize_datatuple(self, datatuple: DataTuple) -> tuple[TokenBag, Category]:
+        """
+        Convert the `Text` of a `DataTuple` to a `TokenBag`.
+        """
+        count_passage(log, "tokenize_datatuple", 100)
+        return self.tokenizer.tokenize(datatuple[0]), datatuple[1]
+
+    def _add_feature_unigrams(self, dataset: t.Iterator[tuple[TokenBag, Category]]) -> None:
+        """
+        Register the `nltk.sentiment.util.extract_unigram_feats` feature extrator on the model.
+        """
+        # Ignore the category and only access the tokens
+        tokenbags = map(lambda d: d[0], dataset)
+        # Get all words in the documents
+        all_words = self.model.all_words(tokenbags, labeled=False)
+        # Create unigram `contains(*)` features from the previously gathered words
+        unigrams = self.model.unigram_word_feats(words=all_words, min_freq=4)
+        # Add the feature extractor to the model
+        self.model.add_feat_extractor(nltk.sentiment.util.extract_unigram_feats, unigrams=unigrams)
+
+    def _add_feature_extractors(self, dataset: t.Iterator[tuple[TokenBag, Category]]):
+        """
+        Register new feature extractors on the `.model`.
+        """
+        # Add the unigrams feature
+        self._add_feature_unigrams(dataset)
+
+    def __extract_features(self, data: tuple[TokenBag, Category]) -> tuple[Features, Category]:
+        """
+        Convert a (TokenBag, Category) tuple to a (Features, Category) tuple.
+
+        Does not use `SentimentAnalyzer.apply_features` due to unexpected behaviour when using iterators.
+        """
+        count_passage(log, "extract_features", 100)
+        return self.model.extract_features(data[0]), data[1]
+
+    def train(self, dataset: DataSet) -> None:
+        # Forbid retraining the model
+        if self.trained:
+            raise AlreadyTrainedError()
+
+        # Tokenize the dataset
+        dataset: t.Iterator[tuple[TokenBag, Category]] = map(self.__tokenize_datatuple, dataset)
+
+        # Cleanly duplicate the dataset iterator
+        # Reduce average memory footprint, but not maximum
+        dataset_1, dataset_2 = itertools.tee(dataset, 2)
+        dataset_1: t.Iterator[tuple[TokenBag, Category]]
+        dataset_2: t.Iterator[tuple[TokenBag, Category]]
+
+        # Add the feature extractors to the model
+        self._add_feature_extractors(dataset_1)
+        del dataset_1  # Delete exausted iterator
+
+        # Extract features from the dataset
+        dataset_2: t.Iterator[tuple[Features, Category]] = map(self.__extract_features, dataset_2)
+
+        # Train the classifier with the extracted features and category
+        self.model.classifier = nltk.classify.NaiveBayesClassifier.train(dataset_2)
+
+        # Toggle the trained flag
+        self.trained = True
+
+    def use(self, text: Text) -> Category:
+        # Require the model to be trained
+        if not self.trained:
+            raise NotTrainedError()
+
+        # Tokenize the input
+        tokens = self.tokenizer.tokenize(text)
+
+        # Run the classification method
+        return self.model.classify(instance=tokens)
+
+
+__all__ = (
+    "NLTKSentimentAnalyzer",
+)
--- a/unimore_bda_6/analysis/vanilla.py
+++ b/unimore_bda_6/analysis/vanilla.py
@ -1,126 +0,0 @@
-import nltk
-import nltk.classify
-import nltk.sentiment
-import nltk.sentiment.util
-import logging
-import typing as t
-import itertools
-
-from .base import Input, Category, BaseSA, AlreadyTrainedError, NotTrainedError
-from ..log import count_passage
-
-TokenBag = list[str]
-IntermediateValue = t.TypeVar("IntermediateValue")
-Features = dict[str, int]
-
-
-log = logging.getLogger(__name__)
-
-
-class VanillaSA(BaseSA):
-    """
-    A sentiment analyzer resembling the one implemented in structure the one implemented in the classroom, using the basic sentiment analyzer of NLTK.
-    """
-
-    def __init__(self, *, extractor: t.Callable[[Input], tuple[str, IntermediateValue]], tokenizer: t.Callable[[str], TokenBag], categorizer: t.Callable[[IntermediateValue], Category]) -> None:
-        super().__init__()
-        self.model: nltk.sentiment.SentimentAnalyzer = nltk.sentiment.SentimentAnalyzer()
-        self.trained: bool = False
-        self.extractor: t.Callable[[Input], tuple[str, IntermediateValue]] = extractor
-        self.tokenizer: t.Callable[[str], TokenBag] = tokenizer
-        self.categorizer: t.Callable[[IntermediateValue], Category] = categorizer
-
-    def __repr__(self):
-        return f"<{self.__class__.__qualname__} {'trained' if self.trained else 'untrained'} tokenizer={self.extractor!r} categorizer={self.categorizer!r}>"
-
-    @staticmethod
-    def __data_to_tokenbag(data: tuple[TokenBag, Category]) -> TokenBag:
-        """
-        Access the tokenbag of a data tuple.
-        """
-        return data[0]
-
-    def __add_feature_unigrams(self, dataset: t.Iterator[tuple[TokenBag, Category]]) -> None:
-        """
-        Register the `nltk.sentiment.util.extract_unigram_feats` feature extrator on the model.
-        """
-        tokenbags = map(self.__data_to_tokenbag, dataset)
-        all_words = self.model.all_words(tokenbags, labeled=False)
-        unigrams = self.model.unigram_word_feats(words=all_words, min_freq=4)
-        self.model.add_feat_extractor(nltk.sentiment.util.extract_unigram_feats, unigrams=unigrams)
-
-    def _add_features(self, dataset: t.Iterator[tuple[TokenBag, Category]]):
-        """
-        Register new feature extractors on the `.model`.
-        """
-        self.__add_feature_unigrams(dataset)
-
-    def __extract_features(self, data: tuple[TokenBag, Category]) -> tuple[Features, Category]:
-        """
-        Convert a (TokenBag, Category) tuple to a (Features, Category) tuple.
-
-        Does not use `SentimentAnalyzer.apply_features` due to unexpected behaviour when using iterators.
-        """
-        count_passage("processed_features", 100)
-        return self.model.extract_features(data[0]), data[1]
-
-    def _train_from_dataset(self, dataset: t.Iterator[tuple[TokenBag, Category]]) -> None:
-        """
-        Train the model with the given training set.
-        """
-        if self.trained:
-            raise AlreadyTrainedError()
-
-        dataset_1, dataset_2 = itertools.tee(dataset, 2)
-
-        self._add_features(dataset_1)
-        del dataset_1
-
-        dataset_2 = map(self.__extract_features, dataset_2)
-        self.model.classifier = nltk.classify.NaiveBayesClassifier.train(dataset_2)
-        self.trained = True
-
-    def _evaluate_from_dataset(self, dataset: t.Iterator[tuple[TokenBag, Category]]) -> dict:
-        """
-        Perform a model evaluation with the given test set.
-        """
-        if not self.trained:
-            raise NotTrainedError()
-
-        dataset_1 = map(self.__extract_features, dataset)
-        # FIXME: This won't work with streams :(
-        return self.model.evaluate(list(dataset_1))
-
-    def _use_from_tokenbag(self, tokens: TokenBag) -> Category:
-        """
-        Categorize the given token bag.
-        """
-        if not self.trained:
-            raise NotTrainedError()
-
-        return self.model.classify(instance=tokens)
-
-    def _extract_data(self, inp: Input) -> tuple[TokenBag, Category]:
-        count_passage("processed_data", 100)
-        text, value = self.extractor(inp)
-        return self.tokenizer(text), self.categorizer(value)
-
-    def _extract_dataset(self, inp: t.Iterator[Input]) -> list[tuple[TokenBag, Category]]:
-        return map(self._extract_data, inp)
-
-    def train(self, training_set: t.Iterator[Input]) -> None:
-        dataset = self._extract_dataset(training_set)
-        self._train_from_dataset(dataset)
-
-    def evaluate(self, test_set: t.Iterator[Input]) -> dict:
-        dataset = self._extract_dataset(test_set)
-        return self._evaluate_from_dataset(dataset)
-
-    def use(self, text: Input) -> Category:
-        tokens = self.tokenizer(text)
-        return self._use_from_tokenbag(tokens)
-
-
-__all__ = (
-    "VanillaSA",
-)
--- a/unimore_bda_6/database.py
+++ b/unimore_bda_6/database.py
@ -6,7 +6,7 @@ import bson
 import logging
 import itertools

-from .config import MONGO_HOST, MONGO_PORT, WORKING_SET_SIZE, DATA_SET_SIZE
+from .config import MONGO_HOST, MONGO_PORT, WORKING_SET_SIZE

 log = logging.getLogger(__name__)

@ -24,6 +24,12 @@ class Review(t.TypedDict):
    reviewTime: str


+Text = str
+Category = str
+DataTuple = tuple[Text, Category]
+DataSet = t.Iterable[DataTuple]
+
+
@contextlib.contextmanager
 def mongo_client_from_config() -> t.ContextManager[pymongo.MongoClient]:
    """
@ -80,7 +86,40 @@ def sample_reviews_by_rating(reviews: pymongo.collection.Collection, rating: flo
    ])


-def dataset_polar(collection: pymongo.collection.Collection, amount: int) -> t.Iterator[Review]:
+def review_to_datatuple(review: Review) -> tuple[Text, Category]:
+    """
+    Return the label corresponding to the given review.
+
+    Possible categories are:
+
+    * terrible (1.0)
+    * negative (2.0)
+    * mixed (3.0)
+    * positive (4.0)
+    * great (5.0)
+    * unknown (everything else)
+    """
+    text = review["reviewText"]
+    rating = review["overall"]
+
+    match rating:
+        case 1.0:
+            category = "terrible"
+        case 2.0:
+            category = "negative"
+        case 3.0:
+            category = "mixed"
+        case 4.0:
+            category = "positive"
+        case 5.0:
+            category = "great"
+        case _:
+            category = "unknown"
+
+    return text, category
+
+
+def polar_dataset(collection: pymongo.collection.Collection, amount: int) -> t.Iterator[DataTuple]:
    """
    Get a list of the same amount of 1-star and 5-star reviews.
    """
@ -90,13 +129,16 @@ def dataset_polar(collection: pymongo.collection.Collection, amount: int) -> t.I
    positive = sample_reviews_by_rating(collection, rating=5.0, amount=amount)
    negative = sample_reviews_by_rating(collection, rating=1.0, amount=amount)

-    # Randomness here does not matter, so just merge the lists
-    both = itertools.chain(positive, negative)
+    # Chain the iterators
+    full = itertools.chain(positive, negative)

-    return both
+    # Convert reviews to datatuples
+    full = map(review_to_datatuple, full)
+
+    return full


-def dataset_varied(collection: pymongo.collection.Collection, amount: int) -> t.Iterator[Review]:
+def varied_dataset(collection: pymongo.collection.Collection, amount: int) -> t.Iterator[DataTuple]:
    """
    Get a list of the same amount of reviews for each rating.
    """
@ -109,17 +151,25 @@ def dataset_varied(collection: pymongo.collection.Collection, amount: int) -> t.
    positive = sample_reviews_by_rating(collection, rating=4.0, amount=amount)
    great    = sample_reviews_by_rating(collection, rating=5.0, amount=amount)

+    # Chain the iterators
    full = itertools.chain(terrible, negative, mixed, positive, great)

+    # Convert reviews to datatuples
+    full = map(review_to_datatuple, full)
+
    return full


 __all__ = (
    "Review",
+    "Text",
+    "Category",
+    "DataTuple",
+    "DataSet",
    "mongo_client_from_config",
    "mongo_reviews_collection_from_config",
    "sample_reviews",
    "sample_reviews_by_rating",
-    "dataset_polar",
-    "dataset_varied",
+    "polar_dataset",
+    "varied_dataset",
 )
--- a/unimore_bda_6/log.py
+++ b/unimore_bda_6/log.py
@ -2,7 +2,7 @@ import collections
 import logging
 import coloredlogs

-log = logging.getLogger(__name__)
+this_log = logging.getLogger(__name__)


 def install_log_handler(loggers: list[logging.Logger] = None):
@ -32,13 +32,13 @@ def install_log_handler(loggers: list[logging.Logger] = None):
            ),
            isatty=True,
        )
-        log.debug("Installed custom log handler on: %s", logger)
+        this_log.debug("Installed custom log handler on: %s", logger)


 _passage_counts = collections.defaultdict(lambda: 0)


-def count_passage(key: str, mod: int):
+def count_passage(log: logging.Logger, key: str, mod: int):
    _passage_counts[key] += 1
    if not _passage_counts[key] % mod:
        log.debug("%s - %d calls", key, _passage_counts[key])
--- a/unimore_bda_6/tokenization/init.py
+++ b/unimore_bda_6/tokenization/init.py
@ -1,15 +0,0 @@
-from . import nltk_based
-from . import potts_based
-
-
-all_tokenizers = [
-    nltk_based.nltk_tokenizer,
-    potts_based.potts_tokenizer,
-]
-
-
-__all__ = (
-    "nltk_based",
-    "potts_based",
-    "all_tokenizers",
-)
--- a/unimore_bda_6/tokenization/nltk_based.py
+++ b/unimore_bda_6/tokenization/nltk_based.py
@ -1,16 +0,0 @@
-import nltk
-import nltk.sentiment.util
-
-
-def nltk_tokenizer(text: str) -> list[str]:
-    """
-    Convert a text string into a list of tokens.
-    """
-    tokens = nltk.word_tokenize(text)
-    nltk.sentiment.util.mark_negation(tokens, shallow=True)
-    return tokens
-
-
-__all__ = (
-    "nltk_tokenizer",
-)
--- a/unimore_bda_6/tokenizer/init.py
+++ b/unimore_bda_6/tokenizer/init.py
@ -0,0 +1,10 @@
+from .base import BaseTokenizer
+from .nltk_word_tokenize import NLTKWordTokenizer
+from .potts import PottsTokenizer, PottsTokenizerWithNegation
+
+
+__all__ = (
+    "BaseTokenizer",
+    "NLTKWordTokenizer",
+    "PottsTokenizer",
+)
--- a/unimore_bda_6/tokenizer/base.py
+++ b/unimore_bda_6/tokenizer/base.py
@ -0,0 +1,17 @@
+import abc
+
+
+class BaseTokenizer(metaclass=abc.ABCMeta):
+    """
+    The base for all tokenizers in this project.
+    """
+
+    def __repr__(self):
+        return f"{self.__class__.__qualname__}()"
+
+    @abc.abstractmethod
+    def tokenize(self, text: str) -> list[str]:
+        """
+        Convert a text string into a list of tokens.
+        """
+        raise NotImplementedError()
--- a/unimore_bda_6/tokenizer/nltk_word_tokenize.py
+++ b/unimore_bda_6/tokenizer/nltk_word_tokenize.py
@ -0,0 +1,21 @@
+import nltk
+import nltk.sentiment.util
+import typing as t
+
+from .base import BaseTokenizer
+
+
+class NLTKWordTokenizer(BaseTokenizer):
+    """
+    Tokenizer based on `nltk.word_tokenize`.
+    """
+
+    def tokenize(self, text: str) -> t.Iterable[str]:
+        tokens = nltk.word_tokenize(text)
+        nltk.sentiment.util.mark_negation(tokens, shallow=True)
+        return tokens
+
+
+__all__ = (
+    "NLTKWordTokenizer",
+)
--- a/unimore_bda_6/tokenization/potts_based.py
+++ b/unimore_bda_6/tokenization/potts_based.py
@ -1,8 +1,4 @@
 """
-This file is a vendored version of `Christopher Potts' tokenizer <http://sentiment.christopherpotts.net/tokenizing.html>`_, which the project's specifications require to use.
-
-It has been altered to be used with Python 3.10, but the code is mostly the same.
-
 =========================
 Original module docstring
 =========================
@ -54,6 +50,9 @@ __email__ = "See the author's website"
 import re
 import html.entities
 import typing as t
+import nltk.sentiment.util
+
+from .base import BaseTokenizer

 ######################################################################
 # The following strings are components in the regular expression
@ -143,49 +142,58 @@ amp = "&amp;"
 ######################################################################


-def potts_tokenizer(text: str) -> t.Iterable[str]:
+class PottsTokenizer(BaseTokenizer):
    """
-    Argument: s -- any string object
-    Value: a tokenize list of strings; conatenating this list returns the original string if preserve_case=False
+    Tokenizer based on `Christopher Potts' tokenizer <http://sentiment.christopherpotts.net/tokenizing.html>`_.
    """
-    # Fix HTML character entitites:
-    s = __html2string(text)
-    # Tokenize:
-    words = word_re.findall(s)
-    # Possible alter the case, but avoid changing emoticons like :D into :d:
-    words = map((lambda x : x if emoticon_re.search(x) else x.lower()), words)
-    # Return the results
-    return words

-
-def __html2string(html: str) -> str:
+    @staticmethod
+    def __html2string(s: str) -> str:
        """
        Internal metod that seeks to replace all the HTML entities in
        s with their corresponding unicode characters.
        """
        # First the digits:
-    ents = set(html_entity_digit_re.findall(html))
+        ents = set(html_entity_digit_re.findall(s))
        if len(ents) > 0:
            for ent in ents:
                entnum = ent[2:-1]
                try:
                    entnum = int(entnum)
-                html = html.replace(ent, chr(entnum))
-            except:
+                    s = s.replace(ent, chr(entnum))
+                except (ValueError, KeyError):
                    pass
        # Now the alpha versions:
-    ents = set(html_entity_alpha_re.findall(html))
+        ents = set(html_entity_alpha_re.findall(s))
        ents = filter((lambda x : x != amp), ents)
        for ent in ents:
            entname = ent[1:-1]
            try:
-            html = html.replace(ent, chr(html.entities.name2codepoint[entname]))
-        except:
+                s = s.replace(ent, chr(html.entities.name2codepoint[entname]))
+            except (ValueError, KeyError):
                pass
-        html = html.replace(amp, " and ")
-    return html
+            s = s.replace(amp, " and ")
+        return s
+
+    def tokenize(self, text: str) -> t.Iterable[str]:
+        # Fix HTML character entitites:
+        s = self.__html2string(text)
+        # Tokenize:
+        words = word_re.findall(s)
+        # Possible alter the case, but avoid changing emoticons like :D into :d:
+        words = list(map(lambda x: x if emoticon_re.search(x) else x.lower(), words))
+        # Return the results
+        return words
+
+
+class PottsTokenizerWithNegation(PottsTokenizer):
+    def tokenize(self, text: str) -> t.Iterable[str]:
+        words = super().tokenize(text)
+        nltk.sentiment.util.mark_negation(words, shallow=True)
+        return words


 __all__ = (
-    "potts_tokenizer",
+    "PottsTokenizer",
+    "PottsTokenizerWithNegation",
 )