From dcfc4fbc3b7be413dfe75f84f47ff6cfccad44f9 Mon Sep 17 00:00:00 2001 From: Stefano Pigozzi Date: Sat, 4 Feb 2023 06:14:24 +0100 Subject: [PATCH] Getting closer... --- unimore-bda-6.iml | 1 + unimore_bda_6/__main__.py | 63 +++++----- unimore_bda_6/analysis/base.py | 40 +++--- unimore_bda_6/analysis/nltk_sentiment.py | 14 ++- unimore_bda_6/analysis/tf_text.py | 29 +---- unimore_bda_6/database.py | 149 +++++++++++++++-------- 6 files changed, 170 insertions(+), 126 deletions(-) diff --git a/unimore-bda-6.iml b/unimore-bda-6.iml index 1312514..80da260 100644 --- a/unimore-bda-6.iml +++ b/unimore-bda-6.iml @@ -7,6 +7,7 @@ + diff --git a/unimore_bda_6/__main__.py b/unimore_bda_6/__main__.py index f5c00f4..9bbd0ba 100644 --- a/unimore_bda_6/__main__.py +++ b/unimore_bda_6/__main__.py @@ -2,7 +2,7 @@ import logging import tensorflow from .config import config, DATA_SET_SIZE -from .database import mongo_reviews_collection_from_config, polar_dataset, varied_dataset +from .database import mongo_reviews_collection_from_config, sample_reviews_polar, sample_reviews_varied from .analysis.nltk_sentiment import NLTKSentimentAnalyzer from .analysis.tf_text import TensorflowSentimentAnalyzer from .tokenizer import NLTKWordTokenizer, PottsTokenizer, PottsTokenizerWithNegation, LowercaseTokenizer @@ -17,37 +17,44 @@ def main(): else: log.debug("Tensorflow successfully found GPU acceleration!") - for dataset_func in [polar_dataset, varied_dataset]: - for SentimentAnalyzer in [ - NLTKSentimentAnalyzer, - # TensorflowSentimentAnalyzer, + for dataset_func in [sample_reviews_polar, sample_reviews_varied]: + # Tensorflow-based + for Tokenizer in [ + LowercaseTokenizer ]: - for Tokenizer in [ - NLTKWordTokenizer, - PottsTokenizer, - PottsTokenizerWithNegation, - LowercaseTokenizer, - ]: - tokenizer = Tokenizer() - model = SentimentAnalyzer(tokenizer=tokenizer) + tokenizer = Tokenizer() + model = TensorflowSentimentAnalyzer() - with mongo_reviews_collection_from_config() as reviews: - reviews_training = dataset_func(collection=reviews, amount=DATA_SET_SIZE.__wrapped__) - reviews_evaluation = dataset_func(collection=reviews, amount=DATA_SET_SIZE.__wrapped__) + with mongo_reviews_collection_from_config() as collection: + ... - log.info("Training model %s", model) - model.train(reviews_training) - log.info("Evaluating model %s", model) - correct, evaluated = model.evaluate(reviews_evaluation) - log.info("%d evaluated, %d correct, %0.2d %% accuracy", evaluated, correct, correct / evaluated * 100) + # NLTK-based + for Tokenizer in [ + NLTKWordTokenizer, + PottsTokenizer, + PottsTokenizerWithNegation, + LowercaseTokenizer, + ]: + tokenizer = Tokenizer() + model = NLTKSentimentAnalyzer(tokenizer=tokenizer) - # try: - # print("Manual testing for %s" % model) - # print("Input an empty string to continue to the next model.") - # while inp := input(): - # print(model.use(inp)) - # except KeyboardInterrupt: - # pass + with mongo_reviews_collection_from_config() as collection: + reviews_training = dataset_func(collection=collection, amount=DATA_SET_SIZE.__wrapped__) + reviews_evaluation = dataset_func(collection=collection, amount=DATA_SET_SIZE.__wrapped__) + + log.info("Training model %s", model) + model.train(reviews_training) + log.info("Evaluating model %s", model) + correct, evaluated = model.evaluate(reviews_evaluation) + log.info("%d evaluated, %d correct, %0.2d %% accuracy", evaluated, correct, correct / evaluated * 100) + + # try: + # print("Manual testing for %s" % model) + # print("Input an empty string to continue to the next model.") + # while inp := input(): + # print(model.use(inp)) + # except KeyboardInterrupt: + # pass if __name__ == "__main__": diff --git a/unimore_bda_6/analysis/base.py b/unimore_bda_6/analysis/base.py index 57f94ec..b9c3900 100644 --- a/unimore_bda_6/analysis/base.py +++ b/unimore_bda_6/analysis/base.py @@ -1,47 +1,55 @@ import abc import logging +import typing as t +import dataclasses -from ..database import DataSet, Text, Category -from ..tokenizer import BaseTokenizer +from ..database import Text, Category, Review, DatasetFunc log = logging.getLogger(__name__) +@dataclasses.dataclass +class EvaluationResults: + correct: int + evaluated: int + + def __repr__(self): + return f"" + + def __str__(self): + return f"{self.correct} / {self.evaluated} - {self.correct / self.evaluated * 100:.2f} %" + + class BaseSentimentAnalyzer(metaclass=abc.ABCMeta): """ Abstract base class for sentiment analyzers implemented in this project. """ - def __init__(self, *, tokenizer: BaseTokenizer): - self.tokenizer: BaseTokenizer = tokenizer - - def __repr__(self): - return f"<{self.__class__.__qualname__} tokenizer={self.tokenizer!r}>" - @abc.abstractmethod - def train(self, training_set: DataSet) -> None: + def train(self, dataset_func: DatasetFunc) -> None: """ Train the analyzer with the given training dataset. """ raise NotImplementedError() - def evaluate(self, test_set: DataSet) -> tuple[int, int]: + def evaluate(self, dataset_func: DatasetFunc) -> EvaluationResults: """ Perform a model evaluation by calling repeatedly `.use` on every text of the test dataset and by comparing its resulting category with the expected category. Returns a tuple with the number of correct results and the number of evaluated results. """ - evaluated: int = 0 - correct: int = 0 - for text, expected_category in test_set: - resulting_category = self.use(text) + evaluated: int = 0 + correct: int = 0 + + for review in dataset_func(): + resulting_category = self.use(review.text) evaluated += 1 - correct += 1 if resulting_category == expected_category else 0 + correct += 1 if resulting_category == review.category else 0 if not evaluated % 100: log.debug("%d evaluated, %d correct, %0.2d %% accuracy", evaluated, correct, correct / evaluated * 100) - return correct, evaluated + return EvaluationResults(correct=correct, evaluated=evaluated) @abc.abstractmethod def use(self, text: Text) -> Category: diff --git a/unimore_bda_6/analysis/nltk_sentiment.py b/unimore_bda_6/analysis/nltk_sentiment.py index 6f7d03e..a1fd2ea 100644 --- a/unimore_bda_6/analysis/nltk_sentiment.py +++ b/unimore_bda_6/analysis/nltk_sentiment.py @@ -6,7 +6,7 @@ import logging import typing as t import itertools -from ..database import Text, Category, DataTuple, DataSet +from ..database import Text, Category, Review from .base import BaseSentimentAnalyzer, AlreadyTrainedError, NotTrainedError from ..log import count_passage from ..tokenizer import BaseTokenizer @@ -23,16 +23,20 @@ class NLTKSentimentAnalyzer(BaseSentimentAnalyzer): """ def __init__(self, *, tokenizer: BaseTokenizer) -> None: - super().__init__(tokenizer=tokenizer) + super().__init__() self.model: nltk.sentiment.SentimentAnalyzer = nltk.sentiment.SentimentAnalyzer() self.trained: bool = False + self.tokenizer: BaseTokenizer = tokenizer - def __tokenize_datatuple(self, datatuple: DataTuple) -> tuple[TokenBag, Category]: + def __repr__(self): + return f"<{self.__class__.__qualname__} tokenizer={self.tokenizer!r}>" + + def __tokenize_datatuple(self, datatuple: Review) -> tuple[TokenBag, Category]: """ Convert the `Text` of a `DataTuple` to a `TokenBag`. """ count_passage(log, "tokenize_datatuple", 100) - return self.tokenizer.tokenize_builtins(datatuple[0]), datatuple[1] + return self.tokenizer.tokenize_builtins(datatuple.text), datatuple.category def _add_feature_unigrams(self, dataset: t.Iterator[tuple[TokenBag, Category]]) -> None: """ @@ -63,7 +67,7 @@ class NLTKSentimentAnalyzer(BaseSentimentAnalyzer): count_passage(log, "extract_features", 100) return self.model.extract_features(data[0]), data[1] - def train(self, dataset: DataSet) -> None: + def train(self, dataset: t.Iterator[Review]) -> None: # Forbid retraining the model if self.trained: raise AlreadyTrainedError() diff --git a/unimore_bda_6/analysis/tf_text.py b/unimore_bda_6/analysis/tf_text.py index df65f1d..fc4780a 100644 --- a/unimore_bda_6/analysis/tf_text.py +++ b/unimore_bda_6/analysis/tf_text.py @@ -2,42 +2,25 @@ import tensorflow import itertools import typing as t -from ..database import DataSet, Text, Category +from ..database import Text, Category, Review from ..tokenizer import BaseTokenizer from .base import BaseSentimentAnalyzer, AlreadyTrainedError, NotTrainedError class TensorflowSentimentAnalyzer(BaseSentimentAnalyzer): - def __init__(self, *, tokenizer: BaseTokenizer): - super().__init__(tokenizer=tokenizer) + def __init__(self): + super().__init__() self.trained = False self.text_vectorization_layer = None self.neural_network: tensorflow.keras.Sequential | None = None - @staticmethod - def __infinite_dataset_generator_factory(dataset: DataSet): - """ - A generator of infinite copies of dataset. - - .. todo:: Loads the whole dataset in memory. What a waste! Can we perform multiple MongoDB queries instead? - """ - dataset = map(lambda text, category: (tensorflow.convert_to_tensor(text, dtype=tensorflow.string), tensorflow.convert_to_tensor(category, dtype=tensorflow.string)), dataset) - - def generator(): - while True: - nonlocal dataset - dataset, result = itertools.tee(dataset, 2) - yield result - - return generator - @classmethod - def __bda_dataset_to_tf_dataset(cls, dataset: DataSet) -> tensorflow.data.Dataset: + def __bda_dataset_to_tf_dataset(cls, dataset_func: t.Callable[[], t.Iterator[Review]]) -> tensorflow.data.Dataset: """ Convert a `unimore_bda_6.database.DataSet` to a "real" `tensorflow.data.Dataset`. """ return tensorflow.data.Dataset.from_generator( - cls.__infinite_dataset_generator_factory(dataset), + dataset_func, output_signature=( tensorflow.TensorSpec(shape=(), dtype=tensorflow.string), tensorflow.TensorSpec(shape=(), dtype=tensorflow.string), @@ -48,7 +31,7 @@ class TensorflowSentimentAnalyzer(BaseSentimentAnalyzer): EMBEDDING_DIM = 16 EPOCHS = 10 - def train(self, training_set: DataSet) -> None: + def train(self, training_set: t.Iterator[Review]) -> None: if self.trained: raise AlreadyTrainedError() diff --git a/unimore_bda_6/database.py b/unimore_bda_6/database.py index 7536670..9828fd1 100644 --- a/unimore_bda_6/database.py +++ b/unimore_bda_6/database.py @@ -4,14 +4,19 @@ import pymongo.collection import contextlib import bson import logging -import itertools +import tensorflow from .config import MONGO_HOST, MONGO_PORT, WORKING_SET_SIZE log = logging.getLogger(__name__) -class Review(t.TypedDict): +class MongoReview(t.TypedDict): + """ + A review as it is stored on MongoDB. + + .. warning:: Do not instantiate: this is only for type hints! + """ _id: bson.ObjectId reviewerID: str asin: str @@ -28,13 +33,13 @@ Text = str Category = float -class DataTuple: - def __init__(self, text, category): +class Review: + def __init__(self, text: Text, category: Category): self.text: Text = text self.category: Category = category @classmethod - def from_review(cls, review): + def from_mongoreview(cls, review: MongoReview): return cls( text=review["reviewText"], category=review["overall"], @@ -44,15 +49,15 @@ class DataTuple: return f"<{self.__class__.__qualname__}: [{self.category}] {self.text}>" def __getitem__(self, item): - if item == 0: + if item == 0 or item == "text": return self.text - elif item == 1: + elif item == 1 or item == "category": return self.category else: raise KeyError(item) - -DataSet = t.Iterable[DataTuple] + def to_tensor_tuple(self) -> tuple[tensorflow.Tensor, tensorflow.Tensor]: + return tensorflow.convert_to_tensor(self.text, dtype=tensorflow.string), tensorflow.convert_to_tensor(self.category, dtype=tensorflow.string) @contextlib.contextmanager @@ -65,7 +70,7 @@ def mongo_client_from_config() -> t.ContextManager[pymongo.MongoClient]: host=MONGO_HOST.__wrapped__, port=MONGO_PORT.__wrapped__, ) - log.info("Opened connection to MongoDB at %s!", client.address) + log.info("Opened connection to MongoDB!") yield client @@ -75,7 +80,7 @@ def mongo_client_from_config() -> t.ContextManager[pymongo.MongoClient]: @contextlib.contextmanager -def mongo_reviews_collection_from_config() -> pymongo.collection.Collection[Review]: +def mongo_reviews_collection_from_config() -> pymongo.collection.Collection[MongoReview]: """ Create a new MongoDB client, access the ``reviews`` collection in the ``reviews`` database, and yield it. """ @@ -86,82 +91,118 @@ def mongo_reviews_collection_from_config() -> pymongo.collection.Collection[Revi yield collection -def sample_reviews(reviews: pymongo.collection.Collection, amount: int) -> t.Iterator[Review]: +class DatasetFunc(t.Protocol): + def __call__(self) -> t.Iterator[Review]: + pass + + +def sample_reviews(collection: pymongo.collection.Collection, amount: int) -> t.Iterator[Review]: """ Get ``amount`` random reviews from the ``reviews`` collection. """ log.debug("Getting a sample of %d reviews...", amount) - return reviews.aggregate([ + cursor = collection.aggregate([ {"$limit": WORKING_SET_SIZE.__wrapped__}, {"$sample": {"size": amount}}, ]) + cursor = map(Review.from_mongoreview, cursor) + return cursor -def sample_reviews_by_rating(reviews: pymongo.collection.Collection, rating: float, amount: int) -> t.Iterator[Review]: + +def sample_reviews_by_rating(collection: pymongo.collection.Collection, rating: float, amount: int) -> t.Iterator[Review]: """ Get ``amount`` random reviews with ``rating`` stars from the ``reviews`` collection. """ log.debug("Getting a sample of %d reviews with %d stars...", amount, rating) - return reviews.aggregate([ + cursor = collection.aggregate([ {"$limit": WORKING_SET_SIZE.__wrapped__}, {"$match": {"overall": rating}}, {"$sample": {"size": amount}}, ]) - -def polar_dataset(collection: pymongo.collection.Collection, amount: int) -> t.Iterator[DataTuple]: - """ - Get a list of the same amount of 1-star and 5-star reviews. - """ - log.info("Building polar dataset with %d reviews...", amount * 2) - - # Sample the required reviews - positive = sample_reviews_by_rating(collection, rating=5.0, amount=amount) - negative = sample_reviews_by_rating(collection, rating=1.0, amount=amount) - - # Chain the iterators - full = itertools.chain(positive, negative) - - # Convert reviews to datatuples - full = map(DataTuple.from_review, full) - - return full + cursor = map(Review.from_mongoreview, cursor) + return cursor -def varied_dataset(collection: pymongo.collection.Collection, amount: int) -> t.Iterator[DataTuple]: - """ - Get a list of the same amount of reviews for each rating. - """ - log.info("Building varied dataset with %d reviews...", amount * 5) +def sample_reviews_polar(collection: pymongo.collection.Collection, amount: int) -> t.Iterator[Review]: + log.debug("Getting a sample of %d polar reviews...", amount * 2) - # Sample the required reviews - terrible = sample_reviews_by_rating(collection, rating=1.0, amount=amount) - negative = sample_reviews_by_rating(collection, rating=2.0, amount=amount) - mixed = sample_reviews_by_rating(collection, rating=3.0, amount=amount) - positive = sample_reviews_by_rating(collection, rating=4.0, amount=amount) - great = sample_reviews_by_rating(collection, rating=5.0, amount=amount) + cursor = collection.aggregate([ + {"$limit": WORKING_SET_SIZE.__wrapped__}, + {"$match": {"overall": 1.0}}, + {"$sample": {"size": amount}}, + {"$unionWith": { + "coll": collection.name, + "pipeline": [ + {"$limit": WORKING_SET_SIZE.__wrapped__}, + {"$match": {"overall": 5.0}}, + {"$sample": {"size": amount}}, + ], + }} + ]) - # Chain the iterators - full = itertools.chain(terrible, negative, mixed, positive, great) + cursor = map(Review.from_mongoreview, cursor) + return cursor - # Convert reviews to datatuples - full = map(DataTuple.from_review, full) - return full +def sample_reviews_varied(collection: pymongo.collection.Collection, amount: int) -> t.Iterator[Review]: + log.debug("Getting a sample of %d varied reviews...", amount * 5) + + # Wow, this is ugly. + cursor = collection.aggregate([ + {"$limit": WORKING_SET_SIZE.__wrapped__}, + {"$match": {"overall": 1.0}}, + {"$sample": {"size": amount}}, + {"$unionWith": { + "coll": collection.name, + "pipeline": [ + {"$limit": WORKING_SET_SIZE.__wrapped__}, + {"$match": {"overall": 2.0}}, + {"$sample": {"size": amount}}, + {"$unionWith": { + "coll": collection.name, + "pipeline": [ + {"$limit": WORKING_SET_SIZE.__wrapped__}, + {"$match": {"overall": 3.0}}, + {"$sample": {"size": amount}}, + {"$unionWith": { + "coll": collection.name, + "pipeline": [ + {"$limit": WORKING_SET_SIZE.__wrapped__}, + {"$match": {"overall": 4.0}}, + {"$sample": {"size": amount}}, + {"$unionWith": { + "coll": collection.name, + "pipeline": [ + {"$limit": WORKING_SET_SIZE.__wrapped__}, + {"$match": {"overall": 5.0}}, + {"$sample": {"size": amount}}, + ], + }} + ], + }} + ], + }} + ], + }} + ]) + + cursor = map(Review.from_mongoreview, cursor) + return cursor __all__ = ( - "Review", "Text", "Category", - "DataTuple", - "DataSet", + "Review", + "DatasetFunc", "mongo_client_from_config", "mongo_reviews_collection_from_config", "sample_reviews", "sample_reviews_by_rating", - "polar_dataset", - "varied_dataset", + "sample_reviews_polar", + "sample_reviews_varied", )