diff --git a/unimore-bda-6.iml b/unimore-bda-6.iml
index 1312514..80da260 100644
--- a/unimore-bda-6.iml
+++ b/unimore-bda-6.iml
@@ -7,6 +7,7 @@
+
diff --git a/unimore_bda_6/__main__.py b/unimore_bda_6/__main__.py
index f5c00f4..9bbd0ba 100644
--- a/unimore_bda_6/__main__.py
+++ b/unimore_bda_6/__main__.py
@@ -2,7 +2,7 @@ import logging
import tensorflow
from .config import config, DATA_SET_SIZE
-from .database import mongo_reviews_collection_from_config, polar_dataset, varied_dataset
+from .database import mongo_reviews_collection_from_config, sample_reviews_polar, sample_reviews_varied
from .analysis.nltk_sentiment import NLTKSentimentAnalyzer
from .analysis.tf_text import TensorflowSentimentAnalyzer
from .tokenizer import NLTKWordTokenizer, PottsTokenizer, PottsTokenizerWithNegation, LowercaseTokenizer
@@ -17,37 +17,44 @@ def main():
else:
log.debug("Tensorflow successfully found GPU acceleration!")
- for dataset_func in [polar_dataset, varied_dataset]:
- for SentimentAnalyzer in [
- NLTKSentimentAnalyzer,
- # TensorflowSentimentAnalyzer,
+ for dataset_func in [sample_reviews_polar, sample_reviews_varied]:
+ # Tensorflow-based
+ for Tokenizer in [
+ LowercaseTokenizer
]:
- for Tokenizer in [
- NLTKWordTokenizer,
- PottsTokenizer,
- PottsTokenizerWithNegation,
- LowercaseTokenizer,
- ]:
- tokenizer = Tokenizer()
- model = SentimentAnalyzer(tokenizer=tokenizer)
+ tokenizer = Tokenizer()
+ model = TensorflowSentimentAnalyzer()
- with mongo_reviews_collection_from_config() as reviews:
- reviews_training = dataset_func(collection=reviews, amount=DATA_SET_SIZE.__wrapped__)
- reviews_evaluation = dataset_func(collection=reviews, amount=DATA_SET_SIZE.__wrapped__)
+ with mongo_reviews_collection_from_config() as collection:
+ ...
- log.info("Training model %s", model)
- model.train(reviews_training)
- log.info("Evaluating model %s", model)
- correct, evaluated = model.evaluate(reviews_evaluation)
- log.info("%d evaluated, %d correct, %0.2d %% accuracy", evaluated, correct, correct / evaluated * 100)
+ # NLTK-based
+ for Tokenizer in [
+ NLTKWordTokenizer,
+ PottsTokenizer,
+ PottsTokenizerWithNegation,
+ LowercaseTokenizer,
+ ]:
+ tokenizer = Tokenizer()
+ model = NLTKSentimentAnalyzer(tokenizer=tokenizer)
- # try:
- # print("Manual testing for %s" % model)
- # print("Input an empty string to continue to the next model.")
- # while inp := input():
- # print(model.use(inp))
- # except KeyboardInterrupt:
- # pass
+ with mongo_reviews_collection_from_config() as collection:
+ reviews_training = dataset_func(collection=collection, amount=DATA_SET_SIZE.__wrapped__)
+ reviews_evaluation = dataset_func(collection=collection, amount=DATA_SET_SIZE.__wrapped__)
+
+ log.info("Training model %s", model)
+ model.train(reviews_training)
+ log.info("Evaluating model %s", model)
+ correct, evaluated = model.evaluate(reviews_evaluation)
+ log.info("%d evaluated, %d correct, %0.2d %% accuracy", evaluated, correct, correct / evaluated * 100)
+
+ # try:
+ # print("Manual testing for %s" % model)
+ # print("Input an empty string to continue to the next model.")
+ # while inp := input():
+ # print(model.use(inp))
+ # except KeyboardInterrupt:
+ # pass
if __name__ == "__main__":
diff --git a/unimore_bda_6/analysis/base.py b/unimore_bda_6/analysis/base.py
index 57f94ec..b9c3900 100644
--- a/unimore_bda_6/analysis/base.py
+++ b/unimore_bda_6/analysis/base.py
@@ -1,47 +1,55 @@
import abc
import logging
+import typing as t
+import dataclasses
-from ..database import DataSet, Text, Category
-from ..tokenizer import BaseTokenizer
+from ..database import Text, Category, Review, DatasetFunc
log = logging.getLogger(__name__)
+@dataclasses.dataclass
+class EvaluationResults:
+ correct: int
+ evaluated: int
+
+ def __repr__(self):
+ return f""
+
+ def __str__(self):
+ return f"{self.correct} / {self.evaluated} - {self.correct / self.evaluated * 100:.2f} %"
+
+
class BaseSentimentAnalyzer(metaclass=abc.ABCMeta):
"""
Abstract base class for sentiment analyzers implemented in this project.
"""
- def __init__(self, *, tokenizer: BaseTokenizer):
- self.tokenizer: BaseTokenizer = tokenizer
-
- def __repr__(self):
- return f"<{self.__class__.__qualname__} tokenizer={self.tokenizer!r}>"
-
@abc.abstractmethod
- def train(self, training_set: DataSet) -> None:
+ def train(self, dataset_func: DatasetFunc) -> None:
"""
Train the analyzer with the given training dataset.
"""
raise NotImplementedError()
- def evaluate(self, test_set: DataSet) -> tuple[int, int]:
+ def evaluate(self, dataset_func: DatasetFunc) -> EvaluationResults:
"""
Perform a model evaluation by calling repeatedly `.use` on every text of the test dataset and by comparing its resulting category with the expected category.
Returns a tuple with the number of correct results and the number of evaluated results.
"""
- evaluated: int = 0
- correct: int = 0
- for text, expected_category in test_set:
- resulting_category = self.use(text)
+ evaluated: int = 0
+ correct: int = 0
+
+ for review in dataset_func():
+ resulting_category = self.use(review.text)
evaluated += 1
- correct += 1 if resulting_category == expected_category else 0
+ correct += 1 if resulting_category == review.category else 0
if not evaluated % 100:
log.debug("%d evaluated, %d correct, %0.2d %% accuracy", evaluated, correct, correct / evaluated * 100)
- return correct, evaluated
+ return EvaluationResults(correct=correct, evaluated=evaluated)
@abc.abstractmethod
def use(self, text: Text) -> Category:
diff --git a/unimore_bda_6/analysis/nltk_sentiment.py b/unimore_bda_6/analysis/nltk_sentiment.py
index 6f7d03e..a1fd2ea 100644
--- a/unimore_bda_6/analysis/nltk_sentiment.py
+++ b/unimore_bda_6/analysis/nltk_sentiment.py
@@ -6,7 +6,7 @@ import logging
import typing as t
import itertools
-from ..database import Text, Category, DataTuple, DataSet
+from ..database import Text, Category, Review
from .base import BaseSentimentAnalyzer, AlreadyTrainedError, NotTrainedError
from ..log import count_passage
from ..tokenizer import BaseTokenizer
@@ -23,16 +23,20 @@ class NLTKSentimentAnalyzer(BaseSentimentAnalyzer):
"""
def __init__(self, *, tokenizer: BaseTokenizer) -> None:
- super().__init__(tokenizer=tokenizer)
+ super().__init__()
self.model: nltk.sentiment.SentimentAnalyzer = nltk.sentiment.SentimentAnalyzer()
self.trained: bool = False
+ self.tokenizer: BaseTokenizer = tokenizer
- def __tokenize_datatuple(self, datatuple: DataTuple) -> tuple[TokenBag, Category]:
+ def __repr__(self):
+ return f"<{self.__class__.__qualname__} tokenizer={self.tokenizer!r}>"
+
+ def __tokenize_datatuple(self, datatuple: Review) -> tuple[TokenBag, Category]:
"""
Convert the `Text` of a `DataTuple` to a `TokenBag`.
"""
count_passage(log, "tokenize_datatuple", 100)
- return self.tokenizer.tokenize_builtins(datatuple[0]), datatuple[1]
+ return self.tokenizer.tokenize_builtins(datatuple.text), datatuple.category
def _add_feature_unigrams(self, dataset: t.Iterator[tuple[TokenBag, Category]]) -> None:
"""
@@ -63,7 +67,7 @@ class NLTKSentimentAnalyzer(BaseSentimentAnalyzer):
count_passage(log, "extract_features", 100)
return self.model.extract_features(data[0]), data[1]
- def train(self, dataset: DataSet) -> None:
+ def train(self, dataset: t.Iterator[Review]) -> None:
# Forbid retraining the model
if self.trained:
raise AlreadyTrainedError()
diff --git a/unimore_bda_6/analysis/tf_text.py b/unimore_bda_6/analysis/tf_text.py
index df65f1d..fc4780a 100644
--- a/unimore_bda_6/analysis/tf_text.py
+++ b/unimore_bda_6/analysis/tf_text.py
@@ -2,42 +2,25 @@ import tensorflow
import itertools
import typing as t
-from ..database import DataSet, Text, Category
+from ..database import Text, Category, Review
from ..tokenizer import BaseTokenizer
from .base import BaseSentimentAnalyzer, AlreadyTrainedError, NotTrainedError
class TensorflowSentimentAnalyzer(BaseSentimentAnalyzer):
- def __init__(self, *, tokenizer: BaseTokenizer):
- super().__init__(tokenizer=tokenizer)
+ def __init__(self):
+ super().__init__()
self.trained = False
self.text_vectorization_layer = None
self.neural_network: tensorflow.keras.Sequential | None = None
- @staticmethod
- def __infinite_dataset_generator_factory(dataset: DataSet):
- """
- A generator of infinite copies of dataset.
-
- .. todo:: Loads the whole dataset in memory. What a waste! Can we perform multiple MongoDB queries instead?
- """
- dataset = map(lambda text, category: (tensorflow.convert_to_tensor(text, dtype=tensorflow.string), tensorflow.convert_to_tensor(category, dtype=tensorflow.string)), dataset)
-
- def generator():
- while True:
- nonlocal dataset
- dataset, result = itertools.tee(dataset, 2)
- yield result
-
- return generator
-
@classmethod
- def __bda_dataset_to_tf_dataset(cls, dataset: DataSet) -> tensorflow.data.Dataset:
+ def __bda_dataset_to_tf_dataset(cls, dataset_func: t.Callable[[], t.Iterator[Review]]) -> tensorflow.data.Dataset:
"""
Convert a `unimore_bda_6.database.DataSet` to a "real" `tensorflow.data.Dataset`.
"""
return tensorflow.data.Dataset.from_generator(
- cls.__infinite_dataset_generator_factory(dataset),
+ dataset_func,
output_signature=(
tensorflow.TensorSpec(shape=(), dtype=tensorflow.string),
tensorflow.TensorSpec(shape=(), dtype=tensorflow.string),
@@ -48,7 +31,7 @@ class TensorflowSentimentAnalyzer(BaseSentimentAnalyzer):
EMBEDDING_DIM = 16
EPOCHS = 10
- def train(self, training_set: DataSet) -> None:
+ def train(self, training_set: t.Iterator[Review]) -> None:
if self.trained:
raise AlreadyTrainedError()
diff --git a/unimore_bda_6/database.py b/unimore_bda_6/database.py
index 7536670..9828fd1 100644
--- a/unimore_bda_6/database.py
+++ b/unimore_bda_6/database.py
@@ -4,14 +4,19 @@ import pymongo.collection
import contextlib
import bson
import logging
-import itertools
+import tensorflow
from .config import MONGO_HOST, MONGO_PORT, WORKING_SET_SIZE
log = logging.getLogger(__name__)
-class Review(t.TypedDict):
+class MongoReview(t.TypedDict):
+ """
+ A review as it is stored on MongoDB.
+
+ .. warning:: Do not instantiate: this is only for type hints!
+ """
_id: bson.ObjectId
reviewerID: str
asin: str
@@ -28,13 +33,13 @@ Text = str
Category = float
-class DataTuple:
- def __init__(self, text, category):
+class Review:
+ def __init__(self, text: Text, category: Category):
self.text: Text = text
self.category: Category = category
@classmethod
- def from_review(cls, review):
+ def from_mongoreview(cls, review: MongoReview):
return cls(
text=review["reviewText"],
category=review["overall"],
@@ -44,15 +49,15 @@ class DataTuple:
return f"<{self.__class__.__qualname__}: [{self.category}] {self.text}>"
def __getitem__(self, item):
- if item == 0:
+ if item == 0 or item == "text":
return self.text
- elif item == 1:
+ elif item == 1 or item == "category":
return self.category
else:
raise KeyError(item)
-
-DataSet = t.Iterable[DataTuple]
+ def to_tensor_tuple(self) -> tuple[tensorflow.Tensor, tensorflow.Tensor]:
+ return tensorflow.convert_to_tensor(self.text, dtype=tensorflow.string), tensorflow.convert_to_tensor(self.category, dtype=tensorflow.string)
@contextlib.contextmanager
@@ -65,7 +70,7 @@ def mongo_client_from_config() -> t.ContextManager[pymongo.MongoClient]:
host=MONGO_HOST.__wrapped__,
port=MONGO_PORT.__wrapped__,
)
- log.info("Opened connection to MongoDB at %s!", client.address)
+ log.info("Opened connection to MongoDB!")
yield client
@@ -75,7 +80,7 @@ def mongo_client_from_config() -> t.ContextManager[pymongo.MongoClient]:
@contextlib.contextmanager
-def mongo_reviews_collection_from_config() -> pymongo.collection.Collection[Review]:
+def mongo_reviews_collection_from_config() -> pymongo.collection.Collection[MongoReview]:
"""
Create a new MongoDB client, access the ``reviews`` collection in the ``reviews`` database, and yield it.
"""
@@ -86,82 +91,118 @@ def mongo_reviews_collection_from_config() -> pymongo.collection.Collection[Revi
yield collection
-def sample_reviews(reviews: pymongo.collection.Collection, amount: int) -> t.Iterator[Review]:
+class DatasetFunc(t.Protocol):
+ def __call__(self) -> t.Iterator[Review]:
+ pass
+
+
+def sample_reviews(collection: pymongo.collection.Collection, amount: int) -> t.Iterator[Review]:
"""
Get ``amount`` random reviews from the ``reviews`` collection.
"""
log.debug("Getting a sample of %d reviews...", amount)
- return reviews.aggregate([
+ cursor = collection.aggregate([
{"$limit": WORKING_SET_SIZE.__wrapped__},
{"$sample": {"size": amount}},
])
+ cursor = map(Review.from_mongoreview, cursor)
+ return cursor
-def sample_reviews_by_rating(reviews: pymongo.collection.Collection, rating: float, amount: int) -> t.Iterator[Review]:
+
+def sample_reviews_by_rating(collection: pymongo.collection.Collection, rating: float, amount: int) -> t.Iterator[Review]:
"""
Get ``amount`` random reviews with ``rating`` stars from the ``reviews`` collection.
"""
log.debug("Getting a sample of %d reviews with %d stars...", amount, rating)
- return reviews.aggregate([
+ cursor = collection.aggregate([
{"$limit": WORKING_SET_SIZE.__wrapped__},
{"$match": {"overall": rating}},
{"$sample": {"size": amount}},
])
-
-def polar_dataset(collection: pymongo.collection.Collection, amount: int) -> t.Iterator[DataTuple]:
- """
- Get a list of the same amount of 1-star and 5-star reviews.
- """
- log.info("Building polar dataset with %d reviews...", amount * 2)
-
- # Sample the required reviews
- positive = sample_reviews_by_rating(collection, rating=5.0, amount=amount)
- negative = sample_reviews_by_rating(collection, rating=1.0, amount=amount)
-
- # Chain the iterators
- full = itertools.chain(positive, negative)
-
- # Convert reviews to datatuples
- full = map(DataTuple.from_review, full)
-
- return full
+ cursor = map(Review.from_mongoreview, cursor)
+ return cursor
-def varied_dataset(collection: pymongo.collection.Collection, amount: int) -> t.Iterator[DataTuple]:
- """
- Get a list of the same amount of reviews for each rating.
- """
- log.info("Building varied dataset with %d reviews...", amount * 5)
+def sample_reviews_polar(collection: pymongo.collection.Collection, amount: int) -> t.Iterator[Review]:
+ log.debug("Getting a sample of %d polar reviews...", amount * 2)
- # Sample the required reviews
- terrible = sample_reviews_by_rating(collection, rating=1.0, amount=amount)
- negative = sample_reviews_by_rating(collection, rating=2.0, amount=amount)
- mixed = sample_reviews_by_rating(collection, rating=3.0, amount=amount)
- positive = sample_reviews_by_rating(collection, rating=4.0, amount=amount)
- great = sample_reviews_by_rating(collection, rating=5.0, amount=amount)
+ cursor = collection.aggregate([
+ {"$limit": WORKING_SET_SIZE.__wrapped__},
+ {"$match": {"overall": 1.0}},
+ {"$sample": {"size": amount}},
+ {"$unionWith": {
+ "coll": collection.name,
+ "pipeline": [
+ {"$limit": WORKING_SET_SIZE.__wrapped__},
+ {"$match": {"overall": 5.0}},
+ {"$sample": {"size": amount}},
+ ],
+ }}
+ ])
- # Chain the iterators
- full = itertools.chain(terrible, negative, mixed, positive, great)
+ cursor = map(Review.from_mongoreview, cursor)
+ return cursor
- # Convert reviews to datatuples
- full = map(DataTuple.from_review, full)
- return full
+def sample_reviews_varied(collection: pymongo.collection.Collection, amount: int) -> t.Iterator[Review]:
+ log.debug("Getting a sample of %d varied reviews...", amount * 5)
+
+ # Wow, this is ugly.
+ cursor = collection.aggregate([
+ {"$limit": WORKING_SET_SIZE.__wrapped__},
+ {"$match": {"overall": 1.0}},
+ {"$sample": {"size": amount}},
+ {"$unionWith": {
+ "coll": collection.name,
+ "pipeline": [
+ {"$limit": WORKING_SET_SIZE.__wrapped__},
+ {"$match": {"overall": 2.0}},
+ {"$sample": {"size": amount}},
+ {"$unionWith": {
+ "coll": collection.name,
+ "pipeline": [
+ {"$limit": WORKING_SET_SIZE.__wrapped__},
+ {"$match": {"overall": 3.0}},
+ {"$sample": {"size": amount}},
+ {"$unionWith": {
+ "coll": collection.name,
+ "pipeline": [
+ {"$limit": WORKING_SET_SIZE.__wrapped__},
+ {"$match": {"overall": 4.0}},
+ {"$sample": {"size": amount}},
+ {"$unionWith": {
+ "coll": collection.name,
+ "pipeline": [
+ {"$limit": WORKING_SET_SIZE.__wrapped__},
+ {"$match": {"overall": 5.0}},
+ {"$sample": {"size": amount}},
+ ],
+ }}
+ ],
+ }}
+ ],
+ }}
+ ],
+ }}
+ ])
+
+ cursor = map(Review.from_mongoreview, cursor)
+ return cursor
__all__ = (
- "Review",
"Text",
"Category",
- "DataTuple",
- "DataSet",
+ "Review",
+ "DatasetFunc",
"mongo_client_from_config",
"mongo_reviews_collection_from_config",
"sample_reviews",
"sample_reviews_by_rating",
- "polar_dataset",
- "varied_dataset",
+ "sample_reviews_polar",
+ "sample_reviews_varied",
)