1
Fork 0
mirror of https://github.com/Steffo99/unimore-bda-6.git synced 2024-11-22 07:54:19 +00:00

Getting closer...

This commit is contained in:
Steffo 2023-02-04 06:14:24 +01:00
parent 02f10e6ae4
commit dcfc4fbc3b
Signed by: steffo
GPG key ID: 2A24051445686895
6 changed files with 170 additions and 126 deletions

View file

@ -7,6 +7,7 @@
<excludeFolder url="file://$MODULE_DIR$/data/db" /> <excludeFolder url="file://$MODULE_DIR$/data/db" />
<excludeFolder url="file://$MODULE_DIR$/data/raw" /> <excludeFolder url="file://$MODULE_DIR$/data/raw" />
<excludeFolder url="file://$MODULE_DIR$/data/nltk" /> <excludeFolder url="file://$MODULE_DIR$/data/nltk" />
<excludeFolder url="file://$MODULE_DIR$/.venv" />
</content> </content>
<orderEntry type="jdk" jdkName="Poetry (unimore-bda-6)" jdkType="Python SDK" /> <orderEntry type="jdk" jdkName="Poetry (unimore-bda-6)" jdkType="Python SDK" />
<orderEntry type="sourceFolder" forTests="false" /> <orderEntry type="sourceFolder" forTests="false" />

View file

@ -2,7 +2,7 @@ import logging
import tensorflow import tensorflow
from .config import config, DATA_SET_SIZE from .config import config, DATA_SET_SIZE
from .database import mongo_reviews_collection_from_config, polar_dataset, varied_dataset from .database import mongo_reviews_collection_from_config, sample_reviews_polar, sample_reviews_varied
from .analysis.nltk_sentiment import NLTKSentimentAnalyzer from .analysis.nltk_sentiment import NLTKSentimentAnalyzer
from .analysis.tf_text import TensorflowSentimentAnalyzer from .analysis.tf_text import TensorflowSentimentAnalyzer
from .tokenizer import NLTKWordTokenizer, PottsTokenizer, PottsTokenizerWithNegation, LowercaseTokenizer from .tokenizer import NLTKWordTokenizer, PottsTokenizer, PottsTokenizerWithNegation, LowercaseTokenizer
@ -17,11 +17,18 @@ def main():
else: else:
log.debug("Tensorflow successfully found GPU acceleration!") log.debug("Tensorflow successfully found GPU acceleration!")
for dataset_func in [polar_dataset, varied_dataset]: for dataset_func in [sample_reviews_polar, sample_reviews_varied]:
for SentimentAnalyzer in [ # Tensorflow-based
NLTKSentimentAnalyzer, for Tokenizer in [
# TensorflowSentimentAnalyzer, LowercaseTokenizer
]: ]:
tokenizer = Tokenizer()
model = TensorflowSentimentAnalyzer()
with mongo_reviews_collection_from_config() as collection:
...
# NLTK-based
for Tokenizer in [ for Tokenizer in [
NLTKWordTokenizer, NLTKWordTokenizer,
PottsTokenizer, PottsTokenizer,
@ -29,11 +36,11 @@ def main():
LowercaseTokenizer, LowercaseTokenizer,
]: ]:
tokenizer = Tokenizer() tokenizer = Tokenizer()
model = SentimentAnalyzer(tokenizer=tokenizer) model = NLTKSentimentAnalyzer(tokenizer=tokenizer)
with mongo_reviews_collection_from_config() as reviews: with mongo_reviews_collection_from_config() as collection:
reviews_training = dataset_func(collection=reviews, amount=DATA_SET_SIZE.__wrapped__) reviews_training = dataset_func(collection=collection, amount=DATA_SET_SIZE.__wrapped__)
reviews_evaluation = dataset_func(collection=reviews, amount=DATA_SET_SIZE.__wrapped__) reviews_evaluation = dataset_func(collection=collection, amount=DATA_SET_SIZE.__wrapped__)
log.info("Training model %s", model) log.info("Training model %s", model)
model.train(reviews_training) model.train(reviews_training)

View file

@ -1,47 +1,55 @@
import abc import abc
import logging import logging
import typing as t
import dataclasses
from ..database import DataSet, Text, Category from ..database import Text, Category, Review, DatasetFunc
from ..tokenizer import BaseTokenizer
log = logging.getLogger(__name__) log = logging.getLogger(__name__)
@dataclasses.dataclass
class EvaluationResults:
correct: int
evaluated: int
def __repr__(self):
return f"<EvaluationResults: {self.correct}/{self.evaluated}, {self.correct / self.evaluated * 100:.2f}>"
def __str__(self):
return f"{self.correct} / {self.evaluated} - {self.correct / self.evaluated * 100:.2f} %"
class BaseSentimentAnalyzer(metaclass=abc.ABCMeta): class BaseSentimentAnalyzer(metaclass=abc.ABCMeta):
""" """
Abstract base class for sentiment analyzers implemented in this project. Abstract base class for sentiment analyzers implemented in this project.
""" """
def __init__(self, *, tokenizer: BaseTokenizer):
self.tokenizer: BaseTokenizer = tokenizer
def __repr__(self):
return f"<{self.__class__.__qualname__} tokenizer={self.tokenizer!r}>"
@abc.abstractmethod @abc.abstractmethod
def train(self, training_set: DataSet) -> None: def train(self, dataset_func: DatasetFunc) -> None:
""" """
Train the analyzer with the given training dataset. Train the analyzer with the given training dataset.
""" """
raise NotImplementedError() raise NotImplementedError()
def evaluate(self, test_set: DataSet) -> tuple[int, int]: def evaluate(self, dataset_func: DatasetFunc) -> EvaluationResults:
""" """
Perform a model evaluation by calling repeatedly `.use` on every text of the test dataset and by comparing its resulting category with the expected category. Perform a model evaluation by calling repeatedly `.use` on every text of the test dataset and by comparing its resulting category with the expected category.
Returns a tuple with the number of correct results and the number of evaluated results. Returns a tuple with the number of correct results and the number of evaluated results.
""" """
evaluated: int = 0 evaluated: int = 0
correct: int = 0 correct: int = 0
for text, expected_category in test_set: for review in dataset_func():
resulting_category = self.use(text) resulting_category = self.use(review.text)
evaluated += 1 evaluated += 1
correct += 1 if resulting_category == expected_category else 0 correct += 1 if resulting_category == review.category else 0
if not evaluated % 100: if not evaluated % 100:
log.debug("%d evaluated, %d correct, %0.2d %% accuracy", evaluated, correct, correct / evaluated * 100) log.debug("%d evaluated, %d correct, %0.2d %% accuracy", evaluated, correct, correct / evaluated * 100)
return correct, evaluated return EvaluationResults(correct=correct, evaluated=evaluated)
@abc.abstractmethod @abc.abstractmethod
def use(self, text: Text) -> Category: def use(self, text: Text) -> Category:

View file

@ -6,7 +6,7 @@ import logging
import typing as t import typing as t
import itertools import itertools
from ..database import Text, Category, DataTuple, DataSet from ..database import Text, Category, Review
from .base import BaseSentimentAnalyzer, AlreadyTrainedError, NotTrainedError from .base import BaseSentimentAnalyzer, AlreadyTrainedError, NotTrainedError
from ..log import count_passage from ..log import count_passage
from ..tokenizer import BaseTokenizer from ..tokenizer import BaseTokenizer
@ -23,16 +23,20 @@ class NLTKSentimentAnalyzer(BaseSentimentAnalyzer):
""" """
def __init__(self, *, tokenizer: BaseTokenizer) -> None: def __init__(self, *, tokenizer: BaseTokenizer) -> None:
super().__init__(tokenizer=tokenizer) super().__init__()
self.model: nltk.sentiment.SentimentAnalyzer = nltk.sentiment.SentimentAnalyzer() self.model: nltk.sentiment.SentimentAnalyzer = nltk.sentiment.SentimentAnalyzer()
self.trained: bool = False self.trained: bool = False
self.tokenizer: BaseTokenizer = tokenizer
def __tokenize_datatuple(self, datatuple: DataTuple) -> tuple[TokenBag, Category]: def __repr__(self):
return f"<{self.__class__.__qualname__} tokenizer={self.tokenizer!r}>"
def __tokenize_datatuple(self, datatuple: Review) -> tuple[TokenBag, Category]:
""" """
Convert the `Text` of a `DataTuple` to a `TokenBag`. Convert the `Text` of a `DataTuple` to a `TokenBag`.
""" """
count_passage(log, "tokenize_datatuple", 100) count_passage(log, "tokenize_datatuple", 100)
return self.tokenizer.tokenize_builtins(datatuple[0]), datatuple[1] return self.tokenizer.tokenize_builtins(datatuple.text), datatuple.category
def _add_feature_unigrams(self, dataset: t.Iterator[tuple[TokenBag, Category]]) -> None: def _add_feature_unigrams(self, dataset: t.Iterator[tuple[TokenBag, Category]]) -> None:
""" """
@ -63,7 +67,7 @@ class NLTKSentimentAnalyzer(BaseSentimentAnalyzer):
count_passage(log, "extract_features", 100) count_passage(log, "extract_features", 100)
return self.model.extract_features(data[0]), data[1] return self.model.extract_features(data[0]), data[1]
def train(self, dataset: DataSet) -> None: def train(self, dataset: t.Iterator[Review]) -> None:
# Forbid retraining the model # Forbid retraining the model
if self.trained: if self.trained:
raise AlreadyTrainedError() raise AlreadyTrainedError()

View file

@ -2,42 +2,25 @@ import tensorflow
import itertools import itertools
import typing as t import typing as t
from ..database import DataSet, Text, Category from ..database import Text, Category, Review
from ..tokenizer import BaseTokenizer from ..tokenizer import BaseTokenizer
from .base import BaseSentimentAnalyzer, AlreadyTrainedError, NotTrainedError from .base import BaseSentimentAnalyzer, AlreadyTrainedError, NotTrainedError
class TensorflowSentimentAnalyzer(BaseSentimentAnalyzer): class TensorflowSentimentAnalyzer(BaseSentimentAnalyzer):
def __init__(self, *, tokenizer: BaseTokenizer): def __init__(self):
super().__init__(tokenizer=tokenizer) super().__init__()
self.trained = False self.trained = False
self.text_vectorization_layer = None self.text_vectorization_layer = None
self.neural_network: tensorflow.keras.Sequential | None = None self.neural_network: tensorflow.keras.Sequential | None = None
@staticmethod
def __infinite_dataset_generator_factory(dataset: DataSet):
"""
A generator of infinite copies of dataset.
.. todo:: Loads the whole dataset in memory. What a waste! Can we perform multiple MongoDB queries instead?
"""
dataset = map(lambda text, category: (tensorflow.convert_to_tensor(text, dtype=tensorflow.string), tensorflow.convert_to_tensor(category, dtype=tensorflow.string)), dataset)
def generator():
while True:
nonlocal dataset
dataset, result = itertools.tee(dataset, 2)
yield result
return generator
@classmethod @classmethod
def __bda_dataset_to_tf_dataset(cls, dataset: DataSet) -> tensorflow.data.Dataset: def __bda_dataset_to_tf_dataset(cls, dataset_func: t.Callable[[], t.Iterator[Review]]) -> tensorflow.data.Dataset:
""" """
Convert a `unimore_bda_6.database.DataSet` to a "real" `tensorflow.data.Dataset`. Convert a `unimore_bda_6.database.DataSet` to a "real" `tensorflow.data.Dataset`.
""" """
return tensorflow.data.Dataset.from_generator( return tensorflow.data.Dataset.from_generator(
cls.__infinite_dataset_generator_factory(dataset), dataset_func,
output_signature=( output_signature=(
tensorflow.TensorSpec(shape=(), dtype=tensorflow.string), tensorflow.TensorSpec(shape=(), dtype=tensorflow.string),
tensorflow.TensorSpec(shape=(), dtype=tensorflow.string), tensorflow.TensorSpec(shape=(), dtype=tensorflow.string),
@ -48,7 +31,7 @@ class TensorflowSentimentAnalyzer(BaseSentimentAnalyzer):
EMBEDDING_DIM = 16 EMBEDDING_DIM = 16
EPOCHS = 10 EPOCHS = 10
def train(self, training_set: DataSet) -> None: def train(self, training_set: t.Iterator[Review]) -> None:
if self.trained: if self.trained:
raise AlreadyTrainedError() raise AlreadyTrainedError()

View file

@ -4,14 +4,19 @@ import pymongo.collection
import contextlib import contextlib
import bson import bson
import logging import logging
import itertools import tensorflow
from .config import MONGO_HOST, MONGO_PORT, WORKING_SET_SIZE from .config import MONGO_HOST, MONGO_PORT, WORKING_SET_SIZE
log = logging.getLogger(__name__) log = logging.getLogger(__name__)
class Review(t.TypedDict): class MongoReview(t.TypedDict):
"""
A review as it is stored on MongoDB.
.. warning:: Do not instantiate: this is only for type hints!
"""
_id: bson.ObjectId _id: bson.ObjectId
reviewerID: str reviewerID: str
asin: str asin: str
@ -28,13 +33,13 @@ Text = str
Category = float Category = float
class DataTuple: class Review:
def __init__(self, text, category): def __init__(self, text: Text, category: Category):
self.text: Text = text self.text: Text = text
self.category: Category = category self.category: Category = category
@classmethod @classmethod
def from_review(cls, review): def from_mongoreview(cls, review: MongoReview):
return cls( return cls(
text=review["reviewText"], text=review["reviewText"],
category=review["overall"], category=review["overall"],
@ -44,15 +49,15 @@ class DataTuple:
return f"<{self.__class__.__qualname__}: [{self.category}] {self.text}>" return f"<{self.__class__.__qualname__}: [{self.category}] {self.text}>"
def __getitem__(self, item): def __getitem__(self, item):
if item == 0: if item == 0 or item == "text":
return self.text return self.text
elif item == 1: elif item == 1 or item == "category":
return self.category return self.category
else: else:
raise KeyError(item) raise KeyError(item)
def to_tensor_tuple(self) -> tuple[tensorflow.Tensor, tensorflow.Tensor]:
DataSet = t.Iterable[DataTuple] return tensorflow.convert_to_tensor(self.text, dtype=tensorflow.string), tensorflow.convert_to_tensor(self.category, dtype=tensorflow.string)
@contextlib.contextmanager @contextlib.contextmanager
@ -65,7 +70,7 @@ def mongo_client_from_config() -> t.ContextManager[pymongo.MongoClient]:
host=MONGO_HOST.__wrapped__, host=MONGO_HOST.__wrapped__,
port=MONGO_PORT.__wrapped__, port=MONGO_PORT.__wrapped__,
) )
log.info("Opened connection to MongoDB at %s!", client.address) log.info("Opened connection to MongoDB!")
yield client yield client
@ -75,7 +80,7 @@ def mongo_client_from_config() -> t.ContextManager[pymongo.MongoClient]:
@contextlib.contextmanager @contextlib.contextmanager
def mongo_reviews_collection_from_config() -> pymongo.collection.Collection[Review]: def mongo_reviews_collection_from_config() -> pymongo.collection.Collection[MongoReview]:
""" """
Create a new MongoDB client, access the ``reviews`` collection in the ``reviews`` database, and yield it. Create a new MongoDB client, access the ``reviews`` collection in the ``reviews`` database, and yield it.
""" """
@ -86,82 +91,118 @@ def mongo_reviews_collection_from_config() -> pymongo.collection.Collection[Revi
yield collection yield collection
def sample_reviews(reviews: pymongo.collection.Collection, amount: int) -> t.Iterator[Review]: class DatasetFunc(t.Protocol):
def __call__(self) -> t.Iterator[Review]:
pass
def sample_reviews(collection: pymongo.collection.Collection, amount: int) -> t.Iterator[Review]:
""" """
Get ``amount`` random reviews from the ``reviews`` collection. Get ``amount`` random reviews from the ``reviews`` collection.
""" """
log.debug("Getting a sample of %d reviews...", amount) log.debug("Getting a sample of %d reviews...", amount)
return reviews.aggregate([ cursor = collection.aggregate([
{"$limit": WORKING_SET_SIZE.__wrapped__}, {"$limit": WORKING_SET_SIZE.__wrapped__},
{"$sample": {"size": amount}}, {"$sample": {"size": amount}},
]) ])
cursor = map(Review.from_mongoreview, cursor)
return cursor
def sample_reviews_by_rating(reviews: pymongo.collection.Collection, rating: float, amount: int) -> t.Iterator[Review]:
def sample_reviews_by_rating(collection: pymongo.collection.Collection, rating: float, amount: int) -> t.Iterator[Review]:
""" """
Get ``amount`` random reviews with ``rating`` stars from the ``reviews`` collection. Get ``amount`` random reviews with ``rating`` stars from the ``reviews`` collection.
""" """
log.debug("Getting a sample of %d reviews with %d stars...", amount, rating) log.debug("Getting a sample of %d reviews with %d stars...", amount, rating)
return reviews.aggregate([ cursor = collection.aggregate([
{"$limit": WORKING_SET_SIZE.__wrapped__}, {"$limit": WORKING_SET_SIZE.__wrapped__},
{"$match": {"overall": rating}}, {"$match": {"overall": rating}},
{"$sample": {"size": amount}}, {"$sample": {"size": amount}},
]) ])
cursor = map(Review.from_mongoreview, cursor)
def polar_dataset(collection: pymongo.collection.Collection, amount: int) -> t.Iterator[DataTuple]: return cursor
"""
Get a list of the same amount of 1-star and 5-star reviews.
"""
log.info("Building polar dataset with %d reviews...", amount * 2)
# Sample the required reviews
positive = sample_reviews_by_rating(collection, rating=5.0, amount=amount)
negative = sample_reviews_by_rating(collection, rating=1.0, amount=amount)
# Chain the iterators
full = itertools.chain(positive, negative)
# Convert reviews to datatuples
full = map(DataTuple.from_review, full)
return full
def varied_dataset(collection: pymongo.collection.Collection, amount: int) -> t.Iterator[DataTuple]: def sample_reviews_polar(collection: pymongo.collection.Collection, amount: int) -> t.Iterator[Review]:
""" log.debug("Getting a sample of %d polar reviews...", amount * 2)
Get a list of the same amount of reviews for each rating.
"""
log.info("Building varied dataset with %d reviews...", amount * 5)
# Sample the required reviews cursor = collection.aggregate([
terrible = sample_reviews_by_rating(collection, rating=1.0, amount=amount) {"$limit": WORKING_SET_SIZE.__wrapped__},
negative = sample_reviews_by_rating(collection, rating=2.0, amount=amount) {"$match": {"overall": 1.0}},
mixed = sample_reviews_by_rating(collection, rating=3.0, amount=amount) {"$sample": {"size": amount}},
positive = sample_reviews_by_rating(collection, rating=4.0, amount=amount) {"$unionWith": {
great = sample_reviews_by_rating(collection, rating=5.0, amount=amount) "coll": collection.name,
"pipeline": [
{"$limit": WORKING_SET_SIZE.__wrapped__},
{"$match": {"overall": 5.0}},
{"$sample": {"size": amount}},
],
}}
])
# Chain the iterators cursor = map(Review.from_mongoreview, cursor)
full = itertools.chain(terrible, negative, mixed, positive, great) return cursor
# Convert reviews to datatuples
full = map(DataTuple.from_review, full)
return full def sample_reviews_varied(collection: pymongo.collection.Collection, amount: int) -> t.Iterator[Review]:
log.debug("Getting a sample of %d varied reviews...", amount * 5)
# Wow, this is ugly.
cursor = collection.aggregate([
{"$limit": WORKING_SET_SIZE.__wrapped__},
{"$match": {"overall": 1.0}},
{"$sample": {"size": amount}},
{"$unionWith": {
"coll": collection.name,
"pipeline": [
{"$limit": WORKING_SET_SIZE.__wrapped__},
{"$match": {"overall": 2.0}},
{"$sample": {"size": amount}},
{"$unionWith": {
"coll": collection.name,
"pipeline": [
{"$limit": WORKING_SET_SIZE.__wrapped__},
{"$match": {"overall": 3.0}},
{"$sample": {"size": amount}},
{"$unionWith": {
"coll": collection.name,
"pipeline": [
{"$limit": WORKING_SET_SIZE.__wrapped__},
{"$match": {"overall": 4.0}},
{"$sample": {"size": amount}},
{"$unionWith": {
"coll": collection.name,
"pipeline": [
{"$limit": WORKING_SET_SIZE.__wrapped__},
{"$match": {"overall": 5.0}},
{"$sample": {"size": amount}},
],
}}
],
}}
],
}}
],
}}
])
cursor = map(Review.from_mongoreview, cursor)
return cursor
__all__ = ( __all__ = (
"Review",
"Text", "Text",
"Category", "Category",
"DataTuple", "Review",
"DataSet", "DatasetFunc",
"mongo_client_from_config", "mongo_client_from_config",
"mongo_reviews_collection_from_config", "mongo_reviews_collection_from_config",
"sample_reviews", "sample_reviews",
"sample_reviews_by_rating", "sample_reviews_by_rating",
"polar_dataset", "sample_reviews_polar",
"varied_dataset", "sample_reviews_varied",
) )