diff --git a/.idea/runConfigurations/unimore_bda_6.xml b/.idea/runConfigurations/unimore_bda_6.xml
index fa5fcae..3718006 100644
--- a/.idea/runConfigurations/unimore_bda_6.xml
+++ b/.idea/runConfigurations/unimore_bda_6.xml
@@ -5,9 +5,9 @@
-
+
diff --git a/unimore_bda_6/__main__.py b/unimore_bda_6/__main__.py
index b9871eb..ea01232 100644
--- a/unimore_bda_6/__main__.py
+++ b/unimore_bda_6/__main__.py
@@ -2,10 +2,11 @@ import logging
import tensorflow
from .config import config, DATA_SET_SIZE
-from .database import mongo_client_from_config, reviews_collection, sample_reviews_polar, sample_reviews_varied, store_cache, load_cache
+from .database import mongo_client_from_config, reviews_collection, sample_reviews_polar, sample_reviews_varied, store_cache, load_cache, delete_cache
from .analysis.nltk_sentiment import NLTKSentimentAnalyzer
from .analysis.tf_text import TensorflowSentimentAnalyzer
-from .tokenizer import NLTKWordTokenizer, PottsTokenizer, PottsTokenizerWithNegation, LowercaseTokenizer
+from .analysis.base import TrainingFailedError
+from .tokenizer import LowercaseTokenizer
from .log import install_log_handler
log = logging.getLogger(__name__)
@@ -17,6 +18,12 @@ def main():
else:
log.debug("Tensorflow successfully found GPU acceleration!")
+ try:
+ delete_cache("./data/training")
+ delete_cache("./data/evaluation")
+ except FileNotFoundError:
+ pass
+
for dataset_func in [sample_reviews_polar, sample_reviews_varied]:
for SentimentAnalyzer in [TensorflowSentimentAnalyzer, NLTKSentimentAnalyzer]:
for Tokenizer in [
@@ -25,46 +32,50 @@ def main():
# PottsTokenizerWithNegation,
LowercaseTokenizer,
]:
- tokenizer = Tokenizer()
- model = SentimentAnalyzer(tokenizer=tokenizer)
-
- with mongo_client_from_config() as db:
- log.debug("Finding the reviews MongoDB collection...")
- collection = reviews_collection(db)
-
+ while True:
try:
- training_cache = load_cache("./data/training")
- evaluation_cache = load_cache("./data/evaluation")
- except FileNotFoundError:
- log.debug("Gathering datasets...")
- reviews_training = dataset_func(collection=collection, amount=DATA_SET_SIZE.__wrapped__)
- reviews_evaluation = dataset_func(collection=collection, amount=DATA_SET_SIZE.__wrapped__)
+ tokenizer = Tokenizer()
+ model = SentimentAnalyzer(tokenizer=tokenizer)
- log.debug("Caching datasets...")
- store_cache(reviews_training, "./data/training")
- store_cache(reviews_evaluation, "./data/evaluation")
- del reviews_training
- del reviews_evaluation
+ with mongo_client_from_config() as db:
+ log.debug("Finding the reviews MongoDB collection...")
+ collection = reviews_collection(db)
- training_cache = load_cache("./data/training")
- evaluation_cache = load_cache("./data/evaluation")
- log.debug("Caches stored and loaded successfully!")
+ try:
+ training_cache = load_cache("./data/training")
+ evaluation_cache = load_cache("./data/evaluation")
+ except FileNotFoundError:
+ log.debug("Gathering datasets...")
+ reviews_training = dataset_func(collection=collection, amount=DATA_SET_SIZE.__wrapped__)
+ reviews_evaluation = dataset_func(collection=collection, amount=DATA_SET_SIZE.__wrapped__)
+
+ log.debug("Caching datasets...")
+ store_cache(reviews_training, "./data/training")
+ store_cache(reviews_evaluation, "./data/evaluation")
+ del reviews_training
+ del reviews_evaluation
+
+ training_cache = load_cache("./data/training")
+ evaluation_cache = load_cache("./data/evaluation")
+ log.debug("Caches stored and loaded successfully!")
+ else:
+ log.debug("Caches loaded successfully!")
+
+ log.info("Training model: %s", model)
+ model.train(training_cache)
+ log.info("Evaluating model: %s", model)
+ evaluation_results = model.evaluate(evaluation_cache)
+ log.info("%s", evaluation_results)
+
+ except TrainingFailedError:
+ log.error("Training failed, restarting with a different dataset.")
+ continue
else:
- log.debug("Caches loaded successfully!")
-
- log.info("Training model: %s", model)
- model.train(training_cache)
- log.info("Evaluating model: %s", model)
- evaluation_results = model.evaluate(evaluation_cache)
- log.info("%s", evaluation_results)
-
- # try:
- # print("Manual testing for %s" % model)
- # print("Input an empty string to continue to the next model.")
- # while inp := input():
- # print(model.use(inp))
- # except KeyboardInterrupt:
- # pass
+ log.info("Training")
+ break
+ finally:
+ delete_cache("./data/training")
+ delete_cache("./data/evaluation")
if __name__ == "__main__":
diff --git a/unimore_bda_6/analysis/base.py b/unimore_bda_6/analysis/base.py
index eea07b4..b9eb589 100644
--- a/unimore_bda_6/analysis/base.py
+++ b/unimore_bda_6/analysis/base.py
@@ -11,12 +11,13 @@ log = logging.getLogger(__name__)
class EvaluationResults:
correct: int
evaluated: int
+ score: float
def __repr__(self):
- return f""
+ return f""
def __str__(self):
- return f"{self.correct} / {self.evaluated} - {self.correct / self.evaluated * 100:.2f} %"
+ return f"{self.evaluated} evaluated, {self.correct} correct, {self.correct / self.evaluated * 100:.2} % accuracy, {self.score:.2} score, {self.score / self.evaluated * 100:.2} scoreaccuracy"
class BaseSentimentAnalyzer(metaclass=abc.ABCMeta):
@@ -40,15 +41,18 @@ class BaseSentimentAnalyzer(metaclass=abc.ABCMeta):
evaluated: int = 0
correct: int = 0
+ score: float = 0.0
for review in dataset_func():
resulting_category = self.use(review.text)
evaluated += 1
correct += 1 if resulting_category == review.category else 0
+ score += 1 - (abs(resulting_category - review.category) / 4)
if not evaluated % 100:
- log.debug("%d evaluated, %d correct, %0.2d %% accuracy", evaluated, correct, correct / evaluated * 100)
+ temp_results = EvaluationResults(correct=correct, evaluated=evaluated, score=score)
+ log.debug(f"{temp_results!s}")
- return EvaluationResults(correct=correct, evaluated=evaluated)
+ return EvaluationResults(correct=correct, evaluated=evaluated, score=score)
@abc.abstractmethod
def use(self, text: Text) -> Category:
@@ -70,8 +74,15 @@ class NotTrainedError(Exception):
"""
+class TrainingFailedError(Exception):
+ """
+ The model wasn't able to complete the training and should not be used anymore.
+ """
+
+
__all__ = (
"BaseSentimentAnalyzer",
"AlreadyTrainedError",
"NotTrainedError",
+ "TrainingFailedError",
)
diff --git a/unimore_bda_6/analysis/tf_text.py b/unimore_bda_6/analysis/tf_text.py
index 8beee6d..ba27c1d 100644
--- a/unimore_bda_6/analysis/tf_text.py
+++ b/unimore_bda_6/analysis/tf_text.py
@@ -1,82 +1,119 @@
import tensorflow
+import logging
from ..database import Text, Category, DatasetFunc
-from ..config import DATA_SET_SIZE
-from .base import BaseSentimentAnalyzer, AlreadyTrainedError, NotTrainedError
+from ..config import TENSORFLOW_EMBEDDING_SIZE, TENSORFLOW_MAX_FEATURES, TENSORFLOW_EPOCHS
+from ..tokenizer import BaseTokenizer
+from .base import BaseSentimentAnalyzer, AlreadyTrainedError, NotTrainedError, TrainingFailedError
+
+log = logging.getLogger(__name__)
class TensorflowSentimentAnalyzer(BaseSentimentAnalyzer):
- def __init__(self, *args, **kwargs):
+ def __init__(self, tokenizer: BaseTokenizer):
super().__init__()
self.trained: bool = False
- self.text_vectorization_layer: tensorflow.keras.layers.TextVectorization = self._build_vectorizer()
+ self.text_vectorization_layer: tensorflow.keras.layers.TextVectorization = self._build_vectorizer(tokenizer)
self.model: tensorflow.keras.Sequential = self._build_model()
+ self.history: tensorflow.keras.callbacks.History | None = None
- def _build_dataset(self, dataset_func: DatasetFunc) -> tensorflow.data.Dataset:
+ @staticmethod
+ def _build_dataset(dataset_func: DatasetFunc) -> tensorflow.data.Dataset:
def dataset_func_with_tensor_tuple():
for review in dataset_func():
yield review.to_tensor_tuple()
- return tensorflow.data.Dataset.from_generator(
+ log.debug("Creating dataset...")
+ dataset = tensorflow.data.Dataset.from_generator(
dataset_func_with_tensor_tuple,
output_signature=(
tensorflow.TensorSpec(shape=(), dtype=tensorflow.string, name="text"),
- tensorflow.TensorSpec(shape=(5,), dtype=tensorflow.float32, name="category"),
+ tensorflow.TensorSpec(shape=(1, 5,), dtype=tensorflow.float32, name="category"),
)
)
- def _build_model(self) -> tensorflow.keras.Sequential:
- return tensorflow.keras.Sequential([
+ log.debug("Caching dataset...")
+ dataset = dataset.cache()
+
+ log.debug("Configuring dataset prefetch...")
+ dataset = dataset.prefetch(buffer_size=tensorflow.data.AUTOTUNE)
+
+ return dataset
+
+ @staticmethod
+ def _build_model() -> tensorflow.keras.Sequential:
+ log.debug("Creating %s model...", tensorflow.keras.Sequential)
+ model = tensorflow.keras.Sequential([
tensorflow.keras.layers.Embedding(
- input_dim=self.MAX_FEATURES + 1,
- output_dim=self.EMBEDDING_DIM,
+ input_dim=TENSORFLOW_MAX_FEATURES.__wrapped__ + 1,
+ output_dim=TENSORFLOW_EMBEDDING_SIZE.__wrapped__,
),
- # tensorflow.keras.layers.Dropout(0.2),
+ tensorflow.keras.layers.Dropout(0.2),
tensorflow.keras.layers.GlobalAveragePooling1D(),
- # tensorflow.keras.layers.Dropout(0.2),
+ tensorflow.keras.layers.Dropout(0.2),
tensorflow.keras.layers.Dense(5, activation="softmax"),
])
+ log.debug("Compiling model: %s", model)
+ model.compile(
+ optimizer=tensorflow.keras.optimizers.Adam(global_clipnorm=1.0),
+ loss=tensorflow.keras.losses.CategoricalCrossentropy(),
+ metrics=[
+ tensorflow.keras.metrics.CategoricalAccuracy(),
+ ]
+ )
+ log.debug("Compiled model: %s", model)
+ return model
- def _build_vectorizer(self) -> tensorflow.keras.layers.TextVectorization:
- return tensorflow.keras.layers.TextVectorization(max_tokens=self.MAX_FEATURES)
-
- def __vectorize_data(self, text, category):
- text = tensorflow.expand_dims(text, -1) # TODO: ??????
- return self.text_vectorization_layer(text), category
-
- MAX_FEATURES = 2500
- EMBEDDING_DIM = 24
- """
- Count of possible "semantic meanings" of words, represented as dimensions of a tensor.
- """
-
- EPOCHS = 3
+ @staticmethod
+ def _build_vectorizer(tokenizer: BaseTokenizer) -> tensorflow.keras.layers.TextVectorization:
+ return tensorflow.keras.layers.TextVectorization(
+ standardize=tokenizer.tokenize_tensorflow,
+ max_tokens=TENSORFLOW_MAX_FEATURES.__wrapped__
+ )
def train(self, dataset_func: DatasetFunc) -> None:
if self.trained:
+ log.error("Tried to train an already trained model.")
raise AlreadyTrainedError()
+ log.debug("Building dataset...")
training_set = self._build_dataset(dataset_func)
+ log.debug("Built dataset: %s", training_set)
+ log.debug("Preparing training_set for %s...", self.text_vectorization_layer.adapt)
only_text_set = training_set.map(lambda text, category: text)
+ log.debug("Adapting text_vectorization_layer: %s", self.text_vectorization_layer)
self.text_vectorization_layer.adapt(only_text_set)
- training_set = training_set.map(self.__vectorize_data)
+ log.debug("Adapted text_vectorization_layer: %s", self.text_vectorization_layer)
- # self.model.compile(loss=tensorflow.keras.losses.SparseCategoricalCrossentropy(from_logits=True), optimizer="adam", metrics=["accuracy"])
- self.model.compile(loss=tensorflow.keras.losses.MeanAbsoluteError(), optimizer="adam", metrics=["accuracy"])
+ log.debug("Preparing training_set for %s...", self.model.fit)
+ training_set = training_set.map(lambda text, category: (self.text_vectorization_layer(text), category))
+ log.info("Training: %s", self.model)
+ self.history: tensorflow.keras.callbacks.History | None = self.model.fit(
+ training_set,
+ epochs=TENSORFLOW_EPOCHS.__wrapped__,
+ callbacks=[
+ tensorflow.keras.callbacks.TerminateOnNaN()
+ ])
+ log.info("Trained: %s", self.model)
- self.model.fit(training_set, epochs=self.EPOCHS)
+ if len(self.history.epoch) < TENSORFLOW_EPOCHS.__wrapped__:
+ log.error("Model %s training failed: only %d epochs computed", self.model, len(self.history.epoch))
+ raise TrainingFailedError()
+ else:
+ log.info("Model %s training succeeded!", self.model)
self.trained = True
def use(self, text: Text) -> Category:
if not self.trained:
+ log.error("Tried to use a non-trained model.")
raise NotTrainedError()
- vector = self.text_vectorization_layer(tensorflow.expand_dims(text, -1))
+ vector = self.text_vectorization_layer(text)
- prediction = self.model.predict(vector)
+ prediction = self.model.predict(vector, verbose=False)
max_i = None
max_p = None
@@ -84,5 +121,6 @@ class TensorflowSentimentAnalyzer(BaseSentimentAnalyzer):
if max_p is None or p > max_p:
max_i = i
max_p = p
+ result = float(max_i) + 1.0
- return float(max_i) + 1.0
+ return result
diff --git a/unimore_bda_6/config.py b/unimore_bda_6/config.py
index d2aca9e..8dcc079 100644
--- a/unimore_bda_6/config.py
+++ b/unimore_bda_6/config.py
@@ -49,10 +49,55 @@ def DATA_SET_SIZE(val: str | None) -> int:
"""
The number of reviews from each category to fetch for the datasets.
- Defaults to `1000`.
+ Defaults to `1750`.
"""
if val is None:
- return 1000
+ return 1750
+ try:
+ return int(val)
+ except ValueError:
+ raise cfig.InvalidValueError("Not an int.")
+
+
+@config.optional()
+def TENSORFLOW_MAX_FEATURES(val: str | None) -> int:
+ """
+ The maximum number of features to use in Tensorflow models.
+
+ Defaults to `30000`.
+ """
+ if val is None:
+ return 30000
+ try:
+ return int(val)
+ except ValueError:
+ raise cfig.InvalidValueError("Not an int.")
+
+
+@config.optional()
+def TENSORFLOW_EMBEDDING_SIZE(val: str | None) -> int:
+ """
+ The size of the embeddings tensor to use in Tensorflow models.
+
+ Defaults to `12`.
+ """
+ if val is None:
+ return 12
+ try:
+ return int(val)
+ except ValueError:
+ raise cfig.InvalidValueError("Not an int.")
+
+
+@config.optional()
+def TENSORFLOW_EPOCHS(val: str | None) -> int:
+ """
+ The number of epochs to train Tensorflow models for.
+
+ Defaults to `15`.
+ """
+ if val is None:
+ return 15
try:
return int(val)
except ValueError:
@@ -65,6 +110,9 @@ __all__ = (
"MONGO_PORT",
"WORKING_SET_SIZE",
"DATA_SET_SIZE",
+ "TENSORFLOW_MAX_FEATURES",
+ "TENSORFLOW_EMBEDDING_SIZE",
+ "TENSORFLOW_EPOCHS",
)
diff --git a/unimore_bda_6/database/cache.py b/unimore_bda_6/database/cache.py
index ae00f34..29f530f 100644
--- a/unimore_bda_6/database/cache.py
+++ b/unimore_bda_6/database/cache.py
@@ -36,7 +36,7 @@ def store_cache(reviews: t.Iterator[Review], path: str | pathlib.Path) -> None:
def load_cache(path: str | pathlib.Path) -> DatasetFunc:
"""
- Load the contents of a directory
+ Load the contents of a directory into a `Review` iterator.
"""
path = pathlib.Path(path)
@@ -47,8 +47,10 @@ def load_cache(path: str | pathlib.Path) -> DatasetFunc:
document_paths = path.iterdir()
for document_path in document_paths:
document_path = pathlib.Path(document_path)
+
if not str(document_path).endswith(".pickle"):
log.debug("Ignoring non-pickle file: %s", document_path)
+ continue
log.debug("Loading pickle file: %s", document_path)
with open(document_path, "rb") as file:
@@ -58,8 +60,22 @@ def load_cache(path: str | pathlib.Path) -> DatasetFunc:
return data_cache_loader
+def delete_cache(path: str | pathlib.Path) -> None:
+ """
+ Delete the given cache directory.
+ """
+ path = pathlib.Path(path)
+
+ if not path.exists():
+ raise FileNotFoundError("The specified path does not exist.")
+
+ log.warning("Deleting cache directory: %s", path)
+ shutil.rmtree(path)
+
+
__all__ = (
"DatasetFunc",
"store_cache",
"load_cache",
+ "delete_cache",
)
diff --git a/unimore_bda_6/database/collections.py b/unimore_bda_6/database/collections.py
index 7dd2469..76f5307 100644
--- a/unimore_bda_6/database/collections.py
+++ b/unimore_bda_6/database/collections.py
@@ -1,4 +1,3 @@
-import contextlib
import pymongo.collection
import typing as t
import bson
@@ -30,8 +29,8 @@ def reviews_collection(db: pymongo.MongoClient) -> pymongo.collection.Collection
Create a new MongoDB client, access the ``reviews`` collection in the ``reviews`` database, and yield it.
"""
log.debug("Accessing the reviews collection...")
- collection = db.reviews.reviews
- log.debug("Collection accessed successfully: %s", collection)
+ collection: pymongo.collection.Collection[MongoReview] = db.reviews.reviews
+ log.debug("Collection accessed successfully: %s", collection.name)
return collection
diff --git a/unimore_bda_6/database/datatypes.py b/unimore_bda_6/database/datatypes.py
index 5f1df35..1af2b7d 100644
--- a/unimore_bda_6/database/datatypes.py
+++ b/unimore_bda_6/database/datatypes.py
@@ -1,5 +1,8 @@
import tensorflow
from .collections import MongoReview
+import logging
+
+log = logging.getLogger(__name__)
Text = str
@@ -33,19 +36,21 @@ class Review:
return tensorflow.convert_to_tensor(self.text, dtype=tensorflow.string)
def to_tensor_category(self) -> tensorflow.Tensor:
- return tensorflow.convert_to_tensor([
+ return tensorflow.convert_to_tensor([[
1.0 if self.category == 1.0 else 0.0,
1.0 if self.category == 2.0 else 0.0,
1.0 if self.category == 3.0 else 0.0,
1.0 if self.category == 4.0 else 0.0,
1.0 if self.category == 5.0 else 0.0,
- ], dtype=tensorflow.float32)
+ ]], dtype=tensorflow.float32)
def to_tensor_tuple(self) -> tuple[tensorflow.Tensor, tensorflow.Tensor]:
- return (
+ t = (
self.to_tensor_text(),
self.to_tensor_category(),
)
+ log.debug("Converted %s", t)
+ return t
__all__ = (
diff --git a/unimore_bda_6/database/queries.py b/unimore_bda_6/database/queries.py
index 8a88b39..c2cefc4 100644
--- a/unimore_bda_6/database/queries.py
+++ b/unimore_bda_6/database/queries.py
@@ -54,6 +54,12 @@ def sample_reviews_polar(collection: pymongo.collection.Collection, amount: int)
{"$match": {"overall": 5.0}},
{"$sample": {"size": amount}},
],
+ }},
+ {"$addFields": {
+ "sortKey": {"$rand": {}},
+ }},
+ {"$sort": {
+ "sortKey": 1,
}}
])
@@ -101,6 +107,12 @@ def sample_reviews_varied(collection: pymongo.collection.Collection, amount: int
],
}}
],
+ }},
+ {"$addFields": {
+ "sortKey": {"$rand": {}},
+ }},
+ {"$sort": {
+ "sortKey": 1,
}}
])
diff --git a/unimore_bda_6/log.py b/unimore_bda_6/log.py
index c7272fe..f6f0fcd 100644
--- a/unimore_bda_6/log.py
+++ b/unimore_bda_6/log.py
@@ -15,7 +15,7 @@ def install_log_handler(loggers: list[logging.Logger] = None):
for logger in loggers:
coloredlogs.install(
logger=logger,
- level="INFO",
+ level="DEBUG",
fmt="{asctime} | {name:<32} | {levelname:>8} | {message}",
style="{",
level_styles=dict(
@@ -34,6 +34,9 @@ def install_log_handler(loggers: list[logging.Logger] = None):
)
this_log.debug("Installed custom log handler on: %s", logger)
+ logging.getLogger("unimore_bda_6.database.cache").setLevel("INFO")
+ logging.getLogger("unimore_bda_6.database.datatypes").setLevel("INFO")
+
_passage_counts = collections.defaultdict(lambda: 0)