stuff's working

2024-11-21 23:44:19 +00:00 · 2023-02-08 10:54:14 +01:00 · 2023-02-08 10:54:14 +01:00 · 4d6c8f0fee
commit 4d6c8f0fee
parent c31743f066
10 changed files with 230 additions and 87 deletions
--- a/.idea/runConfigurations/unimore_bda_6.xml
+++ b/.idea/runConfigurations/unimore_bda_6.xml
@ -5,9 +5,9 @@
    <option name="PARENT_ENVS" value="true" />
    <envs>
      <env name="CONFIRM_OVERWRITE" value="False" />
-      <env name="DATA_SET_SIZE" value="2500" />
      <env name="NLTK_DATA" value="./data/nltk" />
      <env name="PYTHONUNBUFFERED" value="1" />
+      <env name="TF_CPP_MIN_LOG_LEVEL" value="2" />
      <env name="WORKING_SET_SIZE" value="1000000" />
      <env name="XLA_FLAGS" value="--xla_gpu_cuda_data_dir=/opt/cuda" />
    </envs>
--- a/unimore_bda_6/main.py
+++ b/unimore_bda_6/main.py
@ -2,10 +2,11 @@ import logging
 import tensorflow

 from .config import config, DATA_SET_SIZE
-from .database import mongo_client_from_config, reviews_collection, sample_reviews_polar, sample_reviews_varied, store_cache, load_cache
+from .database import mongo_client_from_config, reviews_collection, sample_reviews_polar, sample_reviews_varied, store_cache, load_cache, delete_cache
 from .analysis.nltk_sentiment import NLTKSentimentAnalyzer
 from .analysis.tf_text import TensorflowSentimentAnalyzer
-from .tokenizer import NLTKWordTokenizer, PottsTokenizer, PottsTokenizerWithNegation, LowercaseTokenizer
+from .analysis.base import TrainingFailedError
+from .tokenizer import LowercaseTokenizer
 from .log import install_log_handler

 log = logging.getLogger(__name__)
@ -17,6 +18,12 @@ def main():
    else:
        log.debug("Tensorflow successfully found GPU acceleration!")

+    try:
+        delete_cache("./data/training")
+        delete_cache("./data/evaluation")
+    except FileNotFoundError:
+        pass
+
    for dataset_func in [sample_reviews_polar, sample_reviews_varied]:
        for SentimentAnalyzer in [TensorflowSentimentAnalyzer, NLTKSentimentAnalyzer]:
            for Tokenizer in [
@ -25,46 +32,50 @@ def main():
                # PottsTokenizerWithNegation,
                LowercaseTokenizer,
            ]:
-                tokenizer = Tokenizer()
-                model = SentimentAnalyzer(tokenizer=tokenizer)
-
-                with mongo_client_from_config() as db:
-                    log.debug("Finding the reviews MongoDB collection...")
-                    collection = reviews_collection(db)
-
+                while True:
                    try:
-                        training_cache = load_cache("./data/training")
-                        evaluation_cache = load_cache("./data/evaluation")
-                    except FileNotFoundError:
-                        log.debug("Gathering datasets...")
-                        reviews_training = dataset_func(collection=collection, amount=DATA_SET_SIZE.__wrapped__)
-                        reviews_evaluation = dataset_func(collection=collection, amount=DATA_SET_SIZE.__wrapped__)
+                        tokenizer = Tokenizer()
+                        model = SentimentAnalyzer(tokenizer=tokenizer)

-                        log.debug("Caching datasets...")
-                        store_cache(reviews_training, "./data/training")
-                        store_cache(reviews_evaluation, "./data/evaluation")
-                        del reviews_training
-                        del reviews_evaluation
+                        with mongo_client_from_config() as db:
+                            log.debug("Finding the reviews MongoDB collection...")
+                            collection = reviews_collection(db)

-                        training_cache = load_cache("./data/training")
-                        evaluation_cache = load_cache("./data/evaluation")
-                        log.debug("Caches stored and loaded successfully!")
+                            try:
+                                training_cache = load_cache("./data/training")
+                                evaluation_cache = load_cache("./data/evaluation")
+                            except FileNotFoundError:
+                                log.debug("Gathering datasets...")
+                                reviews_training = dataset_func(collection=collection, amount=DATA_SET_SIZE.__wrapped__)
+                                reviews_evaluation = dataset_func(collection=collection, amount=DATA_SET_SIZE.__wrapped__)
+
+                                log.debug("Caching datasets...")
+                                store_cache(reviews_training, "./data/training")
+                                store_cache(reviews_evaluation, "./data/evaluation")
+                                del reviews_training
+                                del reviews_evaluation
+
+                                training_cache = load_cache("./data/training")
+                                evaluation_cache = load_cache("./data/evaluation")
+                                log.debug("Caches stored and loaded successfully!")
+                            else:
+                                log.debug("Caches loaded successfully!")
+
+                            log.info("Training model: %s", model)
+                            model.train(training_cache)
+                            log.info("Evaluating model: %s", model)
+                            evaluation_results = model.evaluate(evaluation_cache)
+                            log.info("%s", evaluation_results)
+
+                    except TrainingFailedError:
+                        log.error("Training failed, restarting with a different dataset.")
+                        continue
                    else:
-                        log.debug("Caches loaded successfully!")
-
-                    log.info("Training model: %s", model)
-                    model.train(training_cache)
-                    log.info("Evaluating model: %s", model)
-                    evaluation_results = model.evaluate(evaluation_cache)
-                    log.info("%s", evaluation_results)
-
-                # try:
-                #     print("Manual testing for %s" % model)
-                #     print("Input an empty string to continue to the next model.")
-                #     while inp := input():
-                #         print(model.use(inp))
-                # except KeyboardInterrupt:
-                #     pass
+                        log.info("Training")
+                        break
+                    finally:
+                        delete_cache("./data/training")
+                        delete_cache("./data/evaluation")


 if __name__ == "__main__":
--- a/unimore_bda_6/analysis/base.py
+++ b/unimore_bda_6/analysis/base.py
@ -11,12 +11,13 @@ log = logging.getLogger(__name__)
 class EvaluationResults:
    correct: int
    evaluated: int
+    score: float

    def __repr__(self):
-        return f"<EvaluationResults: {self.correct}/{self.evaluated}, {self.correct / self.evaluated * 100:.2f}>"
+        return f"<EvaluationResults: score of {self.score} out of {self.evaluated} evaluated tuples>"

    def __str__(self):
-        return f"{self.correct} / {self.evaluated} - {self.correct / self.evaluated * 100:.2f} %"
+        return f"{self.evaluated} evaluated, {self.correct} correct, {self.correct / self.evaluated * 100:.2} % accuracy, {self.score:.2} score, {self.score / self.evaluated * 100:.2} scoreaccuracy"


 class BaseSentimentAnalyzer(metaclass=abc.ABCMeta):
@ -40,15 +41,18 @@ class BaseSentimentAnalyzer(metaclass=abc.ABCMeta):

        evaluated: int = 0
        correct: int = 0
+        score: float = 0.0

        for review in dataset_func():
            resulting_category = self.use(review.text)
            evaluated += 1
            correct += 1 if resulting_category == review.category else 0
+            score += 1 - (abs(resulting_category - review.category) / 4)
            if not evaluated % 100:
-                log.debug("%d evaluated, %d correct, %0.2d %% accuracy", evaluated, correct, correct / evaluated * 100)
+                temp_results = EvaluationResults(correct=correct, evaluated=evaluated, score=score)
+                log.debug(f"{temp_results!s}")

-        return EvaluationResults(correct=correct, evaluated=evaluated)
+        return EvaluationResults(correct=correct, evaluated=evaluated, score=score)

    @abc.abstractmethod
    def use(self, text: Text) -> Category:
@ -70,8 +74,15 @@ class NotTrainedError(Exception):
    """


+class TrainingFailedError(Exception):
+    """
+    The model wasn't able to complete the training and should not be used anymore.
+    """
+
+
 __all__ = (
    "BaseSentimentAnalyzer",
    "AlreadyTrainedError",
    "NotTrainedError",
+    "TrainingFailedError",
 )
--- a/unimore_bda_6/analysis/tf_text.py
+++ b/unimore_bda_6/analysis/tf_text.py
@ -1,82 +1,119 @@
 import tensorflow
+import logging

 from ..database import Text, Category, DatasetFunc
-from ..config import DATA_SET_SIZE
-from .base import BaseSentimentAnalyzer, AlreadyTrainedError, NotTrainedError
+from ..config import TENSORFLOW_EMBEDDING_SIZE, TENSORFLOW_MAX_FEATURES, TENSORFLOW_EPOCHS
+from ..tokenizer import BaseTokenizer
+from .base import BaseSentimentAnalyzer, AlreadyTrainedError, NotTrainedError, TrainingFailedError
+
+log = logging.getLogger(__name__)


 class TensorflowSentimentAnalyzer(BaseSentimentAnalyzer):
-    def __init__(self, *args, **kwargs):
+    def __init__(self, tokenizer: BaseTokenizer):
        super().__init__()
        self.trained: bool = False

-        self.text_vectorization_layer: tensorflow.keras.layers.TextVectorization = self._build_vectorizer()
+        self.text_vectorization_layer: tensorflow.keras.layers.TextVectorization = self._build_vectorizer(tokenizer)
        self.model: tensorflow.keras.Sequential = self._build_model()
+        self.history: tensorflow.keras.callbacks.History | None = None

-    def _build_dataset(self, dataset_func: DatasetFunc) -> tensorflow.data.Dataset:
+    @staticmethod
+    def _build_dataset(dataset_func: DatasetFunc) -> tensorflow.data.Dataset:
        def dataset_func_with_tensor_tuple():
            for review in dataset_func():
                yield review.to_tensor_tuple()

-        return tensorflow.data.Dataset.from_generator(
+        log.debug("Creating dataset...")
+        dataset = tensorflow.data.Dataset.from_generator(
            dataset_func_with_tensor_tuple,
            output_signature=(
                tensorflow.TensorSpec(shape=(), dtype=tensorflow.string, name="text"),
-                tensorflow.TensorSpec(shape=(5,), dtype=tensorflow.float32, name="category"),
+                tensorflow.TensorSpec(shape=(1, 5,), dtype=tensorflow.float32, name="category"),
            )
        )

-    def _build_model(self) -> tensorflow.keras.Sequential:
-        return tensorflow.keras.Sequential([
+        log.debug("Caching dataset...")
+        dataset = dataset.cache()
+
+        log.debug("Configuring dataset prefetch...")
+        dataset = dataset.prefetch(buffer_size=tensorflow.data.AUTOTUNE)
+
+        return dataset
+
+    @staticmethod
+    def _build_model() -> tensorflow.keras.Sequential:
+        log.debug("Creating %s model...", tensorflow.keras.Sequential)
+        model = tensorflow.keras.Sequential([
            tensorflow.keras.layers.Embedding(
-                input_dim=self.MAX_FEATURES + 1,
-                output_dim=self.EMBEDDING_DIM,
+                input_dim=TENSORFLOW_MAX_FEATURES.__wrapped__ + 1,
+                output_dim=TENSORFLOW_EMBEDDING_SIZE.__wrapped__,
            ),
-            # tensorflow.keras.layers.Dropout(0.2),
+            tensorflow.keras.layers.Dropout(0.2),
            tensorflow.keras.layers.GlobalAveragePooling1D(),
-            # tensorflow.keras.layers.Dropout(0.2),
+            tensorflow.keras.layers.Dropout(0.2),
            tensorflow.keras.layers.Dense(5, activation="softmax"),
        ])
+        log.debug("Compiling model: %s", model)
+        model.compile(
+            optimizer=tensorflow.keras.optimizers.Adam(global_clipnorm=1.0),
+            loss=tensorflow.keras.losses.CategoricalCrossentropy(),
+            metrics=[
+                tensorflow.keras.metrics.CategoricalAccuracy(),
+            ]
+        )
+        log.debug("Compiled model: %s", model)
+        return model

-    def _build_vectorizer(self) -> tensorflow.keras.layers.TextVectorization:
-        return tensorflow.keras.layers.TextVectorization(max_tokens=self.MAX_FEATURES)
-
-    def __vectorize_data(self, text, category):
-        text = tensorflow.expand_dims(text, -1)  # TODO: ??????
-        return self.text_vectorization_layer(text), category
-
-    MAX_FEATURES = 2500
-    EMBEDDING_DIM = 24
-    """
-    Count of possible "semantic meanings" of words, represented as dimensions of a tensor.
-    """
-
-    EPOCHS = 3
+    @staticmethod
+    def _build_vectorizer(tokenizer: BaseTokenizer) -> tensorflow.keras.layers.TextVectorization:
+        return tensorflow.keras.layers.TextVectorization(
+            standardize=tokenizer.tokenize_tensorflow,
+            max_tokens=TENSORFLOW_MAX_FEATURES.__wrapped__
+        )

    def train(self, dataset_func: DatasetFunc) -> None:
        if self.trained:
+            log.error("Tried to train an already trained model.")
            raise AlreadyTrainedError()

+        log.debug("Building dataset...")
        training_set = self._build_dataset(dataset_func)
+        log.debug("Built dataset: %s", training_set)

+        log.debug("Preparing training_set for %s...", self.text_vectorization_layer.adapt)
        only_text_set = training_set.map(lambda text, category: text)
+        log.debug("Adapting text_vectorization_layer: %s", self.text_vectorization_layer)
        self.text_vectorization_layer.adapt(only_text_set)
-        training_set = training_set.map(self.__vectorize_data)
+        log.debug("Adapted text_vectorization_layer: %s", self.text_vectorization_layer)

-        # self.model.compile(loss=tensorflow.keras.losses.SparseCategoricalCrossentropy(from_logits=True), optimizer="adam", metrics=["accuracy"])
-        self.model.compile(loss=tensorflow.keras.losses.MeanAbsoluteError(), optimizer="adam", metrics=["accuracy"])
+        log.debug("Preparing training_set for %s...", self.model.fit)
+        training_set = training_set.map(lambda text, category: (self.text_vectorization_layer(text), category))
+        log.info("Training: %s", self.model)
+        self.history: tensorflow.keras.callbacks.History | None  = self.model.fit(
+            training_set,
+            epochs=TENSORFLOW_EPOCHS.__wrapped__,
+            callbacks=[
+                tensorflow.keras.callbacks.TerminateOnNaN()
+            ])
+        log.info("Trained: %s", self.model)

-        self.model.fit(training_set, epochs=self.EPOCHS)
+        if len(self.history.epoch) < TENSORFLOW_EPOCHS.__wrapped__:
+            log.error("Model %s training failed: only %d epochs computed", self.model, len(self.history.epoch))
+            raise TrainingFailedError()
+        else:
+            log.info("Model %s training succeeded!", self.model)

        self.trained = True

    def use(self, text: Text) -> Category:
        if not self.trained:
+            log.error("Tried to use a non-trained model.")
            raise NotTrainedError()

-        vector = self.text_vectorization_layer(tensorflow.expand_dims(text, -1))
+        vector = self.text_vectorization_layer(text)

-        prediction = self.model.predict(vector)
+        prediction = self.model.predict(vector, verbose=False)

        max_i = None
        max_p = None
@ -84,5 +121,6 @@ class TensorflowSentimentAnalyzer(BaseSentimentAnalyzer):
            if max_p is None or p > max_p:
                max_i = i
                max_p = p
+        result = float(max_i) + 1.0

-        return float(max_i) + 1.0
+        return result
--- a/unimore_bda_6/config.py
+++ b/unimore_bda_6/config.py
@ -49,10 +49,55 @@ def DATA_SET_SIZE(val: str | None) -> int:
    """
    The number of reviews from each category to fetch for the datasets.

-    Defaults to `1000`.
+    Defaults to `1750`.
    """
    if val is None:
-        return 1000
+        return 1750
+    try:
+        return int(val)
+    except ValueError:
+        raise cfig.InvalidValueError("Not an int.")
+
+
+@config.optional()
+def TENSORFLOW_MAX_FEATURES(val: str | None) -> int:
+    """
+    The maximum number of features to use in Tensorflow models.
+
+    Defaults to `30000`.
+    """
+    if val is None:
+        return 30000
+    try:
+        return int(val)
+    except ValueError:
+        raise cfig.InvalidValueError("Not an int.")
+
+
+@config.optional()
+def TENSORFLOW_EMBEDDING_SIZE(val: str | None) -> int:
+    """
+    The size of the embeddings tensor to use in Tensorflow models.
+
+    Defaults to `12`.
+    """
+    if val is None:
+        return 12
+    try:
+        return int(val)
+    except ValueError:
+        raise cfig.InvalidValueError("Not an int.")
+
+
+@config.optional()
+def TENSORFLOW_EPOCHS(val: str | None) -> int:
+    """
+    The number of epochs to train Tensorflow models for.
+
+    Defaults to `15`.
+    """
+    if val is None:
+        return 15
    try:
        return int(val)
    except ValueError:
@ -65,6 +110,9 @@ __all__ = (
    "MONGO_PORT",
    "WORKING_SET_SIZE",
    "DATA_SET_SIZE",
+    "TENSORFLOW_MAX_FEATURES",
+    "TENSORFLOW_EMBEDDING_SIZE",
+    "TENSORFLOW_EPOCHS",
 )


--- a/unimore_bda_6/database/cache.py
+++ b/unimore_bda_6/database/cache.py
@ -36,7 +36,7 @@ def store_cache(reviews: t.Iterator[Review], path: str | pathlib.Path) -> None:

 def load_cache(path: str | pathlib.Path) -> DatasetFunc:
    """
-    Load the contents of a directory
+    Load the contents of a directory into a `Review` iterator.
    """
    path = pathlib.Path(path)

@ -47,8 +47,10 @@ def load_cache(path: str | pathlib.Path) -> DatasetFunc:
        document_paths = path.iterdir()
        for document_path in document_paths:
            document_path = pathlib.Path(document_path)
+
            if not str(document_path).endswith(".pickle"):
                log.debug("Ignoring non-pickle file: %s", document_path)
+                continue

            log.debug("Loading pickle file: %s", document_path)
            with open(document_path, "rb") as file:
@ -58,8 +60,22 @@ def load_cache(path: str | pathlib.Path) -> DatasetFunc:
    return data_cache_loader


+def delete_cache(path: str | pathlib.Path) -> None:
+    """
+    Delete the given cache directory.
+    """
+    path = pathlib.Path(path)
+
+    if not path.exists():
+        raise FileNotFoundError("The specified path does not exist.")
+
+    log.warning("Deleting cache directory: %s", path)
+    shutil.rmtree(path)
+
+
 __all__ = (
    "DatasetFunc",
    "store_cache",
    "load_cache",
+    "delete_cache",
 )
--- a/unimore_bda_6/database/collections.py
+++ b/unimore_bda_6/database/collections.py
@ -1,4 +1,3 @@
-import contextlib
 import pymongo.collection
 import typing as t
 import bson
@ -30,8 +29,8 @@ def reviews_collection(db: pymongo.MongoClient) -> pymongo.collection.Collection
    Create a new MongoDB client, access the ``reviews`` collection in the ``reviews`` database, and yield it.
    """
    log.debug("Accessing the reviews collection...")
-    collection = db.reviews.reviews
-    log.debug("Collection accessed successfully: %s", collection)
+    collection: pymongo.collection.Collection[MongoReview] = db.reviews.reviews
+    log.debug("Collection accessed successfully: %s", collection.name)
    return collection


--- a/unimore_bda_6/database/datatypes.py
+++ b/unimore_bda_6/database/datatypes.py
@ -1,5 +1,8 @@
 import tensorflow
 from .collections import MongoReview
+import logging
+
+log = logging.getLogger(__name__)


 Text = str
@ -33,19 +36,21 @@ class Review:
        return tensorflow.convert_to_tensor(self.text, dtype=tensorflow.string)

    def to_tensor_category(self) -> tensorflow.Tensor:
-        return tensorflow.convert_to_tensor([
+        return tensorflow.convert_to_tensor([[
            1.0 if self.category == 1.0 else 0.0,
            1.0 if self.category == 2.0 else 0.0,
            1.0 if self.category == 3.0 else 0.0,
            1.0 if self.category == 4.0 else 0.0,
            1.0 if self.category == 5.0 else 0.0,
-        ], dtype=tensorflow.float32)
+        ]], dtype=tensorflow.float32)

    def to_tensor_tuple(self) -> tuple[tensorflow.Tensor, tensorflow.Tensor]:
-        return (
+        t = (
            self.to_tensor_text(),
            self.to_tensor_category(),
        )
+        log.debug("Converted %s", t)
+        return t


 __all__ = (
--- a/unimore_bda_6/database/queries.py
+++ b/unimore_bda_6/database/queries.py
@ -54,6 +54,12 @@ def sample_reviews_polar(collection: pymongo.collection.Collection, amount: int)
                {"$match": {"overall": 5.0}},
                {"$sample": {"size": amount}},
            ],
+        }},
+        {"$addFields": {
+            "sortKey": {"$rand": {}},
+        }},
+        {"$sort": {
+            "sortKey": 1,
        }}
    ])

@ -101,6 +107,12 @@ def sample_reviews_varied(collection: pymongo.collection.Collection, amount: int
                    ],
                }}
            ],
+        }},
+        {"$addFields": {
+            "sortKey": {"$rand": {}},
+        }},
+        {"$sort": {
+            "sortKey": 1,
        }}
    ])

--- a/unimore_bda_6/log.py
+++ b/unimore_bda_6/log.py
@ -15,7 +15,7 @@ def install_log_handler(loggers: list[logging.Logger] = None):
    for logger in loggers:
        coloredlogs.install(
            logger=logger,
-            level="INFO",
+            level="DEBUG",
            fmt="{asctime} | {name:<32} | {levelname:>8} | {message}",
            style="{",
            level_styles=dict(
@ -34,6 +34,9 @@ def install_log_handler(loggers: list[logging.Logger] = None):
        )
        this_log.debug("Installed custom log handler on: %s", logger)

+    logging.getLogger("unimore_bda_6.database.cache").setLevel("INFO")
+    logging.getLogger("unimore_bda_6.database.datatypes").setLevel("INFO")
+

 _passage_counts = collections.defaultdict(lambda: 0)