Made good progress

How does text vectorization in tensorflow work?
2024-11-21 23:44:19 +00:00 · 2023-02-05 17:40:22 +01:00 · 2023-02-05 17:40:22 +01:00 · 3abba24ca2
commit 3abba24ca2
parent dcfc4fbc3b
13 changed files with 286 additions and 158 deletions
--- a/.gitignore
+++ b/.gitignore
@ -10,6 +10,8 @@
 data/raw/
 data/db/
 data/nltk/
+data/training/
+data/evaluation/

 ##################
 # Python ignores #
--- a/.idea/runConfigurations/unimore_bda_6.xml
+++ b/.idea/runConfigurations/unimore_bda_6.xml
@ -4,6 +4,7 @@
    <option name="INTERPRETER_OPTIONS" value="" />
    <option name="PARENT_ENVS" value="true" />
    <envs>
+      <env name="CONFIRM_OVERWRITE" value="False" />
      <env name="DATA_SET_SIZE" value="750" />
      <env name="NLTK_DATA" value="./data/nltk" />
      <env name="PYTHONUNBUFFERED" value="1" />
--- a/unimore-bda-6.iml
+++ b/unimore-bda-6.iml
@ -8,6 +8,8 @@
      <excludeFolder url="file://$MODULE_DIR$/data/raw" />
      <excludeFolder url="file://$MODULE_DIR$/data/nltk" />
      <excludeFolder url="file://$MODULE_DIR$/.venv" />
+      <excludeFolder url="file://$MODULE_DIR$/data/evaluation" />
+      <excludeFolder url="file://$MODULE_DIR$/data/training" />
    </content>
    <orderEntry type="jdk" jdkName="Poetry (unimore-bda-6)" jdkType="Python SDK" />
    <orderEntry type="sourceFolder" forTests="false" />
--- a/unimore_bda_6/main.py
+++ b/unimore_bda_6/main.py
@ -2,7 +2,7 @@ import logging
 import tensorflow

 from .config import config, DATA_SET_SIZE
-from .database import mongo_reviews_collection_from_config, sample_reviews_polar, sample_reviews_varied
+from .database import mongo_client_from_config, reviews_collection, sample_reviews_polar, sample_reviews_varied, store_cache, load_cache
 from .analysis.nltk_sentiment import NLTKSentimentAnalyzer
 from .analysis.tf_text import TensorflowSentimentAnalyzer
 from .tokenizer import NLTKWordTokenizer, PottsTokenizer, PottsTokenizerWithNegation, LowercaseTokenizer
@ -18,43 +18,53 @@ def main():
        log.debug("Tensorflow successfully found GPU acceleration!")

    for dataset_func in [sample_reviews_polar, sample_reviews_varied]:
-        # Tensorflow-based
-        for Tokenizer in [
-            LowercaseTokenizer
-        ]:
-            tokenizer = Tokenizer()
-            model = TensorflowSentimentAnalyzer()
+        for SentimentAnalyzer in [TensorflowSentimentAnalyzer, NLTKSentimentAnalyzer]:
+            for Tokenizer in [
+                # NLTKWordTokenizer,
+                # PottsTokenizer,
+                # PottsTokenizerWithNegation,
+                LowercaseTokenizer,
+            ]:
+                tokenizer = Tokenizer()
+                model = SentimentAnalyzer(tokenizer=tokenizer)

-            with mongo_reviews_collection_from_config() as collection:
-                ...
+                with mongo_client_from_config() as db:
+                    log.debug("Finding the reviews MongoDB collection...")
+                    collection = reviews_collection(db)

-        # NLTK-based
-        for Tokenizer in [
-            NLTKWordTokenizer,
-            PottsTokenizer,
-            PottsTokenizerWithNegation,
-            LowercaseTokenizer,
-        ]:
-            tokenizer = Tokenizer()
-            model = NLTKSentimentAnalyzer(tokenizer=tokenizer)
+                    try:
+                        training_cache = load_cache("./data/training")
+                        evaluation_cache = load_cache("./data/evaluation")
+                    except FileNotFoundError:
+                        log.debug("Gathering datasets...")
+                        reviews_training = dataset_func(collection=collection, amount=DATA_SET_SIZE.__wrapped__)
+                        reviews_evaluation = dataset_func(collection=collection, amount=DATA_SET_SIZE.__wrapped__)

-            with mongo_reviews_collection_from_config() as collection:
-                reviews_training = dataset_func(collection=collection, amount=DATA_SET_SIZE.__wrapped__)
-                reviews_evaluation = dataset_func(collection=collection, amount=DATA_SET_SIZE.__wrapped__)
+                        log.debug("Caching datasets...")
+                        store_cache(reviews_training, "./data/training")
+                        store_cache(reviews_evaluation, "./data/evaluation")
+                        del reviews_training
+                        del reviews_evaluation

-                log.info("Training model %s", model)
-                model.train(reviews_training)
-                log.info("Evaluating model %s", model)
-                correct, evaluated = model.evaluate(reviews_evaluation)
-                log.info("%d evaluated, %d correct, %0.2d %% accuracy", evaluated, correct, correct / evaluated * 100)
+                        training_cache = load_cache("./data/training")
+                        evaluation_cache = load_cache("./data/evaluation")
+                        log.debug("Caches stored and loaded successfully!")
+                    else:
+                        log.debug("Caches loaded successfully!")

-            # try:
-            #     print("Manual testing for %s" % model)
-            #     print("Input an empty string to continue to the next model.")
-            #     while inp := input():
-            #         print(model.use(inp))
-            # except KeyboardInterrupt:
-            #     pass
+                    log.info("Training model: %s", model)
+                    model.train(training_cache)
+                    log.info("Evaluating model: %s", model)
+                    evaluation_results = model.evaluate(evaluation_cache)
+                    log.info("%s", evaluation_results)
+
+                # try:
+                #     print("Manual testing for %s" % model)
+                #     print("Input an empty string to continue to the next model.")
+                #     while inp := input():
+                #         print(model.use(inp))
+                # except KeyboardInterrupt:
+                #     pass


 if __name__ == "__main__":
--- a/unimore_bda_6/analysis/nltk_sentiment.py
+++ b/unimore_bda_6/analysis/nltk_sentiment.py
@ -6,7 +6,7 @@ import logging
 import typing as t
 import itertools

-from ..database import Text, Category, Review
+from ..database import Text, Category, Review, DatasetFunc
 from .base import BaseSentimentAnalyzer, AlreadyTrainedError, NotTrainedError
 from ..log import count_passage
 from ..tokenizer import BaseTokenizer
@ -31,7 +31,7 @@ class NLTKSentimentAnalyzer(BaseSentimentAnalyzer):
    def __repr__(self):
        return f"<{self.__class__.__qualname__} tokenizer={self.tokenizer!r}>"

-    def __tokenize_datatuple(self, datatuple: Review) -> tuple[TokenBag, Category]:
+    def __tokenize_review(self, datatuple: Review) -> tuple[TokenBag, Category]:
        """
        Convert the `Text` of a `DataTuple` to a `TokenBag`.
        """
@ -67,13 +67,16 @@ class NLTKSentimentAnalyzer(BaseSentimentAnalyzer):
        count_passage(log, "extract_features", 100)
        return self.model.extract_features(data[0]), data[1]

-    def train(self, dataset: t.Iterator[Review]) -> None:
+    def train(self, dataset_func: DatasetFunc) -> None:
        # Forbid retraining the model
        if self.trained:
            raise AlreadyTrainedError()

+        # Get a generator
+        dataset: t.Generator[Review] = dataset_func()
+
        # Tokenize the dataset
-        dataset: t.Iterator[tuple[TokenBag, Category]] = map(self.__tokenize_datatuple, dataset)
+        dataset: t.Iterator[tuple[TokenBag, Category]] = map(self.__tokenize_review, dataset)

        # Cleanly duplicate the dataset iterator
        # Reduce average memory footprint, but not maximum
--- a/unimore_bda_6/analysis/tf_text.py
+++ b/unimore_bda_6/analysis/tf_text.py
@ -2,48 +2,52 @@ import tensorflow
 import itertools
 import typing as t

-from ..database import Text, Category, Review
+from ..database import Text, Category, Review, DatasetFunc
 from ..tokenizer import BaseTokenizer
 from .base import BaseSentimentAnalyzer, AlreadyTrainedError, NotTrainedError


 class TensorflowSentimentAnalyzer(BaseSentimentAnalyzer):
-    def __init__(self):
+    def __init__(self, *, tokenizer: BaseTokenizer):
        super().__init__()
        self.trained = False
-        self.text_vectorization_layer = None
        self.neural_network: tensorflow.keras.Sequential | None = None
-
-    @classmethod
-    def __bda_dataset_to_tf_dataset(cls, dataset_func: t.Callable[[], t.Iterator[Review]]) -> tensorflow.data.Dataset:
-        """
-        Convert a `unimore_bda_6.database.DataSet` to a "real" `tensorflow.data.Dataset`.
-        """
-        return tensorflow.data.Dataset.from_generator(
-            dataset_func,
-            output_signature=(
-                tensorflow.TensorSpec(shape=(), dtype=tensorflow.string),
-                tensorflow.TensorSpec(shape=(), dtype=tensorflow.string),
-            )
-        )
+        self.tokenizer: BaseTokenizer = tokenizer  # TODO

    MAX_FEATURES = 20000
    EMBEDDING_DIM = 16
    EPOCHS = 10

-    def train(self, training_set: t.Iterator[Review]) -> None:
+    def train(self, dataset_func: DatasetFunc) -> None:
        if self.trained:
            raise AlreadyTrainedError()

-        training_set = self.__bda_dataset_to_tf_dataset(training_set)
+        def dataset_func_with_tensor_text():
+            for review in dataset_func():
+                yield review.to_tensor_text()

-        self.text_vectorization_layer = tensorflow.keras.layers.TextVectorization(
+        text_set = tensorflow.data.Dataset.from_generator(
+            dataset_func_with_tensor_text,
+            output_signature=tensorflow.TensorSpec(shape=(), dtype=tensorflow.string)
+        )
+
+        text_vectorization_layer = tensorflow.keras.layers.TextVectorization(
            max_tokens=self.MAX_FEATURES,
            standardize=self.tokenizer.tokenize_tensorflow,
        )
-        self.text_vectorization_layer.adapt(map(lambda t: t[0], training_set))
+        text_vectorization_layer.adapt(text_set)

-        training_set = training_set.map(self.text_vectorization_layer)
+        def dataset_func_with_tensor_tuple():
+            for review in dataset_func():
+                yield review.to_tensor_tuple()
+
+        training_set = tensorflow.data.Dataset.from_generator(
+            dataset_func_with_tensor_tuple,
+            output_signature=(
+                tensorflow.TensorSpec(shape=(), dtype=tensorflow.string, name="text"),
+                tensorflow.TensorSpec(shape=(), dtype=tensorflow.float32, name="category"),
+            )
+        )

        # I have no idea of what I'm doing here
        self.neural_network = tensorflow.keras.Sequential([
@ -59,6 +63,8 @@ class TensorflowSentimentAnalyzer(BaseSentimentAnalyzer):
            metrics=tensorflow.metrics.BinaryAccuracy(threshold=0.0)
        )

+        training_set = training_set.map(text_vectorization_layer)
+
        self.neural_network.fit(
            training_set,
            epochs=self.EPOCHS,
--- a/unimore_bda_6/database/init.py
+++ b/unimore_bda_6/database/init.py
@ -0,0 +1,5 @@
+from .cache import *
+from .collections import *
+from .connection import *
+from .datatypes import *
+from .queries import *
--- a/unimore_bda_6/database/cache.py
+++ b/unimore_bda_6/database/cache.py
@ -0,0 +1,66 @@
+import typing as t
+import logging
+import shutil
+import pathlib
+import pickle
+
+from .datatypes import Review
+
+log = logging.getLogger(__name__)
+
+
+DatasetFunc = t.Callable[[], t.Generator[Review, t.Any, None]]
+
+
+def store_cache(reviews: t.Iterator[Review], path: str | pathlib.Path) -> None:
+    """
+    Store the contents of the given `Review` iterator to different files in a directory at the given path.
+    """
+    path = pathlib.Path(path)
+
+    if path.exists():
+        raise FileExistsError("Specified cache path already exists.")
+
+    # Create the temporary directory
+    log.debug("Creating cache directory: %s", path)
+    path.mkdir(parents=True)
+
+    # Write the documents to path/{index}.pickle
+    for index, document in enumerate(reviews):
+        document_path = path.joinpath(f"{index}.pickle")
+
+        log.debug("Storing pickle file: %s", document_path)
+        with open(document_path, "wb") as file:
+            pickle.dump(document, file)
+
+
+def load_cache(path: str | pathlib.Path) -> DatasetFunc:
+    """
+    Load the contents of a directory
+    """
+    path = pathlib.Path(path)
+
+    if not path.exists():
+        log.error("Specified cache directory does not exist: %s", path)
+        raise FileNotFoundError("The specified path does not exist.")
+
+    def data_cache_loader():
+        document_paths = path.iterdir()
+        for document_path in document_paths:
+            document_path = pathlib.Path(document_path)
+            if not str(document_path).endswith(".pickle"):
+                log.debug("Ignoring non-pickle file: %s", document_path)
+
+            log.debug("Loading pickle file: %s", document_path)
+            with open(document_path, "rb") as file:
+                result: Review = pickle.load(file)
+                yield result
+
+    return data_cache_loader
+
+
+__all__ = (
+    "DatasetFunc",
+    "store_cache",
+    "load_cache",
+)
--- a/unimore_bda_6/database/collections.py
+++ b/unimore_bda_6/database/collections.py
@ -0,0 +1,41 @@
+import contextlib
+import pymongo.collection
+import typing as t
+import bson
+import logging
+
+log = logging.getLogger(__name__)
+
+
+class MongoReview(t.TypedDict):
+    """
+    A review as it is stored on MongoDB.
+
+    .. warning:: Do not instantiate: this is only for type hints!
+    """
+    _id: bson.ObjectId
+    reviewerID: str
+    asin: str
+    reviewerName: str
+    helpful: tuple[int, int]
+    reviewText: str
+    overall: float
+    summary: str
+    unixReviewTime: int
+    reviewTime: str
+
+
+def reviews_collection(db: pymongo.MongoClient) -> pymongo.collection.Collection[MongoReview]:
+    """
+    Create a new MongoDB client, access the ``reviews`` collection in the ``reviews`` database, and yield it.
+    """
+    log.debug("Accessing the reviews collection...")
+    collection = db.reviews.reviews
+    log.debug("Collection accessed successfully: %s", collection)
+    return collection
+
+
+__all__ = (
+    "MongoReview",
+    "reviews_collection",
+)
--- a/unimore_bda_6/database/connection.py
+++ b/unimore_bda_6/database/connection.py
@ -0,0 +1,32 @@
+import pymongo
+import contextlib
+import typing as t
+import logging
+
+from ..config import MONGO_HOST, MONGO_PORT
+
+log = logging.getLogger(__name__)
+
+
+@contextlib.contextmanager
+def mongo_client_from_config() -> t.ContextManager[pymongo.MongoClient]:
+    """
+    Create a new MongoDB client and yield it.
+    """
+    log.debug("Opening connection to MongoDB...")
+    client: pymongo.MongoClient = pymongo.MongoClient(
+        host=MONGO_HOST.__wrapped__,
+        port=MONGO_PORT.__wrapped__,
+    )
+    log.info("Opened connection to MongoDB!")
+
+    yield client
+
+    log.info("Closing connection to MongoDB...")
+    client.close()
+    log.debug("Closed connection to MongoDB!")
+
+
+__all__ = (
+    "mongo_client_from_config",
+)
--- a/unimore_bda_6/database/datatypes.py
+++ b/unimore_bda_6/database/datatypes.py
@ -0,0 +1,49 @@
+import tensorflow
+from .collections import MongoReview
+
+
+Text = str
+Category = float
+
+
+class Review:
+    def __init__(self, text: Text, category: Category):
+        self.text: str = text
+        self.category: float = category
+
+    @classmethod
+    def from_mongoreview(cls, review: MongoReview):
+        return cls(
+            text=review["reviewText"],
+            category=review["overall"],
+        )
+
+    def __repr__(self):
+        return f"<{self.__class__.__qualname__}: [{self.category}] {self.text}>"
+
+    def __getitem__(self, item):
+        if item == 0 or item == "text":
+            return self.text
+        elif item == 1 or item == "category":
+            return self.category
+        else:
+            raise KeyError(item)
+
+    def to_tensor_text(self) -> tensorflow.Tensor:
+        return tensorflow.convert_to_tensor(self.text, dtype=tensorflow.string)
+
+    def to_tensor_category(self) -> tensorflow.Tensor:
+        return tensorflow.convert_to_tensor(self.category / 5.0, dtype=tensorflow.float32)
+
+    def to_tensor_tuple(self) -> tuple[tensorflow.Tensor, tensorflow.Tensor]:
+        return (
+            self.to_tensor_text(),
+            self.to_tensor_category(),
+        )
+
+
+__all__ = (
+    "Text",
+    "Category",
+    "Review",
+)
--- a/unimore_bda_6/database/queries.py
+++ b/unimore_bda_6/database/queries.py
@ -1,101 +1,14 @@
-import typing as t
-import pymongo
-import pymongo.collection
-import contextlib
-import bson
 import logging
-import tensorflow
+import pymongo
+import typing as t

-from .config import MONGO_HOST, MONGO_PORT, WORKING_SET_SIZE
+from ..config import WORKING_SET_SIZE
+from .collections import MongoReview
+from .datatypes import Review

 log = logging.getLogger(__name__)


-class MongoReview(t.TypedDict):
-    """
-    A review as it is stored on MongoDB.
-
-    .. warning:: Do not instantiate: this is only for type hints!
-    """
-    _id: bson.ObjectId
-    reviewerID: str
-    asin: str
-    reviewerName: str
-    helpful: tuple[int, int]
-    reviewText: str
-    overall: float
-    summary: str
-    unixReviewTime: int
-    reviewTime: str
-
-
-Text = str
-Category = float
-
-
-class Review:
-    def __init__(self, text: Text, category: Category):
-        self.text: Text = text
-        self.category: Category = category
-
-    @classmethod
-    def from_mongoreview(cls, review: MongoReview):
-        return cls(
-            text=review["reviewText"],
-            category=review["overall"],
-        )
-
-    def __repr__(self):
-        return f"<{self.__class__.__qualname__}: [{self.category}] {self.text}>"
-
-    def __getitem__(self, item):
-        if item == 0 or item == "text":
-            return self.text
-        elif item == 1 or item == "category":
-            return self.category
-        else:
-            raise KeyError(item)
-
-    def to_tensor_tuple(self) -> tuple[tensorflow.Tensor, tensorflow.Tensor]:
-        return tensorflow.convert_to_tensor(self.text, dtype=tensorflow.string), tensorflow.convert_to_tensor(self.category, dtype=tensorflow.string)
-
-
-@contextlib.contextmanager
-def mongo_client_from_config() -> t.ContextManager[pymongo.MongoClient]:
-    """
-    Create a new MongoDB client and yield it.
-    """
-    log.debug("Opening connection to MongoDB...")
-    client: pymongo.MongoClient = pymongo.MongoClient(
-        host=MONGO_HOST.__wrapped__,
-        port=MONGO_PORT.__wrapped__,
-    )
-    log.info("Opened connection to MongoDB!")
-
-    yield client
-
-    log.info("Closing connection to MongoDB...")
-    client.close()
-    log.debug("Closed connection to MongoDB!")
-
-
-@contextlib.contextmanager
-def mongo_reviews_collection_from_config() -> pymongo.collection.Collection[MongoReview]:
-    """
-    Create a new MongoDB client, access the ``reviews`` collection in the ``reviews`` database, and yield it.
-    """
-    with mongo_client_from_config() as db:
-        log.debug("Accessing the reviews collection...")
-        collection = db.reviews.reviews
-        log.debug("Collection accessed successfully: %s", collection)
-        yield collection
-
-
-class DatasetFunc(t.Protocol):
-    def __call__(self) -> t.Iterator[Review]:
-        pass
-
-
 def sample_reviews(collection: pymongo.collection.Collection, amount: int) -> t.Iterator[Review]:
    """
    Get ``amount`` random reviews from the ``reviews`` collection.
@ -108,6 +21,7 @@ def sample_reviews(collection: pymongo.collection.Collection, amount: int) -> t.
    ])

    cursor = map(Review.from_mongoreview, cursor)
+
    return cursor


@ -123,7 +37,6 @@ def sample_reviews_by_rating(collection: pymongo.collection.Collection, rating:
        {"$sample": {"size": amount}},
    ])

-    cursor = map(Review.from_mongoreview, cursor)
    return cursor


@ -145,6 +58,7 @@ def sample_reviews_polar(collection: pymongo.collection.Collection, amount: int)
    ])

    cursor = map(Review.from_mongoreview, cursor)
+
    return cursor


@ -191,16 +105,11 @@ def sample_reviews_varied(collection: pymongo.collection.Collection, amount: int
    ])

    cursor = map(Review.from_mongoreview, cursor)
+
    return cursor


 __all__ = (
-    "Text",
-    "Category",
-    "Review",
-    "DatasetFunc",
-    "mongo_client_from_config",
-    "mongo_reviews_collection_from_config",
    "sample_reviews",
    "sample_reviews_by_rating",
    "sample_reviews_polar",
--- a/unimore_bda_6/tokenizer/lower.py
+++ b/unimore_bda_6/tokenizer/lower.py
@ -8,4 +8,6 @@ class LowercaseTokenizer(BaseTokenizer):
        return text.lower().split()

    def tokenize_tensorflow(self, text: tensorflow.Tensor) -> tensorflow.Tensor:
-        return tensorflow.strings.lower(text)
+        text = tensorflow.strings.lower(text)
+        text = tensorflow.expand_dims(text, -1, name="tokens")
+        return text