diff --git a/.gitignore b/.gitignore
index 8467da3..25241c6 100644
--- a/.gitignore
+++ b/.gitignore
@@ -10,6 +10,8 @@
data/raw/
data/db/
data/nltk/
+data/training/
+data/evaluation/
##################
# Python ignores #
diff --git a/.idea/runConfigurations/unimore_bda_6.xml b/.idea/runConfigurations/unimore_bda_6.xml
index f113c82..df78a5b 100644
--- a/.idea/runConfigurations/unimore_bda_6.xml
+++ b/.idea/runConfigurations/unimore_bda_6.xml
@@ -4,6 +4,7 @@
+
diff --git a/unimore-bda-6.iml b/unimore-bda-6.iml
index 80da260..b8d6c99 100644
--- a/unimore-bda-6.iml
+++ b/unimore-bda-6.iml
@@ -8,6 +8,8 @@
+
+
diff --git a/unimore_bda_6/__main__.py b/unimore_bda_6/__main__.py
index 9bbd0ba..b9871eb 100644
--- a/unimore_bda_6/__main__.py
+++ b/unimore_bda_6/__main__.py
@@ -2,7 +2,7 @@ import logging
import tensorflow
from .config import config, DATA_SET_SIZE
-from .database import mongo_reviews_collection_from_config, sample_reviews_polar, sample_reviews_varied
+from .database import mongo_client_from_config, reviews_collection, sample_reviews_polar, sample_reviews_varied, store_cache, load_cache
from .analysis.nltk_sentiment import NLTKSentimentAnalyzer
from .analysis.tf_text import TensorflowSentimentAnalyzer
from .tokenizer import NLTKWordTokenizer, PottsTokenizer, PottsTokenizerWithNegation, LowercaseTokenizer
@@ -18,43 +18,53 @@ def main():
log.debug("Tensorflow successfully found GPU acceleration!")
for dataset_func in [sample_reviews_polar, sample_reviews_varied]:
- # Tensorflow-based
- for Tokenizer in [
- LowercaseTokenizer
- ]:
- tokenizer = Tokenizer()
- model = TensorflowSentimentAnalyzer()
+ for SentimentAnalyzer in [TensorflowSentimentAnalyzer, NLTKSentimentAnalyzer]:
+ for Tokenizer in [
+ # NLTKWordTokenizer,
+ # PottsTokenizer,
+ # PottsTokenizerWithNegation,
+ LowercaseTokenizer,
+ ]:
+ tokenizer = Tokenizer()
+ model = SentimentAnalyzer(tokenizer=tokenizer)
- with mongo_reviews_collection_from_config() as collection:
- ...
+ with mongo_client_from_config() as db:
+ log.debug("Finding the reviews MongoDB collection...")
+ collection = reviews_collection(db)
- # NLTK-based
- for Tokenizer in [
- NLTKWordTokenizer,
- PottsTokenizer,
- PottsTokenizerWithNegation,
- LowercaseTokenizer,
- ]:
- tokenizer = Tokenizer()
- model = NLTKSentimentAnalyzer(tokenizer=tokenizer)
+ try:
+ training_cache = load_cache("./data/training")
+ evaluation_cache = load_cache("./data/evaluation")
+ except FileNotFoundError:
+ log.debug("Gathering datasets...")
+ reviews_training = dataset_func(collection=collection, amount=DATA_SET_SIZE.__wrapped__)
+ reviews_evaluation = dataset_func(collection=collection, amount=DATA_SET_SIZE.__wrapped__)
- with mongo_reviews_collection_from_config() as collection:
- reviews_training = dataset_func(collection=collection, amount=DATA_SET_SIZE.__wrapped__)
- reviews_evaluation = dataset_func(collection=collection, amount=DATA_SET_SIZE.__wrapped__)
+ log.debug("Caching datasets...")
+ store_cache(reviews_training, "./data/training")
+ store_cache(reviews_evaluation, "./data/evaluation")
+ del reviews_training
+ del reviews_evaluation
- log.info("Training model %s", model)
- model.train(reviews_training)
- log.info("Evaluating model %s", model)
- correct, evaluated = model.evaluate(reviews_evaluation)
- log.info("%d evaluated, %d correct, %0.2d %% accuracy", evaluated, correct, correct / evaluated * 100)
+ training_cache = load_cache("./data/training")
+ evaluation_cache = load_cache("./data/evaluation")
+ log.debug("Caches stored and loaded successfully!")
+ else:
+ log.debug("Caches loaded successfully!")
- # try:
- # print("Manual testing for %s" % model)
- # print("Input an empty string to continue to the next model.")
- # while inp := input():
- # print(model.use(inp))
- # except KeyboardInterrupt:
- # pass
+ log.info("Training model: %s", model)
+ model.train(training_cache)
+ log.info("Evaluating model: %s", model)
+ evaluation_results = model.evaluate(evaluation_cache)
+ log.info("%s", evaluation_results)
+
+ # try:
+ # print("Manual testing for %s" % model)
+ # print("Input an empty string to continue to the next model.")
+ # while inp := input():
+ # print(model.use(inp))
+ # except KeyboardInterrupt:
+ # pass
if __name__ == "__main__":
diff --git a/unimore_bda_6/analysis/nltk_sentiment.py b/unimore_bda_6/analysis/nltk_sentiment.py
index a1fd2ea..1feaa95 100644
--- a/unimore_bda_6/analysis/nltk_sentiment.py
+++ b/unimore_bda_6/analysis/nltk_sentiment.py
@@ -6,7 +6,7 @@ import logging
import typing as t
import itertools
-from ..database import Text, Category, Review
+from ..database import Text, Category, Review, DatasetFunc
from .base import BaseSentimentAnalyzer, AlreadyTrainedError, NotTrainedError
from ..log import count_passage
from ..tokenizer import BaseTokenizer
@@ -31,7 +31,7 @@ class NLTKSentimentAnalyzer(BaseSentimentAnalyzer):
def __repr__(self):
return f"<{self.__class__.__qualname__} tokenizer={self.tokenizer!r}>"
- def __tokenize_datatuple(self, datatuple: Review) -> tuple[TokenBag, Category]:
+ def __tokenize_review(self, datatuple: Review) -> tuple[TokenBag, Category]:
"""
Convert the `Text` of a `DataTuple` to a `TokenBag`.
"""
@@ -67,13 +67,16 @@ class NLTKSentimentAnalyzer(BaseSentimentAnalyzer):
count_passage(log, "extract_features", 100)
return self.model.extract_features(data[0]), data[1]
- def train(self, dataset: t.Iterator[Review]) -> None:
+ def train(self, dataset_func: DatasetFunc) -> None:
# Forbid retraining the model
if self.trained:
raise AlreadyTrainedError()
+ # Get a generator
+ dataset: t.Generator[Review] = dataset_func()
+
# Tokenize the dataset
- dataset: t.Iterator[tuple[TokenBag, Category]] = map(self.__tokenize_datatuple, dataset)
+ dataset: t.Iterator[tuple[TokenBag, Category]] = map(self.__tokenize_review, dataset)
# Cleanly duplicate the dataset iterator
# Reduce average memory footprint, but not maximum
diff --git a/unimore_bda_6/analysis/tf_text.py b/unimore_bda_6/analysis/tf_text.py
index fc4780a..4c443c6 100644
--- a/unimore_bda_6/analysis/tf_text.py
+++ b/unimore_bda_6/analysis/tf_text.py
@@ -2,48 +2,52 @@ import tensorflow
import itertools
import typing as t
-from ..database import Text, Category, Review
+from ..database import Text, Category, Review, DatasetFunc
from ..tokenizer import BaseTokenizer
from .base import BaseSentimentAnalyzer, AlreadyTrainedError, NotTrainedError
class TensorflowSentimentAnalyzer(BaseSentimentAnalyzer):
- def __init__(self):
+ def __init__(self, *, tokenizer: BaseTokenizer):
super().__init__()
self.trained = False
- self.text_vectorization_layer = None
self.neural_network: tensorflow.keras.Sequential | None = None
-
- @classmethod
- def __bda_dataset_to_tf_dataset(cls, dataset_func: t.Callable[[], t.Iterator[Review]]) -> tensorflow.data.Dataset:
- """
- Convert a `unimore_bda_6.database.DataSet` to a "real" `tensorflow.data.Dataset`.
- """
- return tensorflow.data.Dataset.from_generator(
- dataset_func,
- output_signature=(
- tensorflow.TensorSpec(shape=(), dtype=tensorflow.string),
- tensorflow.TensorSpec(shape=(), dtype=tensorflow.string),
- )
- )
+ self.tokenizer: BaseTokenizer = tokenizer # TODO
MAX_FEATURES = 20000
EMBEDDING_DIM = 16
EPOCHS = 10
- def train(self, training_set: t.Iterator[Review]) -> None:
+ def train(self, dataset_func: DatasetFunc) -> None:
if self.trained:
raise AlreadyTrainedError()
- training_set = self.__bda_dataset_to_tf_dataset(training_set)
+ def dataset_func_with_tensor_text():
+ for review in dataset_func():
+ yield review.to_tensor_text()
- self.text_vectorization_layer = tensorflow.keras.layers.TextVectorization(
+ text_set = tensorflow.data.Dataset.from_generator(
+ dataset_func_with_tensor_text,
+ output_signature=tensorflow.TensorSpec(shape=(), dtype=tensorflow.string)
+ )
+
+ text_vectorization_layer = tensorflow.keras.layers.TextVectorization(
max_tokens=self.MAX_FEATURES,
standardize=self.tokenizer.tokenize_tensorflow,
)
- self.text_vectorization_layer.adapt(map(lambda t: t[0], training_set))
+ text_vectorization_layer.adapt(text_set)
- training_set = training_set.map(self.text_vectorization_layer)
+ def dataset_func_with_tensor_tuple():
+ for review in dataset_func():
+ yield review.to_tensor_tuple()
+
+ training_set = tensorflow.data.Dataset.from_generator(
+ dataset_func_with_tensor_tuple,
+ output_signature=(
+ tensorflow.TensorSpec(shape=(), dtype=tensorflow.string, name="text"),
+ tensorflow.TensorSpec(shape=(), dtype=tensorflow.float32, name="category"),
+ )
+ )
# I have no idea of what I'm doing here
self.neural_network = tensorflow.keras.Sequential([
@@ -59,6 +63,8 @@ class TensorflowSentimentAnalyzer(BaseSentimentAnalyzer):
metrics=tensorflow.metrics.BinaryAccuracy(threshold=0.0)
)
+ training_set = training_set.map(text_vectorization_layer)
+
self.neural_network.fit(
training_set,
epochs=self.EPOCHS,
diff --git a/unimore_bda_6/database/__init__.py b/unimore_bda_6/database/__init__.py
new file mode 100644
index 0000000..22ee5bf
--- /dev/null
+++ b/unimore_bda_6/database/__init__.py
@@ -0,0 +1,5 @@
+from .cache import *
+from .collections import *
+from .connection import *
+from .datatypes import *
+from .queries import *
diff --git a/unimore_bda_6/database/cache.py b/unimore_bda_6/database/cache.py
new file mode 100644
index 0000000..9db9829
--- /dev/null
+++ b/unimore_bda_6/database/cache.py
@@ -0,0 +1,66 @@
+import typing as t
+import logging
+import shutil
+import pathlib
+import pickle
+
+from .datatypes import Review
+
+log = logging.getLogger(__name__)
+
+
+DatasetFunc = t.Callable[[], t.Generator[Review, t.Any, None]]
+
+
+def store_cache(reviews: t.Iterator[Review], path: str | pathlib.Path) -> None:
+ """
+ Store the contents of the given `Review` iterator to different files in a directory at the given path.
+ """
+ path = pathlib.Path(path)
+
+ if path.exists():
+ raise FileExistsError("Specified cache path already exists.")
+
+ # Create the temporary directory
+ log.debug("Creating cache directory: %s", path)
+ path.mkdir(parents=True)
+
+ # Write the documents to path/{index}.pickle
+ for index, document in enumerate(reviews):
+ document_path = path.joinpath(f"{index}.pickle")
+
+ log.debug("Storing pickle file: %s", document_path)
+ with open(document_path, "wb") as file:
+ pickle.dump(document, file)
+
+
+def load_cache(path: str | pathlib.Path) -> DatasetFunc:
+ """
+ Load the contents of a directory
+ """
+ path = pathlib.Path(path)
+
+ if not path.exists():
+ log.error("Specified cache directory does not exist: %s", path)
+ raise FileNotFoundError("The specified path does not exist.")
+
+ def data_cache_loader():
+ document_paths = path.iterdir()
+ for document_path in document_paths:
+ document_path = pathlib.Path(document_path)
+ if not str(document_path).endswith(".pickle"):
+ log.debug("Ignoring non-pickle file: %s", document_path)
+
+ log.debug("Loading pickle file: %s", document_path)
+ with open(document_path, "rb") as file:
+ result: Review = pickle.load(file)
+ yield result
+
+ return data_cache_loader
+
+
+__all__ = (
+ "DatasetFunc",
+ "store_cache",
+ "load_cache",
+)
diff --git a/unimore_bda_6/database/collections.py b/unimore_bda_6/database/collections.py
new file mode 100644
index 0000000..7dd2469
--- /dev/null
+++ b/unimore_bda_6/database/collections.py
@@ -0,0 +1,41 @@
+import contextlib
+import pymongo.collection
+import typing as t
+import bson
+import logging
+
+log = logging.getLogger(__name__)
+
+
+class MongoReview(t.TypedDict):
+ """
+ A review as it is stored on MongoDB.
+
+ .. warning:: Do not instantiate: this is only for type hints!
+ """
+ _id: bson.ObjectId
+ reviewerID: str
+ asin: str
+ reviewerName: str
+ helpful: tuple[int, int]
+ reviewText: str
+ overall: float
+ summary: str
+ unixReviewTime: int
+ reviewTime: str
+
+
+def reviews_collection(db: pymongo.MongoClient) -> pymongo.collection.Collection[MongoReview]:
+ """
+ Create a new MongoDB client, access the ``reviews`` collection in the ``reviews`` database, and yield it.
+ """
+ log.debug("Accessing the reviews collection...")
+ collection = db.reviews.reviews
+ log.debug("Collection accessed successfully: %s", collection)
+ return collection
+
+
+__all__ = (
+ "MongoReview",
+ "reviews_collection",
+)
diff --git a/unimore_bda_6/database/connection.py b/unimore_bda_6/database/connection.py
new file mode 100644
index 0000000..2f8e7e3
--- /dev/null
+++ b/unimore_bda_6/database/connection.py
@@ -0,0 +1,32 @@
+import pymongo
+import contextlib
+import typing as t
+import logging
+
+from ..config import MONGO_HOST, MONGO_PORT
+
+log = logging.getLogger(__name__)
+
+
+@contextlib.contextmanager
+def mongo_client_from_config() -> t.ContextManager[pymongo.MongoClient]:
+ """
+ Create a new MongoDB client and yield it.
+ """
+ log.debug("Opening connection to MongoDB...")
+ client: pymongo.MongoClient = pymongo.MongoClient(
+ host=MONGO_HOST.__wrapped__,
+ port=MONGO_PORT.__wrapped__,
+ )
+ log.info("Opened connection to MongoDB!")
+
+ yield client
+
+ log.info("Closing connection to MongoDB...")
+ client.close()
+ log.debug("Closed connection to MongoDB!")
+
+
+__all__ = (
+ "mongo_client_from_config",
+)
diff --git a/unimore_bda_6/database/datatypes.py b/unimore_bda_6/database/datatypes.py
new file mode 100644
index 0000000..32c65f1
--- /dev/null
+++ b/unimore_bda_6/database/datatypes.py
@@ -0,0 +1,49 @@
+import tensorflow
+from .collections import MongoReview
+
+
+Text = str
+Category = float
+
+
+class Review:
+ def __init__(self, text: Text, category: Category):
+ self.text: str = text
+ self.category: float = category
+
+ @classmethod
+ def from_mongoreview(cls, review: MongoReview):
+ return cls(
+ text=review["reviewText"],
+ category=review["overall"],
+ )
+
+ def __repr__(self):
+ return f"<{self.__class__.__qualname__}: [{self.category}] {self.text}>"
+
+ def __getitem__(self, item):
+ if item == 0 or item == "text":
+ return self.text
+ elif item == 1 or item == "category":
+ return self.category
+ else:
+ raise KeyError(item)
+
+ def to_tensor_text(self) -> tensorflow.Tensor:
+ return tensorflow.convert_to_tensor(self.text, dtype=tensorflow.string)
+
+ def to_tensor_category(self) -> tensorflow.Tensor:
+ return tensorflow.convert_to_tensor(self.category / 5.0, dtype=tensorflow.float32)
+
+ def to_tensor_tuple(self) -> tuple[tensorflow.Tensor, tensorflow.Tensor]:
+ return (
+ self.to_tensor_text(),
+ self.to_tensor_category(),
+ )
+
+
+__all__ = (
+ "Text",
+ "Category",
+ "Review",
+)
diff --git a/unimore_bda_6/database.py b/unimore_bda_6/database/queries.py
similarity index 57%
rename from unimore_bda_6/database.py
rename to unimore_bda_6/database/queries.py
index 9828fd1..8a88b39 100644
--- a/unimore_bda_6/database.py
+++ b/unimore_bda_6/database/queries.py
@@ -1,101 +1,14 @@
-import typing as t
-import pymongo
-import pymongo.collection
-import contextlib
-import bson
import logging
-import tensorflow
+import pymongo
+import typing as t
-from .config import MONGO_HOST, MONGO_PORT, WORKING_SET_SIZE
+from ..config import WORKING_SET_SIZE
+from .collections import MongoReview
+from .datatypes import Review
log = logging.getLogger(__name__)
-class MongoReview(t.TypedDict):
- """
- A review as it is stored on MongoDB.
-
- .. warning:: Do not instantiate: this is only for type hints!
- """
- _id: bson.ObjectId
- reviewerID: str
- asin: str
- reviewerName: str
- helpful: tuple[int, int]
- reviewText: str
- overall: float
- summary: str
- unixReviewTime: int
- reviewTime: str
-
-
-Text = str
-Category = float
-
-
-class Review:
- def __init__(self, text: Text, category: Category):
- self.text: Text = text
- self.category: Category = category
-
- @classmethod
- def from_mongoreview(cls, review: MongoReview):
- return cls(
- text=review["reviewText"],
- category=review["overall"],
- )
-
- def __repr__(self):
- return f"<{self.__class__.__qualname__}: [{self.category}] {self.text}>"
-
- def __getitem__(self, item):
- if item == 0 or item == "text":
- return self.text
- elif item == 1 or item == "category":
- return self.category
- else:
- raise KeyError(item)
-
- def to_tensor_tuple(self) -> tuple[tensorflow.Tensor, tensorflow.Tensor]:
- return tensorflow.convert_to_tensor(self.text, dtype=tensorflow.string), tensorflow.convert_to_tensor(self.category, dtype=tensorflow.string)
-
-
-@contextlib.contextmanager
-def mongo_client_from_config() -> t.ContextManager[pymongo.MongoClient]:
- """
- Create a new MongoDB client and yield it.
- """
- log.debug("Opening connection to MongoDB...")
- client: pymongo.MongoClient = pymongo.MongoClient(
- host=MONGO_HOST.__wrapped__,
- port=MONGO_PORT.__wrapped__,
- )
- log.info("Opened connection to MongoDB!")
-
- yield client
-
- log.info("Closing connection to MongoDB...")
- client.close()
- log.debug("Closed connection to MongoDB!")
-
-
-@contextlib.contextmanager
-def mongo_reviews_collection_from_config() -> pymongo.collection.Collection[MongoReview]:
- """
- Create a new MongoDB client, access the ``reviews`` collection in the ``reviews`` database, and yield it.
- """
- with mongo_client_from_config() as db:
- log.debug("Accessing the reviews collection...")
- collection = db.reviews.reviews
- log.debug("Collection accessed successfully: %s", collection)
- yield collection
-
-
-class DatasetFunc(t.Protocol):
- def __call__(self) -> t.Iterator[Review]:
- pass
-
-
def sample_reviews(collection: pymongo.collection.Collection, amount: int) -> t.Iterator[Review]:
"""
Get ``amount`` random reviews from the ``reviews`` collection.
@@ -108,6 +21,7 @@ def sample_reviews(collection: pymongo.collection.Collection, amount: int) -> t.
])
cursor = map(Review.from_mongoreview, cursor)
+
return cursor
@@ -123,7 +37,6 @@ def sample_reviews_by_rating(collection: pymongo.collection.Collection, rating:
{"$sample": {"size": amount}},
])
- cursor = map(Review.from_mongoreview, cursor)
return cursor
@@ -145,6 +58,7 @@ def sample_reviews_polar(collection: pymongo.collection.Collection, amount: int)
])
cursor = map(Review.from_mongoreview, cursor)
+
return cursor
@@ -191,16 +105,11 @@ def sample_reviews_varied(collection: pymongo.collection.Collection, amount: int
])
cursor = map(Review.from_mongoreview, cursor)
+
return cursor
__all__ = (
- "Text",
- "Category",
- "Review",
- "DatasetFunc",
- "mongo_client_from_config",
- "mongo_reviews_collection_from_config",
"sample_reviews",
"sample_reviews_by_rating",
"sample_reviews_polar",
diff --git a/unimore_bda_6/tokenizer/lower.py b/unimore_bda_6/tokenizer/lower.py
index d321f61..99d9d6e 100644
--- a/unimore_bda_6/tokenizer/lower.py
+++ b/unimore_bda_6/tokenizer/lower.py
@@ -8,4 +8,6 @@ class LowercaseTokenizer(BaseTokenizer):
return text.lower().split()
def tokenize_tensorflow(self, text: tensorflow.Tensor) -> tensorflow.Tensor:
- return tensorflow.strings.lower(text)
+ text = tensorflow.strings.lower(text)
+ text = tensorflow.expand_dims(text, -1, name="tokens")
+ return text