1
Fork 0
mirror of https://github.com/Steffo99/unimore-bda-6.git synced 2024-11-25 01:04:19 +00:00

Made good progress

How does text vectorization in tensorflow work?
This commit is contained in:
Steffo 2023-02-05 17:40:22 +01:00
parent dcfc4fbc3b
commit 3abba24ca2
Signed by: steffo
GPG key ID: 2A24051445686895
13 changed files with 286 additions and 158 deletions

2
.gitignore vendored
View file

@ -10,6 +10,8 @@
data/raw/ data/raw/
data/db/ data/db/
data/nltk/ data/nltk/
data/training/
data/evaluation/
################## ##################
# Python ignores # # Python ignores #

View file

@ -4,6 +4,7 @@
<option name="INTERPRETER_OPTIONS" value="" /> <option name="INTERPRETER_OPTIONS" value="" />
<option name="PARENT_ENVS" value="true" /> <option name="PARENT_ENVS" value="true" />
<envs> <envs>
<env name="CONFIRM_OVERWRITE" value="False" />
<env name="DATA_SET_SIZE" value="750" /> <env name="DATA_SET_SIZE" value="750" />
<env name="NLTK_DATA" value="./data/nltk" /> <env name="NLTK_DATA" value="./data/nltk" />
<env name="PYTHONUNBUFFERED" value="1" /> <env name="PYTHONUNBUFFERED" value="1" />

View file

@ -8,6 +8,8 @@
<excludeFolder url="file://$MODULE_DIR$/data/raw" /> <excludeFolder url="file://$MODULE_DIR$/data/raw" />
<excludeFolder url="file://$MODULE_DIR$/data/nltk" /> <excludeFolder url="file://$MODULE_DIR$/data/nltk" />
<excludeFolder url="file://$MODULE_DIR$/.venv" /> <excludeFolder url="file://$MODULE_DIR$/.venv" />
<excludeFolder url="file://$MODULE_DIR$/data/evaluation" />
<excludeFolder url="file://$MODULE_DIR$/data/training" />
</content> </content>
<orderEntry type="jdk" jdkName="Poetry (unimore-bda-6)" jdkType="Python SDK" /> <orderEntry type="jdk" jdkName="Poetry (unimore-bda-6)" jdkType="Python SDK" />
<orderEntry type="sourceFolder" forTests="false" /> <orderEntry type="sourceFolder" forTests="false" />

View file

@ -2,7 +2,7 @@ import logging
import tensorflow import tensorflow
from .config import config, DATA_SET_SIZE from .config import config, DATA_SET_SIZE
from .database import mongo_reviews_collection_from_config, sample_reviews_polar, sample_reviews_varied from .database import mongo_client_from_config, reviews_collection, sample_reviews_polar, sample_reviews_varied, store_cache, load_cache
from .analysis.nltk_sentiment import NLTKSentimentAnalyzer from .analysis.nltk_sentiment import NLTKSentimentAnalyzer
from .analysis.tf_text import TensorflowSentimentAnalyzer from .analysis.tf_text import TensorflowSentimentAnalyzer
from .tokenizer import NLTKWordTokenizer, PottsTokenizer, PottsTokenizerWithNegation, LowercaseTokenizer from .tokenizer import NLTKWordTokenizer, PottsTokenizer, PottsTokenizerWithNegation, LowercaseTokenizer
@ -18,43 +18,53 @@ def main():
log.debug("Tensorflow successfully found GPU acceleration!") log.debug("Tensorflow successfully found GPU acceleration!")
for dataset_func in [sample_reviews_polar, sample_reviews_varied]: for dataset_func in [sample_reviews_polar, sample_reviews_varied]:
# Tensorflow-based for SentimentAnalyzer in [TensorflowSentimentAnalyzer, NLTKSentimentAnalyzer]:
for Tokenizer in [ for Tokenizer in [
LowercaseTokenizer # NLTKWordTokenizer,
]: # PottsTokenizer,
tokenizer = Tokenizer() # PottsTokenizerWithNegation,
model = TensorflowSentimentAnalyzer() LowercaseTokenizer,
]:
tokenizer = Tokenizer()
model = SentimentAnalyzer(tokenizer=tokenizer)
with mongo_reviews_collection_from_config() as collection: with mongo_client_from_config() as db:
... log.debug("Finding the reviews MongoDB collection...")
collection = reviews_collection(db)
# NLTK-based try:
for Tokenizer in [ training_cache = load_cache("./data/training")
NLTKWordTokenizer, evaluation_cache = load_cache("./data/evaluation")
PottsTokenizer, except FileNotFoundError:
PottsTokenizerWithNegation, log.debug("Gathering datasets...")
LowercaseTokenizer, reviews_training = dataset_func(collection=collection, amount=DATA_SET_SIZE.__wrapped__)
]: reviews_evaluation = dataset_func(collection=collection, amount=DATA_SET_SIZE.__wrapped__)
tokenizer = Tokenizer()
model = NLTKSentimentAnalyzer(tokenizer=tokenizer)
with mongo_reviews_collection_from_config() as collection: log.debug("Caching datasets...")
reviews_training = dataset_func(collection=collection, amount=DATA_SET_SIZE.__wrapped__) store_cache(reviews_training, "./data/training")
reviews_evaluation = dataset_func(collection=collection, amount=DATA_SET_SIZE.__wrapped__) store_cache(reviews_evaluation, "./data/evaluation")
del reviews_training
del reviews_evaluation
log.info("Training model %s", model) training_cache = load_cache("./data/training")
model.train(reviews_training) evaluation_cache = load_cache("./data/evaluation")
log.info("Evaluating model %s", model) log.debug("Caches stored and loaded successfully!")
correct, evaluated = model.evaluate(reviews_evaluation) else:
log.info("%d evaluated, %d correct, %0.2d %% accuracy", evaluated, correct, correct / evaluated * 100) log.debug("Caches loaded successfully!")
# try: log.info("Training model: %s", model)
# print("Manual testing for %s" % model) model.train(training_cache)
# print("Input an empty string to continue to the next model.") log.info("Evaluating model: %s", model)
# while inp := input(): evaluation_results = model.evaluate(evaluation_cache)
# print(model.use(inp)) log.info("%s", evaluation_results)
# except KeyboardInterrupt:
# pass # try:
# print("Manual testing for %s" % model)
# print("Input an empty string to continue to the next model.")
# while inp := input():
# print(model.use(inp))
# except KeyboardInterrupt:
# pass
if __name__ == "__main__": if __name__ == "__main__":

View file

@ -6,7 +6,7 @@ import logging
import typing as t import typing as t
import itertools import itertools
from ..database import Text, Category, Review from ..database import Text, Category, Review, DatasetFunc
from .base import BaseSentimentAnalyzer, AlreadyTrainedError, NotTrainedError from .base import BaseSentimentAnalyzer, AlreadyTrainedError, NotTrainedError
from ..log import count_passage from ..log import count_passage
from ..tokenizer import BaseTokenizer from ..tokenizer import BaseTokenizer
@ -31,7 +31,7 @@ class NLTKSentimentAnalyzer(BaseSentimentAnalyzer):
def __repr__(self): def __repr__(self):
return f"<{self.__class__.__qualname__} tokenizer={self.tokenizer!r}>" return f"<{self.__class__.__qualname__} tokenizer={self.tokenizer!r}>"
def __tokenize_datatuple(self, datatuple: Review) -> tuple[TokenBag, Category]: def __tokenize_review(self, datatuple: Review) -> tuple[TokenBag, Category]:
""" """
Convert the `Text` of a `DataTuple` to a `TokenBag`. Convert the `Text` of a `DataTuple` to a `TokenBag`.
""" """
@ -67,13 +67,16 @@ class NLTKSentimentAnalyzer(BaseSentimentAnalyzer):
count_passage(log, "extract_features", 100) count_passage(log, "extract_features", 100)
return self.model.extract_features(data[0]), data[1] return self.model.extract_features(data[0]), data[1]
def train(self, dataset: t.Iterator[Review]) -> None: def train(self, dataset_func: DatasetFunc) -> None:
# Forbid retraining the model # Forbid retraining the model
if self.trained: if self.trained:
raise AlreadyTrainedError() raise AlreadyTrainedError()
# Get a generator
dataset: t.Generator[Review] = dataset_func()
# Tokenize the dataset # Tokenize the dataset
dataset: t.Iterator[tuple[TokenBag, Category]] = map(self.__tokenize_datatuple, dataset) dataset: t.Iterator[tuple[TokenBag, Category]] = map(self.__tokenize_review, dataset)
# Cleanly duplicate the dataset iterator # Cleanly duplicate the dataset iterator
# Reduce average memory footprint, but not maximum # Reduce average memory footprint, but not maximum

View file

@ -2,48 +2,52 @@ import tensorflow
import itertools import itertools
import typing as t import typing as t
from ..database import Text, Category, Review from ..database import Text, Category, Review, DatasetFunc
from ..tokenizer import BaseTokenizer from ..tokenizer import BaseTokenizer
from .base import BaseSentimentAnalyzer, AlreadyTrainedError, NotTrainedError from .base import BaseSentimentAnalyzer, AlreadyTrainedError, NotTrainedError
class TensorflowSentimentAnalyzer(BaseSentimentAnalyzer): class TensorflowSentimentAnalyzer(BaseSentimentAnalyzer):
def __init__(self): def __init__(self, *, tokenizer: BaseTokenizer):
super().__init__() super().__init__()
self.trained = False self.trained = False
self.text_vectorization_layer = None
self.neural_network: tensorflow.keras.Sequential | None = None self.neural_network: tensorflow.keras.Sequential | None = None
self.tokenizer: BaseTokenizer = tokenizer # TODO
@classmethod
def __bda_dataset_to_tf_dataset(cls, dataset_func: t.Callable[[], t.Iterator[Review]]) -> tensorflow.data.Dataset:
"""
Convert a `unimore_bda_6.database.DataSet` to a "real" `tensorflow.data.Dataset`.
"""
return tensorflow.data.Dataset.from_generator(
dataset_func,
output_signature=(
tensorflow.TensorSpec(shape=(), dtype=tensorflow.string),
tensorflow.TensorSpec(shape=(), dtype=tensorflow.string),
)
)
MAX_FEATURES = 20000 MAX_FEATURES = 20000
EMBEDDING_DIM = 16 EMBEDDING_DIM = 16
EPOCHS = 10 EPOCHS = 10
def train(self, training_set: t.Iterator[Review]) -> None: def train(self, dataset_func: DatasetFunc) -> None:
if self.trained: if self.trained:
raise AlreadyTrainedError() raise AlreadyTrainedError()
training_set = self.__bda_dataset_to_tf_dataset(training_set) def dataset_func_with_tensor_text():
for review in dataset_func():
yield review.to_tensor_text()
self.text_vectorization_layer = tensorflow.keras.layers.TextVectorization( text_set = tensorflow.data.Dataset.from_generator(
dataset_func_with_tensor_text,
output_signature=tensorflow.TensorSpec(shape=(), dtype=tensorflow.string)
)
text_vectorization_layer = tensorflow.keras.layers.TextVectorization(
max_tokens=self.MAX_FEATURES, max_tokens=self.MAX_FEATURES,
standardize=self.tokenizer.tokenize_tensorflow, standardize=self.tokenizer.tokenize_tensorflow,
) )
self.text_vectorization_layer.adapt(map(lambda t: t[0], training_set)) text_vectorization_layer.adapt(text_set)
training_set = training_set.map(self.text_vectorization_layer) def dataset_func_with_tensor_tuple():
for review in dataset_func():
yield review.to_tensor_tuple()
training_set = tensorflow.data.Dataset.from_generator(
dataset_func_with_tensor_tuple,
output_signature=(
tensorflow.TensorSpec(shape=(), dtype=tensorflow.string, name="text"),
tensorflow.TensorSpec(shape=(), dtype=tensorflow.float32, name="category"),
)
)
# I have no idea of what I'm doing here # I have no idea of what I'm doing here
self.neural_network = tensorflow.keras.Sequential([ self.neural_network = tensorflow.keras.Sequential([
@ -59,6 +63,8 @@ class TensorflowSentimentAnalyzer(BaseSentimentAnalyzer):
metrics=tensorflow.metrics.BinaryAccuracy(threshold=0.0) metrics=tensorflow.metrics.BinaryAccuracy(threshold=0.0)
) )
training_set = training_set.map(text_vectorization_layer)
self.neural_network.fit( self.neural_network.fit(
training_set, training_set,
epochs=self.EPOCHS, epochs=self.EPOCHS,

View file

@ -0,0 +1,5 @@
from .cache import *
from .collections import *
from .connection import *
from .datatypes import *
from .queries import *

View file

@ -0,0 +1,66 @@
import typing as t
import logging
import shutil
import pathlib
import pickle
from .datatypes import Review
log = logging.getLogger(__name__)
DatasetFunc = t.Callable[[], t.Generator[Review, t.Any, None]]
def store_cache(reviews: t.Iterator[Review], path: str | pathlib.Path) -> None:
"""
Store the contents of the given `Review` iterator to different files in a directory at the given path.
"""
path = pathlib.Path(path)
if path.exists():
raise FileExistsError("Specified cache path already exists.")
# Create the temporary directory
log.debug("Creating cache directory: %s", path)
path.mkdir(parents=True)
# Write the documents to path/{index}.pickle
for index, document in enumerate(reviews):
document_path = path.joinpath(f"{index}.pickle")
log.debug("Storing pickle file: %s", document_path)
with open(document_path, "wb") as file:
pickle.dump(document, file)
def load_cache(path: str | pathlib.Path) -> DatasetFunc:
"""
Load the contents of a directory
"""
path = pathlib.Path(path)
if not path.exists():
log.error("Specified cache directory does not exist: %s", path)
raise FileNotFoundError("The specified path does not exist.")
def data_cache_loader():
document_paths = path.iterdir()
for document_path in document_paths:
document_path = pathlib.Path(document_path)
if not str(document_path).endswith(".pickle"):
log.debug("Ignoring non-pickle file: %s", document_path)
log.debug("Loading pickle file: %s", document_path)
with open(document_path, "rb") as file:
result: Review = pickle.load(file)
yield result
return data_cache_loader
__all__ = (
"DatasetFunc",
"store_cache",
"load_cache",
)

View file

@ -0,0 +1,41 @@
import contextlib
import pymongo.collection
import typing as t
import bson
import logging
log = logging.getLogger(__name__)
class MongoReview(t.TypedDict):
"""
A review as it is stored on MongoDB.
.. warning:: Do not instantiate: this is only for type hints!
"""
_id: bson.ObjectId
reviewerID: str
asin: str
reviewerName: str
helpful: tuple[int, int]
reviewText: str
overall: float
summary: str
unixReviewTime: int
reviewTime: str
def reviews_collection(db: pymongo.MongoClient) -> pymongo.collection.Collection[MongoReview]:
"""
Create a new MongoDB client, access the ``reviews`` collection in the ``reviews`` database, and yield it.
"""
log.debug("Accessing the reviews collection...")
collection = db.reviews.reviews
log.debug("Collection accessed successfully: %s", collection)
return collection
__all__ = (
"MongoReview",
"reviews_collection",
)

View file

@ -0,0 +1,32 @@
import pymongo
import contextlib
import typing as t
import logging
from ..config import MONGO_HOST, MONGO_PORT
log = logging.getLogger(__name__)
@contextlib.contextmanager
def mongo_client_from_config() -> t.ContextManager[pymongo.MongoClient]:
"""
Create a new MongoDB client and yield it.
"""
log.debug("Opening connection to MongoDB...")
client: pymongo.MongoClient = pymongo.MongoClient(
host=MONGO_HOST.__wrapped__,
port=MONGO_PORT.__wrapped__,
)
log.info("Opened connection to MongoDB!")
yield client
log.info("Closing connection to MongoDB...")
client.close()
log.debug("Closed connection to MongoDB!")
__all__ = (
"mongo_client_from_config",
)

View file

@ -0,0 +1,49 @@
import tensorflow
from .collections import MongoReview
Text = str
Category = float
class Review:
def __init__(self, text: Text, category: Category):
self.text: str = text
self.category: float = category
@classmethod
def from_mongoreview(cls, review: MongoReview):
return cls(
text=review["reviewText"],
category=review["overall"],
)
def __repr__(self):
return f"<{self.__class__.__qualname__}: [{self.category}] {self.text}>"
def __getitem__(self, item):
if item == 0 or item == "text":
return self.text
elif item == 1 or item == "category":
return self.category
else:
raise KeyError(item)
def to_tensor_text(self) -> tensorflow.Tensor:
return tensorflow.convert_to_tensor(self.text, dtype=tensorflow.string)
def to_tensor_category(self) -> tensorflow.Tensor:
return tensorflow.convert_to_tensor(self.category / 5.0, dtype=tensorflow.float32)
def to_tensor_tuple(self) -> tuple[tensorflow.Tensor, tensorflow.Tensor]:
return (
self.to_tensor_text(),
self.to_tensor_category(),
)
__all__ = (
"Text",
"Category",
"Review",
)

View file

@ -1,101 +1,14 @@
import typing as t
import pymongo
import pymongo.collection
import contextlib
import bson
import logging import logging
import tensorflow import pymongo
import typing as t
from .config import MONGO_HOST, MONGO_PORT, WORKING_SET_SIZE from ..config import WORKING_SET_SIZE
from .collections import MongoReview
from .datatypes import Review
log = logging.getLogger(__name__) log = logging.getLogger(__name__)
class MongoReview(t.TypedDict):
"""
A review as it is stored on MongoDB.
.. warning:: Do not instantiate: this is only for type hints!
"""
_id: bson.ObjectId
reviewerID: str
asin: str
reviewerName: str
helpful: tuple[int, int]
reviewText: str
overall: float
summary: str
unixReviewTime: int
reviewTime: str
Text = str
Category = float
class Review:
def __init__(self, text: Text, category: Category):
self.text: Text = text
self.category: Category = category
@classmethod
def from_mongoreview(cls, review: MongoReview):
return cls(
text=review["reviewText"],
category=review["overall"],
)
def __repr__(self):
return f"<{self.__class__.__qualname__}: [{self.category}] {self.text}>"
def __getitem__(self, item):
if item == 0 or item == "text":
return self.text
elif item == 1 or item == "category":
return self.category
else:
raise KeyError(item)
def to_tensor_tuple(self) -> tuple[tensorflow.Tensor, tensorflow.Tensor]:
return tensorflow.convert_to_tensor(self.text, dtype=tensorflow.string), tensorflow.convert_to_tensor(self.category, dtype=tensorflow.string)
@contextlib.contextmanager
def mongo_client_from_config() -> t.ContextManager[pymongo.MongoClient]:
"""
Create a new MongoDB client and yield it.
"""
log.debug("Opening connection to MongoDB...")
client: pymongo.MongoClient = pymongo.MongoClient(
host=MONGO_HOST.__wrapped__,
port=MONGO_PORT.__wrapped__,
)
log.info("Opened connection to MongoDB!")
yield client
log.info("Closing connection to MongoDB...")
client.close()
log.debug("Closed connection to MongoDB!")
@contextlib.contextmanager
def mongo_reviews_collection_from_config() -> pymongo.collection.Collection[MongoReview]:
"""
Create a new MongoDB client, access the ``reviews`` collection in the ``reviews`` database, and yield it.
"""
with mongo_client_from_config() as db:
log.debug("Accessing the reviews collection...")
collection = db.reviews.reviews
log.debug("Collection accessed successfully: %s", collection)
yield collection
class DatasetFunc(t.Protocol):
def __call__(self) -> t.Iterator[Review]:
pass
def sample_reviews(collection: pymongo.collection.Collection, amount: int) -> t.Iterator[Review]: def sample_reviews(collection: pymongo.collection.Collection, amount: int) -> t.Iterator[Review]:
""" """
Get ``amount`` random reviews from the ``reviews`` collection. Get ``amount`` random reviews from the ``reviews`` collection.
@ -108,6 +21,7 @@ def sample_reviews(collection: pymongo.collection.Collection, amount: int) -> t.
]) ])
cursor = map(Review.from_mongoreview, cursor) cursor = map(Review.from_mongoreview, cursor)
return cursor return cursor
@ -123,7 +37,6 @@ def sample_reviews_by_rating(collection: pymongo.collection.Collection, rating:
{"$sample": {"size": amount}}, {"$sample": {"size": amount}},
]) ])
cursor = map(Review.from_mongoreview, cursor)
return cursor return cursor
@ -145,6 +58,7 @@ def sample_reviews_polar(collection: pymongo.collection.Collection, amount: int)
]) ])
cursor = map(Review.from_mongoreview, cursor) cursor = map(Review.from_mongoreview, cursor)
return cursor return cursor
@ -191,16 +105,11 @@ def sample_reviews_varied(collection: pymongo.collection.Collection, amount: int
]) ])
cursor = map(Review.from_mongoreview, cursor) cursor = map(Review.from_mongoreview, cursor)
return cursor return cursor
__all__ = ( __all__ = (
"Text",
"Category",
"Review",
"DatasetFunc",
"mongo_client_from_config",
"mongo_reviews_collection_from_config",
"sample_reviews", "sample_reviews",
"sample_reviews_by_rating", "sample_reviews_by_rating",
"sample_reviews_polar", "sample_reviews_polar",

View file

@ -8,4 +8,6 @@ class LowercaseTokenizer(BaseTokenizer):
return text.lower().split() return text.lower().split()
def tokenize_tensorflow(self, text: tensorflow.Tensor) -> tensorflow.Tensor: def tokenize_tensorflow(self, text: tensorflow.Tensor) -> tensorflow.Tensor:
return tensorflow.strings.lower(text) text = tensorflow.strings.lower(text)
text = tensorflow.expand_dims(text, -1, name="tokens")
return text