1
Fork 0
mirror of https://github.com/Steffo99/unimore-bda-6.git synced 2024-11-21 23:44:19 +00:00

stuff's working

This commit is contained in:
Steffo 2023-02-08 10:54:14 +01:00
parent c31743f066
commit 4d6c8f0fee
Signed by: steffo
GPG key ID: 2A24051445686895
10 changed files with 230 additions and 87 deletions

View file

@ -5,9 +5,9 @@
<option name="PARENT_ENVS" value="true" /> <option name="PARENT_ENVS" value="true" />
<envs> <envs>
<env name="CONFIRM_OVERWRITE" value="False" /> <env name="CONFIRM_OVERWRITE" value="False" />
<env name="DATA_SET_SIZE" value="2500" />
<env name="NLTK_DATA" value="./data/nltk" /> <env name="NLTK_DATA" value="./data/nltk" />
<env name="PYTHONUNBUFFERED" value="1" /> <env name="PYTHONUNBUFFERED" value="1" />
<env name="TF_CPP_MIN_LOG_LEVEL" value="2" />
<env name="WORKING_SET_SIZE" value="1000000" /> <env name="WORKING_SET_SIZE" value="1000000" />
<env name="XLA_FLAGS" value="--xla_gpu_cuda_data_dir=/opt/cuda" /> <env name="XLA_FLAGS" value="--xla_gpu_cuda_data_dir=/opt/cuda" />
</envs> </envs>

View file

@ -2,10 +2,11 @@ import logging
import tensorflow import tensorflow
from .config import config, DATA_SET_SIZE from .config import config, DATA_SET_SIZE
from .database import mongo_client_from_config, reviews_collection, sample_reviews_polar, sample_reviews_varied, store_cache, load_cache from .database import mongo_client_from_config, reviews_collection, sample_reviews_polar, sample_reviews_varied, store_cache, load_cache, delete_cache
from .analysis.nltk_sentiment import NLTKSentimentAnalyzer from .analysis.nltk_sentiment import NLTKSentimentAnalyzer
from .analysis.tf_text import TensorflowSentimentAnalyzer from .analysis.tf_text import TensorflowSentimentAnalyzer
from .tokenizer import NLTKWordTokenizer, PottsTokenizer, PottsTokenizerWithNegation, LowercaseTokenizer from .analysis.base import TrainingFailedError
from .tokenizer import LowercaseTokenizer
from .log import install_log_handler from .log import install_log_handler
log = logging.getLogger(__name__) log = logging.getLogger(__name__)
@ -17,6 +18,12 @@ def main():
else: else:
log.debug("Tensorflow successfully found GPU acceleration!") log.debug("Tensorflow successfully found GPU acceleration!")
try:
delete_cache("./data/training")
delete_cache("./data/evaluation")
except FileNotFoundError:
pass
for dataset_func in [sample_reviews_polar, sample_reviews_varied]: for dataset_func in [sample_reviews_polar, sample_reviews_varied]:
for SentimentAnalyzer in [TensorflowSentimentAnalyzer, NLTKSentimentAnalyzer]: for SentimentAnalyzer in [TensorflowSentimentAnalyzer, NLTKSentimentAnalyzer]:
for Tokenizer in [ for Tokenizer in [
@ -25,46 +32,50 @@ def main():
# PottsTokenizerWithNegation, # PottsTokenizerWithNegation,
LowercaseTokenizer, LowercaseTokenizer,
]: ]:
tokenizer = Tokenizer() while True:
model = SentimentAnalyzer(tokenizer=tokenizer)
with mongo_client_from_config() as db:
log.debug("Finding the reviews MongoDB collection...")
collection = reviews_collection(db)
try: try:
training_cache = load_cache("./data/training") tokenizer = Tokenizer()
evaluation_cache = load_cache("./data/evaluation") model = SentimentAnalyzer(tokenizer=tokenizer)
except FileNotFoundError:
log.debug("Gathering datasets...")
reviews_training = dataset_func(collection=collection, amount=DATA_SET_SIZE.__wrapped__)
reviews_evaluation = dataset_func(collection=collection, amount=DATA_SET_SIZE.__wrapped__)
log.debug("Caching datasets...") with mongo_client_from_config() as db:
store_cache(reviews_training, "./data/training") log.debug("Finding the reviews MongoDB collection...")
store_cache(reviews_evaluation, "./data/evaluation") collection = reviews_collection(db)
del reviews_training
del reviews_evaluation
training_cache = load_cache("./data/training") try:
evaluation_cache = load_cache("./data/evaluation") training_cache = load_cache("./data/training")
log.debug("Caches stored and loaded successfully!") evaluation_cache = load_cache("./data/evaluation")
except FileNotFoundError:
log.debug("Gathering datasets...")
reviews_training = dataset_func(collection=collection, amount=DATA_SET_SIZE.__wrapped__)
reviews_evaluation = dataset_func(collection=collection, amount=DATA_SET_SIZE.__wrapped__)
log.debug("Caching datasets...")
store_cache(reviews_training, "./data/training")
store_cache(reviews_evaluation, "./data/evaluation")
del reviews_training
del reviews_evaluation
training_cache = load_cache("./data/training")
evaluation_cache = load_cache("./data/evaluation")
log.debug("Caches stored and loaded successfully!")
else:
log.debug("Caches loaded successfully!")
log.info("Training model: %s", model)
model.train(training_cache)
log.info("Evaluating model: %s", model)
evaluation_results = model.evaluate(evaluation_cache)
log.info("%s", evaluation_results)
except TrainingFailedError:
log.error("Training failed, restarting with a different dataset.")
continue
else: else:
log.debug("Caches loaded successfully!") log.info("Training")
break
log.info("Training model: %s", model) finally:
model.train(training_cache) delete_cache("./data/training")
log.info("Evaluating model: %s", model) delete_cache("./data/evaluation")
evaluation_results = model.evaluate(evaluation_cache)
log.info("%s", evaluation_results)
# try:
# print("Manual testing for %s" % model)
# print("Input an empty string to continue to the next model.")
# while inp := input():
# print(model.use(inp))
# except KeyboardInterrupt:
# pass
if __name__ == "__main__": if __name__ == "__main__":

View file

@ -11,12 +11,13 @@ log = logging.getLogger(__name__)
class EvaluationResults: class EvaluationResults:
correct: int correct: int
evaluated: int evaluated: int
score: float
def __repr__(self): def __repr__(self):
return f"<EvaluationResults: {self.correct}/{self.evaluated}, {self.correct / self.evaluated * 100:.2f}>" return f"<EvaluationResults: score of {self.score} out of {self.evaluated} evaluated tuples>"
def __str__(self): def __str__(self):
return f"{self.correct} / {self.evaluated} - {self.correct / self.evaluated * 100:.2f} %" return f"{self.evaluated} evaluated, {self.correct} correct, {self.correct / self.evaluated * 100:.2} % accuracy, {self.score:.2} score, {self.score / self.evaluated * 100:.2} scoreaccuracy"
class BaseSentimentAnalyzer(metaclass=abc.ABCMeta): class BaseSentimentAnalyzer(metaclass=abc.ABCMeta):
@ -40,15 +41,18 @@ class BaseSentimentAnalyzer(metaclass=abc.ABCMeta):
evaluated: int = 0 evaluated: int = 0
correct: int = 0 correct: int = 0
score: float = 0.0
for review in dataset_func(): for review in dataset_func():
resulting_category = self.use(review.text) resulting_category = self.use(review.text)
evaluated += 1 evaluated += 1
correct += 1 if resulting_category == review.category else 0 correct += 1 if resulting_category == review.category else 0
score += 1 - (abs(resulting_category - review.category) / 4)
if not evaluated % 100: if not evaluated % 100:
log.debug("%d evaluated, %d correct, %0.2d %% accuracy", evaluated, correct, correct / evaluated * 100) temp_results = EvaluationResults(correct=correct, evaluated=evaluated, score=score)
log.debug(f"{temp_results!s}")
return EvaluationResults(correct=correct, evaluated=evaluated) return EvaluationResults(correct=correct, evaluated=evaluated, score=score)
@abc.abstractmethod @abc.abstractmethod
def use(self, text: Text) -> Category: def use(self, text: Text) -> Category:
@ -70,8 +74,15 @@ class NotTrainedError(Exception):
""" """
class TrainingFailedError(Exception):
"""
The model wasn't able to complete the training and should not be used anymore.
"""
__all__ = ( __all__ = (
"BaseSentimentAnalyzer", "BaseSentimentAnalyzer",
"AlreadyTrainedError", "AlreadyTrainedError",
"NotTrainedError", "NotTrainedError",
"TrainingFailedError",
) )

View file

@ -1,82 +1,119 @@
import tensorflow import tensorflow
import logging
from ..database import Text, Category, DatasetFunc from ..database import Text, Category, DatasetFunc
from ..config import DATA_SET_SIZE from ..config import TENSORFLOW_EMBEDDING_SIZE, TENSORFLOW_MAX_FEATURES, TENSORFLOW_EPOCHS
from .base import BaseSentimentAnalyzer, AlreadyTrainedError, NotTrainedError from ..tokenizer import BaseTokenizer
from .base import BaseSentimentAnalyzer, AlreadyTrainedError, NotTrainedError, TrainingFailedError
log = logging.getLogger(__name__)
class TensorflowSentimentAnalyzer(BaseSentimentAnalyzer): class TensorflowSentimentAnalyzer(BaseSentimentAnalyzer):
def __init__(self, *args, **kwargs): def __init__(self, tokenizer: BaseTokenizer):
super().__init__() super().__init__()
self.trained: bool = False self.trained: bool = False
self.text_vectorization_layer: tensorflow.keras.layers.TextVectorization = self._build_vectorizer() self.text_vectorization_layer: tensorflow.keras.layers.TextVectorization = self._build_vectorizer(tokenizer)
self.model: tensorflow.keras.Sequential = self._build_model() self.model: tensorflow.keras.Sequential = self._build_model()
self.history: tensorflow.keras.callbacks.History | None = None
def _build_dataset(self, dataset_func: DatasetFunc) -> tensorflow.data.Dataset: @staticmethod
def _build_dataset(dataset_func: DatasetFunc) -> tensorflow.data.Dataset:
def dataset_func_with_tensor_tuple(): def dataset_func_with_tensor_tuple():
for review in dataset_func(): for review in dataset_func():
yield review.to_tensor_tuple() yield review.to_tensor_tuple()
return tensorflow.data.Dataset.from_generator( log.debug("Creating dataset...")
dataset = tensorflow.data.Dataset.from_generator(
dataset_func_with_tensor_tuple, dataset_func_with_tensor_tuple,
output_signature=( output_signature=(
tensorflow.TensorSpec(shape=(), dtype=tensorflow.string, name="text"), tensorflow.TensorSpec(shape=(), dtype=tensorflow.string, name="text"),
tensorflow.TensorSpec(shape=(5,), dtype=tensorflow.float32, name="category"), tensorflow.TensorSpec(shape=(1, 5,), dtype=tensorflow.float32, name="category"),
) )
) )
def _build_model(self) -> tensorflow.keras.Sequential: log.debug("Caching dataset...")
return tensorflow.keras.Sequential([ dataset = dataset.cache()
log.debug("Configuring dataset prefetch...")
dataset = dataset.prefetch(buffer_size=tensorflow.data.AUTOTUNE)
return dataset
@staticmethod
def _build_model() -> tensorflow.keras.Sequential:
log.debug("Creating %s model...", tensorflow.keras.Sequential)
model = tensorflow.keras.Sequential([
tensorflow.keras.layers.Embedding( tensorflow.keras.layers.Embedding(
input_dim=self.MAX_FEATURES + 1, input_dim=TENSORFLOW_MAX_FEATURES.__wrapped__ + 1,
output_dim=self.EMBEDDING_DIM, output_dim=TENSORFLOW_EMBEDDING_SIZE.__wrapped__,
), ),
# tensorflow.keras.layers.Dropout(0.2), tensorflow.keras.layers.Dropout(0.2),
tensorflow.keras.layers.GlobalAveragePooling1D(), tensorflow.keras.layers.GlobalAveragePooling1D(),
# tensorflow.keras.layers.Dropout(0.2), tensorflow.keras.layers.Dropout(0.2),
tensorflow.keras.layers.Dense(5, activation="softmax"), tensorflow.keras.layers.Dense(5, activation="softmax"),
]) ])
log.debug("Compiling model: %s", model)
model.compile(
optimizer=tensorflow.keras.optimizers.Adam(global_clipnorm=1.0),
loss=tensorflow.keras.losses.CategoricalCrossentropy(),
metrics=[
tensorflow.keras.metrics.CategoricalAccuracy(),
]
)
log.debug("Compiled model: %s", model)
return model
def _build_vectorizer(self) -> tensorflow.keras.layers.TextVectorization: @staticmethod
return tensorflow.keras.layers.TextVectorization(max_tokens=self.MAX_FEATURES) def _build_vectorizer(tokenizer: BaseTokenizer) -> tensorflow.keras.layers.TextVectorization:
return tensorflow.keras.layers.TextVectorization(
def __vectorize_data(self, text, category): standardize=tokenizer.tokenize_tensorflow,
text = tensorflow.expand_dims(text, -1) # TODO: ?????? max_tokens=TENSORFLOW_MAX_FEATURES.__wrapped__
return self.text_vectorization_layer(text), category )
MAX_FEATURES = 2500
EMBEDDING_DIM = 24
"""
Count of possible "semantic meanings" of words, represented as dimensions of a tensor.
"""
EPOCHS = 3
def train(self, dataset_func: DatasetFunc) -> None: def train(self, dataset_func: DatasetFunc) -> None:
if self.trained: if self.trained:
log.error("Tried to train an already trained model.")
raise AlreadyTrainedError() raise AlreadyTrainedError()
log.debug("Building dataset...")
training_set = self._build_dataset(dataset_func) training_set = self._build_dataset(dataset_func)
log.debug("Built dataset: %s", training_set)
log.debug("Preparing training_set for %s...", self.text_vectorization_layer.adapt)
only_text_set = training_set.map(lambda text, category: text) only_text_set = training_set.map(lambda text, category: text)
log.debug("Adapting text_vectorization_layer: %s", self.text_vectorization_layer)
self.text_vectorization_layer.adapt(only_text_set) self.text_vectorization_layer.adapt(only_text_set)
training_set = training_set.map(self.__vectorize_data) log.debug("Adapted text_vectorization_layer: %s", self.text_vectorization_layer)
# self.model.compile(loss=tensorflow.keras.losses.SparseCategoricalCrossentropy(from_logits=True), optimizer="adam", metrics=["accuracy"]) log.debug("Preparing training_set for %s...", self.model.fit)
self.model.compile(loss=tensorflow.keras.losses.MeanAbsoluteError(), optimizer="adam", metrics=["accuracy"]) training_set = training_set.map(lambda text, category: (self.text_vectorization_layer(text), category))
log.info("Training: %s", self.model)
self.history: tensorflow.keras.callbacks.History | None = self.model.fit(
training_set,
epochs=TENSORFLOW_EPOCHS.__wrapped__,
callbacks=[
tensorflow.keras.callbacks.TerminateOnNaN()
])
log.info("Trained: %s", self.model)
self.model.fit(training_set, epochs=self.EPOCHS) if len(self.history.epoch) < TENSORFLOW_EPOCHS.__wrapped__:
log.error("Model %s training failed: only %d epochs computed", self.model, len(self.history.epoch))
raise TrainingFailedError()
else:
log.info("Model %s training succeeded!", self.model)
self.trained = True self.trained = True
def use(self, text: Text) -> Category: def use(self, text: Text) -> Category:
if not self.trained: if not self.trained:
log.error("Tried to use a non-trained model.")
raise NotTrainedError() raise NotTrainedError()
vector = self.text_vectorization_layer(tensorflow.expand_dims(text, -1)) vector = self.text_vectorization_layer(text)
prediction = self.model.predict(vector) prediction = self.model.predict(vector, verbose=False)
max_i = None max_i = None
max_p = None max_p = None
@ -84,5 +121,6 @@ class TensorflowSentimentAnalyzer(BaseSentimentAnalyzer):
if max_p is None or p > max_p: if max_p is None or p > max_p:
max_i = i max_i = i
max_p = p max_p = p
result = float(max_i) + 1.0
return float(max_i) + 1.0 return result

View file

@ -49,10 +49,55 @@ def DATA_SET_SIZE(val: str | None) -> int:
""" """
The number of reviews from each category to fetch for the datasets. The number of reviews from each category to fetch for the datasets.
Defaults to `1000`. Defaults to `1750`.
""" """
if val is None: if val is None:
return 1000 return 1750
try:
return int(val)
except ValueError:
raise cfig.InvalidValueError("Not an int.")
@config.optional()
def TENSORFLOW_MAX_FEATURES(val: str | None) -> int:
"""
The maximum number of features to use in Tensorflow models.
Defaults to `30000`.
"""
if val is None:
return 30000
try:
return int(val)
except ValueError:
raise cfig.InvalidValueError("Not an int.")
@config.optional()
def TENSORFLOW_EMBEDDING_SIZE(val: str | None) -> int:
"""
The size of the embeddings tensor to use in Tensorflow models.
Defaults to `12`.
"""
if val is None:
return 12
try:
return int(val)
except ValueError:
raise cfig.InvalidValueError("Not an int.")
@config.optional()
def TENSORFLOW_EPOCHS(val: str | None) -> int:
"""
The number of epochs to train Tensorflow models for.
Defaults to `15`.
"""
if val is None:
return 15
try: try:
return int(val) return int(val)
except ValueError: except ValueError:
@ -65,6 +110,9 @@ __all__ = (
"MONGO_PORT", "MONGO_PORT",
"WORKING_SET_SIZE", "WORKING_SET_SIZE",
"DATA_SET_SIZE", "DATA_SET_SIZE",
"TENSORFLOW_MAX_FEATURES",
"TENSORFLOW_EMBEDDING_SIZE",
"TENSORFLOW_EPOCHS",
) )

View file

@ -36,7 +36,7 @@ def store_cache(reviews: t.Iterator[Review], path: str | pathlib.Path) -> None:
def load_cache(path: str | pathlib.Path) -> DatasetFunc: def load_cache(path: str | pathlib.Path) -> DatasetFunc:
""" """
Load the contents of a directory Load the contents of a directory into a `Review` iterator.
""" """
path = pathlib.Path(path) path = pathlib.Path(path)
@ -47,8 +47,10 @@ def load_cache(path: str | pathlib.Path) -> DatasetFunc:
document_paths = path.iterdir() document_paths = path.iterdir()
for document_path in document_paths: for document_path in document_paths:
document_path = pathlib.Path(document_path) document_path = pathlib.Path(document_path)
if not str(document_path).endswith(".pickle"): if not str(document_path).endswith(".pickle"):
log.debug("Ignoring non-pickle file: %s", document_path) log.debug("Ignoring non-pickle file: %s", document_path)
continue
log.debug("Loading pickle file: %s", document_path) log.debug("Loading pickle file: %s", document_path)
with open(document_path, "rb") as file: with open(document_path, "rb") as file:
@ -58,8 +60,22 @@ def load_cache(path: str | pathlib.Path) -> DatasetFunc:
return data_cache_loader return data_cache_loader
def delete_cache(path: str | pathlib.Path) -> None:
"""
Delete the given cache directory.
"""
path = pathlib.Path(path)
if not path.exists():
raise FileNotFoundError("The specified path does not exist.")
log.warning("Deleting cache directory: %s", path)
shutil.rmtree(path)
__all__ = ( __all__ = (
"DatasetFunc", "DatasetFunc",
"store_cache", "store_cache",
"load_cache", "load_cache",
"delete_cache",
) )

View file

@ -1,4 +1,3 @@
import contextlib
import pymongo.collection import pymongo.collection
import typing as t import typing as t
import bson import bson
@ -30,8 +29,8 @@ def reviews_collection(db: pymongo.MongoClient) -> pymongo.collection.Collection
Create a new MongoDB client, access the ``reviews`` collection in the ``reviews`` database, and yield it. Create a new MongoDB client, access the ``reviews`` collection in the ``reviews`` database, and yield it.
""" """
log.debug("Accessing the reviews collection...") log.debug("Accessing the reviews collection...")
collection = db.reviews.reviews collection: pymongo.collection.Collection[MongoReview] = db.reviews.reviews
log.debug("Collection accessed successfully: %s", collection) log.debug("Collection accessed successfully: %s", collection.name)
return collection return collection

View file

@ -1,5 +1,8 @@
import tensorflow import tensorflow
from .collections import MongoReview from .collections import MongoReview
import logging
log = logging.getLogger(__name__)
Text = str Text = str
@ -33,19 +36,21 @@ class Review:
return tensorflow.convert_to_tensor(self.text, dtype=tensorflow.string) return tensorflow.convert_to_tensor(self.text, dtype=tensorflow.string)
def to_tensor_category(self) -> tensorflow.Tensor: def to_tensor_category(self) -> tensorflow.Tensor:
return tensorflow.convert_to_tensor([ return tensorflow.convert_to_tensor([[
1.0 if self.category == 1.0 else 0.0, 1.0 if self.category == 1.0 else 0.0,
1.0 if self.category == 2.0 else 0.0, 1.0 if self.category == 2.0 else 0.0,
1.0 if self.category == 3.0 else 0.0, 1.0 if self.category == 3.0 else 0.0,
1.0 if self.category == 4.0 else 0.0, 1.0 if self.category == 4.0 else 0.0,
1.0 if self.category == 5.0 else 0.0, 1.0 if self.category == 5.0 else 0.0,
], dtype=tensorflow.float32) ]], dtype=tensorflow.float32)
def to_tensor_tuple(self) -> tuple[tensorflow.Tensor, tensorflow.Tensor]: def to_tensor_tuple(self) -> tuple[tensorflow.Tensor, tensorflow.Tensor]:
return ( t = (
self.to_tensor_text(), self.to_tensor_text(),
self.to_tensor_category(), self.to_tensor_category(),
) )
log.debug("Converted %s", t)
return t
__all__ = ( __all__ = (

View file

@ -54,6 +54,12 @@ def sample_reviews_polar(collection: pymongo.collection.Collection, amount: int)
{"$match": {"overall": 5.0}}, {"$match": {"overall": 5.0}},
{"$sample": {"size": amount}}, {"$sample": {"size": amount}},
], ],
}},
{"$addFields": {
"sortKey": {"$rand": {}},
}},
{"$sort": {
"sortKey": 1,
}} }}
]) ])
@ -101,6 +107,12 @@ def sample_reviews_varied(collection: pymongo.collection.Collection, amount: int
], ],
}} }}
], ],
}},
{"$addFields": {
"sortKey": {"$rand": {}},
}},
{"$sort": {
"sortKey": 1,
}} }}
]) ])

View file

@ -15,7 +15,7 @@ def install_log_handler(loggers: list[logging.Logger] = None):
for logger in loggers: for logger in loggers:
coloredlogs.install( coloredlogs.install(
logger=logger, logger=logger,
level="INFO", level="DEBUG",
fmt="{asctime} | {name:<32} | {levelname:>8} | {message}", fmt="{asctime} | {name:<32} | {levelname:>8} | {message}",
style="{", style="{",
level_styles=dict( level_styles=dict(
@ -34,6 +34,9 @@ def install_log_handler(loggers: list[logging.Logger] = None):
) )
this_log.debug("Installed custom log handler on: %s", logger) this_log.debug("Installed custom log handler on: %s", logger)
logging.getLogger("unimore_bda_6.database.cache").setLevel("INFO")
logging.getLogger("unimore_bda_6.database.datatypes").setLevel("INFO")
_passage_counts = collections.defaultdict(lambda: 0) _passage_counts = collections.defaultdict(lambda: 0)