mirror of
https://github.com/Steffo99/unimore-bda-6.git
synced 2024-11-22 07:54:19 +00:00
stuff's working
This commit is contained in:
parent
c31743f066
commit
4d6c8f0fee
10 changed files with 230 additions and 87 deletions
|
@ -5,9 +5,9 @@
|
||||||
<option name="PARENT_ENVS" value="true" />
|
<option name="PARENT_ENVS" value="true" />
|
||||||
<envs>
|
<envs>
|
||||||
<env name="CONFIRM_OVERWRITE" value="False" />
|
<env name="CONFIRM_OVERWRITE" value="False" />
|
||||||
<env name="DATA_SET_SIZE" value="2500" />
|
|
||||||
<env name="NLTK_DATA" value="./data/nltk" />
|
<env name="NLTK_DATA" value="./data/nltk" />
|
||||||
<env name="PYTHONUNBUFFERED" value="1" />
|
<env name="PYTHONUNBUFFERED" value="1" />
|
||||||
|
<env name="TF_CPP_MIN_LOG_LEVEL" value="2" />
|
||||||
<env name="WORKING_SET_SIZE" value="1000000" />
|
<env name="WORKING_SET_SIZE" value="1000000" />
|
||||||
<env name="XLA_FLAGS" value="--xla_gpu_cuda_data_dir=/opt/cuda" />
|
<env name="XLA_FLAGS" value="--xla_gpu_cuda_data_dir=/opt/cuda" />
|
||||||
</envs>
|
</envs>
|
||||||
|
|
|
@ -2,10 +2,11 @@ import logging
|
||||||
import tensorflow
|
import tensorflow
|
||||||
|
|
||||||
from .config import config, DATA_SET_SIZE
|
from .config import config, DATA_SET_SIZE
|
||||||
from .database import mongo_client_from_config, reviews_collection, sample_reviews_polar, sample_reviews_varied, store_cache, load_cache
|
from .database import mongo_client_from_config, reviews_collection, sample_reviews_polar, sample_reviews_varied, store_cache, load_cache, delete_cache
|
||||||
from .analysis.nltk_sentiment import NLTKSentimentAnalyzer
|
from .analysis.nltk_sentiment import NLTKSentimentAnalyzer
|
||||||
from .analysis.tf_text import TensorflowSentimentAnalyzer
|
from .analysis.tf_text import TensorflowSentimentAnalyzer
|
||||||
from .tokenizer import NLTKWordTokenizer, PottsTokenizer, PottsTokenizerWithNegation, LowercaseTokenizer
|
from .analysis.base import TrainingFailedError
|
||||||
|
from .tokenizer import LowercaseTokenizer
|
||||||
from .log import install_log_handler
|
from .log import install_log_handler
|
||||||
|
|
||||||
log = logging.getLogger(__name__)
|
log = logging.getLogger(__name__)
|
||||||
|
@ -17,6 +18,12 @@ def main():
|
||||||
else:
|
else:
|
||||||
log.debug("Tensorflow successfully found GPU acceleration!")
|
log.debug("Tensorflow successfully found GPU acceleration!")
|
||||||
|
|
||||||
|
try:
|
||||||
|
delete_cache("./data/training")
|
||||||
|
delete_cache("./data/evaluation")
|
||||||
|
except FileNotFoundError:
|
||||||
|
pass
|
||||||
|
|
||||||
for dataset_func in [sample_reviews_polar, sample_reviews_varied]:
|
for dataset_func in [sample_reviews_polar, sample_reviews_varied]:
|
||||||
for SentimentAnalyzer in [TensorflowSentimentAnalyzer, NLTKSentimentAnalyzer]:
|
for SentimentAnalyzer in [TensorflowSentimentAnalyzer, NLTKSentimentAnalyzer]:
|
||||||
for Tokenizer in [
|
for Tokenizer in [
|
||||||
|
@ -25,46 +32,50 @@ def main():
|
||||||
# PottsTokenizerWithNegation,
|
# PottsTokenizerWithNegation,
|
||||||
LowercaseTokenizer,
|
LowercaseTokenizer,
|
||||||
]:
|
]:
|
||||||
tokenizer = Tokenizer()
|
while True:
|
||||||
model = SentimentAnalyzer(tokenizer=tokenizer)
|
|
||||||
|
|
||||||
with mongo_client_from_config() as db:
|
|
||||||
log.debug("Finding the reviews MongoDB collection...")
|
|
||||||
collection = reviews_collection(db)
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
training_cache = load_cache("./data/training")
|
tokenizer = Tokenizer()
|
||||||
evaluation_cache = load_cache("./data/evaluation")
|
model = SentimentAnalyzer(tokenizer=tokenizer)
|
||||||
except FileNotFoundError:
|
|
||||||
log.debug("Gathering datasets...")
|
|
||||||
reviews_training = dataset_func(collection=collection, amount=DATA_SET_SIZE.__wrapped__)
|
|
||||||
reviews_evaluation = dataset_func(collection=collection, amount=DATA_SET_SIZE.__wrapped__)
|
|
||||||
|
|
||||||
log.debug("Caching datasets...")
|
with mongo_client_from_config() as db:
|
||||||
store_cache(reviews_training, "./data/training")
|
log.debug("Finding the reviews MongoDB collection...")
|
||||||
store_cache(reviews_evaluation, "./data/evaluation")
|
collection = reviews_collection(db)
|
||||||
del reviews_training
|
|
||||||
del reviews_evaluation
|
|
||||||
|
|
||||||
training_cache = load_cache("./data/training")
|
try:
|
||||||
evaluation_cache = load_cache("./data/evaluation")
|
training_cache = load_cache("./data/training")
|
||||||
log.debug("Caches stored and loaded successfully!")
|
evaluation_cache = load_cache("./data/evaluation")
|
||||||
|
except FileNotFoundError:
|
||||||
|
log.debug("Gathering datasets...")
|
||||||
|
reviews_training = dataset_func(collection=collection, amount=DATA_SET_SIZE.__wrapped__)
|
||||||
|
reviews_evaluation = dataset_func(collection=collection, amount=DATA_SET_SIZE.__wrapped__)
|
||||||
|
|
||||||
|
log.debug("Caching datasets...")
|
||||||
|
store_cache(reviews_training, "./data/training")
|
||||||
|
store_cache(reviews_evaluation, "./data/evaluation")
|
||||||
|
del reviews_training
|
||||||
|
del reviews_evaluation
|
||||||
|
|
||||||
|
training_cache = load_cache("./data/training")
|
||||||
|
evaluation_cache = load_cache("./data/evaluation")
|
||||||
|
log.debug("Caches stored and loaded successfully!")
|
||||||
|
else:
|
||||||
|
log.debug("Caches loaded successfully!")
|
||||||
|
|
||||||
|
log.info("Training model: %s", model)
|
||||||
|
model.train(training_cache)
|
||||||
|
log.info("Evaluating model: %s", model)
|
||||||
|
evaluation_results = model.evaluate(evaluation_cache)
|
||||||
|
log.info("%s", evaluation_results)
|
||||||
|
|
||||||
|
except TrainingFailedError:
|
||||||
|
log.error("Training failed, restarting with a different dataset.")
|
||||||
|
continue
|
||||||
else:
|
else:
|
||||||
log.debug("Caches loaded successfully!")
|
log.info("Training")
|
||||||
|
break
|
||||||
log.info("Training model: %s", model)
|
finally:
|
||||||
model.train(training_cache)
|
delete_cache("./data/training")
|
||||||
log.info("Evaluating model: %s", model)
|
delete_cache("./data/evaluation")
|
||||||
evaluation_results = model.evaluate(evaluation_cache)
|
|
||||||
log.info("%s", evaluation_results)
|
|
||||||
|
|
||||||
# try:
|
|
||||||
# print("Manual testing for %s" % model)
|
|
||||||
# print("Input an empty string to continue to the next model.")
|
|
||||||
# while inp := input():
|
|
||||||
# print(model.use(inp))
|
|
||||||
# except KeyboardInterrupt:
|
|
||||||
# pass
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|
|
@ -11,12 +11,13 @@ log = logging.getLogger(__name__)
|
||||||
class EvaluationResults:
|
class EvaluationResults:
|
||||||
correct: int
|
correct: int
|
||||||
evaluated: int
|
evaluated: int
|
||||||
|
score: float
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
return f"<EvaluationResults: {self.correct}/{self.evaluated}, {self.correct / self.evaluated * 100:.2f}>"
|
return f"<EvaluationResults: score of {self.score} out of {self.evaluated} evaluated tuples>"
|
||||||
|
|
||||||
def __str__(self):
|
def __str__(self):
|
||||||
return f"{self.correct} / {self.evaluated} - {self.correct / self.evaluated * 100:.2f} %"
|
return f"{self.evaluated} evaluated, {self.correct} correct, {self.correct / self.evaluated * 100:.2} % accuracy, {self.score:.2} score, {self.score / self.evaluated * 100:.2} scoreaccuracy"
|
||||||
|
|
||||||
|
|
||||||
class BaseSentimentAnalyzer(metaclass=abc.ABCMeta):
|
class BaseSentimentAnalyzer(metaclass=abc.ABCMeta):
|
||||||
|
@ -40,15 +41,18 @@ class BaseSentimentAnalyzer(metaclass=abc.ABCMeta):
|
||||||
|
|
||||||
evaluated: int = 0
|
evaluated: int = 0
|
||||||
correct: int = 0
|
correct: int = 0
|
||||||
|
score: float = 0.0
|
||||||
|
|
||||||
for review in dataset_func():
|
for review in dataset_func():
|
||||||
resulting_category = self.use(review.text)
|
resulting_category = self.use(review.text)
|
||||||
evaluated += 1
|
evaluated += 1
|
||||||
correct += 1 if resulting_category == review.category else 0
|
correct += 1 if resulting_category == review.category else 0
|
||||||
|
score += 1 - (abs(resulting_category - review.category) / 4)
|
||||||
if not evaluated % 100:
|
if not evaluated % 100:
|
||||||
log.debug("%d evaluated, %d correct, %0.2d %% accuracy", evaluated, correct, correct / evaluated * 100)
|
temp_results = EvaluationResults(correct=correct, evaluated=evaluated, score=score)
|
||||||
|
log.debug(f"{temp_results!s}")
|
||||||
|
|
||||||
return EvaluationResults(correct=correct, evaluated=evaluated)
|
return EvaluationResults(correct=correct, evaluated=evaluated, score=score)
|
||||||
|
|
||||||
@abc.abstractmethod
|
@abc.abstractmethod
|
||||||
def use(self, text: Text) -> Category:
|
def use(self, text: Text) -> Category:
|
||||||
|
@ -70,8 +74,15 @@ class NotTrainedError(Exception):
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
class TrainingFailedError(Exception):
|
||||||
|
"""
|
||||||
|
The model wasn't able to complete the training and should not be used anymore.
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
__all__ = (
|
__all__ = (
|
||||||
"BaseSentimentAnalyzer",
|
"BaseSentimentAnalyzer",
|
||||||
"AlreadyTrainedError",
|
"AlreadyTrainedError",
|
||||||
"NotTrainedError",
|
"NotTrainedError",
|
||||||
|
"TrainingFailedError",
|
||||||
)
|
)
|
||||||
|
|
|
@ -1,82 +1,119 @@
|
||||||
import tensorflow
|
import tensorflow
|
||||||
|
import logging
|
||||||
|
|
||||||
from ..database import Text, Category, DatasetFunc
|
from ..database import Text, Category, DatasetFunc
|
||||||
from ..config import DATA_SET_SIZE
|
from ..config import TENSORFLOW_EMBEDDING_SIZE, TENSORFLOW_MAX_FEATURES, TENSORFLOW_EPOCHS
|
||||||
from .base import BaseSentimentAnalyzer, AlreadyTrainedError, NotTrainedError
|
from ..tokenizer import BaseTokenizer
|
||||||
|
from .base import BaseSentimentAnalyzer, AlreadyTrainedError, NotTrainedError, TrainingFailedError
|
||||||
|
|
||||||
|
log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
class TensorflowSentimentAnalyzer(BaseSentimentAnalyzer):
|
class TensorflowSentimentAnalyzer(BaseSentimentAnalyzer):
|
||||||
def __init__(self, *args, **kwargs):
|
def __init__(self, tokenizer: BaseTokenizer):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self.trained: bool = False
|
self.trained: bool = False
|
||||||
|
|
||||||
self.text_vectorization_layer: tensorflow.keras.layers.TextVectorization = self._build_vectorizer()
|
self.text_vectorization_layer: tensorflow.keras.layers.TextVectorization = self._build_vectorizer(tokenizer)
|
||||||
self.model: tensorflow.keras.Sequential = self._build_model()
|
self.model: tensorflow.keras.Sequential = self._build_model()
|
||||||
|
self.history: tensorflow.keras.callbacks.History | None = None
|
||||||
|
|
||||||
def _build_dataset(self, dataset_func: DatasetFunc) -> tensorflow.data.Dataset:
|
@staticmethod
|
||||||
|
def _build_dataset(dataset_func: DatasetFunc) -> tensorflow.data.Dataset:
|
||||||
def dataset_func_with_tensor_tuple():
|
def dataset_func_with_tensor_tuple():
|
||||||
for review in dataset_func():
|
for review in dataset_func():
|
||||||
yield review.to_tensor_tuple()
|
yield review.to_tensor_tuple()
|
||||||
|
|
||||||
return tensorflow.data.Dataset.from_generator(
|
log.debug("Creating dataset...")
|
||||||
|
dataset = tensorflow.data.Dataset.from_generator(
|
||||||
dataset_func_with_tensor_tuple,
|
dataset_func_with_tensor_tuple,
|
||||||
output_signature=(
|
output_signature=(
|
||||||
tensorflow.TensorSpec(shape=(), dtype=tensorflow.string, name="text"),
|
tensorflow.TensorSpec(shape=(), dtype=tensorflow.string, name="text"),
|
||||||
tensorflow.TensorSpec(shape=(5,), dtype=tensorflow.float32, name="category"),
|
tensorflow.TensorSpec(shape=(1, 5,), dtype=tensorflow.float32, name="category"),
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
def _build_model(self) -> tensorflow.keras.Sequential:
|
log.debug("Caching dataset...")
|
||||||
return tensorflow.keras.Sequential([
|
dataset = dataset.cache()
|
||||||
|
|
||||||
|
log.debug("Configuring dataset prefetch...")
|
||||||
|
dataset = dataset.prefetch(buffer_size=tensorflow.data.AUTOTUNE)
|
||||||
|
|
||||||
|
return dataset
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _build_model() -> tensorflow.keras.Sequential:
|
||||||
|
log.debug("Creating %s model...", tensorflow.keras.Sequential)
|
||||||
|
model = tensorflow.keras.Sequential([
|
||||||
tensorflow.keras.layers.Embedding(
|
tensorflow.keras.layers.Embedding(
|
||||||
input_dim=self.MAX_FEATURES + 1,
|
input_dim=TENSORFLOW_MAX_FEATURES.__wrapped__ + 1,
|
||||||
output_dim=self.EMBEDDING_DIM,
|
output_dim=TENSORFLOW_EMBEDDING_SIZE.__wrapped__,
|
||||||
),
|
),
|
||||||
# tensorflow.keras.layers.Dropout(0.2),
|
tensorflow.keras.layers.Dropout(0.2),
|
||||||
tensorflow.keras.layers.GlobalAveragePooling1D(),
|
tensorflow.keras.layers.GlobalAveragePooling1D(),
|
||||||
# tensorflow.keras.layers.Dropout(0.2),
|
tensorflow.keras.layers.Dropout(0.2),
|
||||||
tensorflow.keras.layers.Dense(5, activation="softmax"),
|
tensorflow.keras.layers.Dense(5, activation="softmax"),
|
||||||
])
|
])
|
||||||
|
log.debug("Compiling model: %s", model)
|
||||||
|
model.compile(
|
||||||
|
optimizer=tensorflow.keras.optimizers.Adam(global_clipnorm=1.0),
|
||||||
|
loss=tensorflow.keras.losses.CategoricalCrossentropy(),
|
||||||
|
metrics=[
|
||||||
|
tensorflow.keras.metrics.CategoricalAccuracy(),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
log.debug("Compiled model: %s", model)
|
||||||
|
return model
|
||||||
|
|
||||||
def _build_vectorizer(self) -> tensorflow.keras.layers.TextVectorization:
|
@staticmethod
|
||||||
return tensorflow.keras.layers.TextVectorization(max_tokens=self.MAX_FEATURES)
|
def _build_vectorizer(tokenizer: BaseTokenizer) -> tensorflow.keras.layers.TextVectorization:
|
||||||
|
return tensorflow.keras.layers.TextVectorization(
|
||||||
def __vectorize_data(self, text, category):
|
standardize=tokenizer.tokenize_tensorflow,
|
||||||
text = tensorflow.expand_dims(text, -1) # TODO: ??????
|
max_tokens=TENSORFLOW_MAX_FEATURES.__wrapped__
|
||||||
return self.text_vectorization_layer(text), category
|
)
|
||||||
|
|
||||||
MAX_FEATURES = 2500
|
|
||||||
EMBEDDING_DIM = 24
|
|
||||||
"""
|
|
||||||
Count of possible "semantic meanings" of words, represented as dimensions of a tensor.
|
|
||||||
"""
|
|
||||||
|
|
||||||
EPOCHS = 3
|
|
||||||
|
|
||||||
def train(self, dataset_func: DatasetFunc) -> None:
|
def train(self, dataset_func: DatasetFunc) -> None:
|
||||||
if self.trained:
|
if self.trained:
|
||||||
|
log.error("Tried to train an already trained model.")
|
||||||
raise AlreadyTrainedError()
|
raise AlreadyTrainedError()
|
||||||
|
|
||||||
|
log.debug("Building dataset...")
|
||||||
training_set = self._build_dataset(dataset_func)
|
training_set = self._build_dataset(dataset_func)
|
||||||
|
log.debug("Built dataset: %s", training_set)
|
||||||
|
|
||||||
|
log.debug("Preparing training_set for %s...", self.text_vectorization_layer.adapt)
|
||||||
only_text_set = training_set.map(lambda text, category: text)
|
only_text_set = training_set.map(lambda text, category: text)
|
||||||
|
log.debug("Adapting text_vectorization_layer: %s", self.text_vectorization_layer)
|
||||||
self.text_vectorization_layer.adapt(only_text_set)
|
self.text_vectorization_layer.adapt(only_text_set)
|
||||||
training_set = training_set.map(self.__vectorize_data)
|
log.debug("Adapted text_vectorization_layer: %s", self.text_vectorization_layer)
|
||||||
|
|
||||||
# self.model.compile(loss=tensorflow.keras.losses.SparseCategoricalCrossentropy(from_logits=True), optimizer="adam", metrics=["accuracy"])
|
log.debug("Preparing training_set for %s...", self.model.fit)
|
||||||
self.model.compile(loss=tensorflow.keras.losses.MeanAbsoluteError(), optimizer="adam", metrics=["accuracy"])
|
training_set = training_set.map(lambda text, category: (self.text_vectorization_layer(text), category))
|
||||||
|
log.info("Training: %s", self.model)
|
||||||
|
self.history: tensorflow.keras.callbacks.History | None = self.model.fit(
|
||||||
|
training_set,
|
||||||
|
epochs=TENSORFLOW_EPOCHS.__wrapped__,
|
||||||
|
callbacks=[
|
||||||
|
tensorflow.keras.callbacks.TerminateOnNaN()
|
||||||
|
])
|
||||||
|
log.info("Trained: %s", self.model)
|
||||||
|
|
||||||
self.model.fit(training_set, epochs=self.EPOCHS)
|
if len(self.history.epoch) < TENSORFLOW_EPOCHS.__wrapped__:
|
||||||
|
log.error("Model %s training failed: only %d epochs computed", self.model, len(self.history.epoch))
|
||||||
|
raise TrainingFailedError()
|
||||||
|
else:
|
||||||
|
log.info("Model %s training succeeded!", self.model)
|
||||||
|
|
||||||
self.trained = True
|
self.trained = True
|
||||||
|
|
||||||
def use(self, text: Text) -> Category:
|
def use(self, text: Text) -> Category:
|
||||||
if not self.trained:
|
if not self.trained:
|
||||||
|
log.error("Tried to use a non-trained model.")
|
||||||
raise NotTrainedError()
|
raise NotTrainedError()
|
||||||
|
|
||||||
vector = self.text_vectorization_layer(tensorflow.expand_dims(text, -1))
|
vector = self.text_vectorization_layer(text)
|
||||||
|
|
||||||
prediction = self.model.predict(vector)
|
prediction = self.model.predict(vector, verbose=False)
|
||||||
|
|
||||||
max_i = None
|
max_i = None
|
||||||
max_p = None
|
max_p = None
|
||||||
|
@ -84,5 +121,6 @@ class TensorflowSentimentAnalyzer(BaseSentimentAnalyzer):
|
||||||
if max_p is None or p > max_p:
|
if max_p is None or p > max_p:
|
||||||
max_i = i
|
max_i = i
|
||||||
max_p = p
|
max_p = p
|
||||||
|
result = float(max_i) + 1.0
|
||||||
|
|
||||||
return float(max_i) + 1.0
|
return result
|
||||||
|
|
|
@ -49,10 +49,55 @@ def DATA_SET_SIZE(val: str | None) -> int:
|
||||||
"""
|
"""
|
||||||
The number of reviews from each category to fetch for the datasets.
|
The number of reviews from each category to fetch for the datasets.
|
||||||
|
|
||||||
Defaults to `1000`.
|
Defaults to `1750`.
|
||||||
"""
|
"""
|
||||||
if val is None:
|
if val is None:
|
||||||
return 1000
|
return 1750
|
||||||
|
try:
|
||||||
|
return int(val)
|
||||||
|
except ValueError:
|
||||||
|
raise cfig.InvalidValueError("Not an int.")
|
||||||
|
|
||||||
|
|
||||||
|
@config.optional()
|
||||||
|
def TENSORFLOW_MAX_FEATURES(val: str | None) -> int:
|
||||||
|
"""
|
||||||
|
The maximum number of features to use in Tensorflow models.
|
||||||
|
|
||||||
|
Defaults to `30000`.
|
||||||
|
"""
|
||||||
|
if val is None:
|
||||||
|
return 30000
|
||||||
|
try:
|
||||||
|
return int(val)
|
||||||
|
except ValueError:
|
||||||
|
raise cfig.InvalidValueError("Not an int.")
|
||||||
|
|
||||||
|
|
||||||
|
@config.optional()
|
||||||
|
def TENSORFLOW_EMBEDDING_SIZE(val: str | None) -> int:
|
||||||
|
"""
|
||||||
|
The size of the embeddings tensor to use in Tensorflow models.
|
||||||
|
|
||||||
|
Defaults to `12`.
|
||||||
|
"""
|
||||||
|
if val is None:
|
||||||
|
return 12
|
||||||
|
try:
|
||||||
|
return int(val)
|
||||||
|
except ValueError:
|
||||||
|
raise cfig.InvalidValueError("Not an int.")
|
||||||
|
|
||||||
|
|
||||||
|
@config.optional()
|
||||||
|
def TENSORFLOW_EPOCHS(val: str | None) -> int:
|
||||||
|
"""
|
||||||
|
The number of epochs to train Tensorflow models for.
|
||||||
|
|
||||||
|
Defaults to `15`.
|
||||||
|
"""
|
||||||
|
if val is None:
|
||||||
|
return 15
|
||||||
try:
|
try:
|
||||||
return int(val)
|
return int(val)
|
||||||
except ValueError:
|
except ValueError:
|
||||||
|
@ -65,6 +110,9 @@ __all__ = (
|
||||||
"MONGO_PORT",
|
"MONGO_PORT",
|
||||||
"WORKING_SET_SIZE",
|
"WORKING_SET_SIZE",
|
||||||
"DATA_SET_SIZE",
|
"DATA_SET_SIZE",
|
||||||
|
"TENSORFLOW_MAX_FEATURES",
|
||||||
|
"TENSORFLOW_EMBEDDING_SIZE",
|
||||||
|
"TENSORFLOW_EPOCHS",
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -36,7 +36,7 @@ def store_cache(reviews: t.Iterator[Review], path: str | pathlib.Path) -> None:
|
||||||
|
|
||||||
def load_cache(path: str | pathlib.Path) -> DatasetFunc:
|
def load_cache(path: str | pathlib.Path) -> DatasetFunc:
|
||||||
"""
|
"""
|
||||||
Load the contents of a directory
|
Load the contents of a directory into a `Review` iterator.
|
||||||
"""
|
"""
|
||||||
path = pathlib.Path(path)
|
path = pathlib.Path(path)
|
||||||
|
|
||||||
|
@ -47,8 +47,10 @@ def load_cache(path: str | pathlib.Path) -> DatasetFunc:
|
||||||
document_paths = path.iterdir()
|
document_paths = path.iterdir()
|
||||||
for document_path in document_paths:
|
for document_path in document_paths:
|
||||||
document_path = pathlib.Path(document_path)
|
document_path = pathlib.Path(document_path)
|
||||||
|
|
||||||
if not str(document_path).endswith(".pickle"):
|
if not str(document_path).endswith(".pickle"):
|
||||||
log.debug("Ignoring non-pickle file: %s", document_path)
|
log.debug("Ignoring non-pickle file: %s", document_path)
|
||||||
|
continue
|
||||||
|
|
||||||
log.debug("Loading pickle file: %s", document_path)
|
log.debug("Loading pickle file: %s", document_path)
|
||||||
with open(document_path, "rb") as file:
|
with open(document_path, "rb") as file:
|
||||||
|
@ -58,8 +60,22 @@ def load_cache(path: str | pathlib.Path) -> DatasetFunc:
|
||||||
return data_cache_loader
|
return data_cache_loader
|
||||||
|
|
||||||
|
|
||||||
|
def delete_cache(path: str | pathlib.Path) -> None:
|
||||||
|
"""
|
||||||
|
Delete the given cache directory.
|
||||||
|
"""
|
||||||
|
path = pathlib.Path(path)
|
||||||
|
|
||||||
|
if not path.exists():
|
||||||
|
raise FileNotFoundError("The specified path does not exist.")
|
||||||
|
|
||||||
|
log.warning("Deleting cache directory: %s", path)
|
||||||
|
shutil.rmtree(path)
|
||||||
|
|
||||||
|
|
||||||
__all__ = (
|
__all__ = (
|
||||||
"DatasetFunc",
|
"DatasetFunc",
|
||||||
"store_cache",
|
"store_cache",
|
||||||
"load_cache",
|
"load_cache",
|
||||||
|
"delete_cache",
|
||||||
)
|
)
|
||||||
|
|
|
@ -1,4 +1,3 @@
|
||||||
import contextlib
|
|
||||||
import pymongo.collection
|
import pymongo.collection
|
||||||
import typing as t
|
import typing as t
|
||||||
import bson
|
import bson
|
||||||
|
@ -30,8 +29,8 @@ def reviews_collection(db: pymongo.MongoClient) -> pymongo.collection.Collection
|
||||||
Create a new MongoDB client, access the ``reviews`` collection in the ``reviews`` database, and yield it.
|
Create a new MongoDB client, access the ``reviews`` collection in the ``reviews`` database, and yield it.
|
||||||
"""
|
"""
|
||||||
log.debug("Accessing the reviews collection...")
|
log.debug("Accessing the reviews collection...")
|
||||||
collection = db.reviews.reviews
|
collection: pymongo.collection.Collection[MongoReview] = db.reviews.reviews
|
||||||
log.debug("Collection accessed successfully: %s", collection)
|
log.debug("Collection accessed successfully: %s", collection.name)
|
||||||
return collection
|
return collection
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,5 +1,8 @@
|
||||||
import tensorflow
|
import tensorflow
|
||||||
from .collections import MongoReview
|
from .collections import MongoReview
|
||||||
|
import logging
|
||||||
|
|
||||||
|
log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
Text = str
|
Text = str
|
||||||
|
@ -33,19 +36,21 @@ class Review:
|
||||||
return tensorflow.convert_to_tensor(self.text, dtype=tensorflow.string)
|
return tensorflow.convert_to_tensor(self.text, dtype=tensorflow.string)
|
||||||
|
|
||||||
def to_tensor_category(self) -> tensorflow.Tensor:
|
def to_tensor_category(self) -> tensorflow.Tensor:
|
||||||
return tensorflow.convert_to_tensor([
|
return tensorflow.convert_to_tensor([[
|
||||||
1.0 if self.category == 1.0 else 0.0,
|
1.0 if self.category == 1.0 else 0.0,
|
||||||
1.0 if self.category == 2.0 else 0.0,
|
1.0 if self.category == 2.0 else 0.0,
|
||||||
1.0 if self.category == 3.0 else 0.0,
|
1.0 if self.category == 3.0 else 0.0,
|
||||||
1.0 if self.category == 4.0 else 0.0,
|
1.0 if self.category == 4.0 else 0.0,
|
||||||
1.0 if self.category == 5.0 else 0.0,
|
1.0 if self.category == 5.0 else 0.0,
|
||||||
], dtype=tensorflow.float32)
|
]], dtype=tensorflow.float32)
|
||||||
|
|
||||||
def to_tensor_tuple(self) -> tuple[tensorflow.Tensor, tensorflow.Tensor]:
|
def to_tensor_tuple(self) -> tuple[tensorflow.Tensor, tensorflow.Tensor]:
|
||||||
return (
|
t = (
|
||||||
self.to_tensor_text(),
|
self.to_tensor_text(),
|
||||||
self.to_tensor_category(),
|
self.to_tensor_category(),
|
||||||
)
|
)
|
||||||
|
log.debug("Converted %s", t)
|
||||||
|
return t
|
||||||
|
|
||||||
|
|
||||||
__all__ = (
|
__all__ = (
|
||||||
|
|
|
@ -54,6 +54,12 @@ def sample_reviews_polar(collection: pymongo.collection.Collection, amount: int)
|
||||||
{"$match": {"overall": 5.0}},
|
{"$match": {"overall": 5.0}},
|
||||||
{"$sample": {"size": amount}},
|
{"$sample": {"size": amount}},
|
||||||
],
|
],
|
||||||
|
}},
|
||||||
|
{"$addFields": {
|
||||||
|
"sortKey": {"$rand": {}},
|
||||||
|
}},
|
||||||
|
{"$sort": {
|
||||||
|
"sortKey": 1,
|
||||||
}}
|
}}
|
||||||
])
|
])
|
||||||
|
|
||||||
|
@ -101,6 +107,12 @@ def sample_reviews_varied(collection: pymongo.collection.Collection, amount: int
|
||||||
],
|
],
|
||||||
}}
|
}}
|
||||||
],
|
],
|
||||||
|
}},
|
||||||
|
{"$addFields": {
|
||||||
|
"sortKey": {"$rand": {}},
|
||||||
|
}},
|
||||||
|
{"$sort": {
|
||||||
|
"sortKey": 1,
|
||||||
}}
|
}}
|
||||||
])
|
])
|
||||||
|
|
||||||
|
|
|
@ -15,7 +15,7 @@ def install_log_handler(loggers: list[logging.Logger] = None):
|
||||||
for logger in loggers:
|
for logger in loggers:
|
||||||
coloredlogs.install(
|
coloredlogs.install(
|
||||||
logger=logger,
|
logger=logger,
|
||||||
level="INFO",
|
level="DEBUG",
|
||||||
fmt="{asctime} | {name:<32} | {levelname:>8} | {message}",
|
fmt="{asctime} | {name:<32} | {levelname:>8} | {message}",
|
||||||
style="{",
|
style="{",
|
||||||
level_styles=dict(
|
level_styles=dict(
|
||||||
|
@ -34,6 +34,9 @@ def install_log_handler(loggers: list[logging.Logger] = None):
|
||||||
)
|
)
|
||||||
this_log.debug("Installed custom log handler on: %s", logger)
|
this_log.debug("Installed custom log handler on: %s", logger)
|
||||||
|
|
||||||
|
logging.getLogger("unimore_bda_6.database.cache").setLevel("INFO")
|
||||||
|
logging.getLogger("unimore_bda_6.database.datatypes").setLevel("INFO")
|
||||||
|
|
||||||
|
|
||||||
_passage_counts = collections.defaultdict(lambda: 0)
|
_passage_counts = collections.defaultdict(lambda: 0)
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue