1
Fork 0
mirror of https://github.com/Steffo99/unimore-bda-6.git synced 2024-11-25 09:14:19 +00:00
bda-6-steffo/unimore_bda_6/analysis/tf_text.py

288 lines
11 KiB
Python
Raw Normal View History

2023-02-09 17:54:58 +00:00
import abc
import typing as t
import numpy
2023-02-04 00:36:42 +00:00
import tensorflow
2023-02-08 09:54:14 +00:00
import logging
2023-02-04 00:36:42 +00:00
2023-02-12 04:11:58 +00:00
from ..database import CachedDatasetFunc, TextReview, TokenizedReview
2023-02-08 09:54:14 +00:00
from ..config import TENSORFLOW_EMBEDDING_SIZE, TENSORFLOW_MAX_FEATURES, TENSORFLOW_EPOCHS
from ..tokenizer import BaseTokenizer
from .base import BaseSentimentAnalyzer, AlreadyTrainedError, NotTrainedError, TrainingFailedError
log = logging.getLogger(__name__)
2023-02-04 00:36:42 +00:00
2023-02-08 18:46:05 +00:00
if len(tensorflow.config.list_physical_devices(device_type="GPU")) == 0:
log.warning("Tensorflow reports no GPU acceleration available.")
else:
log.debug("Tensorflow successfully found GPU acceleration!")
2023-02-12 04:11:58 +00:00
ConversionFunc = t.Callable[[TextReview], tensorflow.Tensor | tuple]
2023-02-09 17:54:58 +00:00
class TensorflowSentimentAnalyzer(BaseSentimentAnalyzer, metaclass=abc.ABCMeta):
"""
Base class for a sentiment analyzer using `tensorflow`.
"""
2023-02-08 18:46:05 +00:00
def __init__(self, *, tokenizer: BaseTokenizer):
super().__init__(tokenizer=tokenizer)
self.trained: bool = False
2023-02-09 17:54:58 +00:00
self.failed: bool = False
2023-02-04 00:36:42 +00:00
2023-02-12 04:11:58 +00:00
self.string_lookup_layer: tensorflow.keras.layers.StringLookup = tensorflow.keras.layers.StringLookup(max_tokens=TENSORFLOW_MAX_FEATURES.__wrapped__)
self.model: tensorflow.keras.Sequential = self._build_model()
2023-02-08 09:54:14 +00:00
self.history: tensorflow.keras.callbacks.History | None = None
2023-02-04 00:36:42 +00:00
2023-02-09 17:54:58 +00:00
@abc.abstractmethod
def _build_model(self) -> tensorflow.keras.Sequential:
"""
Create the `tensorflow.keras.Sequential` model that should be executed by this sentiment analyzer.
"""
raise NotImplementedError()
2023-02-08 09:54:14 +00:00
2023-02-09 17:54:58 +00:00
def _build_dataset(self, dataset_func: CachedDatasetFunc) -> tensorflow.data.Dataset:
"""
Create a `tensorflow.data.Dataset` from the given `CachedDatasetFunc`.
"""
2023-02-12 04:11:58 +00:00
def dataset_generator():
for review in dataset_func():
review: TextReview
review: TokenizedReview = self.tokenizer.tokenize_review(review)
tokens: tensorflow.Tensor = self._tokens_to_tensor(review.tokens)
rating: tensorflow.Tensor = self._rating_to_input(review.rating)
yield tokens, rating
log.debug("Creating dataset...")
dataset = tensorflow.data.Dataset.from_generator(
dataset_generator,
output_signature=(
tensorflow.TensorSpec(shape=(1, None,), dtype=tensorflow.string, name="tokens"),
self._ratingtensor_shape(),
),
)
log.debug("Caching dataset...")
dataset = dataset.cache()
log.debug("Configuring dataset prefetch...")
dataset = dataset.prefetch(buffer_size=tensorflow.data.AUTOTUNE)
return dataset
2023-02-08 09:54:14 +00:00
2023-02-09 17:54:58 +00:00
def _adapt_textvectorization(self, dataset: tensorflow.data.Dataset) -> None:
"""
Adapt the `.text_vectorization_layer` to the given dataset.
"""
2023-02-12 04:11:58 +00:00
log.debug("Preparing dataset to adapt %s...", self.string_lookup_layer)
2023-02-09 17:54:58 +00:00
dataset = dataset.map(lambda text, category: text)
2023-02-12 04:11:58 +00:00
log.debug("Adapting %s...", self.string_lookup_layer)
self.string_lookup_layer.adapt(dataset)
2023-02-08 18:46:05 +00:00
def train(self, training_dataset_func: CachedDatasetFunc, validation_dataset_func: CachedDatasetFunc) -> None:
2023-02-09 17:54:58 +00:00
if self.failed:
log.error("Tried to train a failed model.")
raise AlreadyTrainedError("Cannot re-train a failed model.")
if self.trained:
2023-02-08 09:54:14 +00:00
log.error("Tried to train an already trained model.")
2023-02-09 17:54:58 +00:00
raise AlreadyTrainedError("Cannot re-train an already trained model.")
2023-02-12 04:11:58 +00:00
log.debug("Building training dataset...")
2023-02-08 18:46:05 +00:00
training_set = self._build_dataset(training_dataset_func)
2023-02-12 04:11:58 +00:00
log.debug("Building validation dataset...")
2023-02-08 18:46:05 +00:00
validation_set = self._build_dataset(validation_dataset_func)
2023-02-12 04:11:58 +00:00
log.debug("Building vocabulary...")
vocabulary = training_set.map(lambda tokens, rating: tokens)
2023-02-08 18:46:05 +00:00
2023-02-12 04:11:58 +00:00
log.debug("Adapting lookup layer to the vocabulary...")
self.string_lookup_layer.adapt(vocabulary)
2023-02-08 09:54:14 +00:00
log.info("Training: %s", self.model)
self.history: tensorflow.keras.callbacks.History | None = self.model.fit(
training_set,
2023-02-08 18:46:05 +00:00
validation_data=validation_set,
2023-02-08 09:54:14 +00:00
epochs=TENSORFLOW_EPOCHS.__wrapped__,
callbacks=[
tensorflow.keras.callbacks.TerminateOnNaN()
2023-02-08 18:46:05 +00:00
],
)
2023-02-08 09:54:14 +00:00
if len(self.history.epoch) < TENSORFLOW_EPOCHS.__wrapped__:
log.error("Model %s training failed: only %d epochs computed", self.model, len(self.history.epoch))
2023-02-09 17:54:58 +00:00
self.failed = True
2023-02-08 09:54:14 +00:00
raise TrainingFailedError()
else:
log.info("Model %s training succeeded!", self.model)
2023-02-09 17:54:58 +00:00
self.trained = True
2023-02-04 00:36:42 +00:00
2023-02-12 04:11:58 +00:00
@staticmethod
def _tokens_to_tensor(tokens: t.Iterator[str]) -> tensorflow.Tensor:
2023-02-09 17:54:58 +00:00
"""
2023-02-12 04:11:58 +00:00
Convert an iterator of tokens to a `tensorflow.Tensor`.
2023-02-09 17:54:58 +00:00
"""
2023-02-12 04:11:58 +00:00
tensor = tensorflow.convert_to_tensor(
[list(tokens)],
dtype=tensorflow.string,
name="tokens"
)
return tensor
2023-02-04 00:36:42 +00:00
2023-02-12 04:11:58 +00:00
def use(self, text: str) -> float:
2023-02-09 17:54:58 +00:00
if self.failed:
raise NotTrainedError("Cannot use a failed model.")
2023-02-04 00:36:42 +00:00
if not self.trained:
2023-02-09 17:54:58 +00:00
raise NotTrainedError("Cannot use a non-trained model.")
2023-02-04 00:36:42 +00:00
2023-02-12 04:11:58 +00:00
tokens = self.tokenizer.tokenize(text)
tokens = self._tokens_to_tensor(tokens)
prediction = self.model.predict(tokens, verbose=False)
prediction = self._prediction_to_rating(prediction)
return prediction
2023-02-12 04:11:58 +00:00
@abc.abstractmethod
def _rating_to_input(self, rating: float) -> tensorflow.Tensor:
"""
Convert a review rating to a `tensorflow.Tensor`.
"""
raise NotImplementedError()
@abc.abstractmethod
def _ratingtensor_shape(self) -> tensorflow.TensorSpec:
"""
Returns the shape of the tensor output by `._rating_to_tensor` and accepted as input by `._tensor_to_rating`.
"""
raise NotImplementedError()
@abc.abstractmethod
def _prediction_to_rating(self, prediction: tensorflow.Tensor) -> float:
"""
Convert the results of `tensorflow.keras.Sequential.predict` into a review rating.
"""
raise NotImplementedError()
2023-02-09 17:54:58 +00:00
class TensorflowCategorySentimentAnalyzer(TensorflowSentimentAnalyzer):
"""
A `tensorflow`-based sentiment analyzer that considers each star rating as a separate category.
"""
def _build_model(self) -> tensorflow.keras.Sequential:
log.debug("Creating sequential categorizer model...")
model = tensorflow.keras.Sequential([
2023-02-12 04:11:58 +00:00
self.string_lookup_layer,
2023-02-09 17:54:58 +00:00
tensorflow.keras.layers.Embedding(
input_dim=TENSORFLOW_MAX_FEATURES.__wrapped__ + 1,
output_dim=TENSORFLOW_EMBEDDING_SIZE.__wrapped__,
),
2023-02-11 03:32:17 +00:00
tensorflow.keras.layers.Dropout(0.25),
2023-02-09 17:54:58 +00:00
tensorflow.keras.layers.GlobalAveragePooling1D(),
2023-02-11 03:32:17 +00:00
tensorflow.keras.layers.Dropout(0.25),
tensorflow.keras.layers.Dense(8),
tensorflow.keras.layers.Dropout(0.25),
2023-02-09 17:54:58 +00:00
tensorflow.keras.layers.Dense(5, activation="softmax"),
])
log.debug("Compiling model: %s", model)
model.compile(
2023-02-11 04:57:14 +00:00
optimizer=tensorflow.keras.optimizers.Adam(clipnorm=1.0),
2023-02-09 17:54:58 +00:00
loss=tensorflow.keras.losses.CategoricalCrossentropy(),
metrics=[
tensorflow.keras.metrics.CategoricalAccuracy(),
]
)
log.debug("Compiled model: %s", model)
return model
2023-02-12 04:11:58 +00:00
def _rating_to_input(self, rating: float) -> tensorflow.Tensor:
tensor = tensorflow.convert_to_tensor(
[[
1.0 if rating == 1.0 else 0.0,
1.0 if rating == 2.0 else 0.0,
1.0 if rating == 3.0 else 0.0,
1.0 if rating == 4.0 else 0.0,
1.0 if rating == 5.0 else 0.0,
]],
dtype=tensorflow.float32,
name="rating_one_hot"
)
return tensor
def _ratingtensor_shape(self) -> tensorflow.TensorSpec:
spec = tensorflow.TensorSpec(shape=(1, 5), dtype=tensorflow.float32, name="rating_one_hot")
return spec
def _prediction_to_rating(self, prediction: tensorflow.Tensor) -> float:
best_prediction = None
best_prediction_index = None
for index, prediction in enumerate(iter(prediction[0])):
if best_prediction is None or prediction > best_prediction:
best_prediction = prediction
best_prediction_index = index
result = float(best_prediction_index) + 1.0
return result
2023-02-09 17:54:58 +00:00
2023-02-10 04:52:13 +00:00
class TensorflowPolarSentimentAnalyzer(TensorflowSentimentAnalyzer):
"""
A `tensorflow`-based sentiment analyzer that uses the floating point value rating to get as close as possible to the correct category.
"""
def _build_model(self) -> tensorflow.keras.Sequential:
log.debug("Creating sequential categorizer model...")
model = tensorflow.keras.Sequential([
2023-02-12 04:11:58 +00:00
self.string_lookup_layer,
2023-02-10 04:52:13 +00:00
tensorflow.keras.layers.Embedding(
input_dim=TENSORFLOW_MAX_FEATURES.__wrapped__ + 1,
output_dim=TENSORFLOW_EMBEDDING_SIZE.__wrapped__,
),
2023-02-11 03:32:17 +00:00
tensorflow.keras.layers.Dropout(0.25),
2023-02-10 04:52:13 +00:00
tensorflow.keras.layers.GlobalAveragePooling1D(),
2023-02-11 03:32:17 +00:00
tensorflow.keras.layers.Dropout(0.25),
2023-02-12 04:11:58 +00:00
tensorflow.keras.layers.Dense(8),
tensorflow.keras.layers.Dropout(0.25),
tensorflow.keras.layers.Dense(1, activation=tensorflow.keras.activations.sigmoid),
2023-02-10 04:52:13 +00:00
])
log.debug("Compiling model: %s", model)
model.compile(
2023-02-11 04:57:14 +00:00
optimizer=tensorflow.keras.optimizers.Adam(clipnorm=1.0),
2023-02-11 03:32:17 +00:00
loss=tensorflow.keras.losses.MeanAbsoluteError(),
2023-02-10 04:52:13 +00:00
)
log.debug("Compiled model: %s", model)
return model
2023-02-12 04:11:58 +00:00
def _rating_to_input(self, rating: float) -> tensorflow.Tensor:
normalized_rating = (rating - 1) / 4
tensor = tensorflow.convert_to_tensor(
[normalized_rating],
dtype=tensorflow.float32,
name="rating_value"
)
return tensor
def _ratingtensor_shape(self) -> tensorflow.TensorSpec:
spec = tensorflow.TensorSpec(shape=(1,), dtype=tensorflow.float32, name="rating_value")
return spec
def _prediction_to_rating(self, prediction: numpy.array) -> float:
rating: float = prediction[0, 0]
rating = 1.0 if rating < 0.5 else 5.0
return rating
2023-02-10 04:52:13 +00:00
2023-02-09 17:54:58 +00:00
__all__ = (
"TensorflowSentimentAnalyzer",
"TensorflowCategorySentimentAnalyzer",
2023-02-10 04:52:13 +00:00
"TensorflowPolarSentimentAnalyzer",
2023-02-09 17:54:58 +00:00
)