1
Fork 0
mirror of https://github.com/Steffo99/unimore-bda-6.git synced 2024-11-22 16:04:18 +00:00
bda-6-steffo/unimore_bda_6/analysis/tf_text.py

89 lines
3.2 KiB
Python
Raw Normal View History

2023-02-04 00:36:42 +00:00
import tensorflow
from ..database import Text, Category, DatasetFunc
from ..config import DATA_SET_SIZE
2023-02-04 00:36:42 +00:00
from .base import BaseSentimentAnalyzer, AlreadyTrainedError, NotTrainedError
class TensorflowSentimentAnalyzer(BaseSentimentAnalyzer):
def __init__(self, *args, **kwargs):
2023-02-04 05:14:24 +00:00
super().__init__()
self.trained: bool = False
2023-02-04 00:36:42 +00:00
self.text_vectorization_layer: tensorflow.keras.layers.TextVectorization = self._build_vectorizer()
self.model: tensorflow.keras.Sequential = self._build_model()
2023-02-04 00:36:42 +00:00
def _build_dataset(self, dataset_func: DatasetFunc) -> tensorflow.data.Dataset:
def dataset_func_with_tensor_tuple():
for review in dataset_func():
yield review.to_tensor_tuple()
return tensorflow.data.Dataset.from_generator(
dataset_func_with_tensor_tuple,
output_signature=(
tensorflow.TensorSpec(shape=(), dtype=tensorflow.string, name="text"),
tensorflow.TensorSpec(shape=(5,), dtype=tensorflow.float32, name="category"),
)
)
2023-02-04 00:36:42 +00:00
def _build_model(self) -> tensorflow.keras.Sequential:
return tensorflow.keras.Sequential([
tensorflow.keras.layers.Embedding(
input_dim=self.MAX_FEATURES + 1,
output_dim=self.EMBEDDING_DIM,
),
# tensorflow.keras.layers.Dropout(0.2),
2023-02-04 00:36:42 +00:00
tensorflow.keras.layers.GlobalAveragePooling1D(),
# tensorflow.keras.layers.Dropout(0.2),
tensorflow.keras.layers.Dense(5, activation="softmax"),
2023-02-04 00:36:42 +00:00
])
def _build_vectorizer(self) -> tensorflow.keras.layers.TextVectorization:
return tensorflow.keras.layers.TextVectorization(max_tokens=self.MAX_FEATURES)
2023-02-04 00:36:42 +00:00
def __vectorize_data(self, text, category):
text = tensorflow.expand_dims(text, -1) # TODO: ??????
return self.text_vectorization_layer(text), category
MAX_FEATURES = 2500
EMBEDDING_DIM = 24
"""
Count of possible "semantic meanings" of words, represented as dimensions of a tensor.
"""
EPOCHS = 3
def train(self, dataset_func: DatasetFunc) -> None:
if self.trained:
raise AlreadyTrainedError()
training_set = self._build_dataset(dataset_func)
only_text_set = training_set.map(lambda text, category: text)
self.text_vectorization_layer.adapt(only_text_set)
training_set = training_set.map(self.__vectorize_data)
# self.model.compile(loss=tensorflow.keras.losses.SparseCategoricalCrossentropy(from_logits=True), optimizer="adam", metrics=["accuracy"])
self.model.compile(loss=tensorflow.keras.losses.MeanAbsoluteError(), optimizer="adam", metrics=["accuracy"])
self.model.fit(training_set, epochs=self.EPOCHS)
2023-02-04 00:36:42 +00:00
self.trained = True
def use(self, text: Text) -> Category:
if not self.trained:
raise NotTrainedError()
vector = self.text_vectorization_layer(tensorflow.expand_dims(text, -1))
prediction = self.model.predict(vector)
max_i = None
max_p = None
for i, p in enumerate(iter(prediction[0])):
if max_p is None or p > max_p:
max_i = i
max_p = p
return float(max_i) + 1.0