2023-02-04 00:36:42 +00:00
|
|
|
import tensorflow
|
|
|
|
|
2023-02-06 00:12:30 +00:00
|
|
|
from ..database import Text, Category, DatasetFunc
|
2023-02-07 09:22:09 +00:00
|
|
|
from ..config import DATA_SET_SIZE
|
2023-02-04 00:36:42 +00:00
|
|
|
from .base import BaseSentimentAnalyzer, AlreadyTrainedError, NotTrainedError
|
|
|
|
|
|
|
|
|
|
|
|
class TensorflowSentimentAnalyzer(BaseSentimentAnalyzer):
|
2023-02-06 00:12:30 +00:00
|
|
|
def __init__(self, *args, **kwargs):
|
2023-02-04 05:14:24 +00:00
|
|
|
super().__init__()
|
2023-02-06 00:12:30 +00:00
|
|
|
self.trained: bool = False
|
2023-02-04 00:36:42 +00:00
|
|
|
|
2023-02-06 00:12:30 +00:00
|
|
|
self.text_vectorization_layer: tensorflow.keras.layers.TextVectorization = self._build_vectorizer()
|
|
|
|
self.model: tensorflow.keras.Sequential = self._build_model()
|
2023-02-04 00:36:42 +00:00
|
|
|
|
2023-02-06 00:12:30 +00:00
|
|
|
def _build_dataset(self, dataset_func: DatasetFunc) -> tensorflow.data.Dataset:
|
2023-02-05 16:40:22 +00:00
|
|
|
def dataset_func_with_tensor_tuple():
|
|
|
|
for review in dataset_func():
|
|
|
|
yield review.to_tensor_tuple()
|
|
|
|
|
2023-02-06 00:12:30 +00:00
|
|
|
return tensorflow.data.Dataset.from_generator(
|
2023-02-05 16:40:22 +00:00
|
|
|
dataset_func_with_tensor_tuple,
|
|
|
|
output_signature=(
|
|
|
|
tensorflow.TensorSpec(shape=(), dtype=tensorflow.string, name="text"),
|
2023-02-07 09:22:09 +00:00
|
|
|
tensorflow.TensorSpec(shape=(5,), dtype=tensorflow.float32, name="category"),
|
2023-02-05 16:40:22 +00:00
|
|
|
)
|
|
|
|
)
|
2023-02-04 00:36:42 +00:00
|
|
|
|
2023-02-06 00:12:30 +00:00
|
|
|
def _build_model(self) -> tensorflow.keras.Sequential:
|
|
|
|
return tensorflow.keras.Sequential([
|
2023-02-07 09:22:09 +00:00
|
|
|
tensorflow.keras.layers.Embedding(
|
|
|
|
input_dim=self.MAX_FEATURES + 1,
|
|
|
|
output_dim=self.EMBEDDING_DIM,
|
|
|
|
),
|
|
|
|
# tensorflow.keras.layers.Dropout(0.2),
|
2023-02-04 00:36:42 +00:00
|
|
|
tensorflow.keras.layers.GlobalAveragePooling1D(),
|
2023-02-07 09:22:09 +00:00
|
|
|
# tensorflow.keras.layers.Dropout(0.2),
|
|
|
|
tensorflow.keras.layers.Dense(5, activation="softmax"),
|
2023-02-04 00:36:42 +00:00
|
|
|
])
|
|
|
|
|
2023-02-06 00:12:30 +00:00
|
|
|
def _build_vectorizer(self) -> tensorflow.keras.layers.TextVectorization:
|
|
|
|
return tensorflow.keras.layers.TextVectorization(max_tokens=self.MAX_FEATURES)
|
2023-02-04 00:36:42 +00:00
|
|
|
|
2023-02-06 00:12:30 +00:00
|
|
|
def __vectorize_data(self, text, category):
|
|
|
|
text = tensorflow.expand_dims(text, -1) # TODO: ??????
|
|
|
|
return self.text_vectorization_layer(text), category
|
2023-02-05 16:40:22 +00:00
|
|
|
|
2023-02-07 09:22:09 +00:00
|
|
|
MAX_FEATURES = 2500
|
|
|
|
EMBEDDING_DIM = 24
|
|
|
|
"""
|
|
|
|
Count of possible "semantic meanings" of words, represented as dimensions of a tensor.
|
|
|
|
"""
|
|
|
|
|
|
|
|
EPOCHS = 3
|
2023-02-06 00:12:30 +00:00
|
|
|
|
|
|
|
def train(self, dataset_func: DatasetFunc) -> None:
|
|
|
|
if self.trained:
|
|
|
|
raise AlreadyTrainedError()
|
|
|
|
|
|
|
|
training_set = self._build_dataset(dataset_func)
|
|
|
|
|
|
|
|
only_text_set = training_set.map(lambda text, category: text)
|
|
|
|
self.text_vectorization_layer.adapt(only_text_set)
|
|
|
|
training_set = training_set.map(self.__vectorize_data)
|
|
|
|
|
2023-02-07 09:22:09 +00:00
|
|
|
# self.model.compile(loss=tensorflow.keras.losses.SparseCategoricalCrossentropy(from_logits=True), optimizer="adam", metrics=["accuracy"])
|
|
|
|
self.model.compile(loss=tensorflow.keras.losses.MeanAbsoluteError(), optimizer="adam", metrics=["accuracy"])
|
2023-02-06 00:12:30 +00:00
|
|
|
|
2023-02-07 09:22:09 +00:00
|
|
|
self.model.fit(training_set, epochs=self.EPOCHS)
|
2023-02-04 00:36:42 +00:00
|
|
|
|
|
|
|
self.trained = True
|
|
|
|
|
|
|
|
def use(self, text: Text) -> Category:
|
|
|
|
if not self.trained:
|
|
|
|
raise NotTrainedError()
|
|
|
|
|
2023-02-07 09:22:09 +00:00
|
|
|
vector = self.text_vectorization_layer(tensorflow.expand_dims(text, -1))
|
|
|
|
|
|
|
|
prediction = self.model.predict(vector)
|
|
|
|
|
|
|
|
max_i = None
|
|
|
|
max_p = None
|
|
|
|
for i, p in enumerate(iter(prediction[0])):
|
|
|
|
if max_p is None or p > max_p:
|
|
|
|
max_i = i
|
|
|
|
max_p = p
|
|
|
|
|
|
|
|
return float(max_i) + 1.0
|