1
Fork 0
mirror of https://github.com/Steffo99/unimore-bda-6.git synced 2024-11-21 23:44:19 +00:00

Now I understand text vectorization (but this still does not work)

This commit is contained in:
Steffo 2023-02-06 01:12:30 +01:00
parent 3abba24ca2
commit e9a4421acd
Signed by: steffo
GPG key ID: 2A24051445686895
2 changed files with 36 additions and 45 deletions

View file

@ -5,7 +5,7 @@
<option name="PARENT_ENVS" value="true" /> <option name="PARENT_ENVS" value="true" />
<envs> <envs>
<env name="CONFIRM_OVERWRITE" value="False" /> <env name="CONFIRM_OVERWRITE" value="False" />
<env name="DATA_SET_SIZE" value="750" /> <env name="DATA_SET_SIZE" value="100" />
<env name="NLTK_DATA" value="./data/nltk" /> <env name="NLTK_DATA" value="./data/nltk" />
<env name="PYTHONUNBUFFERED" value="1" /> <env name="PYTHONUNBUFFERED" value="1" />
<env name="WORKING_SET_SIZE" value="1000000" /> <env name="WORKING_SET_SIZE" value="1000000" />

View file

@ -1,47 +1,23 @@
import tensorflow import tensorflow
import itertools
import typing as t
from ..database import Text, Category, Review, DatasetFunc from ..database import Text, Category, DatasetFunc
from ..tokenizer import BaseTokenizer
from .base import BaseSentimentAnalyzer, AlreadyTrainedError, NotTrainedError from .base import BaseSentimentAnalyzer, AlreadyTrainedError, NotTrainedError
class TensorflowSentimentAnalyzer(BaseSentimentAnalyzer): class TensorflowSentimentAnalyzer(BaseSentimentAnalyzer):
def __init__(self, *, tokenizer: BaseTokenizer): def __init__(self, *args, **kwargs):
super().__init__() super().__init__()
self.trained = False self.trained: bool = False
self.neural_network: tensorflow.keras.Sequential | None = None
self.tokenizer: BaseTokenizer = tokenizer # TODO
MAX_FEATURES = 20000 self.text_vectorization_layer: tensorflow.keras.layers.TextVectorization = self._build_vectorizer()
EMBEDDING_DIM = 16 self.model: tensorflow.keras.Sequential = self._build_model()
EPOCHS = 10
def train(self, dataset_func: DatasetFunc) -> None:
if self.trained:
raise AlreadyTrainedError()
def dataset_func_with_tensor_text():
for review in dataset_func():
yield review.to_tensor_text()
text_set = tensorflow.data.Dataset.from_generator(
dataset_func_with_tensor_text,
output_signature=tensorflow.TensorSpec(shape=(), dtype=tensorflow.string)
)
text_vectorization_layer = tensorflow.keras.layers.TextVectorization(
max_tokens=self.MAX_FEATURES,
standardize=self.tokenizer.tokenize_tensorflow,
)
text_vectorization_layer.adapt(text_set)
def _build_dataset(self, dataset_func: DatasetFunc) -> tensorflow.data.Dataset:
def dataset_func_with_tensor_tuple(): def dataset_func_with_tensor_tuple():
for review in dataset_func(): for review in dataset_func():
yield review.to_tensor_tuple() yield review.to_tensor_tuple()
training_set = tensorflow.data.Dataset.from_generator( return tensorflow.data.Dataset.from_generator(
dataset_func_with_tensor_tuple, dataset_func_with_tensor_tuple,
output_signature=( output_signature=(
tensorflow.TensorSpec(shape=(), dtype=tensorflow.string, name="text"), tensorflow.TensorSpec(shape=(), dtype=tensorflow.string, name="text"),
@ -49,26 +25,41 @@ class TensorflowSentimentAnalyzer(BaseSentimentAnalyzer):
) )
) )
# I have no idea of what I'm doing here def _build_model(self) -> tensorflow.keras.Sequential:
self.neural_network = tensorflow.keras.Sequential([ return tensorflow.keras.Sequential([
tensorflow.keras.layers.Embedding(self.MAX_FEATURES + 1, self.EMBEDDING_DIM), tensorflow.keras.layers.Embedding(input_dim=self.MAX_FEATURES + 1, output_dim=self.EMBEDDING_DIM),
tensorflow.keras.layers.Dropout(0.2), tensorflow.keras.layers.Dropout(0.2),
tensorflow.keras.layers.GlobalAveragePooling1D(), tensorflow.keras.layers.GlobalAveragePooling1D(),
tensorflow.keras.layers.Dropout(0.2), tensorflow.keras.layers.Dropout(0.2),
tensorflow.keras.layers.Dense(1), tensorflow.keras.layers.Dense(1),
]) ])
self.neural_network.compile( def _build_vectorizer(self) -> tensorflow.keras.layers.TextVectorization:
loss=tensorflow.losses.BinaryCrossentropy(from_logits=True), # Only works with two tags return tensorflow.keras.layers.TextVectorization(max_tokens=self.MAX_FEATURES)
metrics=tensorflow.metrics.BinaryAccuracy(threshold=0.0)
)
training_set = training_set.map(text_vectorization_layer) def __vectorize_data(self, text, category):
text = tensorflow.expand_dims(text, -1) # TODO: ??????
return self.text_vectorization_layer(text), category
self.neural_network.fit( MAX_FEATURES = 1000
training_set, EMBEDDING_DIM = 16
epochs=self.EPOCHS, EPOCHS = 10
)
def train(self, dataset_func: DatasetFunc) -> None:
if self.trained:
raise AlreadyTrainedError()
training_set = self._build_dataset(dataset_func)
only_text_set = training_set.map(lambda text, category: text)
self.text_vectorization_layer.adapt(only_text_set)
training_set = training_set.map(self.__vectorize_data)
self.model.compile(loss=tensorflow.keras.losses.CosineSimilarity(axis=0), metrics=["accuracy"])
history = self.model.fit(training_set, epochs=self.EPOCHS)
...
self.trained = True self.trained = True
@ -76,5 +67,5 @@ class TensorflowSentimentAnalyzer(BaseSentimentAnalyzer):
if not self.trained: if not self.trained:
raise NotTrainedError() raise NotTrainedError()
prediction = self.neural_network.predict(text) prediction = self.model.predict(text)
breakpoint() breakpoint()