mirror of
https://github.com/Steffo99/unimore-bda-6.git
synced 2024-11-22 07:54:19 +00:00
back to i have no idea of what's happening, but at least it works
This commit is contained in:
parent
e9a4421acd
commit
c31743f066
6 changed files with 42 additions and 20 deletions
|
@ -5,10 +5,11 @@
|
||||||
<option name="PARENT_ENVS" value="true" />
|
<option name="PARENT_ENVS" value="true" />
|
||||||
<envs>
|
<envs>
|
||||||
<env name="CONFIRM_OVERWRITE" value="False" />
|
<env name="CONFIRM_OVERWRITE" value="False" />
|
||||||
<env name="DATA_SET_SIZE" value="100" />
|
<env name="DATA_SET_SIZE" value="2500" />
|
||||||
<env name="NLTK_DATA" value="./data/nltk" />
|
<env name="NLTK_DATA" value="./data/nltk" />
|
||||||
<env name="PYTHONUNBUFFERED" value="1" />
|
<env name="PYTHONUNBUFFERED" value="1" />
|
||||||
<env name="WORKING_SET_SIZE" value="1000000" />
|
<env name="WORKING_SET_SIZE" value="1000000" />
|
||||||
|
<env name="XLA_FLAGS" value="--xla_gpu_cuda_data_dir=/opt/cuda" />
|
||||||
</envs>
|
</envs>
|
||||||
<option name="SDK_HOME" value="$PROJECT_DIR$/.venv/bin/python" />
|
<option name="SDK_HOME" value="$PROJECT_DIR$/.venv/bin/python" />
|
||||||
<option name="SDK_NAME" value="Poetry (unimore-bda-6)" />
|
<option name="SDK_NAME" value="Poetry (unimore-bda-6)" />
|
||||||
|
|
|
@ -1,9 +1,8 @@
|
||||||
import abc
|
import abc
|
||||||
import logging
|
import logging
|
||||||
import typing as t
|
|
||||||
import dataclasses
|
import dataclasses
|
||||||
|
|
||||||
from ..database import Text, Category, Review, DatasetFunc
|
from ..database import Text, Category, DatasetFunc
|
||||||
|
|
||||||
log = logging.getLogger(__name__)
|
log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
|
@ -1,6 +1,7 @@
|
||||||
import tensorflow
|
import tensorflow
|
||||||
|
|
||||||
from ..database import Text, Category, DatasetFunc
|
from ..database import Text, Category, DatasetFunc
|
||||||
|
from ..config import DATA_SET_SIZE
|
||||||
from .base import BaseSentimentAnalyzer, AlreadyTrainedError, NotTrainedError
|
from .base import BaseSentimentAnalyzer, AlreadyTrainedError, NotTrainedError
|
||||||
|
|
||||||
|
|
||||||
|
@ -21,17 +22,20 @@ class TensorflowSentimentAnalyzer(BaseSentimentAnalyzer):
|
||||||
dataset_func_with_tensor_tuple,
|
dataset_func_with_tensor_tuple,
|
||||||
output_signature=(
|
output_signature=(
|
||||||
tensorflow.TensorSpec(shape=(), dtype=tensorflow.string, name="text"),
|
tensorflow.TensorSpec(shape=(), dtype=tensorflow.string, name="text"),
|
||||||
tensorflow.TensorSpec(shape=(), dtype=tensorflow.float32, name="category"),
|
tensorflow.TensorSpec(shape=(5,), dtype=tensorflow.float32, name="category"),
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
def _build_model(self) -> tensorflow.keras.Sequential:
|
def _build_model(self) -> tensorflow.keras.Sequential:
|
||||||
return tensorflow.keras.Sequential([
|
return tensorflow.keras.Sequential([
|
||||||
tensorflow.keras.layers.Embedding(input_dim=self.MAX_FEATURES + 1, output_dim=self.EMBEDDING_DIM),
|
tensorflow.keras.layers.Embedding(
|
||||||
tensorflow.keras.layers.Dropout(0.2),
|
input_dim=self.MAX_FEATURES + 1,
|
||||||
|
output_dim=self.EMBEDDING_DIM,
|
||||||
|
),
|
||||||
|
# tensorflow.keras.layers.Dropout(0.2),
|
||||||
tensorflow.keras.layers.GlobalAveragePooling1D(),
|
tensorflow.keras.layers.GlobalAveragePooling1D(),
|
||||||
tensorflow.keras.layers.Dropout(0.2),
|
# tensorflow.keras.layers.Dropout(0.2),
|
||||||
tensorflow.keras.layers.Dense(1),
|
tensorflow.keras.layers.Dense(5, activation="softmax"),
|
||||||
])
|
])
|
||||||
|
|
||||||
def _build_vectorizer(self) -> tensorflow.keras.layers.TextVectorization:
|
def _build_vectorizer(self) -> tensorflow.keras.layers.TextVectorization:
|
||||||
|
@ -41,9 +45,13 @@ class TensorflowSentimentAnalyzer(BaseSentimentAnalyzer):
|
||||||
text = tensorflow.expand_dims(text, -1) # TODO: ??????
|
text = tensorflow.expand_dims(text, -1) # TODO: ??????
|
||||||
return self.text_vectorization_layer(text), category
|
return self.text_vectorization_layer(text), category
|
||||||
|
|
||||||
MAX_FEATURES = 1000
|
MAX_FEATURES = 2500
|
||||||
EMBEDDING_DIM = 16
|
EMBEDDING_DIM = 24
|
||||||
EPOCHS = 10
|
"""
|
||||||
|
Count of possible "semantic meanings" of words, represented as dimensions of a tensor.
|
||||||
|
"""
|
||||||
|
|
||||||
|
EPOCHS = 3
|
||||||
|
|
||||||
def train(self, dataset_func: DatasetFunc) -> None:
|
def train(self, dataset_func: DatasetFunc) -> None:
|
||||||
if self.trained:
|
if self.trained:
|
||||||
|
@ -55,11 +63,10 @@ class TensorflowSentimentAnalyzer(BaseSentimentAnalyzer):
|
||||||
self.text_vectorization_layer.adapt(only_text_set)
|
self.text_vectorization_layer.adapt(only_text_set)
|
||||||
training_set = training_set.map(self.__vectorize_data)
|
training_set = training_set.map(self.__vectorize_data)
|
||||||
|
|
||||||
self.model.compile(loss=tensorflow.keras.losses.CosineSimilarity(axis=0), metrics=["accuracy"])
|
# self.model.compile(loss=tensorflow.keras.losses.SparseCategoricalCrossentropy(from_logits=True), optimizer="adam", metrics=["accuracy"])
|
||||||
|
self.model.compile(loss=tensorflow.keras.losses.MeanAbsoluteError(), optimizer="adam", metrics=["accuracy"])
|
||||||
|
|
||||||
history = self.model.fit(training_set, epochs=self.EPOCHS)
|
self.model.fit(training_set, epochs=self.EPOCHS)
|
||||||
|
|
||||||
...
|
|
||||||
|
|
||||||
self.trained = True
|
self.trained = True
|
||||||
|
|
||||||
|
@ -67,5 +74,15 @@ class TensorflowSentimentAnalyzer(BaseSentimentAnalyzer):
|
||||||
if not self.trained:
|
if not self.trained:
|
||||||
raise NotTrainedError()
|
raise NotTrainedError()
|
||||||
|
|
||||||
prediction = self.model.predict(text)
|
vector = self.text_vectorization_layer(tensorflow.expand_dims(text, -1))
|
||||||
breakpoint()
|
|
||||||
|
prediction = self.model.predict(vector)
|
||||||
|
|
||||||
|
max_i = None
|
||||||
|
max_p = None
|
||||||
|
for i, p in enumerate(iter(prediction[0])):
|
||||||
|
if max_p is None or p > max_p:
|
||||||
|
max_i = i
|
||||||
|
max_p = p
|
||||||
|
|
||||||
|
return float(max_i) + 1.0
|
||||||
|
|
|
@ -41,7 +41,6 @@ def load_cache(path: str | pathlib.Path) -> DatasetFunc:
|
||||||
path = pathlib.Path(path)
|
path = pathlib.Path(path)
|
||||||
|
|
||||||
if not path.exists():
|
if not path.exists():
|
||||||
log.error("Specified cache directory does not exist: %s", path)
|
|
||||||
raise FileNotFoundError("The specified path does not exist.")
|
raise FileNotFoundError("The specified path does not exist.")
|
||||||
|
|
||||||
def data_cache_loader():
|
def data_cache_loader():
|
||||||
|
|
|
@ -33,7 +33,13 @@ class Review:
|
||||||
return tensorflow.convert_to_tensor(self.text, dtype=tensorflow.string)
|
return tensorflow.convert_to_tensor(self.text, dtype=tensorflow.string)
|
||||||
|
|
||||||
def to_tensor_category(self) -> tensorflow.Tensor:
|
def to_tensor_category(self) -> tensorflow.Tensor:
|
||||||
return tensorflow.convert_to_tensor(self.category / 5.0, dtype=tensorflow.float32)
|
return tensorflow.convert_to_tensor([
|
||||||
|
1.0 if self.category == 1.0 else 0.0,
|
||||||
|
1.0 if self.category == 2.0 else 0.0,
|
||||||
|
1.0 if self.category == 3.0 else 0.0,
|
||||||
|
1.0 if self.category == 4.0 else 0.0,
|
||||||
|
1.0 if self.category == 5.0 else 0.0,
|
||||||
|
], dtype=tensorflow.float32)
|
||||||
|
|
||||||
def to_tensor_tuple(self) -> tuple[tensorflow.Tensor, tensorflow.Tensor]:
|
def to_tensor_tuple(self) -> tuple[tensorflow.Tensor, tensorflow.Tensor]:
|
||||||
return (
|
return (
|
||||||
|
|
|
@ -15,7 +15,7 @@ def install_log_handler(loggers: list[logging.Logger] = None):
|
||||||
for logger in loggers:
|
for logger in loggers:
|
||||||
coloredlogs.install(
|
coloredlogs.install(
|
||||||
logger=logger,
|
logger=logger,
|
||||||
level="DEBUG",
|
level="INFO",
|
||||||
fmt="{asctime} | {name:<32} | {levelname:>8} | {message}",
|
fmt="{asctime} | {name:<32} | {levelname:>8} | {message}",
|
||||||
style="{",
|
style="{",
|
||||||
level_styles=dict(
|
level_styles=dict(
|
||||||
|
|
Loading…
Reference in a new issue