mirror of
https://github.com/Steffo99/unimore-bda-6.git
synced 2024-11-22 07:54:19 +00:00
back to i have no idea of what's happening, but at least it works
This commit is contained in:
parent
e9a4421acd
commit
c31743f066
6 changed files with 42 additions and 20 deletions
|
@ -5,10 +5,11 @@
|
|||
<option name="PARENT_ENVS" value="true" />
|
||||
<envs>
|
||||
<env name="CONFIRM_OVERWRITE" value="False" />
|
||||
<env name="DATA_SET_SIZE" value="100" />
|
||||
<env name="DATA_SET_SIZE" value="2500" />
|
||||
<env name="NLTK_DATA" value="./data/nltk" />
|
||||
<env name="PYTHONUNBUFFERED" value="1" />
|
||||
<env name="WORKING_SET_SIZE" value="1000000" />
|
||||
<env name="XLA_FLAGS" value="--xla_gpu_cuda_data_dir=/opt/cuda" />
|
||||
</envs>
|
||||
<option name="SDK_HOME" value="$PROJECT_DIR$/.venv/bin/python" />
|
||||
<option name="SDK_NAME" value="Poetry (unimore-bda-6)" />
|
||||
|
|
|
@ -1,9 +1,8 @@
|
|||
import abc
|
||||
import logging
|
||||
import typing as t
|
||||
import dataclasses
|
||||
|
||||
from ..database import Text, Category, Review, DatasetFunc
|
||||
from ..database import Text, Category, DatasetFunc
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
import tensorflow
|
||||
|
||||
from ..database import Text, Category, DatasetFunc
|
||||
from ..config import DATA_SET_SIZE
|
||||
from .base import BaseSentimentAnalyzer, AlreadyTrainedError, NotTrainedError
|
||||
|
||||
|
||||
|
@ -21,17 +22,20 @@ class TensorflowSentimentAnalyzer(BaseSentimentAnalyzer):
|
|||
dataset_func_with_tensor_tuple,
|
||||
output_signature=(
|
||||
tensorflow.TensorSpec(shape=(), dtype=tensorflow.string, name="text"),
|
||||
tensorflow.TensorSpec(shape=(), dtype=tensorflow.float32, name="category"),
|
||||
tensorflow.TensorSpec(shape=(5,), dtype=tensorflow.float32, name="category"),
|
||||
)
|
||||
)
|
||||
|
||||
def _build_model(self) -> tensorflow.keras.Sequential:
|
||||
return tensorflow.keras.Sequential([
|
||||
tensorflow.keras.layers.Embedding(input_dim=self.MAX_FEATURES + 1, output_dim=self.EMBEDDING_DIM),
|
||||
tensorflow.keras.layers.Dropout(0.2),
|
||||
tensorflow.keras.layers.Embedding(
|
||||
input_dim=self.MAX_FEATURES + 1,
|
||||
output_dim=self.EMBEDDING_DIM,
|
||||
),
|
||||
# tensorflow.keras.layers.Dropout(0.2),
|
||||
tensorflow.keras.layers.GlobalAveragePooling1D(),
|
||||
tensorflow.keras.layers.Dropout(0.2),
|
||||
tensorflow.keras.layers.Dense(1),
|
||||
# tensorflow.keras.layers.Dropout(0.2),
|
||||
tensorflow.keras.layers.Dense(5, activation="softmax"),
|
||||
])
|
||||
|
||||
def _build_vectorizer(self) -> tensorflow.keras.layers.TextVectorization:
|
||||
|
@ -41,9 +45,13 @@ class TensorflowSentimentAnalyzer(BaseSentimentAnalyzer):
|
|||
text = tensorflow.expand_dims(text, -1) # TODO: ??????
|
||||
return self.text_vectorization_layer(text), category
|
||||
|
||||
MAX_FEATURES = 1000
|
||||
EMBEDDING_DIM = 16
|
||||
EPOCHS = 10
|
||||
MAX_FEATURES = 2500
|
||||
EMBEDDING_DIM = 24
|
||||
"""
|
||||
Count of possible "semantic meanings" of words, represented as dimensions of a tensor.
|
||||
"""
|
||||
|
||||
EPOCHS = 3
|
||||
|
||||
def train(self, dataset_func: DatasetFunc) -> None:
|
||||
if self.trained:
|
||||
|
@ -55,11 +63,10 @@ class TensorflowSentimentAnalyzer(BaseSentimentAnalyzer):
|
|||
self.text_vectorization_layer.adapt(only_text_set)
|
||||
training_set = training_set.map(self.__vectorize_data)
|
||||
|
||||
self.model.compile(loss=tensorflow.keras.losses.CosineSimilarity(axis=0), metrics=["accuracy"])
|
||||
# self.model.compile(loss=tensorflow.keras.losses.SparseCategoricalCrossentropy(from_logits=True), optimizer="adam", metrics=["accuracy"])
|
||||
self.model.compile(loss=tensorflow.keras.losses.MeanAbsoluteError(), optimizer="adam", metrics=["accuracy"])
|
||||
|
||||
history = self.model.fit(training_set, epochs=self.EPOCHS)
|
||||
|
||||
...
|
||||
self.model.fit(training_set, epochs=self.EPOCHS)
|
||||
|
||||
self.trained = True
|
||||
|
||||
|
@ -67,5 +74,15 @@ class TensorflowSentimentAnalyzer(BaseSentimentAnalyzer):
|
|||
if not self.trained:
|
||||
raise NotTrainedError()
|
||||
|
||||
prediction = self.model.predict(text)
|
||||
breakpoint()
|
||||
vector = self.text_vectorization_layer(tensorflow.expand_dims(text, -1))
|
||||
|
||||
prediction = self.model.predict(vector)
|
||||
|
||||
max_i = None
|
||||
max_p = None
|
||||
for i, p in enumerate(iter(prediction[0])):
|
||||
if max_p is None or p > max_p:
|
||||
max_i = i
|
||||
max_p = p
|
||||
|
||||
return float(max_i) + 1.0
|
||||
|
|
|
@ -41,7 +41,6 @@ def load_cache(path: str | pathlib.Path) -> DatasetFunc:
|
|||
path = pathlib.Path(path)
|
||||
|
||||
if not path.exists():
|
||||
log.error("Specified cache directory does not exist: %s", path)
|
||||
raise FileNotFoundError("The specified path does not exist.")
|
||||
|
||||
def data_cache_loader():
|
||||
|
|
|
@ -33,7 +33,13 @@ class Review:
|
|||
return tensorflow.convert_to_tensor(self.text, dtype=tensorflow.string)
|
||||
|
||||
def to_tensor_category(self) -> tensorflow.Tensor:
|
||||
return tensorflow.convert_to_tensor(self.category / 5.0, dtype=tensorflow.float32)
|
||||
return tensorflow.convert_to_tensor([
|
||||
1.0 if self.category == 1.0 else 0.0,
|
||||
1.0 if self.category == 2.0 else 0.0,
|
||||
1.0 if self.category == 3.0 else 0.0,
|
||||
1.0 if self.category == 4.0 else 0.0,
|
||||
1.0 if self.category == 5.0 else 0.0,
|
||||
], dtype=tensorflow.float32)
|
||||
|
||||
def to_tensor_tuple(self) -> tuple[tensorflow.Tensor, tensorflow.Tensor]:
|
||||
return (
|
||||
|
|
|
@ -15,7 +15,7 @@ def install_log_handler(loggers: list[logging.Logger] = None):
|
|||
for logger in loggers:
|
||||
coloredlogs.install(
|
||||
logger=logger,
|
||||
level="DEBUG",
|
||||
level="INFO",
|
||||
fmt="{asctime} | {name:<32} | {levelname:>8} | {message}",
|
||||
style="{",
|
||||
level_styles=dict(
|
||||
|
|
Loading…
Reference in a new issue