Add polar model

2024-11-22 07:54:19 +00:00 · 2023-02-10 05:52:13 +01:00 · 2023-02-10 05:52:13 +01:00 · c979699ff1
commit c979699ff1
parent 4f40aa44b4
5 changed files with 68 additions and 11 deletions
--- a/.idea/runConfigurations/unimore_bda_6.xml
+++ b/.idea/runConfigurations/unimore_bda_6.xml
@ -4,10 +4,14 @@
    <option name="INTERPRETER_OPTIONS" value="" />
    <option name="PARENT_ENVS" value="true" />
    <envs>
-      <env name="PYTHONUNBUFFERED" value="1" />
      <env name="CONFIRM_OVERWRITE" value="False" />
      <env name="NLTK_DATA" value="./data/nltk" />
+      <env name="PYTHONUNBUFFERED" value="1" />
+      <env name="TENSORFLOW_EPOCHS" value="4" />
+      <env name="EVALUATION_SET_SIZE" value="100" />
      <env name="TF_CPP_MIN_LOG_LEVEL" value="2" />
+      <env name="TRAINING_SET_SIZE" value="1000" />
+      <env name="VALIDATION_SET_SIZE" value="100" />
      <env name="WORKING_SET_SIZE" value="1000000" />
      <env name="XLA_FLAGS" value="--xla_gpu_cuda_data_dir=/opt/cuda" />
    </envs>
--- a/unimore_bda_6/main.py
+++ b/unimore_bda_6/main.py
@ -6,8 +6,7 @@ install_general_log_handlers()

 from .config import config
 from .database import mongo_client_from_config, reviews_collection, sample_reviews_polar, sample_reviews_varied
-from .analysis.nltk_sentiment import NLTKSentimentAnalyzer
-from .analysis.tf_text import TensorflowCategorySentimentAnalyzer
+from .analysis import NLTKSentimentAnalyzer, TensorflowCategorySentimentAnalyzer, TensorflowPolarSentimentAnalyzer
 from .analysis.base import TrainingFailedError
 from .tokenizer import PlainTokenizer, LowercaseTokenizer, NLTKWordTokenizer, PottsTokenizer, PottsTokenizerWithNegation
 from .gathering import Caches
@ -39,6 +38,7 @@ def main():
            slog.debug("Selected sample_func: %s", sample_func.__name__)

            for SentimentAnalyzer in [
+                TensorflowPolarSentimentAnalyzer,
                TensorflowCategorySentimentAnalyzer,
                NLTKSentimentAnalyzer,
            ]:
--- a/unimore_bda_6/analysis/base.py
+++ b/unimore_bda_6/analysis/base.py
@ -50,7 +50,7 @@ class BaseSentimentAnalyzer(metaclass=abc.ABCMeta):
        for review in evaluation_dataset_func():
            resulting_category = self.use(review.text)
            evaluated += 1
-            correct += 1 if resulting_category == review.category else 0
+            correct += 1 if round(resulting_category) == round(review.category) else 0
            score += 1 - (abs(resulting_category - review.category) / 4)

        return EvaluationResults(correct=correct, evaluated=evaluated, score=score)
--- a/unimore_bda_6/analysis/tf_text.py
+++ b/unimore_bda_6/analysis/tf_text.py
@ -164,7 +164,7 @@ class TensorflowSentimentAnalyzer(BaseSentimentAnalyzer, metaclass=abc.ABCMeta):
        vector = self.text_vectorization_layer(text)
        prediction = self.model.predict(vector, verbose=False)

-        return prediction
+        return self._translate_prediction(prediction)


 class TensorflowCategorySentimentAnalyzer(TensorflowSentimentAnalyzer):
@ -175,10 +175,10 @@ class TensorflowCategorySentimentAnalyzer(TensorflowSentimentAnalyzer):
    def _build_dataset(self, dataset_func: CachedDatasetFunc) -> tensorflow.data.Dataset:
        return build_dataset(
            dataset_func=dataset_func,
-            conversion_func=Review.to_tensor_tuple,
+            conversion_func=Review.to_tensor_tuple_category,
            output_signature=(
                tensorflow.TensorSpec(shape=(), dtype=tensorflow.string, name="text"),
-                tensorflow.TensorSpec(shape=(1, 5,), dtype=tensorflow.float32, name="review_one_hot"),
+                tensorflow.TensorSpec(shape=(1, 5,), dtype=tensorflow.float32, name="category_one_hot"),
            ),
        )

@ -218,7 +218,53 @@ class TensorflowCategorySentimentAnalyzer(TensorflowSentimentAnalyzer):
        return result


+class TensorflowPolarSentimentAnalyzer(TensorflowSentimentAnalyzer):
+    """
+    A `tensorflow`-based sentiment analyzer that uses the floating point value rating to get as close as possible to the correct category.
+    """
+
+    def _build_dataset(self, dataset_func: CachedDatasetFunc) -> tensorflow.data.Dataset:
+        return build_dataset(
+            dataset_func=dataset_func,
+            conversion_func=Review.to_tensor_tuple_normvalue,
+            output_signature=(
+                tensorflow.TensorSpec(shape=(), dtype=tensorflow.string, name="text"),
+                tensorflow.TensorSpec(shape=(1,), dtype=tensorflow.float32, name="category"),
+            ),
+        )
+
+    def _build_model(self) -> tensorflow.keras.Sequential:
+        log.debug("Creating sequential categorizer model...")
+        model = tensorflow.keras.Sequential([
+            tensorflow.keras.layers.Embedding(
+                input_dim=TENSORFLOW_MAX_FEATURES.__wrapped__ + 1,
+                output_dim=TENSORFLOW_EMBEDDING_SIZE.__wrapped__,
+            ),
+            tensorflow.keras.layers.Dropout(0.25),
+            tensorflow.keras.layers.GlobalAveragePooling1D(),
+            tensorflow.keras.layers.Dropout(0.25),
+            tensorflow.keras.layers.Dense(1),
+        ])
+
+        log.debug("Compiling model: %s", model)
+        model.compile(
+            optimizer=tensorflow.keras.optimizers.Adam(global_clipnorm=1.0),
+            loss=tensorflow.keras.losses.MeanSquaredError(),
+            metrics=[
+                tensorflow.keras.metrics.MeanAbsoluteError(),
+                tensorflow.keras.metrics.CosineSimilarity(),
+            ]
+        )
+
+        log.debug("Compiled model: %s", model)
+        return model
+
+    def _translate_prediction(self, a: numpy.array) -> Category:
+        return a[0, 0]
+
+
 __all__ = (
    "TensorflowSentimentAnalyzer",
    "TensorflowCategorySentimentAnalyzer",
+    "TensorflowPolarSentimentAnalyzer",
 )
--- a/unimore_bda_6/database/datatypes.py
+++ b/unimore_bda_6/database/datatypes.py
@ -40,6 +40,15 @@ class Review:
    def to_tensor_text(self) -> tensorflow.Tensor:
        return tensorflow.convert_to_tensor(self.text, dtype=tensorflow.string)

+    def to_tensor_normvalue(self) -> tensorflow.Tensor:
+        return tensorflow.convert_to_tensor([self.category / 5], dtype=tensorflow.float32)
+
+    def to_tensor_tuple_normvalue(self) -> tuple[tensorflow.Tensor, tensorflow.Tensor]:
+        return (
+            self.to_tensor_text(),
+            self.to_tensor_normvalue(),
+        )
+
    def to_tensor_category(self) -> tensorflow.Tensor:
        return tensorflow.convert_to_tensor([[
            1.0 if self.category == 1.0 else 0.0,
@ -49,13 +58,11 @@ class Review:
            1.0 if self.category == 5.0 else 0.0,
        ]], dtype=tensorflow.float32)

-    def to_tensor_tuple(self) -> tuple[tensorflow.Tensor, tensorflow.Tensor]:
-        t = (
+    def to_tensor_tuple_category(self) -> tuple[tensorflow.Tensor, tensorflow.Tensor]:
+        return (
            self.to_tensor_text(),
            self.to_tensor_category(),
        )
-        log.debug("Converted %s", t)
-        return t


 __all__ = (