From 4f40aa44b4af17d138dc82961d850e1516e90502 Mon Sep 17 00:00:00 2001
From: Stefano Pigozzi <me@steffo.eu>
Date: Fri, 10 Feb 2023 05:18:24 +0100
Subject: [PATCH] Fix some leftover bugs

---
 unimore_bda_6/__main__.py                | 6 +++---
 unimore_bda_6/analysis/nltk_sentiment.py | 2 +-
 unimore_bda_6/analysis/tf_text.py        | 6 +++---
 3 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/unimore_bda_6/__main__.py b/unimore_bda_6/__main__.py
index 7d3b7f0..5111065 100644
--- a/unimore_bda_6/__main__.py
+++ b/unimore_bda_6/__main__.py
@@ -39,19 +39,19 @@ def main():
             slog.debug("Selected sample_func: %s", sample_func.__name__)
 
             for SentimentAnalyzer in [
-                NLTKSentimentAnalyzer,
                 TensorflowCategorySentimentAnalyzer,
+                NLTKSentimentAnalyzer,
             ]:
 
                 slog = logging.getLogger(f"{__name__}.{sample_func.__name__}.{SentimentAnalyzer.__name__}")
                 slog.debug("Selected SentimentAnalyzer: %s", SentimentAnalyzer.__name__)
 
                 for Tokenizer in [
-                    PottsTokenizer,
-                    PottsTokenizerWithNegation,
                     PlainTokenizer,
                     LowercaseTokenizer,
                     NLTKWordTokenizer,
+                    PottsTokenizer,
+                    PottsTokenizerWithNegation,
                 ]:
 
                     slog = logging.getLogger(f"{__name__}.{sample_func.__name__}.{SentimentAnalyzer.__name__}.{Tokenizer.__name__}")
diff --git a/unimore_bda_6/analysis/nltk_sentiment.py b/unimore_bda_6/analysis/nltk_sentiment.py
index f3d4b86..4c063b7 100644
--- a/unimore_bda_6/analysis/nltk_sentiment.py
+++ b/unimore_bda_6/analysis/nltk_sentiment.py
@@ -40,7 +40,7 @@ class NLTKSentimentAnalyzer(BaseSentimentAnalyzer):
         Convert the `Text` of a `DataTuple` to a `TokenBag`.
         """
         count_passage(log, "tokenize_datatuple", 100)
-        return self.tokenizer.tokenize_plain(datatuple.text), datatuple.category
+        return self.tokenizer.tokenize_and_split_plain(datatuple.text), datatuple.category
 
     def _add_feature_unigrams(self, dataset: t.Iterator[tuple[TokenBag, Category]]) -> None:
         """
diff --git a/unimore_bda_6/analysis/tf_text.py b/unimore_bda_6/analysis/tf_text.py
index 840bfac..8d3442f 100644
--- a/unimore_bda_6/analysis/tf_text.py
+++ b/unimore_bda_6/analysis/tf_text.py
@@ -71,7 +71,7 @@ class TensorflowSentimentAnalyzer(BaseSentimentAnalyzer, metaclass=abc.ABCMeta):
         """
         log.debug("Creating TextVectorization layer...")
         layer = tensorflow.keras.layers.TextVectorization(
-            standardize=self.tokenizer.tokenize_tensorflow,
+            standardize=self.tokenizer.tokenize_tensorflow_and_expand_dims,
             max_tokens=TENSORFLOW_MAX_FEATURES.__wrapped__
         )
         log.debug("Created TextVectorization layer: %s", layer)
@@ -177,8 +177,8 @@ class TensorflowCategorySentimentAnalyzer(TensorflowSentimentAnalyzer):
             dataset_func=dataset_func,
             conversion_func=Review.to_tensor_tuple,
             output_signature=(
-                tensorflow.TensorSpec(shape=(1,), dtype=tensorflow.string, name="text"),
-                tensorflow.TensorSpec(shape=(5,), dtype=tensorflow.float32, name="review_one_hot"),
+                tensorflow.TensorSpec(shape=(), dtype=tensorflow.string, name="text"),
+                tensorflow.TensorSpec(shape=(1, 5,), dtype=tensorflow.float32, name="review_one_hot"),
             ),
         )