diff --git a/.idea/runConfigurations/unimore_bda_6.xml b/.idea/runConfigurations/unimore_bda_6.xml
index cf5db9a..580512f 100644
--- a/.idea/runConfigurations/unimore_bda_6.xml
+++ b/.idea/runConfigurations/unimore_bda_6.xml
@@ -5,14 +5,14 @@
-
+
-
-
+
+
diff --git a/unimore_bda_6/__main__.py b/unimore_bda_6/__main__.py
index 6f75361..c59ac1d 100644
--- a/unimore_bda_6/__main__.py
+++ b/unimore_bda_6/__main__.py
@@ -6,7 +6,7 @@ install_general_log_handlers()
from .config import config
from .database import mongo_client_from_config, reviews_collection, sample_reviews_polar, sample_reviews_varied
-from .analysis import NLTKSentimentAnalyzer, TensorflowCategorySentimentAnalyzer, TensorflowPolarSentimentAnalyzer
+from .analysis import NLTKSentimentAnalyzer, TensorflowCategorySentimentAnalyzer, TensorflowPolarSentimentAnalyzer, ThreeCheat
from .analysis.base import TrainingFailedError
from .tokenizer import PlainTokenizer, LowercaseTokenizer, NLTKWordTokenizer, PottsTokenizer, PottsTokenizerWithNegation, HuggingBertTokenizer
from .gathering import Caches
@@ -32,25 +32,29 @@ def main():
reviews = reviews_collection(db)
- for sample_func in [sample_reviews_varied, sample_reviews_polar]:
+ for sample_func in [
+ sample_reviews_polar,
+ sample_reviews_varied,
+ ]:
slog = logging.getLogger(f"{__name__}.{sample_func.__name__}")
slog.debug("Selected sample_func: %s", sample_func.__name__)
for SentimentAnalyzer in [
+ # ThreeCheat,
TensorflowPolarSentimentAnalyzer,
TensorflowCategorySentimentAnalyzer,
- # NLTKSentimentAnalyzer,
+ NLTKSentimentAnalyzer,
]:
slog = logging.getLogger(f"{__name__}.{sample_func.__name__}.{SentimentAnalyzer.__name__}")
slog.debug("Selected SentimentAnalyzer: %s", SentimentAnalyzer.__name__)
for Tokenizer in [
- PottsTokenizerWithNegation,
PottsTokenizer,
- HuggingBertTokenizer,
PlainTokenizer,
+ HuggingBertTokenizer,
+ PottsTokenizerWithNegation,
LowercaseTokenizer,
NLTKWordTokenizer,
]:
diff --git a/unimore_bda_6/analysis/__init__.py b/unimore_bda_6/analysis/__init__.py
index 0ecbd3e..8472678 100644
--- a/unimore_bda_6/analysis/__init__.py
+++ b/unimore_bda_6/analysis/__init__.py
@@ -5,3 +5,4 @@ This module contains all implemented types of sentiment analyzers.
from .base import *
from .nltk_sentiment import *
from .tf_text import *
+from .cheating import *
diff --git a/unimore_bda_6/analysis/base.py b/unimore_bda_6/analysis/base.py
index db66dcc..666237f 100644
--- a/unimore_bda_6/analysis/base.py
+++ b/unimore_bda_6/analysis/base.py
@@ -4,7 +4,7 @@ import abc
import logging
import dataclasses
-from ..database import Text, Category, CachedDatasetFunc
+from ..database import CachedDatasetFunc, TextReview, TokenizedReview
from ..tokenizer import BaseTokenizer
log = logging.getLogger(__name__)
@@ -15,12 +15,11 @@ class BaseSentimentAnalyzer(metaclass=abc.ABCMeta):
Abstract base class for sentiment analyzers implemented in this project.
"""
- # noinspection PyUnusedLocal
def __init__(self, *, tokenizer: BaseTokenizer):
- pass
+ self.tokenizer: BaseTokenizer = tokenizer
def __repr__(self):
- return f"<{self.__class__.__qualname__}>"
+ return f"<{self.__class__.__qualname__} with {self.tokenizer} tokenizer>"
@abc.abstractmethod
def train(self, training_dataset_func: CachedDatasetFunc, validation_dataset_func: CachedDatasetFunc) -> None:
@@ -30,34 +29,34 @@ class BaseSentimentAnalyzer(metaclass=abc.ABCMeta):
raise NotImplementedError()
@abc.abstractmethod
- def use(self, text: Text) -> Category:
+ def use(self, text: str) -> float:
"""
- Run the model on the given input.
+ Run the model on the given input, and return the predicted rating.
"""
raise NotImplementedError()
def evaluate(self, evaluation_dataset_func: CachedDatasetFunc) -> EvaluationResults:
"""
Perform a model evaluation by calling repeatedly `.use` on every text of the test dataset and by comparing its resulting category with the expected category.
-
- Returns a tuple with the number of correct results and the number of evaluated results.
"""
evaluated: int = 0
- correct: int = 0
- score: float = 0.0
+
+ perfect: int = 0
+
+ squared_error: float = 0.0
for review in evaluation_dataset_func():
resulting_category = self.use(review.text)
- log.debug("Evaluation step: expected %d, received %d, review was %s", review.category, resulting_category, review.text[:80])
+ log.debug("Evaluation step: %d for %s", resulting_category, review)
evaluated += 1
try:
- correct += 1 if resulting_category == review.category else 0
- score += 1 - (abs(resulting_category - review.category) / 4)
+ perfect += 1 if resulting_category == review.rating else 0
+ squared_error += (resulting_category - review.rating) ** 2
except ValueError:
log.warning("Model execution on %s resulted in a NaN value: %s", review, resulting_category)
- return EvaluationResults(correct=correct, evaluated=evaluated, score=score)
+ return EvaluationResults(perfect=perfect, evaluated=evaluated, mse=squared_error / evaluated)
@dataclasses.dataclass
@@ -66,15 +65,26 @@ class EvaluationResults:
Container for the results of a dataset evaluation.
"""
- correct: int
evaluated: int
- score: float
+ """
+ The number of reviews that were evaluated.
+ """
+
+ perfect: int
+ """
+ The number of reviews for which the model returned the correct rating.
+ """
+
+ mse: float
+ """
+ Mean squared error
+ """
def __repr__(self):
return f""
def __str__(self):
- return f"{self.evaluated} evaluated, {self.correct} correct, {self.correct / self.evaluated:.2%} accuracy, {self.score:.2f} score, {self.score / self.evaluated:.2%} scoreaccuracy"
+ return f"Evaluation results:\t{self.evaluated}\tevaluated\t{self.perfect}\tperfect\t{self.perfect / self.evaluated:.2%}\taccuracy\t{self.mse / self.evaluated:.2%}\tmean squared error"
class AlreadyTrainedError(Exception):
diff --git a/unimore_bda_6/analysis/cheating.py b/unimore_bda_6/analysis/cheating.py
new file mode 100644
index 0000000..16f7498
--- /dev/null
+++ b/unimore_bda_6/analysis/cheating.py
@@ -0,0 +1,21 @@
+from .base import BaseSentimentAnalyzer
+from ..database.cache import CachedDatasetFunc
+
+
+class ThreeCheat(BaseSentimentAnalyzer):
+ """
+ A sentiment analyzer that always predicts a 3.0* rating.
+
+ Why? To test the scoring!
+ """
+
+ def train(self, training_dataset_func: CachedDatasetFunc, validation_dataset_func: CachedDatasetFunc) -> None:
+ pass
+
+ def use(self, text: str) -> float:
+ return 3.0
+
+
+__all__ = (
+ "ThreeCheat",
+)
diff --git a/unimore_bda_6/analysis/nltk_sentiment.py b/unimore_bda_6/analysis/nltk_sentiment.py
index 2f33b1b..e90e9c3 100644
--- a/unimore_bda_6/analysis/nltk_sentiment.py
+++ b/unimore_bda_6/analysis/nltk_sentiment.py
@@ -6,7 +6,7 @@ import logging
import typing as t
import itertools
-from ..database import Text, Category, Review, CachedDatasetFunc
+from ..database import TextReview, CachedDatasetFunc, TokenizedReview
from .base import BaseSentimentAnalyzer, AlreadyTrainedError, NotTrainedError
from ..log import count_passage
from ..tokenizer import BaseTokenizer
@@ -23,31 +23,17 @@ class NLTKSentimentAnalyzer(BaseSentimentAnalyzer):
"""
def __init__(self, *, tokenizer: BaseTokenizer) -> None:
- if not tokenizer.supports_plain():
- raise TypeError("Tokenizer does not support NLTK")
-
super().__init__(tokenizer=tokenizer)
self.model: nltk.sentiment.SentimentAnalyzer = nltk.sentiment.SentimentAnalyzer()
self.trained: bool = False
- self.tokenizer: BaseTokenizer = tokenizer
- def __repr__(self):
- return f"<{self.__class__.__qualname__} tokenizer={self.tokenizer!r}>"
-
- def __tokenize_review(self, datatuple: Review) -> tuple[TokenBag, Category]:
- """
- Convert the `Text` of a `DataTuple` to a `TokenBag`.
- """
- count_passage(log, "tokenize_datatuple", 100)
- return self.tokenizer.tokenize_and_split_plain(datatuple.text), datatuple.category
-
- def _add_feature_unigrams(self, dataset: t.Iterator[tuple[TokenBag, Category]]) -> None:
+ def _add_feature_unigrams(self, dataset: t.Iterator[TokenizedReview]) -> None:
"""
Register the `nltk.sentiment.util.extract_unigram_feats` feature extrator on the model.
"""
# Ignore the category and only access the tokens
- tokenbags = map(lambda d: d[0], dataset)
+ tokenbags = map(lambda r: r.rating, dataset)
# Get all words in the documents
all_words = self.model.all_words(tokenbags, labeled=False)
# Create unigram `contains(*)` features from the previously gathered words
@@ -55,59 +41,48 @@ class NLTKSentimentAnalyzer(BaseSentimentAnalyzer):
# Add the feature extractor to the model
self.model.add_feat_extractor(nltk.sentiment.util.extract_unigram_feats, unigrams=unigrams)
- def _add_feature_extractors(self, dataset: t.Iterator[tuple[TokenBag, Category]]):
+ def _add_feature_extractors(self, dataset: t.Iterator[TextReview]):
"""
Register new feature extractors on the `.model`.
"""
+ # Tokenize the reviews
+ dataset: t.Iterator[TokenizedReview] = map(self.tokenizer.tokenize_review, dataset)
# Add the unigrams feature
self._add_feature_unigrams(dataset)
- def __extract_features(self, data: tuple[TokenBag, Category]) -> tuple[Features, Category]:
+ def __extract_features(self, review: TextReview) -> tuple[Features, float]:
"""
Convert a (TokenBag, Category) tuple to a (Features, Category) tuple.
Does not use `SentimentAnalyzer.apply_features` due to unexpected behaviour when using iterators.
"""
- count_passage(log, "extract_features", 100)
- return self.model.extract_features(data[0]), data[1]
+ review: TokenizedReview = self.tokenizer.tokenize_review(review)
+ return self.model.extract_features(review.tokens), review.rating
def train(self, training_dataset_func: CachedDatasetFunc, validation_dataset_func: CachedDatasetFunc) -> None:
# Forbid retraining the model
if self.trained:
raise AlreadyTrainedError()
- # Get a generator
- dataset: t.Generator[Review] = training_dataset_func()
-
- # Tokenize the dataset
- dataset: t.Iterator[tuple[TokenBag, Category]] = map(self.__tokenize_review, dataset)
-
- # Cleanly duplicate the dataset iterator
- # Reduce average memory footprint, but not maximum
- dataset_1, dataset_2 = itertools.tee(dataset, 2)
- dataset_1: t.Iterator[tuple[TokenBag, Category]]
- dataset_2: t.Iterator[tuple[TokenBag, Category]]
-
# Add the feature extractors to the model
- self._add_feature_extractors(dataset_1)
- del dataset_1 # Delete exausted iterator
+ self._add_feature_extractors(training_dataset_func())
# Extract features from the dataset
- dataset_2: t.Iterator[tuple[Features, Category]] = map(self.__extract_features, dataset_2)
+ featureset: t.Iterator[tuple[Features, float]] = map(self.__extract_features, training_dataset_func())
# Train the classifier with the extracted features and category
- self.model.classifier = nltk.classify.NaiveBayesClassifier.train(dataset_2)
+ self.model.classifier = nltk.classify.NaiveBayesClassifier.train(featureset)
# Toggle the trained flag
self.trained = True
- def use(self, text: Text) -> Category:
+ def use(self, text: str) -> float:
# Require the model to be trained
if not self.trained:
raise NotTrainedError()
# Tokenize the input
- tokens = self.tokenizer.tokenize_and_split_plain(text)
+ tokens = self.tokenizer.tokenize(text)
# Run the classification method
return self.model.classify(instance=tokens)
diff --git a/unimore_bda_6/analysis/tf_text.py b/unimore_bda_6/analysis/tf_text.py
index d6ff3fb..5b047b7 100644
--- a/unimore_bda_6/analysis/tf_text.py
+++ b/unimore_bda_6/analysis/tf_text.py
@@ -5,7 +5,7 @@ import numpy
import tensorflow
import logging
-from ..database import Text, Category, CachedDatasetFunc, Review
+from ..database import CachedDatasetFunc, TextReview, TokenizedReview
from ..config import TENSORFLOW_EMBEDDING_SIZE, TENSORFLOW_MAX_FEATURES, TENSORFLOW_EPOCHS
from ..tokenizer import BaseTokenizer
from .base import BaseSentimentAnalyzer, AlreadyTrainedError, NotTrainedError, TrainingFailedError
@@ -19,31 +19,7 @@ else:
log.debug("Tensorflow successfully found GPU acceleration!")
-ConversionFunc = t.Callable[[Review], tensorflow.Tensor | tuple]
-
-
-def build_dataset(dataset_func: CachedDatasetFunc, conversion_func: ConversionFunc, output_signature: tensorflow.TensorSpec | tuple) -> tensorflow.data.Dataset:
- """
- Convert a `CachedDatasetFunc` to a `tensorflow.data.Dataset`.
- """
-
- def dataset_generator():
- for review in dataset_func():
- yield conversion_func(review)
-
- log.debug("Creating dataset...")
- dataset = tensorflow.data.Dataset.from_generator(
- dataset_generator,
- output_signature=output_signature,
- )
-
- log.debug("Caching dataset...")
- dataset = dataset.cache()
-
- log.debug("Configuring dataset prefetch...")
- dataset = dataset.prefetch(buffer_size=tensorflow.data.AUTOTUNE)
-
- return dataset
+ConversionFunc = t.Callable[[TextReview], tensorflow.Tensor | tuple]
class TensorflowSentimentAnalyzer(BaseSentimentAnalyzer, metaclass=abc.ABCMeta):
@@ -52,31 +28,15 @@ class TensorflowSentimentAnalyzer(BaseSentimentAnalyzer, metaclass=abc.ABCMeta):
"""
def __init__(self, *, tokenizer: BaseTokenizer):
- if not tokenizer.supports_tensorflow():
- raise TypeError("Tokenizer does not support Tensorflow")
-
super().__init__(tokenizer=tokenizer)
self.trained: bool = False
self.failed: bool = False
- self.tokenizer: BaseTokenizer = tokenizer
- self.text_vectorization_layer: tensorflow.keras.layers.TextVectorization = self._build_text_vectorization_layer()
+ self.string_lookup_layer: tensorflow.keras.layers.StringLookup = tensorflow.keras.layers.StringLookup(max_tokens=TENSORFLOW_MAX_FEATURES.__wrapped__)
self.model: tensorflow.keras.Sequential = self._build_model()
self.history: tensorflow.keras.callbacks.History | None = None
- def _build_text_vectorization_layer(self) -> tensorflow.keras.layers.TextVectorization:
- """
- Create a `tensorflow`-compatible `TextVectorization` layer.
- """
- log.debug("Creating TextVectorization layer...")
- layer = tensorflow.keras.layers.TextVectorization(
- standardize=self.tokenizer.tokenize_tensorflow_and_expand_dims,
- max_tokens=TENSORFLOW_MAX_FEATURES.__wrapped__
- )
- log.debug("Created TextVectorization layer: %s", layer)
- return layer
-
@abc.abstractmethod
def _build_model(self) -> tensorflow.keras.Sequential:
"""
@@ -84,33 +44,44 @@ class TensorflowSentimentAnalyzer(BaseSentimentAnalyzer, metaclass=abc.ABCMeta):
"""
raise NotImplementedError()
- @abc.abstractmethod
def _build_dataset(self, dataset_func: CachedDatasetFunc) -> tensorflow.data.Dataset:
"""
Create a `tensorflow.data.Dataset` from the given `CachedDatasetFunc`.
"""
- raise NotImplementedError()
+
+ def dataset_generator():
+ for review in dataset_func():
+ review: TextReview
+ review: TokenizedReview = self.tokenizer.tokenize_review(review)
+ tokens: tensorflow.Tensor = self._tokens_to_tensor(review.tokens)
+ rating: tensorflow.Tensor = self._rating_to_input(review.rating)
+ yield tokens, rating
+
+ log.debug("Creating dataset...")
+ dataset = tensorflow.data.Dataset.from_generator(
+ dataset_generator,
+ output_signature=(
+ tensorflow.TensorSpec(shape=(1, None,), dtype=tensorflow.string, name="tokens"),
+ self._ratingtensor_shape(),
+ ),
+ )
+
+ log.debug("Caching dataset...")
+ dataset = dataset.cache()
+
+ log.debug("Configuring dataset prefetch...")
+ dataset = dataset.prefetch(buffer_size=tensorflow.data.AUTOTUNE)
+
+ return dataset
def _adapt_textvectorization(self, dataset: tensorflow.data.Dataset) -> None:
"""
Adapt the `.text_vectorization_layer` to the given dataset.
"""
- log.debug("Preparing dataset to adapt %s...", self.text_vectorization_layer)
+ log.debug("Preparing dataset to adapt %s...", self.string_lookup_layer)
dataset = dataset.map(lambda text, category: text)
- log.debug("Adapting %s...", self.text_vectorization_layer)
- self.text_vectorization_layer.adapt(dataset)
-
- def _vectorize_dataset(self, dataset: tensorflow.data.Dataset) -> tensorflow.data.Dataset:
- """
- Apply the `.text_vectorization_layer` to the text in the dataset.
- """
- def vectorize_entry(text, category):
- return self.text_vectorization_layer(text), category
-
- log.debug("Vectorizing dataset: %s", dataset)
- dataset = dataset.map(vectorize_entry)
- log.debug("Vectorized dataset: %s", dataset)
- return dataset
+ log.debug("Adapting %s...", self.string_lookup_layer)
+ self.string_lookup_layer.adapt(dataset)
def train(self, training_dataset_func: CachedDatasetFunc, validation_dataset_func: CachedDatasetFunc) -> None:
if self.failed:
@@ -120,13 +91,17 @@ class TensorflowSentimentAnalyzer(BaseSentimentAnalyzer, metaclass=abc.ABCMeta):
log.error("Tried to train an already trained model.")
raise AlreadyTrainedError("Cannot re-train an already trained model.")
+ log.debug("Building training dataset...")
training_set = self._build_dataset(training_dataset_func)
+
+ log.debug("Building validation dataset...")
validation_set = self._build_dataset(validation_dataset_func)
- self._adapt_textvectorization(training_set)
+ log.debug("Building vocabulary...")
+ vocabulary = training_set.map(lambda tokens, rating: tokens)
- training_set = self._vectorize_dataset(training_set)
- validation_set = self._vectorize_dataset(validation_set)
+ log.debug("Adapting lookup layer to the vocabulary...")
+ self.string_lookup_layer.adapt(vocabulary)
log.info("Training: %s", self.model)
self.history: tensorflow.keras.callbacks.History | None = self.model.fit(
@@ -146,25 +121,50 @@ class TensorflowSentimentAnalyzer(BaseSentimentAnalyzer, metaclass=abc.ABCMeta):
log.info("Model %s training succeeded!", self.model)
self.trained = True
- @abc.abstractmethod
- def _translate_prediction(self, a: numpy.array) -> Category:
+ @staticmethod
+ def _tokens_to_tensor(tokens: t.Iterator[str]) -> tensorflow.Tensor:
"""
- Convert the results of `tensorflow.keras.Sequential.predict` into a `.Category`.
+ Convert an iterator of tokens to a `tensorflow.Tensor`.
+ """
+ tensor = tensorflow.convert_to_tensor(
+ [list(tokens)],
+ dtype=tensorflow.string,
+ name="tokens"
+ )
+ return tensor
+
+ def use(self, text: str) -> float:
+ if self.failed:
+ raise NotTrainedError("Cannot use a failed model.")
+ if not self.trained:
+ raise NotTrainedError("Cannot use a non-trained model.")
+
+ tokens = self.tokenizer.tokenize(text)
+ tokens = self._tokens_to_tensor(tokens)
+ prediction = self.model.predict(tokens, verbose=False)
+ prediction = self._prediction_to_rating(prediction)
+ return prediction
+
+ @abc.abstractmethod
+ def _rating_to_input(self, rating: float) -> tensorflow.Tensor:
+ """
+ Convert a review rating to a `tensorflow.Tensor`.
"""
raise NotImplementedError()
- def use(self, text: Text) -> Category:
- if self.failed:
- log.error("Tried to use a failed model.")
- raise NotTrainedError("Cannot use a failed model.")
- if not self.trained:
- log.error("Tried to use a non-trained model.")
- raise NotTrainedError("Cannot use a non-trained model.")
+ @abc.abstractmethod
+ def _ratingtensor_shape(self) -> tensorflow.TensorSpec:
+ """
+ Returns the shape of the tensor output by `._rating_to_tensor` and accepted as input by `._tensor_to_rating`.
+ """
+ raise NotImplementedError()
- vector = self.text_vectorization_layer(text)
- prediction = self.model.predict(vector, verbose=False)
-
- return self._translate_prediction(prediction)
+ @abc.abstractmethod
+ def _prediction_to_rating(self, prediction: tensorflow.Tensor) -> float:
+ """
+ Convert the results of `tensorflow.keras.Sequential.predict` into a review rating.
+ """
+ raise NotImplementedError()
class TensorflowCategorySentimentAnalyzer(TensorflowSentimentAnalyzer):
@@ -172,19 +172,10 @@ class TensorflowCategorySentimentAnalyzer(TensorflowSentimentAnalyzer):
A `tensorflow`-based sentiment analyzer that considers each star rating as a separate category.
"""
- def _build_dataset(self, dataset_func: CachedDatasetFunc) -> tensorflow.data.Dataset:
- return build_dataset(
- dataset_func=dataset_func,
- conversion_func=Review.to_tensor_tuple_category,
- output_signature=(
- tensorflow.TensorSpec(shape=(), dtype=tensorflow.string, name="text"),
- tensorflow.TensorSpec(shape=(1, 5,), dtype=tensorflow.float32, name="category_one_hot"),
- ),
- )
-
def _build_model(self) -> tensorflow.keras.Sequential:
log.debug("Creating sequential categorizer model...")
model = tensorflow.keras.Sequential([
+ self.string_lookup_layer,
tensorflow.keras.layers.Embedding(
input_dim=TENSORFLOW_MAX_FEATURES.__wrapped__ + 1,
output_dim=TENSORFLOW_EMBEDDING_SIZE.__wrapped__,
@@ -209,15 +200,35 @@ class TensorflowCategorySentimentAnalyzer(TensorflowSentimentAnalyzer):
log.debug("Compiled model: %s", model)
return model
- def _translate_prediction(self, a: numpy.array) -> Category:
- max_i = None
- max_p = None
- for i, p in enumerate(iter(a[0])):
- if max_p is None or p > max_p:
- max_i = i
- max_p = p
- result = float(max_i) + 1.0
- return float(round(result))
+ def _rating_to_input(self, rating: float) -> tensorflow.Tensor:
+ tensor = tensorflow.convert_to_tensor(
+ [[
+ 1.0 if rating == 1.0 else 0.0,
+ 1.0 if rating == 2.0 else 0.0,
+ 1.0 if rating == 3.0 else 0.0,
+ 1.0 if rating == 4.0 else 0.0,
+ 1.0 if rating == 5.0 else 0.0,
+ ]],
+ dtype=tensorflow.float32,
+ name="rating_one_hot"
+ )
+ return tensor
+
+ def _ratingtensor_shape(self) -> tensorflow.TensorSpec:
+ spec = tensorflow.TensorSpec(shape=(1, 5), dtype=tensorflow.float32, name="rating_one_hot")
+ return spec
+
+ def _prediction_to_rating(self, prediction: tensorflow.Tensor) -> float:
+ best_prediction = None
+ best_prediction_index = None
+
+ for index, prediction in enumerate(iter(prediction[0])):
+ if best_prediction is None or prediction > best_prediction:
+ best_prediction = prediction
+ best_prediction_index = index
+
+ result = float(best_prediction_index) + 1.0
+ return result
class TensorflowPolarSentimentAnalyzer(TensorflowSentimentAnalyzer):
@@ -225,19 +236,10 @@ class TensorflowPolarSentimentAnalyzer(TensorflowSentimentAnalyzer):
A `tensorflow`-based sentiment analyzer that uses the floating point value rating to get as close as possible to the correct category.
"""
- def _build_dataset(self, dataset_func: CachedDatasetFunc) -> tensorflow.data.Dataset:
- return build_dataset(
- dataset_func=dataset_func,
- conversion_func=Review.to_tensor_tuple_normvalue,
- output_signature=(
- tensorflow.TensorSpec(shape=(), dtype=tensorflow.string, name="text"),
- tensorflow.TensorSpec(shape=(1,), dtype=tensorflow.float32, name="category"),
- ),
- )
-
def _build_model(self) -> tensorflow.keras.Sequential:
log.debug("Creating sequential categorizer model...")
model = tensorflow.keras.Sequential([
+ self.string_lookup_layer,
tensorflow.keras.layers.Embedding(
input_dim=TENSORFLOW_MAX_FEATURES.__wrapped__ + 1,
output_dim=TENSORFLOW_EMBEDDING_SIZE.__wrapped__,
@@ -245,7 +247,9 @@ class TensorflowPolarSentimentAnalyzer(TensorflowSentimentAnalyzer):
tensorflow.keras.layers.Dropout(0.25),
tensorflow.keras.layers.GlobalAveragePooling1D(),
tensorflow.keras.layers.Dropout(0.25),
- tensorflow.keras.layers.Dense(1, activation="sigmoid"),
+ tensorflow.keras.layers.Dense(8),
+ tensorflow.keras.layers.Dropout(0.25),
+ tensorflow.keras.layers.Dense(1, activation=tensorflow.keras.activations.sigmoid),
])
log.debug("Compiling model: %s", model)
@@ -257,11 +261,23 @@ class TensorflowPolarSentimentAnalyzer(TensorflowSentimentAnalyzer):
log.debug("Compiled model: %s", model)
return model
- def _translate_prediction(self, a: numpy.array) -> Category:
- a: float = a[0, 0]
- a = a * 2 + 1
- a = float(round(a))
- return a
+ def _rating_to_input(self, rating: float) -> tensorflow.Tensor:
+ normalized_rating = (rating - 1) / 4
+ tensor = tensorflow.convert_to_tensor(
+ [normalized_rating],
+ dtype=tensorflow.float32,
+ name="rating_value"
+ )
+ return tensor
+
+ def _ratingtensor_shape(self) -> tensorflow.TensorSpec:
+ spec = tensorflow.TensorSpec(shape=(1,), dtype=tensorflow.float32, name="rating_value")
+ return spec
+
+ def _prediction_to_rating(self, prediction: numpy.array) -> float:
+ rating: float = prediction[0, 0]
+ rating = 1.0 if rating < 0.5 else 5.0
+ return rating
__all__ = (
diff --git a/unimore_bda_6/database/cache.py b/unimore_bda_6/database/cache.py
index e6a3d45..4a18153 100644
--- a/unimore_bda_6/database/cache.py
+++ b/unimore_bda_6/database/cache.py
@@ -4,15 +4,15 @@ import shutil
import pathlib
import pickle
-from .datatypes import Review
+from .datatypes import TextReview
log = logging.getLogger(__name__)
-CachedDatasetFunc = t.Callable[[], t.Generator[Review, t.Any, None]]
+CachedDatasetFunc = t.Callable[[], t.Generator[TextReview, t.Any, None]]
-def store_cache(reviews: t.Iterator[Review], path: str | pathlib.Path) -> None:
+def store_cache(reviews: t.Iterator[TextReview], path: str | pathlib.Path) -> None:
"""
Store the contents of the given `Review` iterator to different files in a directory at the given path.
"""
@@ -54,7 +54,7 @@ def load_cache(path: str | pathlib.Path) -> CachedDatasetFunc:
log.debug("Loading pickle file: %s", document_path)
with open(document_path, "rb") as file:
- result: Review = pickle.load(file)
+ result: TextReview = pickle.load(file)
yield result
return data_cache_loader
diff --git a/unimore_bda_6/database/datatypes.py b/unimore_bda_6/database/datatypes.py
index 5e34047..922087e 100644
--- a/unimore_bda_6/database/datatypes.py
+++ b/unimore_bda_6/database/datatypes.py
@@ -1,75 +1,80 @@
-import tensorflow
+import abc
+import typing as t
+
from .collections import MongoReview
-import logging
-
-log = logging.getLogger(__name__)
-Text = str
-Category = float
+class Review(metaclass=abc.ABCMeta):
+ """
+ Base class for method common to both review types.
+ """
+
+ def __init__(self, *, rating: float):
+ self.rating: float = rating
+ """
+ The star rating of the review, from ``1.0`` to ``5.0``.
+ """
-class Review:
+class TextReview(Review):
+ """
+ Optimized container for a review with the text still intact.
+
+ Uses `__slots__` for better performance.
+ """
+
__slots__ = (
"text",
- "category",
+ "rating",
)
- def __init__(self, text: Text, category: Category):
+ def __init__(self, *, rating: float, text: str):
+ super().__init__(rating=rating)
+
self.text: str = text
- self.category: float = category
+ """
+ The contents of the review.
+ """
@classmethod
- def from_mongoreview(cls, review: MongoReview):
+ def from_mongoreview(cls, review: MongoReview) -> "TextReview":
+ """
+ Create a new `.Review` object from a `MongoReview` `dict`.
+ """
return cls(
text=review["reviewText"],
- category=review["overall"],
+ rating=review["overall"],
)
def __repr__(self):
- return f"<{self.__class__.__qualname__}: [{self.category}] {self.text}>"
+ return f"<{self.__class__.__qualname__}: ({self.rating}*) {self.text[:80]}>"
- def __getitem__(self, item):
- if item == 0 or item == "text":
- return self.text
- elif item == 1 or item == "category":
- return self.category
- else:
- raise KeyError(item)
- def normvalue(self) -> float:
- return (self.category - 1) / 2
+class TokenizedReview(Review):
+ """
+ Optimized container for a review with a tokenized text.
- def to_tensor_text(self) -> tensorflow.Tensor:
- return tensorflow.convert_to_tensor(self.text, dtype=tensorflow.string)
+ Uses `__slots__` for better performance.
+ """
- def to_tensor_normvalue(self) -> tensorflow.Tensor:
- return tensorflow.convert_to_tensor([self.normvalue()], dtype=tensorflow.float32)
+ __slots__ = (
+ "tokens",
+ "rating",
+ )
- def to_tensor_tuple_normvalue(self) -> tuple[tensorflow.Tensor, tensorflow.Tensor]:
- return (
- self.to_tensor_text(),
- self.to_tensor_normvalue(),
- )
+ def __init__(self, *, rating: float, tokens: t.Iterator[str]):
+ super().__init__(rating=rating)
- def to_tensor_category(self) -> tensorflow.Tensor:
- return tensorflow.convert_to_tensor([[
- 1.0 if self.category == 1.0 else 0.0,
- 1.0 if self.category == 2.0 else 0.0,
- 1.0 if self.category == 3.0 else 0.0,
- 1.0 if self.category == 4.0 else 0.0,
- 1.0 if self.category == 5.0 else 0.0,
- ]], dtype=tensorflow.float32)
+ self.tokens: list[str] = list(tokens)
+ """
+ List of all tokens in the review text.
+ """
- def to_tensor_tuple_category(self) -> tuple[tensorflow.Tensor, tensorflow.Tensor]:
- return (
- self.to_tensor_text(),
- self.to_tensor_category(),
- )
+ def __repr__(self):
+ return f"<{self.__class__.__qualname__}: ({self.rating}*) [{len(self.tokens)} tokens]>"
__all__ = (
- "Text",
- "Category",
- "Review",
+ "TextReview",
+ "TokenizedReview",
)
diff --git a/unimore_bda_6/database/queries.py b/unimore_bda_6/database/queries.py
index 1d0801c..530b96d 100644
--- a/unimore_bda_6/database/queries.py
+++ b/unimore_bda_6/database/queries.py
@@ -4,15 +4,15 @@ import typing as t
from ..config import WORKING_SET_SIZE
from .collections import MongoReview
-from .datatypes import Review
+from .datatypes import TextReview
log = logging.getLogger(__name__)
-SampleFunc = t.Callable[[pymongo.collection.Collection, int], t.Iterator[Review]]
+SampleFunc = t.Callable[[pymongo.collection.Collection, int], t.Iterator[TextReview]]
-def sample_reviews(collection: pymongo.collection.Collection, amount: int) -> t.Iterator[Review]:
+def sample_reviews(collection: pymongo.collection.Collection, amount: int) -> t.Iterator[TextReview]:
"""
Get ``amount`` random reviews from the ``reviews`` collection.
"""
@@ -23,12 +23,12 @@ def sample_reviews(collection: pymongo.collection.Collection, amount: int) -> t.
{"$sample": {"size": amount}},
])
- cursor = map(Review.from_mongoreview, cursor)
+ cursor = map(TextReview.from_mongoreview, cursor)
return cursor
-def sample_reviews_by_rating(collection: pymongo.collection.Collection, rating: float, amount: int) -> t.Iterator[Review]:
+def sample_reviews_by_rating(collection: pymongo.collection.Collection, rating: float, amount: int) -> t.Iterator[TextReview]:
"""
Get ``amount`` random reviews with ``rating`` stars from the ``reviews`` collection.
"""
@@ -43,7 +43,7 @@ def sample_reviews_by_rating(collection: pymongo.collection.Collection, rating:
return cursor
-def sample_reviews_polar(collection: pymongo.collection.Collection, amount: int) -> t.Iterator[Review]:
+def sample_reviews_polar(collection: pymongo.collection.Collection, amount: int) -> t.Iterator[TextReview]:
category_amount = amount // 2
log.debug("Getting a sample of %d polar reviews...", category_amount * 2)
@@ -68,12 +68,12 @@ def sample_reviews_polar(collection: pymongo.collection.Collection, amount: int)
}}
])
- cursor = map(Review.from_mongoreview, cursor)
+ cursor = map(TextReview.from_mongoreview, cursor)
return cursor
-def sample_reviews_varied(collection: pymongo.collection.Collection, amount: int) -> t.Iterator[Review]:
+def sample_reviews_varied(collection: pymongo.collection.Collection, amount: int) -> t.Iterator[TextReview]:
category_amount = amount // 5
log.debug("Getting a sample of %d varied reviews...", category_amount * 5)
@@ -123,7 +123,7 @@ def sample_reviews_varied(collection: pymongo.collection.Collection, amount: int
}}
])
- cursor = map(Review.from_mongoreview, cursor)
+ cursor = map(TextReview.from_mongoreview, cursor)
return cursor
diff --git a/unimore_bda_6/gathering.py b/unimore_bda_6/gathering.py
index 81951a4..4706804 100644
--- a/unimore_bda_6/gathering.py
+++ b/unimore_bda_6/gathering.py
@@ -5,7 +5,7 @@ import logging
import pymongo
from .config import TRAINING_SET_SIZE, VALIDATION_SET_SIZE, EVALUATION_SET_SIZE
-from .database import SampleFunc, CachedDatasetFunc, mongo_client_from_config, reviews_collection, store_cache, load_cache, delete_cache
+from .database import SampleFunc, CachedDatasetFunc, store_cache, load_cache, delete_cache
log = logging.getLogger(__name__)
diff --git a/unimore_bda_6/tokenizer/base.py b/unimore_bda_6/tokenizer/base.py
index 01e8d41..4608255 100644
--- a/unimore_bda_6/tokenizer/base.py
+++ b/unimore_bda_6/tokenizer/base.py
@@ -1,51 +1,26 @@
-import tensorflow
+import typing as t
+import abc
+from ..database.datatypes import TextReview, TokenizedReview
-class BaseTokenizer:
+class BaseTokenizer(metaclass=abc.ABCMeta):
"""
The base for all tokenizers in this project.
"""
def __repr__(self):
- return f"{self.__class__.__qualname__}()"
+ return f"<{self.__class__.__qualname__}>"
- @staticmethod
- def __not_implemented(f):
- f.__notimplemented__ = True
- return f
-
- def supports_plain(self) -> bool:
- return not getattr(self.tokenize_plain, "__notimplemented__", False)
-
- def supports_tensorflow(self) -> bool:
- return not getattr(self.tokenize_tensorflow, "__notimplemented__", False)
-
- @__not_implemented
- def tokenize_plain(self, text: str) -> str:
+ @abc.abstractmethod
+ def tokenize(self, text: str) -> t.Iterator[str]:
"""
Convert a text `str` into another `str` containing a series of whitespace-separated tokens.
"""
raise NotImplementedError()
- def tokenize_and_split_plain(self, text: str) -> list[str]:
+ def tokenize_review(self, review: TextReview) -> TokenizedReview:
"""
- Run `.tokenize_plain`, then split the result using `str.split`.
+ Apply `.tokenize` to the text of a `TextReview`, converting it in a `TokenizedReview`.
"""
- text = self.tokenize_plain(text)
- text = text.split()
- return text
-
- @__not_implemented
- def tokenize_tensorflow(self, text: tensorflow.Tensor) -> tensorflow.Tensor:
- """
- Convert a `tensorflow.Tensor` string into another `tensorflow.Tensor` space-separated string.
- """
- raise NotImplementedError()
-
- def tokenize_tensorflow_and_expand_dims(self, text: tensorflow.Tensor) -> tensorflow.Tensor:
- """
- Run `.tokenize_tensorflow`, then add a dimension to the tensor for reasons unknown to me, but required to get `tensorflow` to work properly.
- """
- text = self.tokenize_tensorflow(text)
- text = tensorflow.expand_dims(text, -1, name="tokens")
- return text
+ tokens = self.tokenize(review.text)
+ return TokenizedReview(rating=review.rating, tokens=tokens)
diff --git a/unimore_bda_6/tokenizer/hugging.py b/unimore_bda_6/tokenizer/hugging.py
index 40cc682..6cc23c8 100644
--- a/unimore_bda_6/tokenizer/hugging.py
+++ b/unimore_bda_6/tokenizer/hugging.py
@@ -1,10 +1,15 @@
import abc
import tokenizers
+import typing as t
from .base import BaseTokenizer
class HuggingTokenizer(BaseTokenizer, metaclass=abc.ABCMeta):
+ """
+ Abstract tokenizer to implement any tokenizer based on HuggingFace `tokenizers.Tokenizer`.
+ """
+
def __init__(self):
super().__init__()
self.hug: tokenizers.Tokenizer = self._build_hugging_tokenizer()
@@ -12,11 +17,15 @@ class HuggingTokenizer(BaseTokenizer, metaclass=abc.ABCMeta):
def _build_hugging_tokenizer(self) -> tokenizers.Tokenizer:
raise NotImplementedError()
- def tokenize_plain(self, text: str) -> str:
- return " ".join(self.hug.encode(text).tokens)
+ def tokenize(self, text: str) -> t.Iterator[str]:
+ return self.hug.encode(text).tokens
class HuggingBertTokenizer(HuggingTokenizer):
+ """
+ Tokenizer based on the `bert-base-cased `_ tokenizer.
+ """
+
def _build_hugging_tokenizer(self):
return tokenizers.Tokenizer.from_pretrained("bert-base-cased")
diff --git a/unimore_bda_6/tokenizer/lower.py b/unimore_bda_6/tokenizer/lower.py
index 94fbdf2..3132f9d 100644
--- a/unimore_bda_6/tokenizer/lower.py
+++ b/unimore_bda_6/tokenizer/lower.py
@@ -1,17 +1,14 @@
-import tensorflow
+import typing as t
from .base import BaseTokenizer
class LowercaseTokenizer(BaseTokenizer):
"""
- Tokenizer which converts the words to lowercase before splitting them via spaces.
+ Tokenizer which converts the words to lowercase before splitting them with `str.split`.
"""
- def tokenize_plain(self, text: str) -> str:
+ def tokenize(self, text: str) -> t.Iterator[str]:
text = text.lower()
- return text
-
- def tokenize_tensorflow(self, text: tensorflow.Tensor) -> tensorflow.Tensor:
- text = tensorflow.strings.lower(text)
- return text
+ tokens = text.split()
+ return tokens
diff --git a/unimore_bda_6/tokenizer/nltk_word_tokenize.py b/unimore_bda_6/tokenizer/nltk_word_tokenize.py
index f96c8b0..9c909a0 100644
--- a/unimore_bda_6/tokenizer/nltk_word_tokenize.py
+++ b/unimore_bda_6/tokenizer/nltk_word_tokenize.py
@@ -9,7 +9,7 @@ class NLTKWordTokenizer(BaseTokenizer):
Tokenizer based on `nltk.word_tokenize`.
"""
- def tokenize_plain(self, text: str) -> str:
+ def tokenize(self, text: str) -> str:
tokens = nltk.word_tokenize(text)
nltk.sentiment.util.mark_negation(tokens, shallow=True)
return " ".join(tokens)
diff --git a/unimore_bda_6/tokenizer/plain.py b/unimore_bda_6/tokenizer/plain.py
index b771401..a5db51c 100644
--- a/unimore_bda_6/tokenizer/plain.py
+++ b/unimore_bda_6/tokenizer/plain.py
@@ -1,15 +1,13 @@
-import tensorflow
+import typing as t
from .base import BaseTokenizer
class PlainTokenizer(BaseTokenizer):
"""
- Tokenizer which just splits the text into tokens by separating them at whitespaces.
+ Tokenizer which just splits the text into tokens by separating them at whitespaces with `str.split`.
"""
- def tokenize_plain(self, text: str) -> str:
- return text
-
- def tokenize_tensorflow(self, text: tensorflow.Tensor) -> tensorflow.Tensor:
- return text
+ def tokenize(self, text: str) -> t.Iterator[str]:
+ tokens = text.split()
+ return tokens
diff --git a/unimore_bda_6/tokenizer/potts.py b/unimore_bda_6/tokenizer/potts.py
index bfec472..0473425 100644
--- a/unimore_bda_6/tokenizer/potts.py
+++ b/unimore_bda_6/tokenizer/potts.py
@@ -1,4 +1,3 @@
-import tensorflow
import re
import html.entities
import typing as t
@@ -11,7 +10,7 @@ class PottsTokenizer(BaseTokenizer):
"""
Tokenizer based on `Christopher Potts' tokenizer `_, released in 2011.
- This module is released under the Creative Commons Attribution-NonCommercial-ShareAlike 3.0 Unported License: https://creativecommons.org/licenses/by-nc-sa/3.0/ .
+ This class is released under the Creative Commons Attribution-NonCommercial-ShareAlike 3.0 Unported License: https://creativecommons.org/licenses/by-nc-sa/3.0/ .
"""
# noinspection RegExpRepeatedSpace
@@ -76,7 +75,7 @@ class PottsTokenizer(BaseTokenizer):
amp = "&"
@classmethod
- def __html2string(cls, s: str) -> str:
+ def html_entities_to_chr(cls, s: str) -> str:
"""
Internal metod that seeks to replace all the HTML entities in s with their corresponding characters.
"""
@@ -102,24 +101,41 @@ class PottsTokenizer(BaseTokenizer):
s = s.replace(cls.amp, " and ")
return s
- def tokenize_plain(self, text: str) -> str:
+ @classmethod
+ def lower_but_preserve_emoticons(cls, word):
+ """
+ Internal method which lowercases the word if it does not match `.emoticon_re`.
+ """
+ if cls.emoticon_re.search(word):
+ return word
+ else:
+ return word.lower()
+
+ def tokenize(self, text: str) -> t.Iterator[str]:
# Fix HTML character entitites
- s = self.__html2string(text)
+ text = self.html_entities_to_chr(text)
# Tokenize
- words = self.words_re.findall(s)
+ tokens = self.words_re.findall(text)
# Possible alter the case, but avoid changing emoticons like :D into :d:
- words = list(map(lambda x: x if self.emoticon_re.search(x) else x.lower(), words))
- # Re-join words
- result = " ".join(words)
+ tokens = map(self.lower_but_preserve_emoticons, tokens)
# Return the result
- return result
+ return tokens
class PottsTokenizerWithNegation(PottsTokenizer):
- def tokenize_plain(self, text: str) -> str:
- words = super().tokenize_plain(text).split()
+ """
+ Version of `.PottsTokenizer` which after tokenizing applies `nltk.sentiment.util.mark_negation`.
+ """
+
+ def tokenize(self, text: str) -> str:
+ # Apply the base tokenization
+ words = super().tokenize(text)
+ # Convert to a list (sigh) the iterator
+ words = list(words)
+ # Use nltk to mark negation
nltk.sentiment.util.mark_negation(words, shallow=True)
- return " ".join(words)
+ # Return the result
+ return words
__all__ = (