1
Fork 0
mirror of https://github.com/Steffo99/unimore-bda-6.git synced 2024-11-21 15:34:18 +00:00

CODE IS DONE

This commit is contained in:
Steffo 2023-02-12 05:11:58 +01:00
parent ae2cf563e6
commit 4e8aa68db3
Signed by: steffo
GPG key ID: 2A24051445686895
17 changed files with 331 additions and 304 deletions

View file

@ -5,14 +5,14 @@
<option name="PARENT_ENVS" value="true" />
<envs>
<env name="CONFIRM_OVERWRITE" value="False" />
<env name="EVALUATION_SET_SIZE" value="100" />
<env name="EVALUATION_SET_SIZE" value="4000" />
<env name="NLTK_DATA" value="./data/nltk" />
<env name="PYTHONUNBUFFERED" value="1" />
<env name="TENSORFLOW_EMBEDDING_SIZE" value="64" />
<env name="TENSORFLOW_MAX_FEATURES" value="1000000" />
<env name="TF_CPP_MIN_LOG_LEVEL" value="2" />
<env name="TRAINING_SET_SIZE" value="2000" />
<env name="VALIDATION_SET_SIZE" value="25" />
<env name="TRAINING_SET_SIZE" value="4000" />
<env name="VALIDATION_SET_SIZE" value="100" />
<env name="WORKING_SET_SIZE" value="100000" />
<env name="XLA_FLAGS" value="--xla_gpu_cuda_data_dir=/opt/cuda" />
</envs>

View file

@ -6,7 +6,7 @@ install_general_log_handlers()
from .config import config
from .database import mongo_client_from_config, reviews_collection, sample_reviews_polar, sample_reviews_varied
from .analysis import NLTKSentimentAnalyzer, TensorflowCategorySentimentAnalyzer, TensorflowPolarSentimentAnalyzer
from .analysis import NLTKSentimentAnalyzer, TensorflowCategorySentimentAnalyzer, TensorflowPolarSentimentAnalyzer, ThreeCheat
from .analysis.base import TrainingFailedError
from .tokenizer import PlainTokenizer, LowercaseTokenizer, NLTKWordTokenizer, PottsTokenizer, PottsTokenizerWithNegation, HuggingBertTokenizer
from .gathering import Caches
@ -32,25 +32,29 @@ def main():
reviews = reviews_collection(db)
for sample_func in [sample_reviews_varied, sample_reviews_polar]:
for sample_func in [
sample_reviews_polar,
sample_reviews_varied,
]:
slog = logging.getLogger(f"{__name__}.{sample_func.__name__}")
slog.debug("Selected sample_func: %s", sample_func.__name__)
for SentimentAnalyzer in [
# ThreeCheat,
TensorflowPolarSentimentAnalyzer,
TensorflowCategorySentimentAnalyzer,
# NLTKSentimentAnalyzer,
NLTKSentimentAnalyzer,
]:
slog = logging.getLogger(f"{__name__}.{sample_func.__name__}.{SentimentAnalyzer.__name__}")
slog.debug("Selected SentimentAnalyzer: %s", SentimentAnalyzer.__name__)
for Tokenizer in [
PottsTokenizerWithNegation,
PottsTokenizer,
HuggingBertTokenizer,
PlainTokenizer,
HuggingBertTokenizer,
PottsTokenizerWithNegation,
LowercaseTokenizer,
NLTKWordTokenizer,
]:

View file

@ -5,3 +5,4 @@ This module contains all implemented types of sentiment analyzers.
from .base import *
from .nltk_sentiment import *
from .tf_text import *
from .cheating import *

View file

@ -4,7 +4,7 @@ import abc
import logging
import dataclasses
from ..database import Text, Category, CachedDatasetFunc
from ..database import CachedDatasetFunc, TextReview, TokenizedReview
from ..tokenizer import BaseTokenizer
log = logging.getLogger(__name__)
@ -15,12 +15,11 @@ class BaseSentimentAnalyzer(metaclass=abc.ABCMeta):
Abstract base class for sentiment analyzers implemented in this project.
"""
# noinspection PyUnusedLocal
def __init__(self, *, tokenizer: BaseTokenizer):
pass
self.tokenizer: BaseTokenizer = tokenizer
def __repr__(self):
return f"<{self.__class__.__qualname__}>"
return f"<{self.__class__.__qualname__} with {self.tokenizer} tokenizer>"
@abc.abstractmethod
def train(self, training_dataset_func: CachedDatasetFunc, validation_dataset_func: CachedDatasetFunc) -> None:
@ -30,34 +29,34 @@ class BaseSentimentAnalyzer(metaclass=abc.ABCMeta):
raise NotImplementedError()
@abc.abstractmethod
def use(self, text: Text) -> Category:
def use(self, text: str) -> float:
"""
Run the model on the given input.
Run the model on the given input, and return the predicted rating.
"""
raise NotImplementedError()
def evaluate(self, evaluation_dataset_func: CachedDatasetFunc) -> EvaluationResults:
"""
Perform a model evaluation by calling repeatedly `.use` on every text of the test dataset and by comparing its resulting category with the expected category.
Returns a tuple with the number of correct results and the number of evaluated results.
"""
evaluated: int = 0
correct: int = 0
score: float = 0.0
perfect: int = 0
squared_error: float = 0.0
for review in evaluation_dataset_func():
resulting_category = self.use(review.text)
log.debug("Evaluation step: expected %d, received %d, review was %s", review.category, resulting_category, review.text[:80])
log.debug("Evaluation step: %d for %s", resulting_category, review)
evaluated += 1
try:
correct += 1 if resulting_category == review.category else 0
score += 1 - (abs(resulting_category - review.category) / 4)
perfect += 1 if resulting_category == review.rating else 0
squared_error += (resulting_category - review.rating) ** 2
except ValueError:
log.warning("Model execution on %s resulted in a NaN value: %s", review, resulting_category)
return EvaluationResults(correct=correct, evaluated=evaluated, score=score)
return EvaluationResults(perfect=perfect, evaluated=evaluated, mse=squared_error / evaluated)
@dataclasses.dataclass
@ -66,15 +65,26 @@ class EvaluationResults:
Container for the results of a dataset evaluation.
"""
correct: int
evaluated: int
score: float
"""
The number of reviews that were evaluated.
"""
perfect: int
"""
The number of reviews for which the model returned the correct rating.
"""
mse: float
"""
Mean squared error
"""
def __repr__(self):
return f"<EvaluationResults: {self!s}>"
def __str__(self):
return f"{self.evaluated} evaluated, {self.correct} correct, {self.correct / self.evaluated:.2%} accuracy, {self.score:.2f} score, {self.score / self.evaluated:.2%} scoreaccuracy"
return f"Evaluation results:\t{self.evaluated}\tevaluated\t{self.perfect}\tperfect\t{self.perfect / self.evaluated:.2%}\taccuracy\t{self.mse / self.evaluated:.2%}\tmean squared error"
class AlreadyTrainedError(Exception):

View file

@ -0,0 +1,21 @@
from .base import BaseSentimentAnalyzer
from ..database.cache import CachedDatasetFunc
class ThreeCheat(BaseSentimentAnalyzer):
"""
A sentiment analyzer that always predicts a 3.0* rating.
Why? To test the scoring!
"""
def train(self, training_dataset_func: CachedDatasetFunc, validation_dataset_func: CachedDatasetFunc) -> None:
pass
def use(self, text: str) -> float:
return 3.0
__all__ = (
"ThreeCheat",
)

View file

@ -6,7 +6,7 @@ import logging
import typing as t
import itertools
from ..database import Text, Category, Review, CachedDatasetFunc
from ..database import TextReview, CachedDatasetFunc, TokenizedReview
from .base import BaseSentimentAnalyzer, AlreadyTrainedError, NotTrainedError
from ..log import count_passage
from ..tokenizer import BaseTokenizer
@ -23,31 +23,17 @@ class NLTKSentimentAnalyzer(BaseSentimentAnalyzer):
"""
def __init__(self, *, tokenizer: BaseTokenizer) -> None:
if not tokenizer.supports_plain():
raise TypeError("Tokenizer does not support NLTK")
super().__init__(tokenizer=tokenizer)
self.model: nltk.sentiment.SentimentAnalyzer = nltk.sentiment.SentimentAnalyzer()
self.trained: bool = False
self.tokenizer: BaseTokenizer = tokenizer
def __repr__(self):
return f"<{self.__class__.__qualname__} tokenizer={self.tokenizer!r}>"
def __tokenize_review(self, datatuple: Review) -> tuple[TokenBag, Category]:
"""
Convert the `Text` of a `DataTuple` to a `TokenBag`.
"""
count_passage(log, "tokenize_datatuple", 100)
return self.tokenizer.tokenize_and_split_plain(datatuple.text), datatuple.category
def _add_feature_unigrams(self, dataset: t.Iterator[tuple[TokenBag, Category]]) -> None:
def _add_feature_unigrams(self, dataset: t.Iterator[TokenizedReview]) -> None:
"""
Register the `nltk.sentiment.util.extract_unigram_feats` feature extrator on the model.
"""
# Ignore the category and only access the tokens
tokenbags = map(lambda d: d[0], dataset)
tokenbags = map(lambda r: r.rating, dataset)
# Get all words in the documents
all_words = self.model.all_words(tokenbags, labeled=False)
# Create unigram `contains(*)` features from the previously gathered words
@ -55,59 +41,48 @@ class NLTKSentimentAnalyzer(BaseSentimentAnalyzer):
# Add the feature extractor to the model
self.model.add_feat_extractor(nltk.sentiment.util.extract_unigram_feats, unigrams=unigrams)
def _add_feature_extractors(self, dataset: t.Iterator[tuple[TokenBag, Category]]):
def _add_feature_extractors(self, dataset: t.Iterator[TextReview]):
"""
Register new feature extractors on the `.model`.
"""
# Tokenize the reviews
dataset: t.Iterator[TokenizedReview] = map(self.tokenizer.tokenize_review, dataset)
# Add the unigrams feature
self._add_feature_unigrams(dataset)
def __extract_features(self, data: tuple[TokenBag, Category]) -> tuple[Features, Category]:
def __extract_features(self, review: TextReview) -> tuple[Features, float]:
"""
Convert a (TokenBag, Category) tuple to a (Features, Category) tuple.
Does not use `SentimentAnalyzer.apply_features` due to unexpected behaviour when using iterators.
"""
count_passage(log, "extract_features", 100)
return self.model.extract_features(data[0]), data[1]
review: TokenizedReview = self.tokenizer.tokenize_review(review)
return self.model.extract_features(review.tokens), review.rating
def train(self, training_dataset_func: CachedDatasetFunc, validation_dataset_func: CachedDatasetFunc) -> None:
# Forbid retraining the model
if self.trained:
raise AlreadyTrainedError()
# Get a generator
dataset: t.Generator[Review] = training_dataset_func()
# Tokenize the dataset
dataset: t.Iterator[tuple[TokenBag, Category]] = map(self.__tokenize_review, dataset)
# Cleanly duplicate the dataset iterator
# Reduce average memory footprint, but not maximum
dataset_1, dataset_2 = itertools.tee(dataset, 2)
dataset_1: t.Iterator[tuple[TokenBag, Category]]
dataset_2: t.Iterator[tuple[TokenBag, Category]]
# Add the feature extractors to the model
self._add_feature_extractors(dataset_1)
del dataset_1 # Delete exausted iterator
self._add_feature_extractors(training_dataset_func())
# Extract features from the dataset
dataset_2: t.Iterator[tuple[Features, Category]] = map(self.__extract_features, dataset_2)
featureset: t.Iterator[tuple[Features, float]] = map(self.__extract_features, training_dataset_func())
# Train the classifier with the extracted features and category
self.model.classifier = nltk.classify.NaiveBayesClassifier.train(dataset_2)
self.model.classifier = nltk.classify.NaiveBayesClassifier.train(featureset)
# Toggle the trained flag
self.trained = True
def use(self, text: Text) -> Category:
def use(self, text: str) -> float:
# Require the model to be trained
if not self.trained:
raise NotTrainedError()
# Tokenize the input
tokens = self.tokenizer.tokenize_and_split_plain(text)
tokens = self.tokenizer.tokenize(text)
# Run the classification method
return self.model.classify(instance=tokens)

View file

@ -5,7 +5,7 @@ import numpy
import tensorflow
import logging
from ..database import Text, Category, CachedDatasetFunc, Review
from ..database import CachedDatasetFunc, TextReview, TokenizedReview
from ..config import TENSORFLOW_EMBEDDING_SIZE, TENSORFLOW_MAX_FEATURES, TENSORFLOW_EPOCHS
from ..tokenizer import BaseTokenizer
from .base import BaseSentimentAnalyzer, AlreadyTrainedError, NotTrainedError, TrainingFailedError
@ -19,31 +19,7 @@ else:
log.debug("Tensorflow successfully found GPU acceleration!")
ConversionFunc = t.Callable[[Review], tensorflow.Tensor | tuple]
def build_dataset(dataset_func: CachedDatasetFunc, conversion_func: ConversionFunc, output_signature: tensorflow.TensorSpec | tuple) -> tensorflow.data.Dataset:
"""
Convert a `CachedDatasetFunc` to a `tensorflow.data.Dataset`.
"""
def dataset_generator():
for review in dataset_func():
yield conversion_func(review)
log.debug("Creating dataset...")
dataset = tensorflow.data.Dataset.from_generator(
dataset_generator,
output_signature=output_signature,
)
log.debug("Caching dataset...")
dataset = dataset.cache()
log.debug("Configuring dataset prefetch...")
dataset = dataset.prefetch(buffer_size=tensorflow.data.AUTOTUNE)
return dataset
ConversionFunc = t.Callable[[TextReview], tensorflow.Tensor | tuple]
class TensorflowSentimentAnalyzer(BaseSentimentAnalyzer, metaclass=abc.ABCMeta):
@ -52,31 +28,15 @@ class TensorflowSentimentAnalyzer(BaseSentimentAnalyzer, metaclass=abc.ABCMeta):
"""
def __init__(self, *, tokenizer: BaseTokenizer):
if not tokenizer.supports_tensorflow():
raise TypeError("Tokenizer does not support Tensorflow")
super().__init__(tokenizer=tokenizer)
self.trained: bool = False
self.failed: bool = False
self.tokenizer: BaseTokenizer = tokenizer
self.text_vectorization_layer: tensorflow.keras.layers.TextVectorization = self._build_text_vectorization_layer()
self.string_lookup_layer: tensorflow.keras.layers.StringLookup = tensorflow.keras.layers.StringLookup(max_tokens=TENSORFLOW_MAX_FEATURES.__wrapped__)
self.model: tensorflow.keras.Sequential = self._build_model()
self.history: tensorflow.keras.callbacks.History | None = None
def _build_text_vectorization_layer(self) -> tensorflow.keras.layers.TextVectorization:
"""
Create a `tensorflow`-compatible `TextVectorization` layer.
"""
log.debug("Creating TextVectorization layer...")
layer = tensorflow.keras.layers.TextVectorization(
standardize=self.tokenizer.tokenize_tensorflow_and_expand_dims,
max_tokens=TENSORFLOW_MAX_FEATURES.__wrapped__
)
log.debug("Created TextVectorization layer: %s", layer)
return layer
@abc.abstractmethod
def _build_model(self) -> tensorflow.keras.Sequential:
"""
@ -84,33 +44,44 @@ class TensorflowSentimentAnalyzer(BaseSentimentAnalyzer, metaclass=abc.ABCMeta):
"""
raise NotImplementedError()
@abc.abstractmethod
def _build_dataset(self, dataset_func: CachedDatasetFunc) -> tensorflow.data.Dataset:
"""
Create a `tensorflow.data.Dataset` from the given `CachedDatasetFunc`.
"""
raise NotImplementedError()
def dataset_generator():
for review in dataset_func():
review: TextReview
review: TokenizedReview = self.tokenizer.tokenize_review(review)
tokens: tensorflow.Tensor = self._tokens_to_tensor(review.tokens)
rating: tensorflow.Tensor = self._rating_to_input(review.rating)
yield tokens, rating
log.debug("Creating dataset...")
dataset = tensorflow.data.Dataset.from_generator(
dataset_generator,
output_signature=(
tensorflow.TensorSpec(shape=(1, None,), dtype=tensorflow.string, name="tokens"),
self._ratingtensor_shape(),
),
)
log.debug("Caching dataset...")
dataset = dataset.cache()
log.debug("Configuring dataset prefetch...")
dataset = dataset.prefetch(buffer_size=tensorflow.data.AUTOTUNE)
return dataset
def _adapt_textvectorization(self, dataset: tensorflow.data.Dataset) -> None:
"""
Adapt the `.text_vectorization_layer` to the given dataset.
"""
log.debug("Preparing dataset to adapt %s...", self.text_vectorization_layer)
log.debug("Preparing dataset to adapt %s...", self.string_lookup_layer)
dataset = dataset.map(lambda text, category: text)
log.debug("Adapting %s...", self.text_vectorization_layer)
self.text_vectorization_layer.adapt(dataset)
def _vectorize_dataset(self, dataset: tensorflow.data.Dataset) -> tensorflow.data.Dataset:
"""
Apply the `.text_vectorization_layer` to the text in the dataset.
"""
def vectorize_entry(text, category):
return self.text_vectorization_layer(text), category
log.debug("Vectorizing dataset: %s", dataset)
dataset = dataset.map(vectorize_entry)
log.debug("Vectorized dataset: %s", dataset)
return dataset
log.debug("Adapting %s...", self.string_lookup_layer)
self.string_lookup_layer.adapt(dataset)
def train(self, training_dataset_func: CachedDatasetFunc, validation_dataset_func: CachedDatasetFunc) -> None:
if self.failed:
@ -120,13 +91,17 @@ class TensorflowSentimentAnalyzer(BaseSentimentAnalyzer, metaclass=abc.ABCMeta):
log.error("Tried to train an already trained model.")
raise AlreadyTrainedError("Cannot re-train an already trained model.")
log.debug("Building training dataset...")
training_set = self._build_dataset(training_dataset_func)
log.debug("Building validation dataset...")
validation_set = self._build_dataset(validation_dataset_func)
self._adapt_textvectorization(training_set)
log.debug("Building vocabulary...")
vocabulary = training_set.map(lambda tokens, rating: tokens)
training_set = self._vectorize_dataset(training_set)
validation_set = self._vectorize_dataset(validation_set)
log.debug("Adapting lookup layer to the vocabulary...")
self.string_lookup_layer.adapt(vocabulary)
log.info("Training: %s", self.model)
self.history: tensorflow.keras.callbacks.History | None = self.model.fit(
@ -146,25 +121,50 @@ class TensorflowSentimentAnalyzer(BaseSentimentAnalyzer, metaclass=abc.ABCMeta):
log.info("Model %s training succeeded!", self.model)
self.trained = True
@abc.abstractmethod
def _translate_prediction(self, a: numpy.array) -> Category:
@staticmethod
def _tokens_to_tensor(tokens: t.Iterator[str]) -> tensorflow.Tensor:
"""
Convert the results of `tensorflow.keras.Sequential.predict` into a `.Category`.
Convert an iterator of tokens to a `tensorflow.Tensor`.
"""
tensor = tensorflow.convert_to_tensor(
[list(tokens)],
dtype=tensorflow.string,
name="tokens"
)
return tensor
def use(self, text: str) -> float:
if self.failed:
raise NotTrainedError("Cannot use a failed model.")
if not self.trained:
raise NotTrainedError("Cannot use a non-trained model.")
tokens = self.tokenizer.tokenize(text)
tokens = self._tokens_to_tensor(tokens)
prediction = self.model.predict(tokens, verbose=False)
prediction = self._prediction_to_rating(prediction)
return prediction
@abc.abstractmethod
def _rating_to_input(self, rating: float) -> tensorflow.Tensor:
"""
Convert a review rating to a `tensorflow.Tensor`.
"""
raise NotImplementedError()
def use(self, text: Text) -> Category:
if self.failed:
log.error("Tried to use a failed model.")
raise NotTrainedError("Cannot use a failed model.")
if not self.trained:
log.error("Tried to use a non-trained model.")
raise NotTrainedError("Cannot use a non-trained model.")
@abc.abstractmethod
def _ratingtensor_shape(self) -> tensorflow.TensorSpec:
"""
Returns the shape of the tensor output by `._rating_to_tensor` and accepted as input by `._tensor_to_rating`.
"""
raise NotImplementedError()
vector = self.text_vectorization_layer(text)
prediction = self.model.predict(vector, verbose=False)
return self._translate_prediction(prediction)
@abc.abstractmethod
def _prediction_to_rating(self, prediction: tensorflow.Tensor) -> float:
"""
Convert the results of `tensorflow.keras.Sequential.predict` into a review rating.
"""
raise NotImplementedError()
class TensorflowCategorySentimentAnalyzer(TensorflowSentimentAnalyzer):
@ -172,19 +172,10 @@ class TensorflowCategorySentimentAnalyzer(TensorflowSentimentAnalyzer):
A `tensorflow`-based sentiment analyzer that considers each star rating as a separate category.
"""
def _build_dataset(self, dataset_func: CachedDatasetFunc) -> tensorflow.data.Dataset:
return build_dataset(
dataset_func=dataset_func,
conversion_func=Review.to_tensor_tuple_category,
output_signature=(
tensorflow.TensorSpec(shape=(), dtype=tensorflow.string, name="text"),
tensorflow.TensorSpec(shape=(1, 5,), dtype=tensorflow.float32, name="category_one_hot"),
),
)
def _build_model(self) -> tensorflow.keras.Sequential:
log.debug("Creating sequential categorizer model...")
model = tensorflow.keras.Sequential([
self.string_lookup_layer,
tensorflow.keras.layers.Embedding(
input_dim=TENSORFLOW_MAX_FEATURES.__wrapped__ + 1,
output_dim=TENSORFLOW_EMBEDDING_SIZE.__wrapped__,
@ -209,15 +200,35 @@ class TensorflowCategorySentimentAnalyzer(TensorflowSentimentAnalyzer):
log.debug("Compiled model: %s", model)
return model
def _translate_prediction(self, a: numpy.array) -> Category:
max_i = None
max_p = None
for i, p in enumerate(iter(a[0])):
if max_p is None or p > max_p:
max_i = i
max_p = p
result = float(max_i) + 1.0
return float(round(result))
def _rating_to_input(self, rating: float) -> tensorflow.Tensor:
tensor = tensorflow.convert_to_tensor(
[[
1.0 if rating == 1.0 else 0.0,
1.0 if rating == 2.0 else 0.0,
1.0 if rating == 3.0 else 0.0,
1.0 if rating == 4.0 else 0.0,
1.0 if rating == 5.0 else 0.0,
]],
dtype=tensorflow.float32,
name="rating_one_hot"
)
return tensor
def _ratingtensor_shape(self) -> tensorflow.TensorSpec:
spec = tensorflow.TensorSpec(shape=(1, 5), dtype=tensorflow.float32, name="rating_one_hot")
return spec
def _prediction_to_rating(self, prediction: tensorflow.Tensor) -> float:
best_prediction = None
best_prediction_index = None
for index, prediction in enumerate(iter(prediction[0])):
if best_prediction is None or prediction > best_prediction:
best_prediction = prediction
best_prediction_index = index
result = float(best_prediction_index) + 1.0
return result
class TensorflowPolarSentimentAnalyzer(TensorflowSentimentAnalyzer):
@ -225,19 +236,10 @@ class TensorflowPolarSentimentAnalyzer(TensorflowSentimentAnalyzer):
A `tensorflow`-based sentiment analyzer that uses the floating point value rating to get as close as possible to the correct category.
"""
def _build_dataset(self, dataset_func: CachedDatasetFunc) -> tensorflow.data.Dataset:
return build_dataset(
dataset_func=dataset_func,
conversion_func=Review.to_tensor_tuple_normvalue,
output_signature=(
tensorflow.TensorSpec(shape=(), dtype=tensorflow.string, name="text"),
tensorflow.TensorSpec(shape=(1,), dtype=tensorflow.float32, name="category"),
),
)
def _build_model(self) -> tensorflow.keras.Sequential:
log.debug("Creating sequential categorizer model...")
model = tensorflow.keras.Sequential([
self.string_lookup_layer,
tensorflow.keras.layers.Embedding(
input_dim=TENSORFLOW_MAX_FEATURES.__wrapped__ + 1,
output_dim=TENSORFLOW_EMBEDDING_SIZE.__wrapped__,
@ -245,7 +247,9 @@ class TensorflowPolarSentimentAnalyzer(TensorflowSentimentAnalyzer):
tensorflow.keras.layers.Dropout(0.25),
tensorflow.keras.layers.GlobalAveragePooling1D(),
tensorflow.keras.layers.Dropout(0.25),
tensorflow.keras.layers.Dense(1, activation="sigmoid"),
tensorflow.keras.layers.Dense(8),
tensorflow.keras.layers.Dropout(0.25),
tensorflow.keras.layers.Dense(1, activation=tensorflow.keras.activations.sigmoid),
])
log.debug("Compiling model: %s", model)
@ -257,11 +261,23 @@ class TensorflowPolarSentimentAnalyzer(TensorflowSentimentAnalyzer):
log.debug("Compiled model: %s", model)
return model
def _translate_prediction(self, a: numpy.array) -> Category:
a: float = a[0, 0]
a = a * 2 + 1
a = float(round(a))
return a
def _rating_to_input(self, rating: float) -> tensorflow.Tensor:
normalized_rating = (rating - 1) / 4
tensor = tensorflow.convert_to_tensor(
[normalized_rating],
dtype=tensorflow.float32,
name="rating_value"
)
return tensor
def _ratingtensor_shape(self) -> tensorflow.TensorSpec:
spec = tensorflow.TensorSpec(shape=(1,), dtype=tensorflow.float32, name="rating_value")
return spec
def _prediction_to_rating(self, prediction: numpy.array) -> float:
rating: float = prediction[0, 0]
rating = 1.0 if rating < 0.5 else 5.0
return rating
__all__ = (

View file

@ -4,15 +4,15 @@ import shutil
import pathlib
import pickle
from .datatypes import Review
from .datatypes import TextReview
log = logging.getLogger(__name__)
CachedDatasetFunc = t.Callable[[], t.Generator[Review, t.Any, None]]
CachedDatasetFunc = t.Callable[[], t.Generator[TextReview, t.Any, None]]
def store_cache(reviews: t.Iterator[Review], path: str | pathlib.Path) -> None:
def store_cache(reviews: t.Iterator[TextReview], path: str | pathlib.Path) -> None:
"""
Store the contents of the given `Review` iterator to different files in a directory at the given path.
"""
@ -54,7 +54,7 @@ def load_cache(path: str | pathlib.Path) -> CachedDatasetFunc:
log.debug("Loading pickle file: %s", document_path)
with open(document_path, "rb") as file:
result: Review = pickle.load(file)
result: TextReview = pickle.load(file)
yield result
return data_cache_loader

View file

@ -1,75 +1,80 @@
import tensorflow
import abc
import typing as t
from .collections import MongoReview
import logging
log = logging.getLogger(__name__)
Text = str
Category = float
class Review(metaclass=abc.ABCMeta):
"""
Base class for method common to both review types.
"""
def __init__(self, *, rating: float):
self.rating: float = rating
"""
The star rating of the review, from ``1.0`` to ``5.0``.
"""
class Review:
class TextReview(Review):
"""
Optimized container for a review with the text still intact.
Uses `__slots__` for better performance.
"""
__slots__ = (
"text",
"category",
"rating",
)
def __init__(self, text: Text, category: Category):
def __init__(self, *, rating: float, text: str):
super().__init__(rating=rating)
self.text: str = text
self.category: float = category
"""
The contents of the review.
"""
@classmethod
def from_mongoreview(cls, review: MongoReview):
def from_mongoreview(cls, review: MongoReview) -> "TextReview":
"""
Create a new `.Review` object from a `MongoReview` `dict`.
"""
return cls(
text=review["reviewText"],
category=review["overall"],
rating=review["overall"],
)
def __repr__(self):
return f"<{self.__class__.__qualname__}: [{self.category}] {self.text}>"
return f"<{self.__class__.__qualname__}: ({self.rating}*) {self.text[:80]}>"
def __getitem__(self, item):
if item == 0 or item == "text":
return self.text
elif item == 1 or item == "category":
return self.category
else:
raise KeyError(item)
def normvalue(self) -> float:
return (self.category - 1) / 2
class TokenizedReview(Review):
"""
Optimized container for a review with a tokenized text.
def to_tensor_text(self) -> tensorflow.Tensor:
return tensorflow.convert_to_tensor(self.text, dtype=tensorflow.string)
Uses `__slots__` for better performance.
"""
def to_tensor_normvalue(self) -> tensorflow.Tensor:
return tensorflow.convert_to_tensor([self.normvalue()], dtype=tensorflow.float32)
__slots__ = (
"tokens",
"rating",
)
def to_tensor_tuple_normvalue(self) -> tuple[tensorflow.Tensor, tensorflow.Tensor]:
return (
self.to_tensor_text(),
self.to_tensor_normvalue(),
)
def __init__(self, *, rating: float, tokens: t.Iterator[str]):
super().__init__(rating=rating)
def to_tensor_category(self) -> tensorflow.Tensor:
return tensorflow.convert_to_tensor([[
1.0 if self.category == 1.0 else 0.0,
1.0 if self.category == 2.0 else 0.0,
1.0 if self.category == 3.0 else 0.0,
1.0 if self.category == 4.0 else 0.0,
1.0 if self.category == 5.0 else 0.0,
]], dtype=tensorflow.float32)
self.tokens: list[str] = list(tokens)
"""
List of all tokens in the review text.
"""
def to_tensor_tuple_category(self) -> tuple[tensorflow.Tensor, tensorflow.Tensor]:
return (
self.to_tensor_text(),
self.to_tensor_category(),
)
def __repr__(self):
return f"<{self.__class__.__qualname__}: ({self.rating}*) [{len(self.tokens)} tokens]>"
__all__ = (
"Text",
"Category",
"Review",
"TextReview",
"TokenizedReview",
)

View file

@ -4,15 +4,15 @@ import typing as t
from ..config import WORKING_SET_SIZE
from .collections import MongoReview
from .datatypes import Review
from .datatypes import TextReview
log = logging.getLogger(__name__)
SampleFunc = t.Callable[[pymongo.collection.Collection, int], t.Iterator[Review]]
SampleFunc = t.Callable[[pymongo.collection.Collection, int], t.Iterator[TextReview]]
def sample_reviews(collection: pymongo.collection.Collection, amount: int) -> t.Iterator[Review]:
def sample_reviews(collection: pymongo.collection.Collection, amount: int) -> t.Iterator[TextReview]:
"""
Get ``amount`` random reviews from the ``reviews`` collection.
"""
@ -23,12 +23,12 @@ def sample_reviews(collection: pymongo.collection.Collection, amount: int) -> t.
{"$sample": {"size": amount}},
])
cursor = map(Review.from_mongoreview, cursor)
cursor = map(TextReview.from_mongoreview, cursor)
return cursor
def sample_reviews_by_rating(collection: pymongo.collection.Collection, rating: float, amount: int) -> t.Iterator[Review]:
def sample_reviews_by_rating(collection: pymongo.collection.Collection, rating: float, amount: int) -> t.Iterator[TextReview]:
"""
Get ``amount`` random reviews with ``rating`` stars from the ``reviews`` collection.
"""
@ -43,7 +43,7 @@ def sample_reviews_by_rating(collection: pymongo.collection.Collection, rating:
return cursor
def sample_reviews_polar(collection: pymongo.collection.Collection, amount: int) -> t.Iterator[Review]:
def sample_reviews_polar(collection: pymongo.collection.Collection, amount: int) -> t.Iterator[TextReview]:
category_amount = amount // 2
log.debug("Getting a sample of %d polar reviews...", category_amount * 2)
@ -68,12 +68,12 @@ def sample_reviews_polar(collection: pymongo.collection.Collection, amount: int)
}}
])
cursor = map(Review.from_mongoreview, cursor)
cursor = map(TextReview.from_mongoreview, cursor)
return cursor
def sample_reviews_varied(collection: pymongo.collection.Collection, amount: int) -> t.Iterator[Review]:
def sample_reviews_varied(collection: pymongo.collection.Collection, amount: int) -> t.Iterator[TextReview]:
category_amount = amount // 5
log.debug("Getting a sample of %d varied reviews...", category_amount * 5)
@ -123,7 +123,7 @@ def sample_reviews_varied(collection: pymongo.collection.Collection, amount: int
}}
])
cursor = map(Review.from_mongoreview, cursor)
cursor = map(TextReview.from_mongoreview, cursor)
return cursor

View file

@ -5,7 +5,7 @@ import logging
import pymongo
from .config import TRAINING_SET_SIZE, VALIDATION_SET_SIZE, EVALUATION_SET_SIZE
from .database import SampleFunc, CachedDatasetFunc, mongo_client_from_config, reviews_collection, store_cache, load_cache, delete_cache
from .database import SampleFunc, CachedDatasetFunc, store_cache, load_cache, delete_cache
log = logging.getLogger(__name__)

View file

@ -1,51 +1,26 @@
import tensorflow
import typing as t
import abc
from ..database.datatypes import TextReview, TokenizedReview
class BaseTokenizer:
class BaseTokenizer(metaclass=abc.ABCMeta):
"""
The base for all tokenizers in this project.
"""
def __repr__(self):
return f"{self.__class__.__qualname__}()"
return f"<{self.__class__.__qualname__}>"
@staticmethod
def __not_implemented(f):
f.__notimplemented__ = True
return f
def supports_plain(self) -> bool:
return not getattr(self.tokenize_plain, "__notimplemented__", False)
def supports_tensorflow(self) -> bool:
return not getattr(self.tokenize_tensorflow, "__notimplemented__", False)
@__not_implemented
def tokenize_plain(self, text: str) -> str:
@abc.abstractmethod
def tokenize(self, text: str) -> t.Iterator[str]:
"""
Convert a text `str` into another `str` containing a series of whitespace-separated tokens.
"""
raise NotImplementedError()
def tokenize_and_split_plain(self, text: str) -> list[str]:
def tokenize_review(self, review: TextReview) -> TokenizedReview:
"""
Run `.tokenize_plain`, then split the result using `str.split`.
Apply `.tokenize` to the text of a `TextReview`, converting it in a `TokenizedReview`.
"""
text = self.tokenize_plain(text)
text = text.split()
return text
@__not_implemented
def tokenize_tensorflow(self, text: tensorflow.Tensor) -> tensorflow.Tensor:
"""
Convert a `tensorflow.Tensor` string into another `tensorflow.Tensor` space-separated string.
"""
raise NotImplementedError()
def tokenize_tensorflow_and_expand_dims(self, text: tensorflow.Tensor) -> tensorflow.Tensor:
"""
Run `.tokenize_tensorflow`, then add a dimension to the tensor for reasons unknown to me, but required to get `tensorflow` to work properly.
"""
text = self.tokenize_tensorflow(text)
text = tensorflow.expand_dims(text, -1, name="tokens")
return text
tokens = self.tokenize(review.text)
return TokenizedReview(rating=review.rating, tokens=tokens)

View file

@ -1,10 +1,15 @@
import abc
import tokenizers
import typing as t
from .base import BaseTokenizer
class HuggingTokenizer(BaseTokenizer, metaclass=abc.ABCMeta):
"""
Abstract tokenizer to implement any tokenizer based on HuggingFace `tokenizers.Tokenizer`.
"""
def __init__(self):
super().__init__()
self.hug: tokenizers.Tokenizer = self._build_hugging_tokenizer()
@ -12,11 +17,15 @@ class HuggingTokenizer(BaseTokenizer, metaclass=abc.ABCMeta):
def _build_hugging_tokenizer(self) -> tokenizers.Tokenizer:
raise NotImplementedError()
def tokenize_plain(self, text: str) -> str:
return " ".join(self.hug.encode(text).tokens)
def tokenize(self, text: str) -> t.Iterator[str]:
return self.hug.encode(text).tokens
class HuggingBertTokenizer(HuggingTokenizer):
"""
Tokenizer based on the `bert-base-cased <https://huggingface.co/bert-base-cased>`_ tokenizer.
"""
def _build_hugging_tokenizer(self):
return tokenizers.Tokenizer.from_pretrained("bert-base-cased")

View file

@ -1,17 +1,14 @@
import tensorflow
import typing as t
from .base import BaseTokenizer
class LowercaseTokenizer(BaseTokenizer):
"""
Tokenizer which converts the words to lowercase before splitting them via spaces.
Tokenizer which converts the words to lowercase before splitting them with `str.split`.
"""
def tokenize_plain(self, text: str) -> str:
def tokenize(self, text: str) -> t.Iterator[str]:
text = text.lower()
return text
def tokenize_tensorflow(self, text: tensorflow.Tensor) -> tensorflow.Tensor:
text = tensorflow.strings.lower(text)
return text
tokens = text.split()
return tokens

View file

@ -9,7 +9,7 @@ class NLTKWordTokenizer(BaseTokenizer):
Tokenizer based on `nltk.word_tokenize`.
"""
def tokenize_plain(self, text: str) -> str:
def tokenize(self, text: str) -> str:
tokens = nltk.word_tokenize(text)
nltk.sentiment.util.mark_negation(tokens, shallow=True)
return " ".join(tokens)

View file

@ -1,15 +1,13 @@
import tensorflow
import typing as t
from .base import BaseTokenizer
class PlainTokenizer(BaseTokenizer):
"""
Tokenizer which just splits the text into tokens by separating them at whitespaces.
Tokenizer which just splits the text into tokens by separating them at whitespaces with `str.split`.
"""
def tokenize_plain(self, text: str) -> str:
return text
def tokenize_tensorflow(self, text: tensorflow.Tensor) -> tensorflow.Tensor:
return text
def tokenize(self, text: str) -> t.Iterator[str]:
tokens = text.split()
return tokens

View file

@ -1,4 +1,3 @@
import tensorflow
import re
import html.entities
import typing as t
@ -11,7 +10,7 @@ class PottsTokenizer(BaseTokenizer):
"""
Tokenizer based on `Christopher Potts' tokenizer <http://sentiment.christopherpotts.net/tokenizing.html>`_, released in 2011.
This module is released under the Creative Commons Attribution-NonCommercial-ShareAlike 3.0 Unported License: https://creativecommons.org/licenses/by-nc-sa/3.0/ .
This class is released under the Creative Commons Attribution-NonCommercial-ShareAlike 3.0 Unported License: https://creativecommons.org/licenses/by-nc-sa/3.0/ .
"""
# noinspection RegExpRepeatedSpace
@ -76,7 +75,7 @@ class PottsTokenizer(BaseTokenizer):
amp = "&amp;"
@classmethod
def __html2string(cls, s: str) -> str:
def html_entities_to_chr(cls, s: str) -> str:
"""
Internal metod that seeks to replace all the HTML entities in s with their corresponding characters.
"""
@ -102,24 +101,41 @@ class PottsTokenizer(BaseTokenizer):
s = s.replace(cls.amp, " and ")
return s
def tokenize_plain(self, text: str) -> str:
@classmethod
def lower_but_preserve_emoticons(cls, word):
"""
Internal method which lowercases the word if it does not match `.emoticon_re`.
"""
if cls.emoticon_re.search(word):
return word
else:
return word.lower()
def tokenize(self, text: str) -> t.Iterator[str]:
# Fix HTML character entitites
s = self.__html2string(text)
text = self.html_entities_to_chr(text)
# Tokenize
words = self.words_re.findall(s)
tokens = self.words_re.findall(text)
# Possible alter the case, but avoid changing emoticons like :D into :d:
words = list(map(lambda x: x if self.emoticon_re.search(x) else x.lower(), words))
# Re-join words
result = " ".join(words)
tokens = map(self.lower_but_preserve_emoticons, tokens)
# Return the result
return result
return tokens
class PottsTokenizerWithNegation(PottsTokenizer):
def tokenize_plain(self, text: str) -> str:
words = super().tokenize_plain(text).split()
"""
Version of `.PottsTokenizer` which after tokenizing applies `nltk.sentiment.util.mark_negation`.
"""
def tokenize(self, text: str) -> str:
# Apply the base tokenization
words = super().tokenize(text)
# Convert to a list (sigh) the iterator
words = list(words)
# Use nltk to mark negation
nltk.sentiment.util.mark_negation(words, shallow=True)
return " ".join(words)
# Return the result
return words
__all__ = (