mirror of
https://github.com/Steffo99/unimore-bda-6.git
synced 2024-11-21 23:44:19 +00:00
CODE IS DONE
This commit is contained in:
parent
ae2cf563e6
commit
4e8aa68db3
17 changed files with 331 additions and 304 deletions
|
@ -5,14 +5,14 @@
|
||||||
<option name="PARENT_ENVS" value="true" />
|
<option name="PARENT_ENVS" value="true" />
|
||||||
<envs>
|
<envs>
|
||||||
<env name="CONFIRM_OVERWRITE" value="False" />
|
<env name="CONFIRM_OVERWRITE" value="False" />
|
||||||
<env name="EVALUATION_SET_SIZE" value="100" />
|
<env name="EVALUATION_SET_SIZE" value="4000" />
|
||||||
<env name="NLTK_DATA" value="./data/nltk" />
|
<env name="NLTK_DATA" value="./data/nltk" />
|
||||||
<env name="PYTHONUNBUFFERED" value="1" />
|
<env name="PYTHONUNBUFFERED" value="1" />
|
||||||
<env name="TENSORFLOW_EMBEDDING_SIZE" value="64" />
|
<env name="TENSORFLOW_EMBEDDING_SIZE" value="64" />
|
||||||
<env name="TENSORFLOW_MAX_FEATURES" value="1000000" />
|
<env name="TENSORFLOW_MAX_FEATURES" value="1000000" />
|
||||||
<env name="TF_CPP_MIN_LOG_LEVEL" value="2" />
|
<env name="TF_CPP_MIN_LOG_LEVEL" value="2" />
|
||||||
<env name="TRAINING_SET_SIZE" value="2000" />
|
<env name="TRAINING_SET_SIZE" value="4000" />
|
||||||
<env name="VALIDATION_SET_SIZE" value="25" />
|
<env name="VALIDATION_SET_SIZE" value="100" />
|
||||||
<env name="WORKING_SET_SIZE" value="100000" />
|
<env name="WORKING_SET_SIZE" value="100000" />
|
||||||
<env name="XLA_FLAGS" value="--xla_gpu_cuda_data_dir=/opt/cuda" />
|
<env name="XLA_FLAGS" value="--xla_gpu_cuda_data_dir=/opt/cuda" />
|
||||||
</envs>
|
</envs>
|
||||||
|
|
|
@ -6,7 +6,7 @@ install_general_log_handlers()
|
||||||
|
|
||||||
from .config import config
|
from .config import config
|
||||||
from .database import mongo_client_from_config, reviews_collection, sample_reviews_polar, sample_reviews_varied
|
from .database import mongo_client_from_config, reviews_collection, sample_reviews_polar, sample_reviews_varied
|
||||||
from .analysis import NLTKSentimentAnalyzer, TensorflowCategorySentimentAnalyzer, TensorflowPolarSentimentAnalyzer
|
from .analysis import NLTKSentimentAnalyzer, TensorflowCategorySentimentAnalyzer, TensorflowPolarSentimentAnalyzer, ThreeCheat
|
||||||
from .analysis.base import TrainingFailedError
|
from .analysis.base import TrainingFailedError
|
||||||
from .tokenizer import PlainTokenizer, LowercaseTokenizer, NLTKWordTokenizer, PottsTokenizer, PottsTokenizerWithNegation, HuggingBertTokenizer
|
from .tokenizer import PlainTokenizer, LowercaseTokenizer, NLTKWordTokenizer, PottsTokenizer, PottsTokenizerWithNegation, HuggingBertTokenizer
|
||||||
from .gathering import Caches
|
from .gathering import Caches
|
||||||
|
@ -32,25 +32,29 @@ def main():
|
||||||
|
|
||||||
reviews = reviews_collection(db)
|
reviews = reviews_collection(db)
|
||||||
|
|
||||||
for sample_func in [sample_reviews_varied, sample_reviews_polar]:
|
for sample_func in [
|
||||||
|
sample_reviews_polar,
|
||||||
|
sample_reviews_varied,
|
||||||
|
]:
|
||||||
|
|
||||||
slog = logging.getLogger(f"{__name__}.{sample_func.__name__}")
|
slog = logging.getLogger(f"{__name__}.{sample_func.__name__}")
|
||||||
slog.debug("Selected sample_func: %s", sample_func.__name__)
|
slog.debug("Selected sample_func: %s", sample_func.__name__)
|
||||||
|
|
||||||
for SentimentAnalyzer in [
|
for SentimentAnalyzer in [
|
||||||
|
# ThreeCheat,
|
||||||
TensorflowPolarSentimentAnalyzer,
|
TensorflowPolarSentimentAnalyzer,
|
||||||
TensorflowCategorySentimentAnalyzer,
|
TensorflowCategorySentimentAnalyzer,
|
||||||
# NLTKSentimentAnalyzer,
|
NLTKSentimentAnalyzer,
|
||||||
]:
|
]:
|
||||||
|
|
||||||
slog = logging.getLogger(f"{__name__}.{sample_func.__name__}.{SentimentAnalyzer.__name__}")
|
slog = logging.getLogger(f"{__name__}.{sample_func.__name__}.{SentimentAnalyzer.__name__}")
|
||||||
slog.debug("Selected SentimentAnalyzer: %s", SentimentAnalyzer.__name__)
|
slog.debug("Selected SentimentAnalyzer: %s", SentimentAnalyzer.__name__)
|
||||||
|
|
||||||
for Tokenizer in [
|
for Tokenizer in [
|
||||||
PottsTokenizerWithNegation,
|
|
||||||
PottsTokenizer,
|
PottsTokenizer,
|
||||||
HuggingBertTokenizer,
|
|
||||||
PlainTokenizer,
|
PlainTokenizer,
|
||||||
|
HuggingBertTokenizer,
|
||||||
|
PottsTokenizerWithNegation,
|
||||||
LowercaseTokenizer,
|
LowercaseTokenizer,
|
||||||
NLTKWordTokenizer,
|
NLTKWordTokenizer,
|
||||||
]:
|
]:
|
||||||
|
|
|
@ -5,3 +5,4 @@ This module contains all implemented types of sentiment analyzers.
|
||||||
from .base import *
|
from .base import *
|
||||||
from .nltk_sentiment import *
|
from .nltk_sentiment import *
|
||||||
from .tf_text import *
|
from .tf_text import *
|
||||||
|
from .cheating import *
|
||||||
|
|
|
@ -4,7 +4,7 @@ import abc
|
||||||
import logging
|
import logging
|
||||||
import dataclasses
|
import dataclasses
|
||||||
|
|
||||||
from ..database import Text, Category, CachedDatasetFunc
|
from ..database import CachedDatasetFunc, TextReview, TokenizedReview
|
||||||
from ..tokenizer import BaseTokenizer
|
from ..tokenizer import BaseTokenizer
|
||||||
|
|
||||||
log = logging.getLogger(__name__)
|
log = logging.getLogger(__name__)
|
||||||
|
@ -15,12 +15,11 @@ class BaseSentimentAnalyzer(metaclass=abc.ABCMeta):
|
||||||
Abstract base class for sentiment analyzers implemented in this project.
|
Abstract base class for sentiment analyzers implemented in this project.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
# noinspection PyUnusedLocal
|
|
||||||
def __init__(self, *, tokenizer: BaseTokenizer):
|
def __init__(self, *, tokenizer: BaseTokenizer):
|
||||||
pass
|
self.tokenizer: BaseTokenizer = tokenizer
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
return f"<{self.__class__.__qualname__}>"
|
return f"<{self.__class__.__qualname__} with {self.tokenizer} tokenizer>"
|
||||||
|
|
||||||
@abc.abstractmethod
|
@abc.abstractmethod
|
||||||
def train(self, training_dataset_func: CachedDatasetFunc, validation_dataset_func: CachedDatasetFunc) -> None:
|
def train(self, training_dataset_func: CachedDatasetFunc, validation_dataset_func: CachedDatasetFunc) -> None:
|
||||||
|
@ -30,34 +29,34 @@ class BaseSentimentAnalyzer(metaclass=abc.ABCMeta):
|
||||||
raise NotImplementedError()
|
raise NotImplementedError()
|
||||||
|
|
||||||
@abc.abstractmethod
|
@abc.abstractmethod
|
||||||
def use(self, text: Text) -> Category:
|
def use(self, text: str) -> float:
|
||||||
"""
|
"""
|
||||||
Run the model on the given input.
|
Run the model on the given input, and return the predicted rating.
|
||||||
"""
|
"""
|
||||||
raise NotImplementedError()
|
raise NotImplementedError()
|
||||||
|
|
||||||
def evaluate(self, evaluation_dataset_func: CachedDatasetFunc) -> EvaluationResults:
|
def evaluate(self, evaluation_dataset_func: CachedDatasetFunc) -> EvaluationResults:
|
||||||
"""
|
"""
|
||||||
Perform a model evaluation by calling repeatedly `.use` on every text of the test dataset and by comparing its resulting category with the expected category.
|
Perform a model evaluation by calling repeatedly `.use` on every text of the test dataset and by comparing its resulting category with the expected category.
|
||||||
|
|
||||||
Returns a tuple with the number of correct results and the number of evaluated results.
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
evaluated: int = 0
|
evaluated: int = 0
|
||||||
correct: int = 0
|
|
||||||
score: float = 0.0
|
perfect: int = 0
|
||||||
|
|
||||||
|
squared_error: float = 0.0
|
||||||
|
|
||||||
for review in evaluation_dataset_func():
|
for review in evaluation_dataset_func():
|
||||||
resulting_category = self.use(review.text)
|
resulting_category = self.use(review.text)
|
||||||
log.debug("Evaluation step: expected %d, received %d, review was %s", review.category, resulting_category, review.text[:80])
|
log.debug("Evaluation step: %d for %s", resulting_category, review)
|
||||||
evaluated += 1
|
evaluated += 1
|
||||||
try:
|
try:
|
||||||
correct += 1 if resulting_category == review.category else 0
|
perfect += 1 if resulting_category == review.rating else 0
|
||||||
score += 1 - (abs(resulting_category - review.category) / 4)
|
squared_error += (resulting_category - review.rating) ** 2
|
||||||
except ValueError:
|
except ValueError:
|
||||||
log.warning("Model execution on %s resulted in a NaN value: %s", review, resulting_category)
|
log.warning("Model execution on %s resulted in a NaN value: %s", review, resulting_category)
|
||||||
|
|
||||||
return EvaluationResults(correct=correct, evaluated=evaluated, score=score)
|
return EvaluationResults(perfect=perfect, evaluated=evaluated, mse=squared_error / evaluated)
|
||||||
|
|
||||||
|
|
||||||
@dataclasses.dataclass
|
@dataclasses.dataclass
|
||||||
|
@ -66,15 +65,26 @@ class EvaluationResults:
|
||||||
Container for the results of a dataset evaluation.
|
Container for the results of a dataset evaluation.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
correct: int
|
|
||||||
evaluated: int
|
evaluated: int
|
||||||
score: float
|
"""
|
||||||
|
The number of reviews that were evaluated.
|
||||||
|
"""
|
||||||
|
|
||||||
|
perfect: int
|
||||||
|
"""
|
||||||
|
The number of reviews for which the model returned the correct rating.
|
||||||
|
"""
|
||||||
|
|
||||||
|
mse: float
|
||||||
|
"""
|
||||||
|
Mean squared error
|
||||||
|
"""
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
return f"<EvaluationResults: {self!s}>"
|
return f"<EvaluationResults: {self!s}>"
|
||||||
|
|
||||||
def __str__(self):
|
def __str__(self):
|
||||||
return f"{self.evaluated} evaluated, {self.correct} correct, {self.correct / self.evaluated:.2%} accuracy, {self.score:.2f} score, {self.score / self.evaluated:.2%} scoreaccuracy"
|
return f"Evaluation results:\t{self.evaluated}\tevaluated\t{self.perfect}\tperfect\t{self.perfect / self.evaluated:.2%}\taccuracy\t{self.mse / self.evaluated:.2%}\tmean squared error"
|
||||||
|
|
||||||
|
|
||||||
class AlreadyTrainedError(Exception):
|
class AlreadyTrainedError(Exception):
|
||||||
|
|
21
unimore_bda_6/analysis/cheating.py
Normal file
21
unimore_bda_6/analysis/cheating.py
Normal file
|
@ -0,0 +1,21 @@
|
||||||
|
from .base import BaseSentimentAnalyzer
|
||||||
|
from ..database.cache import CachedDatasetFunc
|
||||||
|
|
||||||
|
|
||||||
|
class ThreeCheat(BaseSentimentAnalyzer):
|
||||||
|
"""
|
||||||
|
A sentiment analyzer that always predicts a 3.0* rating.
|
||||||
|
|
||||||
|
Why? To test the scoring!
|
||||||
|
"""
|
||||||
|
|
||||||
|
def train(self, training_dataset_func: CachedDatasetFunc, validation_dataset_func: CachedDatasetFunc) -> None:
|
||||||
|
pass
|
||||||
|
|
||||||
|
def use(self, text: str) -> float:
|
||||||
|
return 3.0
|
||||||
|
|
||||||
|
|
||||||
|
__all__ = (
|
||||||
|
"ThreeCheat",
|
||||||
|
)
|
|
@ -6,7 +6,7 @@ import logging
|
||||||
import typing as t
|
import typing as t
|
||||||
import itertools
|
import itertools
|
||||||
|
|
||||||
from ..database import Text, Category, Review, CachedDatasetFunc
|
from ..database import TextReview, CachedDatasetFunc, TokenizedReview
|
||||||
from .base import BaseSentimentAnalyzer, AlreadyTrainedError, NotTrainedError
|
from .base import BaseSentimentAnalyzer, AlreadyTrainedError, NotTrainedError
|
||||||
from ..log import count_passage
|
from ..log import count_passage
|
||||||
from ..tokenizer import BaseTokenizer
|
from ..tokenizer import BaseTokenizer
|
||||||
|
@ -23,31 +23,17 @@ class NLTKSentimentAnalyzer(BaseSentimentAnalyzer):
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, *, tokenizer: BaseTokenizer) -> None:
|
def __init__(self, *, tokenizer: BaseTokenizer) -> None:
|
||||||
if not tokenizer.supports_plain():
|
|
||||||
raise TypeError("Tokenizer does not support NLTK")
|
|
||||||
|
|
||||||
super().__init__(tokenizer=tokenizer)
|
super().__init__(tokenizer=tokenizer)
|
||||||
|
|
||||||
self.model: nltk.sentiment.SentimentAnalyzer = nltk.sentiment.SentimentAnalyzer()
|
self.model: nltk.sentiment.SentimentAnalyzer = nltk.sentiment.SentimentAnalyzer()
|
||||||
self.trained: bool = False
|
self.trained: bool = False
|
||||||
self.tokenizer: BaseTokenizer = tokenizer
|
|
||||||
|
|
||||||
def __repr__(self):
|
def _add_feature_unigrams(self, dataset: t.Iterator[TokenizedReview]) -> None:
|
||||||
return f"<{self.__class__.__qualname__} tokenizer={self.tokenizer!r}>"
|
|
||||||
|
|
||||||
def __tokenize_review(self, datatuple: Review) -> tuple[TokenBag, Category]:
|
|
||||||
"""
|
|
||||||
Convert the `Text` of a `DataTuple` to a `TokenBag`.
|
|
||||||
"""
|
|
||||||
count_passage(log, "tokenize_datatuple", 100)
|
|
||||||
return self.tokenizer.tokenize_and_split_plain(datatuple.text), datatuple.category
|
|
||||||
|
|
||||||
def _add_feature_unigrams(self, dataset: t.Iterator[tuple[TokenBag, Category]]) -> None:
|
|
||||||
"""
|
"""
|
||||||
Register the `nltk.sentiment.util.extract_unigram_feats` feature extrator on the model.
|
Register the `nltk.sentiment.util.extract_unigram_feats` feature extrator on the model.
|
||||||
"""
|
"""
|
||||||
# Ignore the category and only access the tokens
|
# Ignore the category and only access the tokens
|
||||||
tokenbags = map(lambda d: d[0], dataset)
|
tokenbags = map(lambda r: r.rating, dataset)
|
||||||
# Get all words in the documents
|
# Get all words in the documents
|
||||||
all_words = self.model.all_words(tokenbags, labeled=False)
|
all_words = self.model.all_words(tokenbags, labeled=False)
|
||||||
# Create unigram `contains(*)` features from the previously gathered words
|
# Create unigram `contains(*)` features from the previously gathered words
|
||||||
|
@ -55,59 +41,48 @@ class NLTKSentimentAnalyzer(BaseSentimentAnalyzer):
|
||||||
# Add the feature extractor to the model
|
# Add the feature extractor to the model
|
||||||
self.model.add_feat_extractor(nltk.sentiment.util.extract_unigram_feats, unigrams=unigrams)
|
self.model.add_feat_extractor(nltk.sentiment.util.extract_unigram_feats, unigrams=unigrams)
|
||||||
|
|
||||||
def _add_feature_extractors(self, dataset: t.Iterator[tuple[TokenBag, Category]]):
|
def _add_feature_extractors(self, dataset: t.Iterator[TextReview]):
|
||||||
"""
|
"""
|
||||||
Register new feature extractors on the `.model`.
|
Register new feature extractors on the `.model`.
|
||||||
"""
|
"""
|
||||||
|
# Tokenize the reviews
|
||||||
|
dataset: t.Iterator[TokenizedReview] = map(self.tokenizer.tokenize_review, dataset)
|
||||||
# Add the unigrams feature
|
# Add the unigrams feature
|
||||||
self._add_feature_unigrams(dataset)
|
self._add_feature_unigrams(dataset)
|
||||||
|
|
||||||
def __extract_features(self, data: tuple[TokenBag, Category]) -> tuple[Features, Category]:
|
def __extract_features(self, review: TextReview) -> tuple[Features, float]:
|
||||||
"""
|
"""
|
||||||
Convert a (TokenBag, Category) tuple to a (Features, Category) tuple.
|
Convert a (TokenBag, Category) tuple to a (Features, Category) tuple.
|
||||||
|
|
||||||
Does not use `SentimentAnalyzer.apply_features` due to unexpected behaviour when using iterators.
|
Does not use `SentimentAnalyzer.apply_features` due to unexpected behaviour when using iterators.
|
||||||
"""
|
"""
|
||||||
count_passage(log, "extract_features", 100)
|
review: TokenizedReview = self.tokenizer.tokenize_review(review)
|
||||||
return self.model.extract_features(data[0]), data[1]
|
return self.model.extract_features(review.tokens), review.rating
|
||||||
|
|
||||||
def train(self, training_dataset_func: CachedDatasetFunc, validation_dataset_func: CachedDatasetFunc) -> None:
|
def train(self, training_dataset_func: CachedDatasetFunc, validation_dataset_func: CachedDatasetFunc) -> None:
|
||||||
# Forbid retraining the model
|
# Forbid retraining the model
|
||||||
if self.trained:
|
if self.trained:
|
||||||
raise AlreadyTrainedError()
|
raise AlreadyTrainedError()
|
||||||
|
|
||||||
# Get a generator
|
|
||||||
dataset: t.Generator[Review] = training_dataset_func()
|
|
||||||
|
|
||||||
# Tokenize the dataset
|
|
||||||
dataset: t.Iterator[tuple[TokenBag, Category]] = map(self.__tokenize_review, dataset)
|
|
||||||
|
|
||||||
# Cleanly duplicate the dataset iterator
|
|
||||||
# Reduce average memory footprint, but not maximum
|
|
||||||
dataset_1, dataset_2 = itertools.tee(dataset, 2)
|
|
||||||
dataset_1: t.Iterator[tuple[TokenBag, Category]]
|
|
||||||
dataset_2: t.Iterator[tuple[TokenBag, Category]]
|
|
||||||
|
|
||||||
# Add the feature extractors to the model
|
# Add the feature extractors to the model
|
||||||
self._add_feature_extractors(dataset_1)
|
self._add_feature_extractors(training_dataset_func())
|
||||||
del dataset_1 # Delete exausted iterator
|
|
||||||
|
|
||||||
# Extract features from the dataset
|
# Extract features from the dataset
|
||||||
dataset_2: t.Iterator[tuple[Features, Category]] = map(self.__extract_features, dataset_2)
|
featureset: t.Iterator[tuple[Features, float]] = map(self.__extract_features, training_dataset_func())
|
||||||
|
|
||||||
# Train the classifier with the extracted features and category
|
# Train the classifier with the extracted features and category
|
||||||
self.model.classifier = nltk.classify.NaiveBayesClassifier.train(dataset_2)
|
self.model.classifier = nltk.classify.NaiveBayesClassifier.train(featureset)
|
||||||
|
|
||||||
# Toggle the trained flag
|
# Toggle the trained flag
|
||||||
self.trained = True
|
self.trained = True
|
||||||
|
|
||||||
def use(self, text: Text) -> Category:
|
def use(self, text: str) -> float:
|
||||||
# Require the model to be trained
|
# Require the model to be trained
|
||||||
if not self.trained:
|
if not self.trained:
|
||||||
raise NotTrainedError()
|
raise NotTrainedError()
|
||||||
|
|
||||||
# Tokenize the input
|
# Tokenize the input
|
||||||
tokens = self.tokenizer.tokenize_and_split_plain(text)
|
tokens = self.tokenizer.tokenize(text)
|
||||||
|
|
||||||
# Run the classification method
|
# Run the classification method
|
||||||
return self.model.classify(instance=tokens)
|
return self.model.classify(instance=tokens)
|
||||||
|
|
|
@ -5,7 +5,7 @@ import numpy
|
||||||
import tensorflow
|
import tensorflow
|
||||||
import logging
|
import logging
|
||||||
|
|
||||||
from ..database import Text, Category, CachedDatasetFunc, Review
|
from ..database import CachedDatasetFunc, TextReview, TokenizedReview
|
||||||
from ..config import TENSORFLOW_EMBEDDING_SIZE, TENSORFLOW_MAX_FEATURES, TENSORFLOW_EPOCHS
|
from ..config import TENSORFLOW_EMBEDDING_SIZE, TENSORFLOW_MAX_FEATURES, TENSORFLOW_EPOCHS
|
||||||
from ..tokenizer import BaseTokenizer
|
from ..tokenizer import BaseTokenizer
|
||||||
from .base import BaseSentimentAnalyzer, AlreadyTrainedError, NotTrainedError, TrainingFailedError
|
from .base import BaseSentimentAnalyzer, AlreadyTrainedError, NotTrainedError, TrainingFailedError
|
||||||
|
@ -19,22 +19,51 @@ else:
|
||||||
log.debug("Tensorflow successfully found GPU acceleration!")
|
log.debug("Tensorflow successfully found GPU acceleration!")
|
||||||
|
|
||||||
|
|
||||||
ConversionFunc = t.Callable[[Review], tensorflow.Tensor | tuple]
|
ConversionFunc = t.Callable[[TextReview], tensorflow.Tensor | tuple]
|
||||||
|
|
||||||
|
|
||||||
def build_dataset(dataset_func: CachedDatasetFunc, conversion_func: ConversionFunc, output_signature: tensorflow.TensorSpec | tuple) -> tensorflow.data.Dataset:
|
class TensorflowSentimentAnalyzer(BaseSentimentAnalyzer, metaclass=abc.ABCMeta):
|
||||||
"""
|
"""
|
||||||
Convert a `CachedDatasetFunc` to a `tensorflow.data.Dataset`.
|
Base class for a sentiment analyzer using `tensorflow`.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, *, tokenizer: BaseTokenizer):
|
||||||
|
super().__init__(tokenizer=tokenizer)
|
||||||
|
|
||||||
|
self.trained: bool = False
|
||||||
|
self.failed: bool = False
|
||||||
|
|
||||||
|
self.string_lookup_layer: tensorflow.keras.layers.StringLookup = tensorflow.keras.layers.StringLookup(max_tokens=TENSORFLOW_MAX_FEATURES.__wrapped__)
|
||||||
|
self.model: tensorflow.keras.Sequential = self._build_model()
|
||||||
|
self.history: tensorflow.keras.callbacks.History | None = None
|
||||||
|
|
||||||
|
@abc.abstractmethod
|
||||||
|
def _build_model(self) -> tensorflow.keras.Sequential:
|
||||||
|
"""
|
||||||
|
Create the `tensorflow.keras.Sequential` model that should be executed by this sentiment analyzer.
|
||||||
|
"""
|
||||||
|
raise NotImplementedError()
|
||||||
|
|
||||||
|
def _build_dataset(self, dataset_func: CachedDatasetFunc) -> tensorflow.data.Dataset:
|
||||||
|
"""
|
||||||
|
Create a `tensorflow.data.Dataset` from the given `CachedDatasetFunc`.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def dataset_generator():
|
def dataset_generator():
|
||||||
for review in dataset_func():
|
for review in dataset_func():
|
||||||
yield conversion_func(review)
|
review: TextReview
|
||||||
|
review: TokenizedReview = self.tokenizer.tokenize_review(review)
|
||||||
|
tokens: tensorflow.Tensor = self._tokens_to_tensor(review.tokens)
|
||||||
|
rating: tensorflow.Tensor = self._rating_to_input(review.rating)
|
||||||
|
yield tokens, rating
|
||||||
|
|
||||||
log.debug("Creating dataset...")
|
log.debug("Creating dataset...")
|
||||||
dataset = tensorflow.data.Dataset.from_generator(
|
dataset = tensorflow.data.Dataset.from_generator(
|
||||||
dataset_generator,
|
dataset_generator,
|
||||||
output_signature=output_signature,
|
output_signature=(
|
||||||
|
tensorflow.TensorSpec(shape=(1, None,), dtype=tensorflow.string, name="tokens"),
|
||||||
|
self._ratingtensor_shape(),
|
||||||
|
),
|
||||||
)
|
)
|
||||||
|
|
||||||
log.debug("Caching dataset...")
|
log.debug("Caching dataset...")
|
||||||
|
@ -45,72 +74,14 @@ def build_dataset(dataset_func: CachedDatasetFunc, conversion_func: ConversionFu
|
||||||
|
|
||||||
return dataset
|
return dataset
|
||||||
|
|
||||||
|
|
||||||
class TensorflowSentimentAnalyzer(BaseSentimentAnalyzer, metaclass=abc.ABCMeta):
|
|
||||||
"""
|
|
||||||
Base class for a sentiment analyzer using `tensorflow`.
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(self, *, tokenizer: BaseTokenizer):
|
|
||||||
if not tokenizer.supports_tensorflow():
|
|
||||||
raise TypeError("Tokenizer does not support Tensorflow")
|
|
||||||
|
|
||||||
super().__init__(tokenizer=tokenizer)
|
|
||||||
|
|
||||||
self.trained: bool = False
|
|
||||||
self.failed: bool = False
|
|
||||||
|
|
||||||
self.tokenizer: BaseTokenizer = tokenizer
|
|
||||||
self.text_vectorization_layer: tensorflow.keras.layers.TextVectorization = self._build_text_vectorization_layer()
|
|
||||||
self.model: tensorflow.keras.Sequential = self._build_model()
|
|
||||||
self.history: tensorflow.keras.callbacks.History | None = None
|
|
||||||
|
|
||||||
def _build_text_vectorization_layer(self) -> tensorflow.keras.layers.TextVectorization:
|
|
||||||
"""
|
|
||||||
Create a `tensorflow`-compatible `TextVectorization` layer.
|
|
||||||
"""
|
|
||||||
log.debug("Creating TextVectorization layer...")
|
|
||||||
layer = tensorflow.keras.layers.TextVectorization(
|
|
||||||
standardize=self.tokenizer.tokenize_tensorflow_and_expand_dims,
|
|
||||||
max_tokens=TENSORFLOW_MAX_FEATURES.__wrapped__
|
|
||||||
)
|
|
||||||
log.debug("Created TextVectorization layer: %s", layer)
|
|
||||||
return layer
|
|
||||||
|
|
||||||
@abc.abstractmethod
|
|
||||||
def _build_model(self) -> tensorflow.keras.Sequential:
|
|
||||||
"""
|
|
||||||
Create the `tensorflow.keras.Sequential` model that should be executed by this sentiment analyzer.
|
|
||||||
"""
|
|
||||||
raise NotImplementedError()
|
|
||||||
|
|
||||||
@abc.abstractmethod
|
|
||||||
def _build_dataset(self, dataset_func: CachedDatasetFunc) -> tensorflow.data.Dataset:
|
|
||||||
"""
|
|
||||||
Create a `tensorflow.data.Dataset` from the given `CachedDatasetFunc`.
|
|
||||||
"""
|
|
||||||
raise NotImplementedError()
|
|
||||||
|
|
||||||
def _adapt_textvectorization(self, dataset: tensorflow.data.Dataset) -> None:
|
def _adapt_textvectorization(self, dataset: tensorflow.data.Dataset) -> None:
|
||||||
"""
|
"""
|
||||||
Adapt the `.text_vectorization_layer` to the given dataset.
|
Adapt the `.text_vectorization_layer` to the given dataset.
|
||||||
"""
|
"""
|
||||||
log.debug("Preparing dataset to adapt %s...", self.text_vectorization_layer)
|
log.debug("Preparing dataset to adapt %s...", self.string_lookup_layer)
|
||||||
dataset = dataset.map(lambda text, category: text)
|
dataset = dataset.map(lambda text, category: text)
|
||||||
log.debug("Adapting %s...", self.text_vectorization_layer)
|
log.debug("Adapting %s...", self.string_lookup_layer)
|
||||||
self.text_vectorization_layer.adapt(dataset)
|
self.string_lookup_layer.adapt(dataset)
|
||||||
|
|
||||||
def _vectorize_dataset(self, dataset: tensorflow.data.Dataset) -> tensorflow.data.Dataset:
|
|
||||||
"""
|
|
||||||
Apply the `.text_vectorization_layer` to the text in the dataset.
|
|
||||||
"""
|
|
||||||
def vectorize_entry(text, category):
|
|
||||||
return self.text_vectorization_layer(text), category
|
|
||||||
|
|
||||||
log.debug("Vectorizing dataset: %s", dataset)
|
|
||||||
dataset = dataset.map(vectorize_entry)
|
|
||||||
log.debug("Vectorized dataset: %s", dataset)
|
|
||||||
return dataset
|
|
||||||
|
|
||||||
def train(self, training_dataset_func: CachedDatasetFunc, validation_dataset_func: CachedDatasetFunc) -> None:
|
def train(self, training_dataset_func: CachedDatasetFunc, validation_dataset_func: CachedDatasetFunc) -> None:
|
||||||
if self.failed:
|
if self.failed:
|
||||||
|
@ -120,13 +91,17 @@ class TensorflowSentimentAnalyzer(BaseSentimentAnalyzer, metaclass=abc.ABCMeta):
|
||||||
log.error("Tried to train an already trained model.")
|
log.error("Tried to train an already trained model.")
|
||||||
raise AlreadyTrainedError("Cannot re-train an already trained model.")
|
raise AlreadyTrainedError("Cannot re-train an already trained model.")
|
||||||
|
|
||||||
|
log.debug("Building training dataset...")
|
||||||
training_set = self._build_dataset(training_dataset_func)
|
training_set = self._build_dataset(training_dataset_func)
|
||||||
|
|
||||||
|
log.debug("Building validation dataset...")
|
||||||
validation_set = self._build_dataset(validation_dataset_func)
|
validation_set = self._build_dataset(validation_dataset_func)
|
||||||
|
|
||||||
self._adapt_textvectorization(training_set)
|
log.debug("Building vocabulary...")
|
||||||
|
vocabulary = training_set.map(lambda tokens, rating: tokens)
|
||||||
|
|
||||||
training_set = self._vectorize_dataset(training_set)
|
log.debug("Adapting lookup layer to the vocabulary...")
|
||||||
validation_set = self._vectorize_dataset(validation_set)
|
self.string_lookup_layer.adapt(vocabulary)
|
||||||
|
|
||||||
log.info("Training: %s", self.model)
|
log.info("Training: %s", self.model)
|
||||||
self.history: tensorflow.keras.callbacks.History | None = self.model.fit(
|
self.history: tensorflow.keras.callbacks.History | None = self.model.fit(
|
||||||
|
@ -146,25 +121,50 @@ class TensorflowSentimentAnalyzer(BaseSentimentAnalyzer, metaclass=abc.ABCMeta):
|
||||||
log.info("Model %s training succeeded!", self.model)
|
log.info("Model %s training succeeded!", self.model)
|
||||||
self.trained = True
|
self.trained = True
|
||||||
|
|
||||||
@abc.abstractmethod
|
@staticmethod
|
||||||
def _translate_prediction(self, a: numpy.array) -> Category:
|
def _tokens_to_tensor(tokens: t.Iterator[str]) -> tensorflow.Tensor:
|
||||||
"""
|
"""
|
||||||
Convert the results of `tensorflow.keras.Sequential.predict` into a `.Category`.
|
Convert an iterator of tokens to a `tensorflow.Tensor`.
|
||||||
|
"""
|
||||||
|
tensor = tensorflow.convert_to_tensor(
|
||||||
|
[list(tokens)],
|
||||||
|
dtype=tensorflow.string,
|
||||||
|
name="tokens"
|
||||||
|
)
|
||||||
|
return tensor
|
||||||
|
|
||||||
|
def use(self, text: str) -> float:
|
||||||
|
if self.failed:
|
||||||
|
raise NotTrainedError("Cannot use a failed model.")
|
||||||
|
if not self.trained:
|
||||||
|
raise NotTrainedError("Cannot use a non-trained model.")
|
||||||
|
|
||||||
|
tokens = self.tokenizer.tokenize(text)
|
||||||
|
tokens = self._tokens_to_tensor(tokens)
|
||||||
|
prediction = self.model.predict(tokens, verbose=False)
|
||||||
|
prediction = self._prediction_to_rating(prediction)
|
||||||
|
return prediction
|
||||||
|
|
||||||
|
@abc.abstractmethod
|
||||||
|
def _rating_to_input(self, rating: float) -> tensorflow.Tensor:
|
||||||
|
"""
|
||||||
|
Convert a review rating to a `tensorflow.Tensor`.
|
||||||
"""
|
"""
|
||||||
raise NotImplementedError()
|
raise NotImplementedError()
|
||||||
|
|
||||||
def use(self, text: Text) -> Category:
|
@abc.abstractmethod
|
||||||
if self.failed:
|
def _ratingtensor_shape(self) -> tensorflow.TensorSpec:
|
||||||
log.error("Tried to use a failed model.")
|
"""
|
||||||
raise NotTrainedError("Cannot use a failed model.")
|
Returns the shape of the tensor output by `._rating_to_tensor` and accepted as input by `._tensor_to_rating`.
|
||||||
if not self.trained:
|
"""
|
||||||
log.error("Tried to use a non-trained model.")
|
raise NotImplementedError()
|
||||||
raise NotTrainedError("Cannot use a non-trained model.")
|
|
||||||
|
|
||||||
vector = self.text_vectorization_layer(text)
|
@abc.abstractmethod
|
||||||
prediction = self.model.predict(vector, verbose=False)
|
def _prediction_to_rating(self, prediction: tensorflow.Tensor) -> float:
|
||||||
|
"""
|
||||||
return self._translate_prediction(prediction)
|
Convert the results of `tensorflow.keras.Sequential.predict` into a review rating.
|
||||||
|
"""
|
||||||
|
raise NotImplementedError()
|
||||||
|
|
||||||
|
|
||||||
class TensorflowCategorySentimentAnalyzer(TensorflowSentimentAnalyzer):
|
class TensorflowCategorySentimentAnalyzer(TensorflowSentimentAnalyzer):
|
||||||
|
@ -172,19 +172,10 @@ class TensorflowCategorySentimentAnalyzer(TensorflowSentimentAnalyzer):
|
||||||
A `tensorflow`-based sentiment analyzer that considers each star rating as a separate category.
|
A `tensorflow`-based sentiment analyzer that considers each star rating as a separate category.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def _build_dataset(self, dataset_func: CachedDatasetFunc) -> tensorflow.data.Dataset:
|
|
||||||
return build_dataset(
|
|
||||||
dataset_func=dataset_func,
|
|
||||||
conversion_func=Review.to_tensor_tuple_category,
|
|
||||||
output_signature=(
|
|
||||||
tensorflow.TensorSpec(shape=(), dtype=tensorflow.string, name="text"),
|
|
||||||
tensorflow.TensorSpec(shape=(1, 5,), dtype=tensorflow.float32, name="category_one_hot"),
|
|
||||||
),
|
|
||||||
)
|
|
||||||
|
|
||||||
def _build_model(self) -> tensorflow.keras.Sequential:
|
def _build_model(self) -> tensorflow.keras.Sequential:
|
||||||
log.debug("Creating sequential categorizer model...")
|
log.debug("Creating sequential categorizer model...")
|
||||||
model = tensorflow.keras.Sequential([
|
model = tensorflow.keras.Sequential([
|
||||||
|
self.string_lookup_layer,
|
||||||
tensorflow.keras.layers.Embedding(
|
tensorflow.keras.layers.Embedding(
|
||||||
input_dim=TENSORFLOW_MAX_FEATURES.__wrapped__ + 1,
|
input_dim=TENSORFLOW_MAX_FEATURES.__wrapped__ + 1,
|
||||||
output_dim=TENSORFLOW_EMBEDDING_SIZE.__wrapped__,
|
output_dim=TENSORFLOW_EMBEDDING_SIZE.__wrapped__,
|
||||||
|
@ -209,15 +200,35 @@ class TensorflowCategorySentimentAnalyzer(TensorflowSentimentAnalyzer):
|
||||||
log.debug("Compiled model: %s", model)
|
log.debug("Compiled model: %s", model)
|
||||||
return model
|
return model
|
||||||
|
|
||||||
def _translate_prediction(self, a: numpy.array) -> Category:
|
def _rating_to_input(self, rating: float) -> tensorflow.Tensor:
|
||||||
max_i = None
|
tensor = tensorflow.convert_to_tensor(
|
||||||
max_p = None
|
[[
|
||||||
for i, p in enumerate(iter(a[0])):
|
1.0 if rating == 1.0 else 0.0,
|
||||||
if max_p is None or p > max_p:
|
1.0 if rating == 2.0 else 0.0,
|
||||||
max_i = i
|
1.0 if rating == 3.0 else 0.0,
|
||||||
max_p = p
|
1.0 if rating == 4.0 else 0.0,
|
||||||
result = float(max_i) + 1.0
|
1.0 if rating == 5.0 else 0.0,
|
||||||
return float(round(result))
|
]],
|
||||||
|
dtype=tensorflow.float32,
|
||||||
|
name="rating_one_hot"
|
||||||
|
)
|
||||||
|
return tensor
|
||||||
|
|
||||||
|
def _ratingtensor_shape(self) -> tensorflow.TensorSpec:
|
||||||
|
spec = tensorflow.TensorSpec(shape=(1, 5), dtype=tensorflow.float32, name="rating_one_hot")
|
||||||
|
return spec
|
||||||
|
|
||||||
|
def _prediction_to_rating(self, prediction: tensorflow.Tensor) -> float:
|
||||||
|
best_prediction = None
|
||||||
|
best_prediction_index = None
|
||||||
|
|
||||||
|
for index, prediction in enumerate(iter(prediction[0])):
|
||||||
|
if best_prediction is None or prediction > best_prediction:
|
||||||
|
best_prediction = prediction
|
||||||
|
best_prediction_index = index
|
||||||
|
|
||||||
|
result = float(best_prediction_index) + 1.0
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
class TensorflowPolarSentimentAnalyzer(TensorflowSentimentAnalyzer):
|
class TensorflowPolarSentimentAnalyzer(TensorflowSentimentAnalyzer):
|
||||||
|
@ -225,19 +236,10 @@ class TensorflowPolarSentimentAnalyzer(TensorflowSentimentAnalyzer):
|
||||||
A `tensorflow`-based sentiment analyzer that uses the floating point value rating to get as close as possible to the correct category.
|
A `tensorflow`-based sentiment analyzer that uses the floating point value rating to get as close as possible to the correct category.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def _build_dataset(self, dataset_func: CachedDatasetFunc) -> tensorflow.data.Dataset:
|
|
||||||
return build_dataset(
|
|
||||||
dataset_func=dataset_func,
|
|
||||||
conversion_func=Review.to_tensor_tuple_normvalue,
|
|
||||||
output_signature=(
|
|
||||||
tensorflow.TensorSpec(shape=(), dtype=tensorflow.string, name="text"),
|
|
||||||
tensorflow.TensorSpec(shape=(1,), dtype=tensorflow.float32, name="category"),
|
|
||||||
),
|
|
||||||
)
|
|
||||||
|
|
||||||
def _build_model(self) -> tensorflow.keras.Sequential:
|
def _build_model(self) -> tensorflow.keras.Sequential:
|
||||||
log.debug("Creating sequential categorizer model...")
|
log.debug("Creating sequential categorizer model...")
|
||||||
model = tensorflow.keras.Sequential([
|
model = tensorflow.keras.Sequential([
|
||||||
|
self.string_lookup_layer,
|
||||||
tensorflow.keras.layers.Embedding(
|
tensorflow.keras.layers.Embedding(
|
||||||
input_dim=TENSORFLOW_MAX_FEATURES.__wrapped__ + 1,
|
input_dim=TENSORFLOW_MAX_FEATURES.__wrapped__ + 1,
|
||||||
output_dim=TENSORFLOW_EMBEDDING_SIZE.__wrapped__,
|
output_dim=TENSORFLOW_EMBEDDING_SIZE.__wrapped__,
|
||||||
|
@ -245,7 +247,9 @@ class TensorflowPolarSentimentAnalyzer(TensorflowSentimentAnalyzer):
|
||||||
tensorflow.keras.layers.Dropout(0.25),
|
tensorflow.keras.layers.Dropout(0.25),
|
||||||
tensorflow.keras.layers.GlobalAveragePooling1D(),
|
tensorflow.keras.layers.GlobalAveragePooling1D(),
|
||||||
tensorflow.keras.layers.Dropout(0.25),
|
tensorflow.keras.layers.Dropout(0.25),
|
||||||
tensorflow.keras.layers.Dense(1, activation="sigmoid"),
|
tensorflow.keras.layers.Dense(8),
|
||||||
|
tensorflow.keras.layers.Dropout(0.25),
|
||||||
|
tensorflow.keras.layers.Dense(1, activation=tensorflow.keras.activations.sigmoid),
|
||||||
])
|
])
|
||||||
|
|
||||||
log.debug("Compiling model: %s", model)
|
log.debug("Compiling model: %s", model)
|
||||||
|
@ -257,11 +261,23 @@ class TensorflowPolarSentimentAnalyzer(TensorflowSentimentAnalyzer):
|
||||||
log.debug("Compiled model: %s", model)
|
log.debug("Compiled model: %s", model)
|
||||||
return model
|
return model
|
||||||
|
|
||||||
def _translate_prediction(self, a: numpy.array) -> Category:
|
def _rating_to_input(self, rating: float) -> tensorflow.Tensor:
|
||||||
a: float = a[0, 0]
|
normalized_rating = (rating - 1) / 4
|
||||||
a = a * 2 + 1
|
tensor = tensorflow.convert_to_tensor(
|
||||||
a = float(round(a))
|
[normalized_rating],
|
||||||
return a
|
dtype=tensorflow.float32,
|
||||||
|
name="rating_value"
|
||||||
|
)
|
||||||
|
return tensor
|
||||||
|
|
||||||
|
def _ratingtensor_shape(self) -> tensorflow.TensorSpec:
|
||||||
|
spec = tensorflow.TensorSpec(shape=(1,), dtype=tensorflow.float32, name="rating_value")
|
||||||
|
return spec
|
||||||
|
|
||||||
|
def _prediction_to_rating(self, prediction: numpy.array) -> float:
|
||||||
|
rating: float = prediction[0, 0]
|
||||||
|
rating = 1.0 if rating < 0.5 else 5.0
|
||||||
|
return rating
|
||||||
|
|
||||||
|
|
||||||
__all__ = (
|
__all__ = (
|
||||||
|
|
|
@ -4,15 +4,15 @@ import shutil
|
||||||
import pathlib
|
import pathlib
|
||||||
import pickle
|
import pickle
|
||||||
|
|
||||||
from .datatypes import Review
|
from .datatypes import TextReview
|
||||||
|
|
||||||
log = logging.getLogger(__name__)
|
log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
CachedDatasetFunc = t.Callable[[], t.Generator[Review, t.Any, None]]
|
CachedDatasetFunc = t.Callable[[], t.Generator[TextReview, t.Any, None]]
|
||||||
|
|
||||||
|
|
||||||
def store_cache(reviews: t.Iterator[Review], path: str | pathlib.Path) -> None:
|
def store_cache(reviews: t.Iterator[TextReview], path: str | pathlib.Path) -> None:
|
||||||
"""
|
"""
|
||||||
Store the contents of the given `Review` iterator to different files in a directory at the given path.
|
Store the contents of the given `Review` iterator to different files in a directory at the given path.
|
||||||
"""
|
"""
|
||||||
|
@ -54,7 +54,7 @@ def load_cache(path: str | pathlib.Path) -> CachedDatasetFunc:
|
||||||
|
|
||||||
log.debug("Loading pickle file: %s", document_path)
|
log.debug("Loading pickle file: %s", document_path)
|
||||||
with open(document_path, "rb") as file:
|
with open(document_path, "rb") as file:
|
||||||
result: Review = pickle.load(file)
|
result: TextReview = pickle.load(file)
|
||||||
yield result
|
yield result
|
||||||
|
|
||||||
return data_cache_loader
|
return data_cache_loader
|
||||||
|
|
|
@ -1,75 +1,80 @@
|
||||||
import tensorflow
|
import abc
|
||||||
|
import typing as t
|
||||||
|
|
||||||
from .collections import MongoReview
|
from .collections import MongoReview
|
||||||
import logging
|
|
||||||
|
|
||||||
log = logging.getLogger(__name__)
|
|
||||||
|
|
||||||
|
|
||||||
Text = str
|
class Review(metaclass=abc.ABCMeta):
|
||||||
Category = float
|
"""
|
||||||
|
Base class for method common to both review types.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, *, rating: float):
|
||||||
|
self.rating: float = rating
|
||||||
|
"""
|
||||||
|
The star rating of the review, from ``1.0`` to ``5.0``.
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
class Review:
|
class TextReview(Review):
|
||||||
|
"""
|
||||||
|
Optimized container for a review with the text still intact.
|
||||||
|
|
||||||
|
Uses `__slots__` for better performance.
|
||||||
|
"""
|
||||||
|
|
||||||
__slots__ = (
|
__slots__ = (
|
||||||
"text",
|
"text",
|
||||||
"category",
|
"rating",
|
||||||
)
|
)
|
||||||
|
|
||||||
def __init__(self, text: Text, category: Category):
|
def __init__(self, *, rating: float, text: str):
|
||||||
|
super().__init__(rating=rating)
|
||||||
|
|
||||||
self.text: str = text
|
self.text: str = text
|
||||||
self.category: float = category
|
"""
|
||||||
|
The contents of the review.
|
||||||
|
"""
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def from_mongoreview(cls, review: MongoReview):
|
def from_mongoreview(cls, review: MongoReview) -> "TextReview":
|
||||||
|
"""
|
||||||
|
Create a new `.Review` object from a `MongoReview` `dict`.
|
||||||
|
"""
|
||||||
return cls(
|
return cls(
|
||||||
text=review["reviewText"],
|
text=review["reviewText"],
|
||||||
category=review["overall"],
|
rating=review["overall"],
|
||||||
)
|
)
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
return f"<{self.__class__.__qualname__}: [{self.category}] {self.text}>"
|
return f"<{self.__class__.__qualname__}: ({self.rating}*) {self.text[:80]}>"
|
||||||
|
|
||||||
def __getitem__(self, item):
|
|
||||||
if item == 0 or item == "text":
|
|
||||||
return self.text
|
|
||||||
elif item == 1 or item == "category":
|
|
||||||
return self.category
|
|
||||||
else:
|
|
||||||
raise KeyError(item)
|
|
||||||
|
|
||||||
def normvalue(self) -> float:
|
class TokenizedReview(Review):
|
||||||
return (self.category - 1) / 2
|
"""
|
||||||
|
Optimized container for a review with a tokenized text.
|
||||||
|
|
||||||
def to_tensor_text(self) -> tensorflow.Tensor:
|
Uses `__slots__` for better performance.
|
||||||
return tensorflow.convert_to_tensor(self.text, dtype=tensorflow.string)
|
"""
|
||||||
|
|
||||||
def to_tensor_normvalue(self) -> tensorflow.Tensor:
|
__slots__ = (
|
||||||
return tensorflow.convert_to_tensor([self.normvalue()], dtype=tensorflow.float32)
|
"tokens",
|
||||||
|
"rating",
|
||||||
def to_tensor_tuple_normvalue(self) -> tuple[tensorflow.Tensor, tensorflow.Tensor]:
|
|
||||||
return (
|
|
||||||
self.to_tensor_text(),
|
|
||||||
self.to_tensor_normvalue(),
|
|
||||||
)
|
)
|
||||||
|
|
||||||
def to_tensor_category(self) -> tensorflow.Tensor:
|
def __init__(self, *, rating: float, tokens: t.Iterator[str]):
|
||||||
return tensorflow.convert_to_tensor([[
|
super().__init__(rating=rating)
|
||||||
1.0 if self.category == 1.0 else 0.0,
|
|
||||||
1.0 if self.category == 2.0 else 0.0,
|
|
||||||
1.0 if self.category == 3.0 else 0.0,
|
|
||||||
1.0 if self.category == 4.0 else 0.0,
|
|
||||||
1.0 if self.category == 5.0 else 0.0,
|
|
||||||
]], dtype=tensorflow.float32)
|
|
||||||
|
|
||||||
def to_tensor_tuple_category(self) -> tuple[tensorflow.Tensor, tensorflow.Tensor]:
|
self.tokens: list[str] = list(tokens)
|
||||||
return (
|
"""
|
||||||
self.to_tensor_text(),
|
List of all tokens in the review text.
|
||||||
self.to_tensor_category(),
|
"""
|
||||||
)
|
|
||||||
|
def __repr__(self):
|
||||||
|
return f"<{self.__class__.__qualname__}: ({self.rating}*) [{len(self.tokens)} tokens]>"
|
||||||
|
|
||||||
|
|
||||||
__all__ = (
|
__all__ = (
|
||||||
"Text",
|
"TextReview",
|
||||||
"Category",
|
"TokenizedReview",
|
||||||
"Review",
|
|
||||||
)
|
)
|
||||||
|
|
|
@ -4,15 +4,15 @@ import typing as t
|
||||||
|
|
||||||
from ..config import WORKING_SET_SIZE
|
from ..config import WORKING_SET_SIZE
|
||||||
from .collections import MongoReview
|
from .collections import MongoReview
|
||||||
from .datatypes import Review
|
from .datatypes import TextReview
|
||||||
|
|
||||||
log = logging.getLogger(__name__)
|
log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
SampleFunc = t.Callable[[pymongo.collection.Collection, int], t.Iterator[Review]]
|
SampleFunc = t.Callable[[pymongo.collection.Collection, int], t.Iterator[TextReview]]
|
||||||
|
|
||||||
|
|
||||||
def sample_reviews(collection: pymongo.collection.Collection, amount: int) -> t.Iterator[Review]:
|
def sample_reviews(collection: pymongo.collection.Collection, amount: int) -> t.Iterator[TextReview]:
|
||||||
"""
|
"""
|
||||||
Get ``amount`` random reviews from the ``reviews`` collection.
|
Get ``amount`` random reviews from the ``reviews`` collection.
|
||||||
"""
|
"""
|
||||||
|
@ -23,12 +23,12 @@ def sample_reviews(collection: pymongo.collection.Collection, amount: int) -> t.
|
||||||
{"$sample": {"size": amount}},
|
{"$sample": {"size": amount}},
|
||||||
])
|
])
|
||||||
|
|
||||||
cursor = map(Review.from_mongoreview, cursor)
|
cursor = map(TextReview.from_mongoreview, cursor)
|
||||||
|
|
||||||
return cursor
|
return cursor
|
||||||
|
|
||||||
|
|
||||||
def sample_reviews_by_rating(collection: pymongo.collection.Collection, rating: float, amount: int) -> t.Iterator[Review]:
|
def sample_reviews_by_rating(collection: pymongo.collection.Collection, rating: float, amount: int) -> t.Iterator[TextReview]:
|
||||||
"""
|
"""
|
||||||
Get ``amount`` random reviews with ``rating`` stars from the ``reviews`` collection.
|
Get ``amount`` random reviews with ``rating`` stars from the ``reviews`` collection.
|
||||||
"""
|
"""
|
||||||
|
@ -43,7 +43,7 @@ def sample_reviews_by_rating(collection: pymongo.collection.Collection, rating:
|
||||||
return cursor
|
return cursor
|
||||||
|
|
||||||
|
|
||||||
def sample_reviews_polar(collection: pymongo.collection.Collection, amount: int) -> t.Iterator[Review]:
|
def sample_reviews_polar(collection: pymongo.collection.Collection, amount: int) -> t.Iterator[TextReview]:
|
||||||
category_amount = amount // 2
|
category_amount = amount // 2
|
||||||
|
|
||||||
log.debug("Getting a sample of %d polar reviews...", category_amount * 2)
|
log.debug("Getting a sample of %d polar reviews...", category_amount * 2)
|
||||||
|
@ -68,12 +68,12 @@ def sample_reviews_polar(collection: pymongo.collection.Collection, amount: int)
|
||||||
}}
|
}}
|
||||||
])
|
])
|
||||||
|
|
||||||
cursor = map(Review.from_mongoreview, cursor)
|
cursor = map(TextReview.from_mongoreview, cursor)
|
||||||
|
|
||||||
return cursor
|
return cursor
|
||||||
|
|
||||||
|
|
||||||
def sample_reviews_varied(collection: pymongo.collection.Collection, amount: int) -> t.Iterator[Review]:
|
def sample_reviews_varied(collection: pymongo.collection.Collection, amount: int) -> t.Iterator[TextReview]:
|
||||||
category_amount = amount // 5
|
category_amount = amount // 5
|
||||||
|
|
||||||
log.debug("Getting a sample of %d varied reviews...", category_amount * 5)
|
log.debug("Getting a sample of %d varied reviews...", category_amount * 5)
|
||||||
|
@ -123,7 +123,7 @@ def sample_reviews_varied(collection: pymongo.collection.Collection, amount: int
|
||||||
}}
|
}}
|
||||||
])
|
])
|
||||||
|
|
||||||
cursor = map(Review.from_mongoreview, cursor)
|
cursor = map(TextReview.from_mongoreview, cursor)
|
||||||
|
|
||||||
return cursor
|
return cursor
|
||||||
|
|
||||||
|
|
|
@ -5,7 +5,7 @@ import logging
|
||||||
import pymongo
|
import pymongo
|
||||||
|
|
||||||
from .config import TRAINING_SET_SIZE, VALIDATION_SET_SIZE, EVALUATION_SET_SIZE
|
from .config import TRAINING_SET_SIZE, VALIDATION_SET_SIZE, EVALUATION_SET_SIZE
|
||||||
from .database import SampleFunc, CachedDatasetFunc, mongo_client_from_config, reviews_collection, store_cache, load_cache, delete_cache
|
from .database import SampleFunc, CachedDatasetFunc, store_cache, load_cache, delete_cache
|
||||||
|
|
||||||
log = logging.getLogger(__name__)
|
log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
|
@ -1,51 +1,26 @@
|
||||||
import tensorflow
|
import typing as t
|
||||||
|
import abc
|
||||||
|
from ..database.datatypes import TextReview, TokenizedReview
|
||||||
|
|
||||||
|
|
||||||
class BaseTokenizer:
|
class BaseTokenizer(metaclass=abc.ABCMeta):
|
||||||
"""
|
"""
|
||||||
The base for all tokenizers in this project.
|
The base for all tokenizers in this project.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
return f"{self.__class__.__qualname__}()"
|
return f"<{self.__class__.__qualname__}>"
|
||||||
|
|
||||||
@staticmethod
|
@abc.abstractmethod
|
||||||
def __not_implemented(f):
|
def tokenize(self, text: str) -> t.Iterator[str]:
|
||||||
f.__notimplemented__ = True
|
|
||||||
return f
|
|
||||||
|
|
||||||
def supports_plain(self) -> bool:
|
|
||||||
return not getattr(self.tokenize_plain, "__notimplemented__", False)
|
|
||||||
|
|
||||||
def supports_tensorflow(self) -> bool:
|
|
||||||
return not getattr(self.tokenize_tensorflow, "__notimplemented__", False)
|
|
||||||
|
|
||||||
@__not_implemented
|
|
||||||
def tokenize_plain(self, text: str) -> str:
|
|
||||||
"""
|
"""
|
||||||
Convert a text `str` into another `str` containing a series of whitespace-separated tokens.
|
Convert a text `str` into another `str` containing a series of whitespace-separated tokens.
|
||||||
"""
|
"""
|
||||||
raise NotImplementedError()
|
raise NotImplementedError()
|
||||||
|
|
||||||
def tokenize_and_split_plain(self, text: str) -> list[str]:
|
def tokenize_review(self, review: TextReview) -> TokenizedReview:
|
||||||
"""
|
"""
|
||||||
Run `.tokenize_plain`, then split the result using `str.split`.
|
Apply `.tokenize` to the text of a `TextReview`, converting it in a `TokenizedReview`.
|
||||||
"""
|
"""
|
||||||
text = self.tokenize_plain(text)
|
tokens = self.tokenize(review.text)
|
||||||
text = text.split()
|
return TokenizedReview(rating=review.rating, tokens=tokens)
|
||||||
return text
|
|
||||||
|
|
||||||
@__not_implemented
|
|
||||||
def tokenize_tensorflow(self, text: tensorflow.Tensor) -> tensorflow.Tensor:
|
|
||||||
"""
|
|
||||||
Convert a `tensorflow.Tensor` string into another `tensorflow.Tensor` space-separated string.
|
|
||||||
"""
|
|
||||||
raise NotImplementedError()
|
|
||||||
|
|
||||||
def tokenize_tensorflow_and_expand_dims(self, text: tensorflow.Tensor) -> tensorflow.Tensor:
|
|
||||||
"""
|
|
||||||
Run `.tokenize_tensorflow`, then add a dimension to the tensor for reasons unknown to me, but required to get `tensorflow` to work properly.
|
|
||||||
"""
|
|
||||||
text = self.tokenize_tensorflow(text)
|
|
||||||
text = tensorflow.expand_dims(text, -1, name="tokens")
|
|
||||||
return text
|
|
||||||
|
|
|
@ -1,10 +1,15 @@
|
||||||
import abc
|
import abc
|
||||||
import tokenizers
|
import tokenizers
|
||||||
|
import typing as t
|
||||||
|
|
||||||
from .base import BaseTokenizer
|
from .base import BaseTokenizer
|
||||||
|
|
||||||
|
|
||||||
class HuggingTokenizer(BaseTokenizer, metaclass=abc.ABCMeta):
|
class HuggingTokenizer(BaseTokenizer, metaclass=abc.ABCMeta):
|
||||||
|
"""
|
||||||
|
Abstract tokenizer to implement any tokenizer based on HuggingFace `tokenizers.Tokenizer`.
|
||||||
|
"""
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self.hug: tokenizers.Tokenizer = self._build_hugging_tokenizer()
|
self.hug: tokenizers.Tokenizer = self._build_hugging_tokenizer()
|
||||||
|
@ -12,11 +17,15 @@ class HuggingTokenizer(BaseTokenizer, metaclass=abc.ABCMeta):
|
||||||
def _build_hugging_tokenizer(self) -> tokenizers.Tokenizer:
|
def _build_hugging_tokenizer(self) -> tokenizers.Tokenizer:
|
||||||
raise NotImplementedError()
|
raise NotImplementedError()
|
||||||
|
|
||||||
def tokenize_plain(self, text: str) -> str:
|
def tokenize(self, text: str) -> t.Iterator[str]:
|
||||||
return " ".join(self.hug.encode(text).tokens)
|
return self.hug.encode(text).tokens
|
||||||
|
|
||||||
|
|
||||||
class HuggingBertTokenizer(HuggingTokenizer):
|
class HuggingBertTokenizer(HuggingTokenizer):
|
||||||
|
"""
|
||||||
|
Tokenizer based on the `bert-base-cased <https://huggingface.co/bert-base-cased>`_ tokenizer.
|
||||||
|
"""
|
||||||
|
|
||||||
def _build_hugging_tokenizer(self):
|
def _build_hugging_tokenizer(self):
|
||||||
return tokenizers.Tokenizer.from_pretrained("bert-base-cased")
|
return tokenizers.Tokenizer.from_pretrained("bert-base-cased")
|
||||||
|
|
||||||
|
|
|
@ -1,17 +1,14 @@
|
||||||
import tensorflow
|
import typing as t
|
||||||
|
|
||||||
from .base import BaseTokenizer
|
from .base import BaseTokenizer
|
||||||
|
|
||||||
|
|
||||||
class LowercaseTokenizer(BaseTokenizer):
|
class LowercaseTokenizer(BaseTokenizer):
|
||||||
"""
|
"""
|
||||||
Tokenizer which converts the words to lowercase before splitting them via spaces.
|
Tokenizer which converts the words to lowercase before splitting them with `str.split`.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def tokenize_plain(self, text: str) -> str:
|
def tokenize(self, text: str) -> t.Iterator[str]:
|
||||||
text = text.lower()
|
text = text.lower()
|
||||||
return text
|
tokens = text.split()
|
||||||
|
return tokens
|
||||||
def tokenize_tensorflow(self, text: tensorflow.Tensor) -> tensorflow.Tensor:
|
|
||||||
text = tensorflow.strings.lower(text)
|
|
||||||
return text
|
|
||||||
|
|
|
@ -9,7 +9,7 @@ class NLTKWordTokenizer(BaseTokenizer):
|
||||||
Tokenizer based on `nltk.word_tokenize`.
|
Tokenizer based on `nltk.word_tokenize`.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def tokenize_plain(self, text: str) -> str:
|
def tokenize(self, text: str) -> str:
|
||||||
tokens = nltk.word_tokenize(text)
|
tokens = nltk.word_tokenize(text)
|
||||||
nltk.sentiment.util.mark_negation(tokens, shallow=True)
|
nltk.sentiment.util.mark_negation(tokens, shallow=True)
|
||||||
return " ".join(tokens)
|
return " ".join(tokens)
|
||||||
|
|
|
@ -1,15 +1,13 @@
|
||||||
import tensorflow
|
import typing as t
|
||||||
|
|
||||||
from .base import BaseTokenizer
|
from .base import BaseTokenizer
|
||||||
|
|
||||||
|
|
||||||
class PlainTokenizer(BaseTokenizer):
|
class PlainTokenizer(BaseTokenizer):
|
||||||
"""
|
"""
|
||||||
Tokenizer which just splits the text into tokens by separating them at whitespaces.
|
Tokenizer which just splits the text into tokens by separating them at whitespaces with `str.split`.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def tokenize_plain(self, text: str) -> str:
|
def tokenize(self, text: str) -> t.Iterator[str]:
|
||||||
return text
|
tokens = text.split()
|
||||||
|
return tokens
|
||||||
def tokenize_tensorflow(self, text: tensorflow.Tensor) -> tensorflow.Tensor:
|
|
||||||
return text
|
|
||||||
|
|
|
@ -1,4 +1,3 @@
|
||||||
import tensorflow
|
|
||||||
import re
|
import re
|
||||||
import html.entities
|
import html.entities
|
||||||
import typing as t
|
import typing as t
|
||||||
|
@ -11,7 +10,7 @@ class PottsTokenizer(BaseTokenizer):
|
||||||
"""
|
"""
|
||||||
Tokenizer based on `Christopher Potts' tokenizer <http://sentiment.christopherpotts.net/tokenizing.html>`_, released in 2011.
|
Tokenizer based on `Christopher Potts' tokenizer <http://sentiment.christopherpotts.net/tokenizing.html>`_, released in 2011.
|
||||||
|
|
||||||
This module is released under the Creative Commons Attribution-NonCommercial-ShareAlike 3.0 Unported License: https://creativecommons.org/licenses/by-nc-sa/3.0/ .
|
This class is released under the Creative Commons Attribution-NonCommercial-ShareAlike 3.0 Unported License: https://creativecommons.org/licenses/by-nc-sa/3.0/ .
|
||||||
"""
|
"""
|
||||||
|
|
||||||
# noinspection RegExpRepeatedSpace
|
# noinspection RegExpRepeatedSpace
|
||||||
|
@ -76,7 +75,7 @@ class PottsTokenizer(BaseTokenizer):
|
||||||
amp = "&"
|
amp = "&"
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def __html2string(cls, s: str) -> str:
|
def html_entities_to_chr(cls, s: str) -> str:
|
||||||
"""
|
"""
|
||||||
Internal metod that seeks to replace all the HTML entities in s with their corresponding characters.
|
Internal metod that seeks to replace all the HTML entities in s with their corresponding characters.
|
||||||
"""
|
"""
|
||||||
|
@ -102,24 +101,41 @@ class PottsTokenizer(BaseTokenizer):
|
||||||
s = s.replace(cls.amp, " and ")
|
s = s.replace(cls.amp, " and ")
|
||||||
return s
|
return s
|
||||||
|
|
||||||
def tokenize_plain(self, text: str) -> str:
|
@classmethod
|
||||||
|
def lower_but_preserve_emoticons(cls, word):
|
||||||
|
"""
|
||||||
|
Internal method which lowercases the word if it does not match `.emoticon_re`.
|
||||||
|
"""
|
||||||
|
if cls.emoticon_re.search(word):
|
||||||
|
return word
|
||||||
|
else:
|
||||||
|
return word.lower()
|
||||||
|
|
||||||
|
def tokenize(self, text: str) -> t.Iterator[str]:
|
||||||
# Fix HTML character entitites
|
# Fix HTML character entitites
|
||||||
s = self.__html2string(text)
|
text = self.html_entities_to_chr(text)
|
||||||
# Tokenize
|
# Tokenize
|
||||||
words = self.words_re.findall(s)
|
tokens = self.words_re.findall(text)
|
||||||
# Possible alter the case, but avoid changing emoticons like :D into :d:
|
# Possible alter the case, but avoid changing emoticons like :D into :d:
|
||||||
words = list(map(lambda x: x if self.emoticon_re.search(x) else x.lower(), words))
|
tokens = map(self.lower_but_preserve_emoticons, tokens)
|
||||||
# Re-join words
|
|
||||||
result = " ".join(words)
|
|
||||||
# Return the result
|
# Return the result
|
||||||
return result
|
return tokens
|
||||||
|
|
||||||
|
|
||||||
class PottsTokenizerWithNegation(PottsTokenizer):
|
class PottsTokenizerWithNegation(PottsTokenizer):
|
||||||
def tokenize_plain(self, text: str) -> str:
|
"""
|
||||||
words = super().tokenize_plain(text).split()
|
Version of `.PottsTokenizer` which after tokenizing applies `nltk.sentiment.util.mark_negation`.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def tokenize(self, text: str) -> str:
|
||||||
|
# Apply the base tokenization
|
||||||
|
words = super().tokenize(text)
|
||||||
|
# Convert to a list (sigh) the iterator
|
||||||
|
words = list(words)
|
||||||
|
# Use nltk to mark negation
|
||||||
nltk.sentiment.util.mark_negation(words, shallow=True)
|
nltk.sentiment.util.mark_negation(words, shallow=True)
|
||||||
return " ".join(words)
|
# Return the result
|
||||||
|
return words
|
||||||
|
|
||||||
|
|
||||||
__all__ = (
|
__all__ = (
|
||||||
|
|
Loading…
Reference in a new issue