1
Fork 0
mirror of https://github.com/Steffo99/unimore-bda-6.git synced 2024-11-21 23:44:19 +00:00

CODE IS DONE

This commit is contained in:
Steffo 2023-02-12 05:11:58 +01:00
parent ae2cf563e6
commit 4e8aa68db3
Signed by: steffo
GPG key ID: 2A24051445686895
17 changed files with 331 additions and 304 deletions

View file

@ -5,14 +5,14 @@
<option name="PARENT_ENVS" value="true" /> <option name="PARENT_ENVS" value="true" />
<envs> <envs>
<env name="CONFIRM_OVERWRITE" value="False" /> <env name="CONFIRM_OVERWRITE" value="False" />
<env name="EVALUATION_SET_SIZE" value="100" /> <env name="EVALUATION_SET_SIZE" value="4000" />
<env name="NLTK_DATA" value="./data/nltk" /> <env name="NLTK_DATA" value="./data/nltk" />
<env name="PYTHONUNBUFFERED" value="1" /> <env name="PYTHONUNBUFFERED" value="1" />
<env name="TENSORFLOW_EMBEDDING_SIZE" value="64" /> <env name="TENSORFLOW_EMBEDDING_SIZE" value="64" />
<env name="TENSORFLOW_MAX_FEATURES" value="1000000" /> <env name="TENSORFLOW_MAX_FEATURES" value="1000000" />
<env name="TF_CPP_MIN_LOG_LEVEL" value="2" /> <env name="TF_CPP_MIN_LOG_LEVEL" value="2" />
<env name="TRAINING_SET_SIZE" value="2000" /> <env name="TRAINING_SET_SIZE" value="4000" />
<env name="VALIDATION_SET_SIZE" value="25" /> <env name="VALIDATION_SET_SIZE" value="100" />
<env name="WORKING_SET_SIZE" value="100000" /> <env name="WORKING_SET_SIZE" value="100000" />
<env name="XLA_FLAGS" value="--xla_gpu_cuda_data_dir=/opt/cuda" /> <env name="XLA_FLAGS" value="--xla_gpu_cuda_data_dir=/opt/cuda" />
</envs> </envs>

View file

@ -6,7 +6,7 @@ install_general_log_handlers()
from .config import config from .config import config
from .database import mongo_client_from_config, reviews_collection, sample_reviews_polar, sample_reviews_varied from .database import mongo_client_from_config, reviews_collection, sample_reviews_polar, sample_reviews_varied
from .analysis import NLTKSentimentAnalyzer, TensorflowCategorySentimentAnalyzer, TensorflowPolarSentimentAnalyzer from .analysis import NLTKSentimentAnalyzer, TensorflowCategorySentimentAnalyzer, TensorflowPolarSentimentAnalyzer, ThreeCheat
from .analysis.base import TrainingFailedError from .analysis.base import TrainingFailedError
from .tokenizer import PlainTokenizer, LowercaseTokenizer, NLTKWordTokenizer, PottsTokenizer, PottsTokenizerWithNegation, HuggingBertTokenizer from .tokenizer import PlainTokenizer, LowercaseTokenizer, NLTKWordTokenizer, PottsTokenizer, PottsTokenizerWithNegation, HuggingBertTokenizer
from .gathering import Caches from .gathering import Caches
@ -32,25 +32,29 @@ def main():
reviews = reviews_collection(db) reviews = reviews_collection(db)
for sample_func in [sample_reviews_varied, sample_reviews_polar]: for sample_func in [
sample_reviews_polar,
sample_reviews_varied,
]:
slog = logging.getLogger(f"{__name__}.{sample_func.__name__}") slog = logging.getLogger(f"{__name__}.{sample_func.__name__}")
slog.debug("Selected sample_func: %s", sample_func.__name__) slog.debug("Selected sample_func: %s", sample_func.__name__)
for SentimentAnalyzer in [ for SentimentAnalyzer in [
# ThreeCheat,
TensorflowPolarSentimentAnalyzer, TensorflowPolarSentimentAnalyzer,
TensorflowCategorySentimentAnalyzer, TensorflowCategorySentimentAnalyzer,
# NLTKSentimentAnalyzer, NLTKSentimentAnalyzer,
]: ]:
slog = logging.getLogger(f"{__name__}.{sample_func.__name__}.{SentimentAnalyzer.__name__}") slog = logging.getLogger(f"{__name__}.{sample_func.__name__}.{SentimentAnalyzer.__name__}")
slog.debug("Selected SentimentAnalyzer: %s", SentimentAnalyzer.__name__) slog.debug("Selected SentimentAnalyzer: %s", SentimentAnalyzer.__name__)
for Tokenizer in [ for Tokenizer in [
PottsTokenizerWithNegation,
PottsTokenizer, PottsTokenizer,
HuggingBertTokenizer,
PlainTokenizer, PlainTokenizer,
HuggingBertTokenizer,
PottsTokenizerWithNegation,
LowercaseTokenizer, LowercaseTokenizer,
NLTKWordTokenizer, NLTKWordTokenizer,
]: ]:

View file

@ -5,3 +5,4 @@ This module contains all implemented types of sentiment analyzers.
from .base import * from .base import *
from .nltk_sentiment import * from .nltk_sentiment import *
from .tf_text import * from .tf_text import *
from .cheating import *

View file

@ -4,7 +4,7 @@ import abc
import logging import logging
import dataclasses import dataclasses
from ..database import Text, Category, CachedDatasetFunc from ..database import CachedDatasetFunc, TextReview, TokenizedReview
from ..tokenizer import BaseTokenizer from ..tokenizer import BaseTokenizer
log = logging.getLogger(__name__) log = logging.getLogger(__name__)
@ -15,12 +15,11 @@ class BaseSentimentAnalyzer(metaclass=abc.ABCMeta):
Abstract base class for sentiment analyzers implemented in this project. Abstract base class for sentiment analyzers implemented in this project.
""" """
# noinspection PyUnusedLocal
def __init__(self, *, tokenizer: BaseTokenizer): def __init__(self, *, tokenizer: BaseTokenizer):
pass self.tokenizer: BaseTokenizer = tokenizer
def __repr__(self): def __repr__(self):
return f"<{self.__class__.__qualname__}>" return f"<{self.__class__.__qualname__} with {self.tokenizer} tokenizer>"
@abc.abstractmethod @abc.abstractmethod
def train(self, training_dataset_func: CachedDatasetFunc, validation_dataset_func: CachedDatasetFunc) -> None: def train(self, training_dataset_func: CachedDatasetFunc, validation_dataset_func: CachedDatasetFunc) -> None:
@ -30,34 +29,34 @@ class BaseSentimentAnalyzer(metaclass=abc.ABCMeta):
raise NotImplementedError() raise NotImplementedError()
@abc.abstractmethod @abc.abstractmethod
def use(self, text: Text) -> Category: def use(self, text: str) -> float:
""" """
Run the model on the given input. Run the model on the given input, and return the predicted rating.
""" """
raise NotImplementedError() raise NotImplementedError()
def evaluate(self, evaluation_dataset_func: CachedDatasetFunc) -> EvaluationResults: def evaluate(self, evaluation_dataset_func: CachedDatasetFunc) -> EvaluationResults:
""" """
Perform a model evaluation by calling repeatedly `.use` on every text of the test dataset and by comparing its resulting category with the expected category. Perform a model evaluation by calling repeatedly `.use` on every text of the test dataset and by comparing its resulting category with the expected category.
Returns a tuple with the number of correct results and the number of evaluated results.
""" """
evaluated: int = 0 evaluated: int = 0
correct: int = 0
score: float = 0.0 perfect: int = 0
squared_error: float = 0.0
for review in evaluation_dataset_func(): for review in evaluation_dataset_func():
resulting_category = self.use(review.text) resulting_category = self.use(review.text)
log.debug("Evaluation step: expected %d, received %d, review was %s", review.category, resulting_category, review.text[:80]) log.debug("Evaluation step: %d for %s", resulting_category, review)
evaluated += 1 evaluated += 1
try: try:
correct += 1 if resulting_category == review.category else 0 perfect += 1 if resulting_category == review.rating else 0
score += 1 - (abs(resulting_category - review.category) / 4) squared_error += (resulting_category - review.rating) ** 2
except ValueError: except ValueError:
log.warning("Model execution on %s resulted in a NaN value: %s", review, resulting_category) log.warning("Model execution on %s resulted in a NaN value: %s", review, resulting_category)
return EvaluationResults(correct=correct, evaluated=evaluated, score=score) return EvaluationResults(perfect=perfect, evaluated=evaluated, mse=squared_error / evaluated)
@dataclasses.dataclass @dataclasses.dataclass
@ -66,15 +65,26 @@ class EvaluationResults:
Container for the results of a dataset evaluation. Container for the results of a dataset evaluation.
""" """
correct: int
evaluated: int evaluated: int
score: float """
The number of reviews that were evaluated.
"""
perfect: int
"""
The number of reviews for which the model returned the correct rating.
"""
mse: float
"""
Mean squared error
"""
def __repr__(self): def __repr__(self):
return f"<EvaluationResults: {self!s}>" return f"<EvaluationResults: {self!s}>"
def __str__(self): def __str__(self):
return f"{self.evaluated} evaluated, {self.correct} correct, {self.correct / self.evaluated:.2%} accuracy, {self.score:.2f} score, {self.score / self.evaluated:.2%} scoreaccuracy" return f"Evaluation results:\t{self.evaluated}\tevaluated\t{self.perfect}\tperfect\t{self.perfect / self.evaluated:.2%}\taccuracy\t{self.mse / self.evaluated:.2%}\tmean squared error"
class AlreadyTrainedError(Exception): class AlreadyTrainedError(Exception):

View file

@ -0,0 +1,21 @@
from .base import BaseSentimentAnalyzer
from ..database.cache import CachedDatasetFunc
class ThreeCheat(BaseSentimentAnalyzer):
"""
A sentiment analyzer that always predicts a 3.0* rating.
Why? To test the scoring!
"""
def train(self, training_dataset_func: CachedDatasetFunc, validation_dataset_func: CachedDatasetFunc) -> None:
pass
def use(self, text: str) -> float:
return 3.0
__all__ = (
"ThreeCheat",
)

View file

@ -6,7 +6,7 @@ import logging
import typing as t import typing as t
import itertools import itertools
from ..database import Text, Category, Review, CachedDatasetFunc from ..database import TextReview, CachedDatasetFunc, TokenizedReview
from .base import BaseSentimentAnalyzer, AlreadyTrainedError, NotTrainedError from .base import BaseSentimentAnalyzer, AlreadyTrainedError, NotTrainedError
from ..log import count_passage from ..log import count_passage
from ..tokenizer import BaseTokenizer from ..tokenizer import BaseTokenizer
@ -23,31 +23,17 @@ class NLTKSentimentAnalyzer(BaseSentimentAnalyzer):
""" """
def __init__(self, *, tokenizer: BaseTokenizer) -> None: def __init__(self, *, tokenizer: BaseTokenizer) -> None:
if not tokenizer.supports_plain():
raise TypeError("Tokenizer does not support NLTK")
super().__init__(tokenizer=tokenizer) super().__init__(tokenizer=tokenizer)
self.model: nltk.sentiment.SentimentAnalyzer = nltk.sentiment.SentimentAnalyzer() self.model: nltk.sentiment.SentimentAnalyzer = nltk.sentiment.SentimentAnalyzer()
self.trained: bool = False self.trained: bool = False
self.tokenizer: BaseTokenizer = tokenizer
def __repr__(self): def _add_feature_unigrams(self, dataset: t.Iterator[TokenizedReview]) -> None:
return f"<{self.__class__.__qualname__} tokenizer={self.tokenizer!r}>"
def __tokenize_review(self, datatuple: Review) -> tuple[TokenBag, Category]:
"""
Convert the `Text` of a `DataTuple` to a `TokenBag`.
"""
count_passage(log, "tokenize_datatuple", 100)
return self.tokenizer.tokenize_and_split_plain(datatuple.text), datatuple.category
def _add_feature_unigrams(self, dataset: t.Iterator[tuple[TokenBag, Category]]) -> None:
""" """
Register the `nltk.sentiment.util.extract_unigram_feats` feature extrator on the model. Register the `nltk.sentiment.util.extract_unigram_feats` feature extrator on the model.
""" """
# Ignore the category and only access the tokens # Ignore the category and only access the tokens
tokenbags = map(lambda d: d[0], dataset) tokenbags = map(lambda r: r.rating, dataset)
# Get all words in the documents # Get all words in the documents
all_words = self.model.all_words(tokenbags, labeled=False) all_words = self.model.all_words(tokenbags, labeled=False)
# Create unigram `contains(*)` features from the previously gathered words # Create unigram `contains(*)` features from the previously gathered words
@ -55,59 +41,48 @@ class NLTKSentimentAnalyzer(BaseSentimentAnalyzer):
# Add the feature extractor to the model # Add the feature extractor to the model
self.model.add_feat_extractor(nltk.sentiment.util.extract_unigram_feats, unigrams=unigrams) self.model.add_feat_extractor(nltk.sentiment.util.extract_unigram_feats, unigrams=unigrams)
def _add_feature_extractors(self, dataset: t.Iterator[tuple[TokenBag, Category]]): def _add_feature_extractors(self, dataset: t.Iterator[TextReview]):
""" """
Register new feature extractors on the `.model`. Register new feature extractors on the `.model`.
""" """
# Tokenize the reviews
dataset: t.Iterator[TokenizedReview] = map(self.tokenizer.tokenize_review, dataset)
# Add the unigrams feature # Add the unigrams feature
self._add_feature_unigrams(dataset) self._add_feature_unigrams(dataset)
def __extract_features(self, data: tuple[TokenBag, Category]) -> tuple[Features, Category]: def __extract_features(self, review: TextReview) -> tuple[Features, float]:
""" """
Convert a (TokenBag, Category) tuple to a (Features, Category) tuple. Convert a (TokenBag, Category) tuple to a (Features, Category) tuple.
Does not use `SentimentAnalyzer.apply_features` due to unexpected behaviour when using iterators. Does not use `SentimentAnalyzer.apply_features` due to unexpected behaviour when using iterators.
""" """
count_passage(log, "extract_features", 100) review: TokenizedReview = self.tokenizer.tokenize_review(review)
return self.model.extract_features(data[0]), data[1] return self.model.extract_features(review.tokens), review.rating
def train(self, training_dataset_func: CachedDatasetFunc, validation_dataset_func: CachedDatasetFunc) -> None: def train(self, training_dataset_func: CachedDatasetFunc, validation_dataset_func: CachedDatasetFunc) -> None:
# Forbid retraining the model # Forbid retraining the model
if self.trained: if self.trained:
raise AlreadyTrainedError() raise AlreadyTrainedError()
# Get a generator
dataset: t.Generator[Review] = training_dataset_func()
# Tokenize the dataset
dataset: t.Iterator[tuple[TokenBag, Category]] = map(self.__tokenize_review, dataset)
# Cleanly duplicate the dataset iterator
# Reduce average memory footprint, but not maximum
dataset_1, dataset_2 = itertools.tee(dataset, 2)
dataset_1: t.Iterator[tuple[TokenBag, Category]]
dataset_2: t.Iterator[tuple[TokenBag, Category]]
# Add the feature extractors to the model # Add the feature extractors to the model
self._add_feature_extractors(dataset_1) self._add_feature_extractors(training_dataset_func())
del dataset_1 # Delete exausted iterator
# Extract features from the dataset # Extract features from the dataset
dataset_2: t.Iterator[tuple[Features, Category]] = map(self.__extract_features, dataset_2) featureset: t.Iterator[tuple[Features, float]] = map(self.__extract_features, training_dataset_func())
# Train the classifier with the extracted features and category # Train the classifier with the extracted features and category
self.model.classifier = nltk.classify.NaiveBayesClassifier.train(dataset_2) self.model.classifier = nltk.classify.NaiveBayesClassifier.train(featureset)
# Toggle the trained flag # Toggle the trained flag
self.trained = True self.trained = True
def use(self, text: Text) -> Category: def use(self, text: str) -> float:
# Require the model to be trained # Require the model to be trained
if not self.trained: if not self.trained:
raise NotTrainedError() raise NotTrainedError()
# Tokenize the input # Tokenize the input
tokens = self.tokenizer.tokenize_and_split_plain(text) tokens = self.tokenizer.tokenize(text)
# Run the classification method # Run the classification method
return self.model.classify(instance=tokens) return self.model.classify(instance=tokens)

View file

@ -5,7 +5,7 @@ import numpy
import tensorflow import tensorflow
import logging import logging
from ..database import Text, Category, CachedDatasetFunc, Review from ..database import CachedDatasetFunc, TextReview, TokenizedReview
from ..config import TENSORFLOW_EMBEDDING_SIZE, TENSORFLOW_MAX_FEATURES, TENSORFLOW_EPOCHS from ..config import TENSORFLOW_EMBEDDING_SIZE, TENSORFLOW_MAX_FEATURES, TENSORFLOW_EPOCHS
from ..tokenizer import BaseTokenizer from ..tokenizer import BaseTokenizer
from .base import BaseSentimentAnalyzer, AlreadyTrainedError, NotTrainedError, TrainingFailedError from .base import BaseSentimentAnalyzer, AlreadyTrainedError, NotTrainedError, TrainingFailedError
@ -19,31 +19,7 @@ else:
log.debug("Tensorflow successfully found GPU acceleration!") log.debug("Tensorflow successfully found GPU acceleration!")
ConversionFunc = t.Callable[[Review], tensorflow.Tensor | tuple] ConversionFunc = t.Callable[[TextReview], tensorflow.Tensor | tuple]
def build_dataset(dataset_func: CachedDatasetFunc, conversion_func: ConversionFunc, output_signature: tensorflow.TensorSpec | tuple) -> tensorflow.data.Dataset:
"""
Convert a `CachedDatasetFunc` to a `tensorflow.data.Dataset`.
"""
def dataset_generator():
for review in dataset_func():
yield conversion_func(review)
log.debug("Creating dataset...")
dataset = tensorflow.data.Dataset.from_generator(
dataset_generator,
output_signature=output_signature,
)
log.debug("Caching dataset...")
dataset = dataset.cache()
log.debug("Configuring dataset prefetch...")
dataset = dataset.prefetch(buffer_size=tensorflow.data.AUTOTUNE)
return dataset
class TensorflowSentimentAnalyzer(BaseSentimentAnalyzer, metaclass=abc.ABCMeta): class TensorflowSentimentAnalyzer(BaseSentimentAnalyzer, metaclass=abc.ABCMeta):
@ -52,31 +28,15 @@ class TensorflowSentimentAnalyzer(BaseSentimentAnalyzer, metaclass=abc.ABCMeta):
""" """
def __init__(self, *, tokenizer: BaseTokenizer): def __init__(self, *, tokenizer: BaseTokenizer):
if not tokenizer.supports_tensorflow():
raise TypeError("Tokenizer does not support Tensorflow")
super().__init__(tokenizer=tokenizer) super().__init__(tokenizer=tokenizer)
self.trained: bool = False self.trained: bool = False
self.failed: bool = False self.failed: bool = False
self.tokenizer: BaseTokenizer = tokenizer self.string_lookup_layer: tensorflow.keras.layers.StringLookup = tensorflow.keras.layers.StringLookup(max_tokens=TENSORFLOW_MAX_FEATURES.__wrapped__)
self.text_vectorization_layer: tensorflow.keras.layers.TextVectorization = self._build_text_vectorization_layer()
self.model: tensorflow.keras.Sequential = self._build_model() self.model: tensorflow.keras.Sequential = self._build_model()
self.history: tensorflow.keras.callbacks.History | None = None self.history: tensorflow.keras.callbacks.History | None = None
def _build_text_vectorization_layer(self) -> tensorflow.keras.layers.TextVectorization:
"""
Create a `tensorflow`-compatible `TextVectorization` layer.
"""
log.debug("Creating TextVectorization layer...")
layer = tensorflow.keras.layers.TextVectorization(
standardize=self.tokenizer.tokenize_tensorflow_and_expand_dims,
max_tokens=TENSORFLOW_MAX_FEATURES.__wrapped__
)
log.debug("Created TextVectorization layer: %s", layer)
return layer
@abc.abstractmethod @abc.abstractmethod
def _build_model(self) -> tensorflow.keras.Sequential: def _build_model(self) -> tensorflow.keras.Sequential:
""" """
@ -84,33 +44,44 @@ class TensorflowSentimentAnalyzer(BaseSentimentAnalyzer, metaclass=abc.ABCMeta):
""" """
raise NotImplementedError() raise NotImplementedError()
@abc.abstractmethod
def _build_dataset(self, dataset_func: CachedDatasetFunc) -> tensorflow.data.Dataset: def _build_dataset(self, dataset_func: CachedDatasetFunc) -> tensorflow.data.Dataset:
""" """
Create a `tensorflow.data.Dataset` from the given `CachedDatasetFunc`. Create a `tensorflow.data.Dataset` from the given `CachedDatasetFunc`.
""" """
raise NotImplementedError()
def dataset_generator():
for review in dataset_func():
review: TextReview
review: TokenizedReview = self.tokenizer.tokenize_review(review)
tokens: tensorflow.Tensor = self._tokens_to_tensor(review.tokens)
rating: tensorflow.Tensor = self._rating_to_input(review.rating)
yield tokens, rating
log.debug("Creating dataset...")
dataset = tensorflow.data.Dataset.from_generator(
dataset_generator,
output_signature=(
tensorflow.TensorSpec(shape=(1, None,), dtype=tensorflow.string, name="tokens"),
self._ratingtensor_shape(),
),
)
log.debug("Caching dataset...")
dataset = dataset.cache()
log.debug("Configuring dataset prefetch...")
dataset = dataset.prefetch(buffer_size=tensorflow.data.AUTOTUNE)
return dataset
def _adapt_textvectorization(self, dataset: tensorflow.data.Dataset) -> None: def _adapt_textvectorization(self, dataset: tensorflow.data.Dataset) -> None:
""" """
Adapt the `.text_vectorization_layer` to the given dataset. Adapt the `.text_vectorization_layer` to the given dataset.
""" """
log.debug("Preparing dataset to adapt %s...", self.text_vectorization_layer) log.debug("Preparing dataset to adapt %s...", self.string_lookup_layer)
dataset = dataset.map(lambda text, category: text) dataset = dataset.map(lambda text, category: text)
log.debug("Adapting %s...", self.text_vectorization_layer) log.debug("Adapting %s...", self.string_lookup_layer)
self.text_vectorization_layer.adapt(dataset) self.string_lookup_layer.adapt(dataset)
def _vectorize_dataset(self, dataset: tensorflow.data.Dataset) -> tensorflow.data.Dataset:
"""
Apply the `.text_vectorization_layer` to the text in the dataset.
"""
def vectorize_entry(text, category):
return self.text_vectorization_layer(text), category
log.debug("Vectorizing dataset: %s", dataset)
dataset = dataset.map(vectorize_entry)
log.debug("Vectorized dataset: %s", dataset)
return dataset
def train(self, training_dataset_func: CachedDatasetFunc, validation_dataset_func: CachedDatasetFunc) -> None: def train(self, training_dataset_func: CachedDatasetFunc, validation_dataset_func: CachedDatasetFunc) -> None:
if self.failed: if self.failed:
@ -120,13 +91,17 @@ class TensorflowSentimentAnalyzer(BaseSentimentAnalyzer, metaclass=abc.ABCMeta):
log.error("Tried to train an already trained model.") log.error("Tried to train an already trained model.")
raise AlreadyTrainedError("Cannot re-train an already trained model.") raise AlreadyTrainedError("Cannot re-train an already trained model.")
log.debug("Building training dataset...")
training_set = self._build_dataset(training_dataset_func) training_set = self._build_dataset(training_dataset_func)
log.debug("Building validation dataset...")
validation_set = self._build_dataset(validation_dataset_func) validation_set = self._build_dataset(validation_dataset_func)
self._adapt_textvectorization(training_set) log.debug("Building vocabulary...")
vocabulary = training_set.map(lambda tokens, rating: tokens)
training_set = self._vectorize_dataset(training_set) log.debug("Adapting lookup layer to the vocabulary...")
validation_set = self._vectorize_dataset(validation_set) self.string_lookup_layer.adapt(vocabulary)
log.info("Training: %s", self.model) log.info("Training: %s", self.model)
self.history: tensorflow.keras.callbacks.History | None = self.model.fit( self.history: tensorflow.keras.callbacks.History | None = self.model.fit(
@ -146,25 +121,50 @@ class TensorflowSentimentAnalyzer(BaseSentimentAnalyzer, metaclass=abc.ABCMeta):
log.info("Model %s training succeeded!", self.model) log.info("Model %s training succeeded!", self.model)
self.trained = True self.trained = True
@abc.abstractmethod @staticmethod
def _translate_prediction(self, a: numpy.array) -> Category: def _tokens_to_tensor(tokens: t.Iterator[str]) -> tensorflow.Tensor:
""" """
Convert the results of `tensorflow.keras.Sequential.predict` into a `.Category`. Convert an iterator of tokens to a `tensorflow.Tensor`.
"""
tensor = tensorflow.convert_to_tensor(
[list(tokens)],
dtype=tensorflow.string,
name="tokens"
)
return tensor
def use(self, text: str) -> float:
if self.failed:
raise NotTrainedError("Cannot use a failed model.")
if not self.trained:
raise NotTrainedError("Cannot use a non-trained model.")
tokens = self.tokenizer.tokenize(text)
tokens = self._tokens_to_tensor(tokens)
prediction = self.model.predict(tokens, verbose=False)
prediction = self._prediction_to_rating(prediction)
return prediction
@abc.abstractmethod
def _rating_to_input(self, rating: float) -> tensorflow.Tensor:
"""
Convert a review rating to a `tensorflow.Tensor`.
""" """
raise NotImplementedError() raise NotImplementedError()
def use(self, text: Text) -> Category: @abc.abstractmethod
if self.failed: def _ratingtensor_shape(self) -> tensorflow.TensorSpec:
log.error("Tried to use a failed model.") """
raise NotTrainedError("Cannot use a failed model.") Returns the shape of the tensor output by `._rating_to_tensor` and accepted as input by `._tensor_to_rating`.
if not self.trained: """
log.error("Tried to use a non-trained model.") raise NotImplementedError()
raise NotTrainedError("Cannot use a non-trained model.")
vector = self.text_vectorization_layer(text) @abc.abstractmethod
prediction = self.model.predict(vector, verbose=False) def _prediction_to_rating(self, prediction: tensorflow.Tensor) -> float:
"""
return self._translate_prediction(prediction) Convert the results of `tensorflow.keras.Sequential.predict` into a review rating.
"""
raise NotImplementedError()
class TensorflowCategorySentimentAnalyzer(TensorflowSentimentAnalyzer): class TensorflowCategorySentimentAnalyzer(TensorflowSentimentAnalyzer):
@ -172,19 +172,10 @@ class TensorflowCategorySentimentAnalyzer(TensorflowSentimentAnalyzer):
A `tensorflow`-based sentiment analyzer that considers each star rating as a separate category. A `tensorflow`-based sentiment analyzer that considers each star rating as a separate category.
""" """
def _build_dataset(self, dataset_func: CachedDatasetFunc) -> tensorflow.data.Dataset:
return build_dataset(
dataset_func=dataset_func,
conversion_func=Review.to_tensor_tuple_category,
output_signature=(
tensorflow.TensorSpec(shape=(), dtype=tensorflow.string, name="text"),
tensorflow.TensorSpec(shape=(1, 5,), dtype=tensorflow.float32, name="category_one_hot"),
),
)
def _build_model(self) -> tensorflow.keras.Sequential: def _build_model(self) -> tensorflow.keras.Sequential:
log.debug("Creating sequential categorizer model...") log.debug("Creating sequential categorizer model...")
model = tensorflow.keras.Sequential([ model = tensorflow.keras.Sequential([
self.string_lookup_layer,
tensorflow.keras.layers.Embedding( tensorflow.keras.layers.Embedding(
input_dim=TENSORFLOW_MAX_FEATURES.__wrapped__ + 1, input_dim=TENSORFLOW_MAX_FEATURES.__wrapped__ + 1,
output_dim=TENSORFLOW_EMBEDDING_SIZE.__wrapped__, output_dim=TENSORFLOW_EMBEDDING_SIZE.__wrapped__,
@ -209,15 +200,35 @@ class TensorflowCategorySentimentAnalyzer(TensorflowSentimentAnalyzer):
log.debug("Compiled model: %s", model) log.debug("Compiled model: %s", model)
return model return model
def _translate_prediction(self, a: numpy.array) -> Category: def _rating_to_input(self, rating: float) -> tensorflow.Tensor:
max_i = None tensor = tensorflow.convert_to_tensor(
max_p = None [[
for i, p in enumerate(iter(a[0])): 1.0 if rating == 1.0 else 0.0,
if max_p is None or p > max_p: 1.0 if rating == 2.0 else 0.0,
max_i = i 1.0 if rating == 3.0 else 0.0,
max_p = p 1.0 if rating == 4.0 else 0.0,
result = float(max_i) + 1.0 1.0 if rating == 5.0 else 0.0,
return float(round(result)) ]],
dtype=tensorflow.float32,
name="rating_one_hot"
)
return tensor
def _ratingtensor_shape(self) -> tensorflow.TensorSpec:
spec = tensorflow.TensorSpec(shape=(1, 5), dtype=tensorflow.float32, name="rating_one_hot")
return spec
def _prediction_to_rating(self, prediction: tensorflow.Tensor) -> float:
best_prediction = None
best_prediction_index = None
for index, prediction in enumerate(iter(prediction[0])):
if best_prediction is None or prediction > best_prediction:
best_prediction = prediction
best_prediction_index = index
result = float(best_prediction_index) + 1.0
return result
class TensorflowPolarSentimentAnalyzer(TensorflowSentimentAnalyzer): class TensorflowPolarSentimentAnalyzer(TensorflowSentimentAnalyzer):
@ -225,19 +236,10 @@ class TensorflowPolarSentimentAnalyzer(TensorflowSentimentAnalyzer):
A `tensorflow`-based sentiment analyzer that uses the floating point value rating to get as close as possible to the correct category. A `tensorflow`-based sentiment analyzer that uses the floating point value rating to get as close as possible to the correct category.
""" """
def _build_dataset(self, dataset_func: CachedDatasetFunc) -> tensorflow.data.Dataset:
return build_dataset(
dataset_func=dataset_func,
conversion_func=Review.to_tensor_tuple_normvalue,
output_signature=(
tensorflow.TensorSpec(shape=(), dtype=tensorflow.string, name="text"),
tensorflow.TensorSpec(shape=(1,), dtype=tensorflow.float32, name="category"),
),
)
def _build_model(self) -> tensorflow.keras.Sequential: def _build_model(self) -> tensorflow.keras.Sequential:
log.debug("Creating sequential categorizer model...") log.debug("Creating sequential categorizer model...")
model = tensorflow.keras.Sequential([ model = tensorflow.keras.Sequential([
self.string_lookup_layer,
tensorflow.keras.layers.Embedding( tensorflow.keras.layers.Embedding(
input_dim=TENSORFLOW_MAX_FEATURES.__wrapped__ + 1, input_dim=TENSORFLOW_MAX_FEATURES.__wrapped__ + 1,
output_dim=TENSORFLOW_EMBEDDING_SIZE.__wrapped__, output_dim=TENSORFLOW_EMBEDDING_SIZE.__wrapped__,
@ -245,7 +247,9 @@ class TensorflowPolarSentimentAnalyzer(TensorflowSentimentAnalyzer):
tensorflow.keras.layers.Dropout(0.25), tensorflow.keras.layers.Dropout(0.25),
tensorflow.keras.layers.GlobalAveragePooling1D(), tensorflow.keras.layers.GlobalAveragePooling1D(),
tensorflow.keras.layers.Dropout(0.25), tensorflow.keras.layers.Dropout(0.25),
tensorflow.keras.layers.Dense(1, activation="sigmoid"), tensorflow.keras.layers.Dense(8),
tensorflow.keras.layers.Dropout(0.25),
tensorflow.keras.layers.Dense(1, activation=tensorflow.keras.activations.sigmoid),
]) ])
log.debug("Compiling model: %s", model) log.debug("Compiling model: %s", model)
@ -257,11 +261,23 @@ class TensorflowPolarSentimentAnalyzer(TensorflowSentimentAnalyzer):
log.debug("Compiled model: %s", model) log.debug("Compiled model: %s", model)
return model return model
def _translate_prediction(self, a: numpy.array) -> Category: def _rating_to_input(self, rating: float) -> tensorflow.Tensor:
a: float = a[0, 0] normalized_rating = (rating - 1) / 4
a = a * 2 + 1 tensor = tensorflow.convert_to_tensor(
a = float(round(a)) [normalized_rating],
return a dtype=tensorflow.float32,
name="rating_value"
)
return tensor
def _ratingtensor_shape(self) -> tensorflow.TensorSpec:
spec = tensorflow.TensorSpec(shape=(1,), dtype=tensorflow.float32, name="rating_value")
return spec
def _prediction_to_rating(self, prediction: numpy.array) -> float:
rating: float = prediction[0, 0]
rating = 1.0 if rating < 0.5 else 5.0
return rating
__all__ = ( __all__ = (

View file

@ -4,15 +4,15 @@ import shutil
import pathlib import pathlib
import pickle import pickle
from .datatypes import Review from .datatypes import TextReview
log = logging.getLogger(__name__) log = logging.getLogger(__name__)
CachedDatasetFunc = t.Callable[[], t.Generator[Review, t.Any, None]] CachedDatasetFunc = t.Callable[[], t.Generator[TextReview, t.Any, None]]
def store_cache(reviews: t.Iterator[Review], path: str | pathlib.Path) -> None: def store_cache(reviews: t.Iterator[TextReview], path: str | pathlib.Path) -> None:
""" """
Store the contents of the given `Review` iterator to different files in a directory at the given path. Store the contents of the given `Review` iterator to different files in a directory at the given path.
""" """
@ -54,7 +54,7 @@ def load_cache(path: str | pathlib.Path) -> CachedDatasetFunc:
log.debug("Loading pickle file: %s", document_path) log.debug("Loading pickle file: %s", document_path)
with open(document_path, "rb") as file: with open(document_path, "rb") as file:
result: Review = pickle.load(file) result: TextReview = pickle.load(file)
yield result yield result
return data_cache_loader return data_cache_loader

View file

@ -1,75 +1,80 @@
import tensorflow import abc
import typing as t
from .collections import MongoReview from .collections import MongoReview
import logging
log = logging.getLogger(__name__)
Text = str class Review(metaclass=abc.ABCMeta):
Category = float """
Base class for method common to both review types.
"""
def __init__(self, *, rating: float):
self.rating: float = rating
"""
The star rating of the review, from ``1.0`` to ``5.0``.
"""
class Review: class TextReview(Review):
"""
Optimized container for a review with the text still intact.
Uses `__slots__` for better performance.
"""
__slots__ = ( __slots__ = (
"text", "text",
"category", "rating",
) )
def __init__(self, text: Text, category: Category): def __init__(self, *, rating: float, text: str):
super().__init__(rating=rating)
self.text: str = text self.text: str = text
self.category: float = category """
The contents of the review.
"""
@classmethod @classmethod
def from_mongoreview(cls, review: MongoReview): def from_mongoreview(cls, review: MongoReview) -> "TextReview":
"""
Create a new `.Review` object from a `MongoReview` `dict`.
"""
return cls( return cls(
text=review["reviewText"], text=review["reviewText"],
category=review["overall"], rating=review["overall"],
) )
def __repr__(self): def __repr__(self):
return f"<{self.__class__.__qualname__}: [{self.category}] {self.text}>" return f"<{self.__class__.__qualname__}: ({self.rating}*) {self.text[:80]}>"
def __getitem__(self, item):
if item == 0 or item == "text":
return self.text
elif item == 1 or item == "category":
return self.category
else:
raise KeyError(item)
def normvalue(self) -> float: class TokenizedReview(Review):
return (self.category - 1) / 2 """
Optimized container for a review with a tokenized text.
def to_tensor_text(self) -> tensorflow.Tensor: Uses `__slots__` for better performance.
return tensorflow.convert_to_tensor(self.text, dtype=tensorflow.string) """
def to_tensor_normvalue(self) -> tensorflow.Tensor: __slots__ = (
return tensorflow.convert_to_tensor([self.normvalue()], dtype=tensorflow.float32) "tokens",
"rating",
)
def to_tensor_tuple_normvalue(self) -> tuple[tensorflow.Tensor, tensorflow.Tensor]: def __init__(self, *, rating: float, tokens: t.Iterator[str]):
return ( super().__init__(rating=rating)
self.to_tensor_text(),
self.to_tensor_normvalue(),
)
def to_tensor_category(self) -> tensorflow.Tensor: self.tokens: list[str] = list(tokens)
return tensorflow.convert_to_tensor([[ """
1.0 if self.category == 1.0 else 0.0, List of all tokens in the review text.
1.0 if self.category == 2.0 else 0.0, """
1.0 if self.category == 3.0 else 0.0,
1.0 if self.category == 4.0 else 0.0,
1.0 if self.category == 5.0 else 0.0,
]], dtype=tensorflow.float32)
def to_tensor_tuple_category(self) -> tuple[tensorflow.Tensor, tensorflow.Tensor]: def __repr__(self):
return ( return f"<{self.__class__.__qualname__}: ({self.rating}*) [{len(self.tokens)} tokens]>"
self.to_tensor_text(),
self.to_tensor_category(),
)
__all__ = ( __all__ = (
"Text", "TextReview",
"Category", "TokenizedReview",
"Review",
) )

View file

@ -4,15 +4,15 @@ import typing as t
from ..config import WORKING_SET_SIZE from ..config import WORKING_SET_SIZE
from .collections import MongoReview from .collections import MongoReview
from .datatypes import Review from .datatypes import TextReview
log = logging.getLogger(__name__) log = logging.getLogger(__name__)
SampleFunc = t.Callable[[pymongo.collection.Collection, int], t.Iterator[Review]] SampleFunc = t.Callable[[pymongo.collection.Collection, int], t.Iterator[TextReview]]
def sample_reviews(collection: pymongo.collection.Collection, amount: int) -> t.Iterator[Review]: def sample_reviews(collection: pymongo.collection.Collection, amount: int) -> t.Iterator[TextReview]:
""" """
Get ``amount`` random reviews from the ``reviews`` collection. Get ``amount`` random reviews from the ``reviews`` collection.
""" """
@ -23,12 +23,12 @@ def sample_reviews(collection: pymongo.collection.Collection, amount: int) -> t.
{"$sample": {"size": amount}}, {"$sample": {"size": amount}},
]) ])
cursor = map(Review.from_mongoreview, cursor) cursor = map(TextReview.from_mongoreview, cursor)
return cursor return cursor
def sample_reviews_by_rating(collection: pymongo.collection.Collection, rating: float, amount: int) -> t.Iterator[Review]: def sample_reviews_by_rating(collection: pymongo.collection.Collection, rating: float, amount: int) -> t.Iterator[TextReview]:
""" """
Get ``amount`` random reviews with ``rating`` stars from the ``reviews`` collection. Get ``amount`` random reviews with ``rating`` stars from the ``reviews`` collection.
""" """
@ -43,7 +43,7 @@ def sample_reviews_by_rating(collection: pymongo.collection.Collection, rating:
return cursor return cursor
def sample_reviews_polar(collection: pymongo.collection.Collection, amount: int) -> t.Iterator[Review]: def sample_reviews_polar(collection: pymongo.collection.Collection, amount: int) -> t.Iterator[TextReview]:
category_amount = amount // 2 category_amount = amount // 2
log.debug("Getting a sample of %d polar reviews...", category_amount * 2) log.debug("Getting a sample of %d polar reviews...", category_amount * 2)
@ -68,12 +68,12 @@ def sample_reviews_polar(collection: pymongo.collection.Collection, amount: int)
}} }}
]) ])
cursor = map(Review.from_mongoreview, cursor) cursor = map(TextReview.from_mongoreview, cursor)
return cursor return cursor
def sample_reviews_varied(collection: pymongo.collection.Collection, amount: int) -> t.Iterator[Review]: def sample_reviews_varied(collection: pymongo.collection.Collection, amount: int) -> t.Iterator[TextReview]:
category_amount = amount // 5 category_amount = amount // 5
log.debug("Getting a sample of %d varied reviews...", category_amount * 5) log.debug("Getting a sample of %d varied reviews...", category_amount * 5)
@ -123,7 +123,7 @@ def sample_reviews_varied(collection: pymongo.collection.Collection, amount: int
}} }}
]) ])
cursor = map(Review.from_mongoreview, cursor) cursor = map(TextReview.from_mongoreview, cursor)
return cursor return cursor

View file

@ -5,7 +5,7 @@ import logging
import pymongo import pymongo
from .config import TRAINING_SET_SIZE, VALIDATION_SET_SIZE, EVALUATION_SET_SIZE from .config import TRAINING_SET_SIZE, VALIDATION_SET_SIZE, EVALUATION_SET_SIZE
from .database import SampleFunc, CachedDatasetFunc, mongo_client_from_config, reviews_collection, store_cache, load_cache, delete_cache from .database import SampleFunc, CachedDatasetFunc, store_cache, load_cache, delete_cache
log = logging.getLogger(__name__) log = logging.getLogger(__name__)

View file

@ -1,51 +1,26 @@
import tensorflow import typing as t
import abc
from ..database.datatypes import TextReview, TokenizedReview
class BaseTokenizer: class BaseTokenizer(metaclass=abc.ABCMeta):
""" """
The base for all tokenizers in this project. The base for all tokenizers in this project.
""" """
def __repr__(self): def __repr__(self):
return f"{self.__class__.__qualname__}()" return f"<{self.__class__.__qualname__}>"
@staticmethod @abc.abstractmethod
def __not_implemented(f): def tokenize(self, text: str) -> t.Iterator[str]:
f.__notimplemented__ = True
return f
def supports_plain(self) -> bool:
return not getattr(self.tokenize_plain, "__notimplemented__", False)
def supports_tensorflow(self) -> bool:
return not getattr(self.tokenize_tensorflow, "__notimplemented__", False)
@__not_implemented
def tokenize_plain(self, text: str) -> str:
""" """
Convert a text `str` into another `str` containing a series of whitespace-separated tokens. Convert a text `str` into another `str` containing a series of whitespace-separated tokens.
""" """
raise NotImplementedError() raise NotImplementedError()
def tokenize_and_split_plain(self, text: str) -> list[str]: def tokenize_review(self, review: TextReview) -> TokenizedReview:
""" """
Run `.tokenize_plain`, then split the result using `str.split`. Apply `.tokenize` to the text of a `TextReview`, converting it in a `TokenizedReview`.
""" """
text = self.tokenize_plain(text) tokens = self.tokenize(review.text)
text = text.split() return TokenizedReview(rating=review.rating, tokens=tokens)
return text
@__not_implemented
def tokenize_tensorflow(self, text: tensorflow.Tensor) -> tensorflow.Tensor:
"""
Convert a `tensorflow.Tensor` string into another `tensorflow.Tensor` space-separated string.
"""
raise NotImplementedError()
def tokenize_tensorflow_and_expand_dims(self, text: tensorflow.Tensor) -> tensorflow.Tensor:
"""
Run `.tokenize_tensorflow`, then add a dimension to the tensor for reasons unknown to me, but required to get `tensorflow` to work properly.
"""
text = self.tokenize_tensorflow(text)
text = tensorflow.expand_dims(text, -1, name="tokens")
return text

View file

@ -1,10 +1,15 @@
import abc import abc
import tokenizers import tokenizers
import typing as t
from .base import BaseTokenizer from .base import BaseTokenizer
class HuggingTokenizer(BaseTokenizer, metaclass=abc.ABCMeta): class HuggingTokenizer(BaseTokenizer, metaclass=abc.ABCMeta):
"""
Abstract tokenizer to implement any tokenizer based on HuggingFace `tokenizers.Tokenizer`.
"""
def __init__(self): def __init__(self):
super().__init__() super().__init__()
self.hug: tokenizers.Tokenizer = self._build_hugging_tokenizer() self.hug: tokenizers.Tokenizer = self._build_hugging_tokenizer()
@ -12,11 +17,15 @@ class HuggingTokenizer(BaseTokenizer, metaclass=abc.ABCMeta):
def _build_hugging_tokenizer(self) -> tokenizers.Tokenizer: def _build_hugging_tokenizer(self) -> tokenizers.Tokenizer:
raise NotImplementedError() raise NotImplementedError()
def tokenize_plain(self, text: str) -> str: def tokenize(self, text: str) -> t.Iterator[str]:
return " ".join(self.hug.encode(text).tokens) return self.hug.encode(text).tokens
class HuggingBertTokenizer(HuggingTokenizer): class HuggingBertTokenizer(HuggingTokenizer):
"""
Tokenizer based on the `bert-base-cased <https://huggingface.co/bert-base-cased>`_ tokenizer.
"""
def _build_hugging_tokenizer(self): def _build_hugging_tokenizer(self):
return tokenizers.Tokenizer.from_pretrained("bert-base-cased") return tokenizers.Tokenizer.from_pretrained("bert-base-cased")

View file

@ -1,17 +1,14 @@
import tensorflow import typing as t
from .base import BaseTokenizer from .base import BaseTokenizer
class LowercaseTokenizer(BaseTokenizer): class LowercaseTokenizer(BaseTokenizer):
""" """
Tokenizer which converts the words to lowercase before splitting them via spaces. Tokenizer which converts the words to lowercase before splitting them with `str.split`.
""" """
def tokenize_plain(self, text: str) -> str: def tokenize(self, text: str) -> t.Iterator[str]:
text = text.lower() text = text.lower()
return text tokens = text.split()
return tokens
def tokenize_tensorflow(self, text: tensorflow.Tensor) -> tensorflow.Tensor:
text = tensorflow.strings.lower(text)
return text

View file

@ -9,7 +9,7 @@ class NLTKWordTokenizer(BaseTokenizer):
Tokenizer based on `nltk.word_tokenize`. Tokenizer based on `nltk.word_tokenize`.
""" """
def tokenize_plain(self, text: str) -> str: def tokenize(self, text: str) -> str:
tokens = nltk.word_tokenize(text) tokens = nltk.word_tokenize(text)
nltk.sentiment.util.mark_negation(tokens, shallow=True) nltk.sentiment.util.mark_negation(tokens, shallow=True)
return " ".join(tokens) return " ".join(tokens)

View file

@ -1,15 +1,13 @@
import tensorflow import typing as t
from .base import BaseTokenizer from .base import BaseTokenizer
class PlainTokenizer(BaseTokenizer): class PlainTokenizer(BaseTokenizer):
""" """
Tokenizer which just splits the text into tokens by separating them at whitespaces. Tokenizer which just splits the text into tokens by separating them at whitespaces with `str.split`.
""" """
def tokenize_plain(self, text: str) -> str: def tokenize(self, text: str) -> t.Iterator[str]:
return text tokens = text.split()
return tokens
def tokenize_tensorflow(self, text: tensorflow.Tensor) -> tensorflow.Tensor:
return text

View file

@ -1,4 +1,3 @@
import tensorflow
import re import re
import html.entities import html.entities
import typing as t import typing as t
@ -11,7 +10,7 @@ class PottsTokenizer(BaseTokenizer):
""" """
Tokenizer based on `Christopher Potts' tokenizer <http://sentiment.christopherpotts.net/tokenizing.html>`_, released in 2011. Tokenizer based on `Christopher Potts' tokenizer <http://sentiment.christopherpotts.net/tokenizing.html>`_, released in 2011.
This module is released under the Creative Commons Attribution-NonCommercial-ShareAlike 3.0 Unported License: https://creativecommons.org/licenses/by-nc-sa/3.0/ . This class is released under the Creative Commons Attribution-NonCommercial-ShareAlike 3.0 Unported License: https://creativecommons.org/licenses/by-nc-sa/3.0/ .
""" """
# noinspection RegExpRepeatedSpace # noinspection RegExpRepeatedSpace
@ -76,7 +75,7 @@ class PottsTokenizer(BaseTokenizer):
amp = "&amp;" amp = "&amp;"
@classmethod @classmethod
def __html2string(cls, s: str) -> str: def html_entities_to_chr(cls, s: str) -> str:
""" """
Internal metod that seeks to replace all the HTML entities in s with their corresponding characters. Internal metod that seeks to replace all the HTML entities in s with their corresponding characters.
""" """
@ -102,24 +101,41 @@ class PottsTokenizer(BaseTokenizer):
s = s.replace(cls.amp, " and ") s = s.replace(cls.amp, " and ")
return s return s
def tokenize_plain(self, text: str) -> str: @classmethod
def lower_but_preserve_emoticons(cls, word):
"""
Internal method which lowercases the word if it does not match `.emoticon_re`.
"""
if cls.emoticon_re.search(word):
return word
else:
return word.lower()
def tokenize(self, text: str) -> t.Iterator[str]:
# Fix HTML character entitites # Fix HTML character entitites
s = self.__html2string(text) text = self.html_entities_to_chr(text)
# Tokenize # Tokenize
words = self.words_re.findall(s) tokens = self.words_re.findall(text)
# Possible alter the case, but avoid changing emoticons like :D into :d: # Possible alter the case, but avoid changing emoticons like :D into :d:
words = list(map(lambda x: x if self.emoticon_re.search(x) else x.lower(), words)) tokens = map(self.lower_but_preserve_emoticons, tokens)
# Re-join words
result = " ".join(words)
# Return the result # Return the result
return result return tokens
class PottsTokenizerWithNegation(PottsTokenizer): class PottsTokenizerWithNegation(PottsTokenizer):
def tokenize_plain(self, text: str) -> str: """
words = super().tokenize_plain(text).split() Version of `.PottsTokenizer` which after tokenizing applies `nltk.sentiment.util.mark_negation`.
"""
def tokenize(self, text: str) -> str:
# Apply the base tokenization
words = super().tokenize(text)
# Convert to a list (sigh) the iterator
words = list(words)
# Use nltk to mark negation
nltk.sentiment.util.mark_negation(words, shallow=True) nltk.sentiment.util.mark_negation(words, shallow=True)
return " ".join(words) # Return the result
return words
__all__ = ( __all__ = (