mirror of
https://github.com/Steffo99/unimore-bda-6.git
synced 2024-11-22 07:54:19 +00:00
Made good progress
How does text vectorization in tensorflow work?
This commit is contained in:
parent
dcfc4fbc3b
commit
3abba24ca2
13 changed files with 286 additions and 158 deletions
2
.gitignore
vendored
2
.gitignore
vendored
|
@ -10,6 +10,8 @@
|
||||||
data/raw/
|
data/raw/
|
||||||
data/db/
|
data/db/
|
||||||
data/nltk/
|
data/nltk/
|
||||||
|
data/training/
|
||||||
|
data/evaluation/
|
||||||
|
|
||||||
##################
|
##################
|
||||||
# Python ignores #
|
# Python ignores #
|
||||||
|
|
|
@ -4,6 +4,7 @@
|
||||||
<option name="INTERPRETER_OPTIONS" value="" />
|
<option name="INTERPRETER_OPTIONS" value="" />
|
||||||
<option name="PARENT_ENVS" value="true" />
|
<option name="PARENT_ENVS" value="true" />
|
||||||
<envs>
|
<envs>
|
||||||
|
<env name="CONFIRM_OVERWRITE" value="False" />
|
||||||
<env name="DATA_SET_SIZE" value="750" />
|
<env name="DATA_SET_SIZE" value="750" />
|
||||||
<env name="NLTK_DATA" value="./data/nltk" />
|
<env name="NLTK_DATA" value="./data/nltk" />
|
||||||
<env name="PYTHONUNBUFFERED" value="1" />
|
<env name="PYTHONUNBUFFERED" value="1" />
|
||||||
|
|
|
@ -8,6 +8,8 @@
|
||||||
<excludeFolder url="file://$MODULE_DIR$/data/raw" />
|
<excludeFolder url="file://$MODULE_DIR$/data/raw" />
|
||||||
<excludeFolder url="file://$MODULE_DIR$/data/nltk" />
|
<excludeFolder url="file://$MODULE_DIR$/data/nltk" />
|
||||||
<excludeFolder url="file://$MODULE_DIR$/.venv" />
|
<excludeFolder url="file://$MODULE_DIR$/.venv" />
|
||||||
|
<excludeFolder url="file://$MODULE_DIR$/data/evaluation" />
|
||||||
|
<excludeFolder url="file://$MODULE_DIR$/data/training" />
|
||||||
</content>
|
</content>
|
||||||
<orderEntry type="jdk" jdkName="Poetry (unimore-bda-6)" jdkType="Python SDK" />
|
<orderEntry type="jdk" jdkName="Poetry (unimore-bda-6)" jdkType="Python SDK" />
|
||||||
<orderEntry type="sourceFolder" forTests="false" />
|
<orderEntry type="sourceFolder" forTests="false" />
|
||||||
|
|
|
@ -2,7 +2,7 @@ import logging
|
||||||
import tensorflow
|
import tensorflow
|
||||||
|
|
||||||
from .config import config, DATA_SET_SIZE
|
from .config import config, DATA_SET_SIZE
|
||||||
from .database import mongo_reviews_collection_from_config, sample_reviews_polar, sample_reviews_varied
|
from .database import mongo_client_from_config, reviews_collection, sample_reviews_polar, sample_reviews_varied, store_cache, load_cache
|
||||||
from .analysis.nltk_sentiment import NLTKSentimentAnalyzer
|
from .analysis.nltk_sentiment import NLTKSentimentAnalyzer
|
||||||
from .analysis.tf_text import TensorflowSentimentAnalyzer
|
from .analysis.tf_text import TensorflowSentimentAnalyzer
|
||||||
from .tokenizer import NLTKWordTokenizer, PottsTokenizer, PottsTokenizerWithNegation, LowercaseTokenizer
|
from .tokenizer import NLTKWordTokenizer, PottsTokenizer, PottsTokenizerWithNegation, LowercaseTokenizer
|
||||||
|
@ -18,35 +18,45 @@ def main():
|
||||||
log.debug("Tensorflow successfully found GPU acceleration!")
|
log.debug("Tensorflow successfully found GPU acceleration!")
|
||||||
|
|
||||||
for dataset_func in [sample_reviews_polar, sample_reviews_varied]:
|
for dataset_func in [sample_reviews_polar, sample_reviews_varied]:
|
||||||
# Tensorflow-based
|
for SentimentAnalyzer in [TensorflowSentimentAnalyzer, NLTKSentimentAnalyzer]:
|
||||||
for Tokenizer in [
|
for Tokenizer in [
|
||||||
LowercaseTokenizer
|
# NLTKWordTokenizer,
|
||||||
]:
|
# PottsTokenizer,
|
||||||
tokenizer = Tokenizer()
|
# PottsTokenizerWithNegation,
|
||||||
model = TensorflowSentimentAnalyzer()
|
|
||||||
|
|
||||||
with mongo_reviews_collection_from_config() as collection:
|
|
||||||
...
|
|
||||||
|
|
||||||
# NLTK-based
|
|
||||||
for Tokenizer in [
|
|
||||||
NLTKWordTokenizer,
|
|
||||||
PottsTokenizer,
|
|
||||||
PottsTokenizerWithNegation,
|
|
||||||
LowercaseTokenizer,
|
LowercaseTokenizer,
|
||||||
]:
|
]:
|
||||||
tokenizer = Tokenizer()
|
tokenizer = Tokenizer()
|
||||||
model = NLTKSentimentAnalyzer(tokenizer=tokenizer)
|
model = SentimentAnalyzer(tokenizer=tokenizer)
|
||||||
|
|
||||||
with mongo_reviews_collection_from_config() as collection:
|
with mongo_client_from_config() as db:
|
||||||
|
log.debug("Finding the reviews MongoDB collection...")
|
||||||
|
collection = reviews_collection(db)
|
||||||
|
|
||||||
|
try:
|
||||||
|
training_cache = load_cache("./data/training")
|
||||||
|
evaluation_cache = load_cache("./data/evaluation")
|
||||||
|
except FileNotFoundError:
|
||||||
|
log.debug("Gathering datasets...")
|
||||||
reviews_training = dataset_func(collection=collection, amount=DATA_SET_SIZE.__wrapped__)
|
reviews_training = dataset_func(collection=collection, amount=DATA_SET_SIZE.__wrapped__)
|
||||||
reviews_evaluation = dataset_func(collection=collection, amount=DATA_SET_SIZE.__wrapped__)
|
reviews_evaluation = dataset_func(collection=collection, amount=DATA_SET_SIZE.__wrapped__)
|
||||||
|
|
||||||
log.info("Training model %s", model)
|
log.debug("Caching datasets...")
|
||||||
model.train(reviews_training)
|
store_cache(reviews_training, "./data/training")
|
||||||
log.info("Evaluating model %s", model)
|
store_cache(reviews_evaluation, "./data/evaluation")
|
||||||
correct, evaluated = model.evaluate(reviews_evaluation)
|
del reviews_training
|
||||||
log.info("%d evaluated, %d correct, %0.2d %% accuracy", evaluated, correct, correct / evaluated * 100)
|
del reviews_evaluation
|
||||||
|
|
||||||
|
training_cache = load_cache("./data/training")
|
||||||
|
evaluation_cache = load_cache("./data/evaluation")
|
||||||
|
log.debug("Caches stored and loaded successfully!")
|
||||||
|
else:
|
||||||
|
log.debug("Caches loaded successfully!")
|
||||||
|
|
||||||
|
log.info("Training model: %s", model)
|
||||||
|
model.train(training_cache)
|
||||||
|
log.info("Evaluating model: %s", model)
|
||||||
|
evaluation_results = model.evaluate(evaluation_cache)
|
||||||
|
log.info("%s", evaluation_results)
|
||||||
|
|
||||||
# try:
|
# try:
|
||||||
# print("Manual testing for %s" % model)
|
# print("Manual testing for %s" % model)
|
||||||
|
|
|
@ -6,7 +6,7 @@ import logging
|
||||||
import typing as t
|
import typing as t
|
||||||
import itertools
|
import itertools
|
||||||
|
|
||||||
from ..database import Text, Category, Review
|
from ..database import Text, Category, Review, DatasetFunc
|
||||||
from .base import BaseSentimentAnalyzer, AlreadyTrainedError, NotTrainedError
|
from .base import BaseSentimentAnalyzer, AlreadyTrainedError, NotTrainedError
|
||||||
from ..log import count_passage
|
from ..log import count_passage
|
||||||
from ..tokenizer import BaseTokenizer
|
from ..tokenizer import BaseTokenizer
|
||||||
|
@ -31,7 +31,7 @@ class NLTKSentimentAnalyzer(BaseSentimentAnalyzer):
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
return f"<{self.__class__.__qualname__} tokenizer={self.tokenizer!r}>"
|
return f"<{self.__class__.__qualname__} tokenizer={self.tokenizer!r}>"
|
||||||
|
|
||||||
def __tokenize_datatuple(self, datatuple: Review) -> tuple[TokenBag, Category]:
|
def __tokenize_review(self, datatuple: Review) -> tuple[TokenBag, Category]:
|
||||||
"""
|
"""
|
||||||
Convert the `Text` of a `DataTuple` to a `TokenBag`.
|
Convert the `Text` of a `DataTuple` to a `TokenBag`.
|
||||||
"""
|
"""
|
||||||
|
@ -67,13 +67,16 @@ class NLTKSentimentAnalyzer(BaseSentimentAnalyzer):
|
||||||
count_passage(log, "extract_features", 100)
|
count_passage(log, "extract_features", 100)
|
||||||
return self.model.extract_features(data[0]), data[1]
|
return self.model.extract_features(data[0]), data[1]
|
||||||
|
|
||||||
def train(self, dataset: t.Iterator[Review]) -> None:
|
def train(self, dataset_func: DatasetFunc) -> None:
|
||||||
# Forbid retraining the model
|
# Forbid retraining the model
|
||||||
if self.trained:
|
if self.trained:
|
||||||
raise AlreadyTrainedError()
|
raise AlreadyTrainedError()
|
||||||
|
|
||||||
|
# Get a generator
|
||||||
|
dataset: t.Generator[Review] = dataset_func()
|
||||||
|
|
||||||
# Tokenize the dataset
|
# Tokenize the dataset
|
||||||
dataset: t.Iterator[tuple[TokenBag, Category]] = map(self.__tokenize_datatuple, dataset)
|
dataset: t.Iterator[tuple[TokenBag, Category]] = map(self.__tokenize_review, dataset)
|
||||||
|
|
||||||
# Cleanly duplicate the dataset iterator
|
# Cleanly duplicate the dataset iterator
|
||||||
# Reduce average memory footprint, but not maximum
|
# Reduce average memory footprint, but not maximum
|
||||||
|
|
|
@ -2,48 +2,52 @@ import tensorflow
|
||||||
import itertools
|
import itertools
|
||||||
import typing as t
|
import typing as t
|
||||||
|
|
||||||
from ..database import Text, Category, Review
|
from ..database import Text, Category, Review, DatasetFunc
|
||||||
from ..tokenizer import BaseTokenizer
|
from ..tokenizer import BaseTokenizer
|
||||||
from .base import BaseSentimentAnalyzer, AlreadyTrainedError, NotTrainedError
|
from .base import BaseSentimentAnalyzer, AlreadyTrainedError, NotTrainedError
|
||||||
|
|
||||||
|
|
||||||
class TensorflowSentimentAnalyzer(BaseSentimentAnalyzer):
|
class TensorflowSentimentAnalyzer(BaseSentimentAnalyzer):
|
||||||
def __init__(self):
|
def __init__(self, *, tokenizer: BaseTokenizer):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self.trained = False
|
self.trained = False
|
||||||
self.text_vectorization_layer = None
|
|
||||||
self.neural_network: tensorflow.keras.Sequential | None = None
|
self.neural_network: tensorflow.keras.Sequential | None = None
|
||||||
|
self.tokenizer: BaseTokenizer = tokenizer # TODO
|
||||||
@classmethod
|
|
||||||
def __bda_dataset_to_tf_dataset(cls, dataset_func: t.Callable[[], t.Iterator[Review]]) -> tensorflow.data.Dataset:
|
|
||||||
"""
|
|
||||||
Convert a `unimore_bda_6.database.DataSet` to a "real" `tensorflow.data.Dataset`.
|
|
||||||
"""
|
|
||||||
return tensorflow.data.Dataset.from_generator(
|
|
||||||
dataset_func,
|
|
||||||
output_signature=(
|
|
||||||
tensorflow.TensorSpec(shape=(), dtype=tensorflow.string),
|
|
||||||
tensorflow.TensorSpec(shape=(), dtype=tensorflow.string),
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
MAX_FEATURES = 20000
|
MAX_FEATURES = 20000
|
||||||
EMBEDDING_DIM = 16
|
EMBEDDING_DIM = 16
|
||||||
EPOCHS = 10
|
EPOCHS = 10
|
||||||
|
|
||||||
def train(self, training_set: t.Iterator[Review]) -> None:
|
def train(self, dataset_func: DatasetFunc) -> None:
|
||||||
if self.trained:
|
if self.trained:
|
||||||
raise AlreadyTrainedError()
|
raise AlreadyTrainedError()
|
||||||
|
|
||||||
training_set = self.__bda_dataset_to_tf_dataset(training_set)
|
def dataset_func_with_tensor_text():
|
||||||
|
for review in dataset_func():
|
||||||
|
yield review.to_tensor_text()
|
||||||
|
|
||||||
self.text_vectorization_layer = tensorflow.keras.layers.TextVectorization(
|
text_set = tensorflow.data.Dataset.from_generator(
|
||||||
|
dataset_func_with_tensor_text,
|
||||||
|
output_signature=tensorflow.TensorSpec(shape=(), dtype=tensorflow.string)
|
||||||
|
)
|
||||||
|
|
||||||
|
text_vectorization_layer = tensorflow.keras.layers.TextVectorization(
|
||||||
max_tokens=self.MAX_FEATURES,
|
max_tokens=self.MAX_FEATURES,
|
||||||
standardize=self.tokenizer.tokenize_tensorflow,
|
standardize=self.tokenizer.tokenize_tensorflow,
|
||||||
)
|
)
|
||||||
self.text_vectorization_layer.adapt(map(lambda t: t[0], training_set))
|
text_vectorization_layer.adapt(text_set)
|
||||||
|
|
||||||
training_set = training_set.map(self.text_vectorization_layer)
|
def dataset_func_with_tensor_tuple():
|
||||||
|
for review in dataset_func():
|
||||||
|
yield review.to_tensor_tuple()
|
||||||
|
|
||||||
|
training_set = tensorflow.data.Dataset.from_generator(
|
||||||
|
dataset_func_with_tensor_tuple,
|
||||||
|
output_signature=(
|
||||||
|
tensorflow.TensorSpec(shape=(), dtype=tensorflow.string, name="text"),
|
||||||
|
tensorflow.TensorSpec(shape=(), dtype=tensorflow.float32, name="category"),
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
# I have no idea of what I'm doing here
|
# I have no idea of what I'm doing here
|
||||||
self.neural_network = tensorflow.keras.Sequential([
|
self.neural_network = tensorflow.keras.Sequential([
|
||||||
|
@ -59,6 +63,8 @@ class TensorflowSentimentAnalyzer(BaseSentimentAnalyzer):
|
||||||
metrics=tensorflow.metrics.BinaryAccuracy(threshold=0.0)
|
metrics=tensorflow.metrics.BinaryAccuracy(threshold=0.0)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
training_set = training_set.map(text_vectorization_layer)
|
||||||
|
|
||||||
self.neural_network.fit(
|
self.neural_network.fit(
|
||||||
training_set,
|
training_set,
|
||||||
epochs=self.EPOCHS,
|
epochs=self.EPOCHS,
|
||||||
|
|
5
unimore_bda_6/database/__init__.py
Normal file
5
unimore_bda_6/database/__init__.py
Normal file
|
@ -0,0 +1,5 @@
|
||||||
|
from .cache import *
|
||||||
|
from .collections import *
|
||||||
|
from .connection import *
|
||||||
|
from .datatypes import *
|
||||||
|
from .queries import *
|
66
unimore_bda_6/database/cache.py
Normal file
66
unimore_bda_6/database/cache.py
Normal file
|
@ -0,0 +1,66 @@
|
||||||
|
import typing as t
|
||||||
|
import logging
|
||||||
|
import shutil
|
||||||
|
import pathlib
|
||||||
|
import pickle
|
||||||
|
|
||||||
|
from .datatypes import Review
|
||||||
|
|
||||||
|
log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
DatasetFunc = t.Callable[[], t.Generator[Review, t.Any, None]]
|
||||||
|
|
||||||
|
|
||||||
|
def store_cache(reviews: t.Iterator[Review], path: str | pathlib.Path) -> None:
|
||||||
|
"""
|
||||||
|
Store the contents of the given `Review` iterator to different files in a directory at the given path.
|
||||||
|
"""
|
||||||
|
path = pathlib.Path(path)
|
||||||
|
|
||||||
|
if path.exists():
|
||||||
|
raise FileExistsError("Specified cache path already exists.")
|
||||||
|
|
||||||
|
# Create the temporary directory
|
||||||
|
log.debug("Creating cache directory: %s", path)
|
||||||
|
path.mkdir(parents=True)
|
||||||
|
|
||||||
|
# Write the documents to path/{index}.pickle
|
||||||
|
for index, document in enumerate(reviews):
|
||||||
|
document_path = path.joinpath(f"{index}.pickle")
|
||||||
|
|
||||||
|
log.debug("Storing pickle file: %s", document_path)
|
||||||
|
with open(document_path, "wb") as file:
|
||||||
|
pickle.dump(document, file)
|
||||||
|
|
||||||
|
|
||||||
|
def load_cache(path: str | pathlib.Path) -> DatasetFunc:
|
||||||
|
"""
|
||||||
|
Load the contents of a directory
|
||||||
|
"""
|
||||||
|
path = pathlib.Path(path)
|
||||||
|
|
||||||
|
if not path.exists():
|
||||||
|
log.error("Specified cache directory does not exist: %s", path)
|
||||||
|
raise FileNotFoundError("The specified path does not exist.")
|
||||||
|
|
||||||
|
def data_cache_loader():
|
||||||
|
document_paths = path.iterdir()
|
||||||
|
for document_path in document_paths:
|
||||||
|
document_path = pathlib.Path(document_path)
|
||||||
|
if not str(document_path).endswith(".pickle"):
|
||||||
|
log.debug("Ignoring non-pickle file: %s", document_path)
|
||||||
|
|
||||||
|
log.debug("Loading pickle file: %s", document_path)
|
||||||
|
with open(document_path, "rb") as file:
|
||||||
|
result: Review = pickle.load(file)
|
||||||
|
yield result
|
||||||
|
|
||||||
|
return data_cache_loader
|
||||||
|
|
||||||
|
|
||||||
|
__all__ = (
|
||||||
|
"DatasetFunc",
|
||||||
|
"store_cache",
|
||||||
|
"load_cache",
|
||||||
|
)
|
41
unimore_bda_6/database/collections.py
Normal file
41
unimore_bda_6/database/collections.py
Normal file
|
@ -0,0 +1,41 @@
|
||||||
|
import contextlib
|
||||||
|
import pymongo.collection
|
||||||
|
import typing as t
|
||||||
|
import bson
|
||||||
|
import logging
|
||||||
|
|
||||||
|
log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class MongoReview(t.TypedDict):
|
||||||
|
"""
|
||||||
|
A review as it is stored on MongoDB.
|
||||||
|
|
||||||
|
.. warning:: Do not instantiate: this is only for type hints!
|
||||||
|
"""
|
||||||
|
_id: bson.ObjectId
|
||||||
|
reviewerID: str
|
||||||
|
asin: str
|
||||||
|
reviewerName: str
|
||||||
|
helpful: tuple[int, int]
|
||||||
|
reviewText: str
|
||||||
|
overall: float
|
||||||
|
summary: str
|
||||||
|
unixReviewTime: int
|
||||||
|
reviewTime: str
|
||||||
|
|
||||||
|
|
||||||
|
def reviews_collection(db: pymongo.MongoClient) -> pymongo.collection.Collection[MongoReview]:
|
||||||
|
"""
|
||||||
|
Create a new MongoDB client, access the ``reviews`` collection in the ``reviews`` database, and yield it.
|
||||||
|
"""
|
||||||
|
log.debug("Accessing the reviews collection...")
|
||||||
|
collection = db.reviews.reviews
|
||||||
|
log.debug("Collection accessed successfully: %s", collection)
|
||||||
|
return collection
|
||||||
|
|
||||||
|
|
||||||
|
__all__ = (
|
||||||
|
"MongoReview",
|
||||||
|
"reviews_collection",
|
||||||
|
)
|
32
unimore_bda_6/database/connection.py
Normal file
32
unimore_bda_6/database/connection.py
Normal file
|
@ -0,0 +1,32 @@
|
||||||
|
import pymongo
|
||||||
|
import contextlib
|
||||||
|
import typing as t
|
||||||
|
import logging
|
||||||
|
|
||||||
|
from ..config import MONGO_HOST, MONGO_PORT
|
||||||
|
|
||||||
|
log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
@contextlib.contextmanager
|
||||||
|
def mongo_client_from_config() -> t.ContextManager[pymongo.MongoClient]:
|
||||||
|
"""
|
||||||
|
Create a new MongoDB client and yield it.
|
||||||
|
"""
|
||||||
|
log.debug("Opening connection to MongoDB...")
|
||||||
|
client: pymongo.MongoClient = pymongo.MongoClient(
|
||||||
|
host=MONGO_HOST.__wrapped__,
|
||||||
|
port=MONGO_PORT.__wrapped__,
|
||||||
|
)
|
||||||
|
log.info("Opened connection to MongoDB!")
|
||||||
|
|
||||||
|
yield client
|
||||||
|
|
||||||
|
log.info("Closing connection to MongoDB...")
|
||||||
|
client.close()
|
||||||
|
log.debug("Closed connection to MongoDB!")
|
||||||
|
|
||||||
|
|
||||||
|
__all__ = (
|
||||||
|
"mongo_client_from_config",
|
||||||
|
)
|
49
unimore_bda_6/database/datatypes.py
Normal file
49
unimore_bda_6/database/datatypes.py
Normal file
|
@ -0,0 +1,49 @@
|
||||||
|
import tensorflow
|
||||||
|
from .collections import MongoReview
|
||||||
|
|
||||||
|
|
||||||
|
Text = str
|
||||||
|
Category = float
|
||||||
|
|
||||||
|
|
||||||
|
class Review:
|
||||||
|
def __init__(self, text: Text, category: Category):
|
||||||
|
self.text: str = text
|
||||||
|
self.category: float = category
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_mongoreview(cls, review: MongoReview):
|
||||||
|
return cls(
|
||||||
|
text=review["reviewText"],
|
||||||
|
category=review["overall"],
|
||||||
|
)
|
||||||
|
|
||||||
|
def __repr__(self):
|
||||||
|
return f"<{self.__class__.__qualname__}: [{self.category}] {self.text}>"
|
||||||
|
|
||||||
|
def __getitem__(self, item):
|
||||||
|
if item == 0 or item == "text":
|
||||||
|
return self.text
|
||||||
|
elif item == 1 or item == "category":
|
||||||
|
return self.category
|
||||||
|
else:
|
||||||
|
raise KeyError(item)
|
||||||
|
|
||||||
|
def to_tensor_text(self) -> tensorflow.Tensor:
|
||||||
|
return tensorflow.convert_to_tensor(self.text, dtype=tensorflow.string)
|
||||||
|
|
||||||
|
def to_tensor_category(self) -> tensorflow.Tensor:
|
||||||
|
return tensorflow.convert_to_tensor(self.category / 5.0, dtype=tensorflow.float32)
|
||||||
|
|
||||||
|
def to_tensor_tuple(self) -> tuple[tensorflow.Tensor, tensorflow.Tensor]:
|
||||||
|
return (
|
||||||
|
self.to_tensor_text(),
|
||||||
|
self.to_tensor_category(),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
__all__ = (
|
||||||
|
"Text",
|
||||||
|
"Category",
|
||||||
|
"Review",
|
||||||
|
)
|
|
@ -1,101 +1,14 @@
|
||||||
import typing as t
|
|
||||||
import pymongo
|
|
||||||
import pymongo.collection
|
|
||||||
import contextlib
|
|
||||||
import bson
|
|
||||||
import logging
|
import logging
|
||||||
import tensorflow
|
import pymongo
|
||||||
|
import typing as t
|
||||||
|
|
||||||
from .config import MONGO_HOST, MONGO_PORT, WORKING_SET_SIZE
|
from ..config import WORKING_SET_SIZE
|
||||||
|
from .collections import MongoReview
|
||||||
|
from .datatypes import Review
|
||||||
|
|
||||||
log = logging.getLogger(__name__)
|
log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
class MongoReview(t.TypedDict):
|
|
||||||
"""
|
|
||||||
A review as it is stored on MongoDB.
|
|
||||||
|
|
||||||
.. warning:: Do not instantiate: this is only for type hints!
|
|
||||||
"""
|
|
||||||
_id: bson.ObjectId
|
|
||||||
reviewerID: str
|
|
||||||
asin: str
|
|
||||||
reviewerName: str
|
|
||||||
helpful: tuple[int, int]
|
|
||||||
reviewText: str
|
|
||||||
overall: float
|
|
||||||
summary: str
|
|
||||||
unixReviewTime: int
|
|
||||||
reviewTime: str
|
|
||||||
|
|
||||||
|
|
||||||
Text = str
|
|
||||||
Category = float
|
|
||||||
|
|
||||||
|
|
||||||
class Review:
|
|
||||||
def __init__(self, text: Text, category: Category):
|
|
||||||
self.text: Text = text
|
|
||||||
self.category: Category = category
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def from_mongoreview(cls, review: MongoReview):
|
|
||||||
return cls(
|
|
||||||
text=review["reviewText"],
|
|
||||||
category=review["overall"],
|
|
||||||
)
|
|
||||||
|
|
||||||
def __repr__(self):
|
|
||||||
return f"<{self.__class__.__qualname__}: [{self.category}] {self.text}>"
|
|
||||||
|
|
||||||
def __getitem__(self, item):
|
|
||||||
if item == 0 or item == "text":
|
|
||||||
return self.text
|
|
||||||
elif item == 1 or item == "category":
|
|
||||||
return self.category
|
|
||||||
else:
|
|
||||||
raise KeyError(item)
|
|
||||||
|
|
||||||
def to_tensor_tuple(self) -> tuple[tensorflow.Tensor, tensorflow.Tensor]:
|
|
||||||
return tensorflow.convert_to_tensor(self.text, dtype=tensorflow.string), tensorflow.convert_to_tensor(self.category, dtype=tensorflow.string)
|
|
||||||
|
|
||||||
|
|
||||||
@contextlib.contextmanager
|
|
||||||
def mongo_client_from_config() -> t.ContextManager[pymongo.MongoClient]:
|
|
||||||
"""
|
|
||||||
Create a new MongoDB client and yield it.
|
|
||||||
"""
|
|
||||||
log.debug("Opening connection to MongoDB...")
|
|
||||||
client: pymongo.MongoClient = pymongo.MongoClient(
|
|
||||||
host=MONGO_HOST.__wrapped__,
|
|
||||||
port=MONGO_PORT.__wrapped__,
|
|
||||||
)
|
|
||||||
log.info("Opened connection to MongoDB!")
|
|
||||||
|
|
||||||
yield client
|
|
||||||
|
|
||||||
log.info("Closing connection to MongoDB...")
|
|
||||||
client.close()
|
|
||||||
log.debug("Closed connection to MongoDB!")
|
|
||||||
|
|
||||||
|
|
||||||
@contextlib.contextmanager
|
|
||||||
def mongo_reviews_collection_from_config() -> pymongo.collection.Collection[MongoReview]:
|
|
||||||
"""
|
|
||||||
Create a new MongoDB client, access the ``reviews`` collection in the ``reviews`` database, and yield it.
|
|
||||||
"""
|
|
||||||
with mongo_client_from_config() as db:
|
|
||||||
log.debug("Accessing the reviews collection...")
|
|
||||||
collection = db.reviews.reviews
|
|
||||||
log.debug("Collection accessed successfully: %s", collection)
|
|
||||||
yield collection
|
|
||||||
|
|
||||||
|
|
||||||
class DatasetFunc(t.Protocol):
|
|
||||||
def __call__(self) -> t.Iterator[Review]:
|
|
||||||
pass
|
|
||||||
|
|
||||||
|
|
||||||
def sample_reviews(collection: pymongo.collection.Collection, amount: int) -> t.Iterator[Review]:
|
def sample_reviews(collection: pymongo.collection.Collection, amount: int) -> t.Iterator[Review]:
|
||||||
"""
|
"""
|
||||||
Get ``amount`` random reviews from the ``reviews`` collection.
|
Get ``amount`` random reviews from the ``reviews`` collection.
|
||||||
|
@ -108,6 +21,7 @@ def sample_reviews(collection: pymongo.collection.Collection, amount: int) -> t.
|
||||||
])
|
])
|
||||||
|
|
||||||
cursor = map(Review.from_mongoreview, cursor)
|
cursor = map(Review.from_mongoreview, cursor)
|
||||||
|
|
||||||
return cursor
|
return cursor
|
||||||
|
|
||||||
|
|
||||||
|
@ -123,7 +37,6 @@ def sample_reviews_by_rating(collection: pymongo.collection.Collection, rating:
|
||||||
{"$sample": {"size": amount}},
|
{"$sample": {"size": amount}},
|
||||||
])
|
])
|
||||||
|
|
||||||
cursor = map(Review.from_mongoreview, cursor)
|
|
||||||
return cursor
|
return cursor
|
||||||
|
|
||||||
|
|
||||||
|
@ -145,6 +58,7 @@ def sample_reviews_polar(collection: pymongo.collection.Collection, amount: int)
|
||||||
])
|
])
|
||||||
|
|
||||||
cursor = map(Review.from_mongoreview, cursor)
|
cursor = map(Review.from_mongoreview, cursor)
|
||||||
|
|
||||||
return cursor
|
return cursor
|
||||||
|
|
||||||
|
|
||||||
|
@ -191,16 +105,11 @@ def sample_reviews_varied(collection: pymongo.collection.Collection, amount: int
|
||||||
])
|
])
|
||||||
|
|
||||||
cursor = map(Review.from_mongoreview, cursor)
|
cursor = map(Review.from_mongoreview, cursor)
|
||||||
|
|
||||||
return cursor
|
return cursor
|
||||||
|
|
||||||
|
|
||||||
__all__ = (
|
__all__ = (
|
||||||
"Text",
|
|
||||||
"Category",
|
|
||||||
"Review",
|
|
||||||
"DatasetFunc",
|
|
||||||
"mongo_client_from_config",
|
|
||||||
"mongo_reviews_collection_from_config",
|
|
||||||
"sample_reviews",
|
"sample_reviews",
|
||||||
"sample_reviews_by_rating",
|
"sample_reviews_by_rating",
|
||||||
"sample_reviews_polar",
|
"sample_reviews_polar",
|
|
@ -8,4 +8,6 @@ class LowercaseTokenizer(BaseTokenizer):
|
||||||
return text.lower().split()
|
return text.lower().split()
|
||||||
|
|
||||||
def tokenize_tensorflow(self, text: tensorflow.Tensor) -> tensorflow.Tensor:
|
def tokenize_tensorflow(self, text: tensorflow.Tensor) -> tensorflow.Tensor:
|
||||||
return tensorflow.strings.lower(text)
|
text = tensorflow.strings.lower(text)
|
||||||
|
text = tensorflow.expand_dims(text, -1, name="tokens")
|
||||||
|
return text
|
||||||
|
|
Loading…
Reference in a new issue