mirror of
https://github.com/Steffo99/unimore-bda-6.git
synced 2024-11-21 23:44:19 +00:00
Getting closer...
This commit is contained in:
parent
02f10e6ae4
commit
dcfc4fbc3b
6 changed files with 170 additions and 126 deletions
|
@ -7,6 +7,7 @@
|
||||||
<excludeFolder url="file://$MODULE_DIR$/data/db" />
|
<excludeFolder url="file://$MODULE_DIR$/data/db" />
|
||||||
<excludeFolder url="file://$MODULE_DIR$/data/raw" />
|
<excludeFolder url="file://$MODULE_DIR$/data/raw" />
|
||||||
<excludeFolder url="file://$MODULE_DIR$/data/nltk" />
|
<excludeFolder url="file://$MODULE_DIR$/data/nltk" />
|
||||||
|
<excludeFolder url="file://$MODULE_DIR$/.venv" />
|
||||||
</content>
|
</content>
|
||||||
<orderEntry type="jdk" jdkName="Poetry (unimore-bda-6)" jdkType="Python SDK" />
|
<orderEntry type="jdk" jdkName="Poetry (unimore-bda-6)" jdkType="Python SDK" />
|
||||||
<orderEntry type="sourceFolder" forTests="false" />
|
<orderEntry type="sourceFolder" forTests="false" />
|
||||||
|
|
|
@ -2,7 +2,7 @@ import logging
|
||||||
import tensorflow
|
import tensorflow
|
||||||
|
|
||||||
from .config import config, DATA_SET_SIZE
|
from .config import config, DATA_SET_SIZE
|
||||||
from .database import mongo_reviews_collection_from_config, polar_dataset, varied_dataset
|
from .database import mongo_reviews_collection_from_config, sample_reviews_polar, sample_reviews_varied
|
||||||
from .analysis.nltk_sentiment import NLTKSentimentAnalyzer
|
from .analysis.nltk_sentiment import NLTKSentimentAnalyzer
|
||||||
from .analysis.tf_text import TensorflowSentimentAnalyzer
|
from .analysis.tf_text import TensorflowSentimentAnalyzer
|
||||||
from .tokenizer import NLTKWordTokenizer, PottsTokenizer, PottsTokenizerWithNegation, LowercaseTokenizer
|
from .tokenizer import NLTKWordTokenizer, PottsTokenizer, PottsTokenizerWithNegation, LowercaseTokenizer
|
||||||
|
@ -17,37 +17,44 @@ def main():
|
||||||
else:
|
else:
|
||||||
log.debug("Tensorflow successfully found GPU acceleration!")
|
log.debug("Tensorflow successfully found GPU acceleration!")
|
||||||
|
|
||||||
for dataset_func in [polar_dataset, varied_dataset]:
|
for dataset_func in [sample_reviews_polar, sample_reviews_varied]:
|
||||||
for SentimentAnalyzer in [
|
# Tensorflow-based
|
||||||
NLTKSentimentAnalyzer,
|
for Tokenizer in [
|
||||||
# TensorflowSentimentAnalyzer,
|
LowercaseTokenizer
|
||||||
]:
|
]:
|
||||||
for Tokenizer in [
|
tokenizer = Tokenizer()
|
||||||
NLTKWordTokenizer,
|
model = TensorflowSentimentAnalyzer()
|
||||||
PottsTokenizer,
|
|
||||||
PottsTokenizerWithNegation,
|
|
||||||
LowercaseTokenizer,
|
|
||||||
]:
|
|
||||||
tokenizer = Tokenizer()
|
|
||||||
model = SentimentAnalyzer(tokenizer=tokenizer)
|
|
||||||
|
|
||||||
with mongo_reviews_collection_from_config() as reviews:
|
with mongo_reviews_collection_from_config() as collection:
|
||||||
reviews_training = dataset_func(collection=reviews, amount=DATA_SET_SIZE.__wrapped__)
|
...
|
||||||
reviews_evaluation = dataset_func(collection=reviews, amount=DATA_SET_SIZE.__wrapped__)
|
|
||||||
|
|
||||||
log.info("Training model %s", model)
|
# NLTK-based
|
||||||
model.train(reviews_training)
|
for Tokenizer in [
|
||||||
log.info("Evaluating model %s", model)
|
NLTKWordTokenizer,
|
||||||
correct, evaluated = model.evaluate(reviews_evaluation)
|
PottsTokenizer,
|
||||||
log.info("%d evaluated, %d correct, %0.2d %% accuracy", evaluated, correct, correct / evaluated * 100)
|
PottsTokenizerWithNegation,
|
||||||
|
LowercaseTokenizer,
|
||||||
|
]:
|
||||||
|
tokenizer = Tokenizer()
|
||||||
|
model = NLTKSentimentAnalyzer(tokenizer=tokenizer)
|
||||||
|
|
||||||
# try:
|
with mongo_reviews_collection_from_config() as collection:
|
||||||
# print("Manual testing for %s" % model)
|
reviews_training = dataset_func(collection=collection, amount=DATA_SET_SIZE.__wrapped__)
|
||||||
# print("Input an empty string to continue to the next model.")
|
reviews_evaluation = dataset_func(collection=collection, amount=DATA_SET_SIZE.__wrapped__)
|
||||||
# while inp := input():
|
|
||||||
# print(model.use(inp))
|
log.info("Training model %s", model)
|
||||||
# except KeyboardInterrupt:
|
model.train(reviews_training)
|
||||||
# pass
|
log.info("Evaluating model %s", model)
|
||||||
|
correct, evaluated = model.evaluate(reviews_evaluation)
|
||||||
|
log.info("%d evaluated, %d correct, %0.2d %% accuracy", evaluated, correct, correct / evaluated * 100)
|
||||||
|
|
||||||
|
# try:
|
||||||
|
# print("Manual testing for %s" % model)
|
||||||
|
# print("Input an empty string to continue to the next model.")
|
||||||
|
# while inp := input():
|
||||||
|
# print(model.use(inp))
|
||||||
|
# except KeyboardInterrupt:
|
||||||
|
# pass
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|
|
@ -1,47 +1,55 @@
|
||||||
import abc
|
import abc
|
||||||
import logging
|
import logging
|
||||||
|
import typing as t
|
||||||
|
import dataclasses
|
||||||
|
|
||||||
from ..database import DataSet, Text, Category
|
from ..database import Text, Category, Review, DatasetFunc
|
||||||
from ..tokenizer import BaseTokenizer
|
|
||||||
|
|
||||||
log = logging.getLogger(__name__)
|
log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
@dataclasses.dataclass
|
||||||
|
class EvaluationResults:
|
||||||
|
correct: int
|
||||||
|
evaluated: int
|
||||||
|
|
||||||
|
def __repr__(self):
|
||||||
|
return f"<EvaluationResults: {self.correct}/{self.evaluated}, {self.correct / self.evaluated * 100:.2f}>"
|
||||||
|
|
||||||
|
def __str__(self):
|
||||||
|
return f"{self.correct} / {self.evaluated} - {self.correct / self.evaluated * 100:.2f} %"
|
||||||
|
|
||||||
|
|
||||||
class BaseSentimentAnalyzer(metaclass=abc.ABCMeta):
|
class BaseSentimentAnalyzer(metaclass=abc.ABCMeta):
|
||||||
"""
|
"""
|
||||||
Abstract base class for sentiment analyzers implemented in this project.
|
Abstract base class for sentiment analyzers implemented in this project.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, *, tokenizer: BaseTokenizer):
|
|
||||||
self.tokenizer: BaseTokenizer = tokenizer
|
|
||||||
|
|
||||||
def __repr__(self):
|
|
||||||
return f"<{self.__class__.__qualname__} tokenizer={self.tokenizer!r}>"
|
|
||||||
|
|
||||||
@abc.abstractmethod
|
@abc.abstractmethod
|
||||||
def train(self, training_set: DataSet) -> None:
|
def train(self, dataset_func: DatasetFunc) -> None:
|
||||||
"""
|
"""
|
||||||
Train the analyzer with the given training dataset.
|
Train the analyzer with the given training dataset.
|
||||||
"""
|
"""
|
||||||
raise NotImplementedError()
|
raise NotImplementedError()
|
||||||
|
|
||||||
def evaluate(self, test_set: DataSet) -> tuple[int, int]:
|
def evaluate(self, dataset_func: DatasetFunc) -> EvaluationResults:
|
||||||
"""
|
"""
|
||||||
Perform a model evaluation by calling repeatedly `.use` on every text of the test dataset and by comparing its resulting category with the expected category.
|
Perform a model evaluation by calling repeatedly `.use` on every text of the test dataset and by comparing its resulting category with the expected category.
|
||||||
|
|
||||||
Returns a tuple with the number of correct results and the number of evaluated results.
|
Returns a tuple with the number of correct results and the number of evaluated results.
|
||||||
"""
|
"""
|
||||||
evaluated: int = 0
|
|
||||||
correct: int = 0
|
|
||||||
|
|
||||||
for text, expected_category in test_set:
|
evaluated: int = 0
|
||||||
resulting_category = self.use(text)
|
correct: int = 0
|
||||||
|
|
||||||
|
for review in dataset_func():
|
||||||
|
resulting_category = self.use(review.text)
|
||||||
evaluated += 1
|
evaluated += 1
|
||||||
correct += 1 if resulting_category == expected_category else 0
|
correct += 1 if resulting_category == review.category else 0
|
||||||
if not evaluated % 100:
|
if not evaluated % 100:
|
||||||
log.debug("%d evaluated, %d correct, %0.2d %% accuracy", evaluated, correct, correct / evaluated * 100)
|
log.debug("%d evaluated, %d correct, %0.2d %% accuracy", evaluated, correct, correct / evaluated * 100)
|
||||||
|
|
||||||
return correct, evaluated
|
return EvaluationResults(correct=correct, evaluated=evaluated)
|
||||||
|
|
||||||
@abc.abstractmethod
|
@abc.abstractmethod
|
||||||
def use(self, text: Text) -> Category:
|
def use(self, text: Text) -> Category:
|
||||||
|
|
|
@ -6,7 +6,7 @@ import logging
|
||||||
import typing as t
|
import typing as t
|
||||||
import itertools
|
import itertools
|
||||||
|
|
||||||
from ..database import Text, Category, DataTuple, DataSet
|
from ..database import Text, Category, Review
|
||||||
from .base import BaseSentimentAnalyzer, AlreadyTrainedError, NotTrainedError
|
from .base import BaseSentimentAnalyzer, AlreadyTrainedError, NotTrainedError
|
||||||
from ..log import count_passage
|
from ..log import count_passage
|
||||||
from ..tokenizer import BaseTokenizer
|
from ..tokenizer import BaseTokenizer
|
||||||
|
@ -23,16 +23,20 @@ class NLTKSentimentAnalyzer(BaseSentimentAnalyzer):
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, *, tokenizer: BaseTokenizer) -> None:
|
def __init__(self, *, tokenizer: BaseTokenizer) -> None:
|
||||||
super().__init__(tokenizer=tokenizer)
|
super().__init__()
|
||||||
self.model: nltk.sentiment.SentimentAnalyzer = nltk.sentiment.SentimentAnalyzer()
|
self.model: nltk.sentiment.SentimentAnalyzer = nltk.sentiment.SentimentAnalyzer()
|
||||||
self.trained: bool = False
|
self.trained: bool = False
|
||||||
|
self.tokenizer: BaseTokenizer = tokenizer
|
||||||
|
|
||||||
def __tokenize_datatuple(self, datatuple: DataTuple) -> tuple[TokenBag, Category]:
|
def __repr__(self):
|
||||||
|
return f"<{self.__class__.__qualname__} tokenizer={self.tokenizer!r}>"
|
||||||
|
|
||||||
|
def __tokenize_datatuple(self, datatuple: Review) -> tuple[TokenBag, Category]:
|
||||||
"""
|
"""
|
||||||
Convert the `Text` of a `DataTuple` to a `TokenBag`.
|
Convert the `Text` of a `DataTuple` to a `TokenBag`.
|
||||||
"""
|
"""
|
||||||
count_passage(log, "tokenize_datatuple", 100)
|
count_passage(log, "tokenize_datatuple", 100)
|
||||||
return self.tokenizer.tokenize_builtins(datatuple[0]), datatuple[1]
|
return self.tokenizer.tokenize_builtins(datatuple.text), datatuple.category
|
||||||
|
|
||||||
def _add_feature_unigrams(self, dataset: t.Iterator[tuple[TokenBag, Category]]) -> None:
|
def _add_feature_unigrams(self, dataset: t.Iterator[tuple[TokenBag, Category]]) -> None:
|
||||||
"""
|
"""
|
||||||
|
@ -63,7 +67,7 @@ class NLTKSentimentAnalyzer(BaseSentimentAnalyzer):
|
||||||
count_passage(log, "extract_features", 100)
|
count_passage(log, "extract_features", 100)
|
||||||
return self.model.extract_features(data[0]), data[1]
|
return self.model.extract_features(data[0]), data[1]
|
||||||
|
|
||||||
def train(self, dataset: DataSet) -> None:
|
def train(self, dataset: t.Iterator[Review]) -> None:
|
||||||
# Forbid retraining the model
|
# Forbid retraining the model
|
||||||
if self.trained:
|
if self.trained:
|
||||||
raise AlreadyTrainedError()
|
raise AlreadyTrainedError()
|
||||||
|
|
|
@ -2,42 +2,25 @@ import tensorflow
|
||||||
import itertools
|
import itertools
|
||||||
import typing as t
|
import typing as t
|
||||||
|
|
||||||
from ..database import DataSet, Text, Category
|
from ..database import Text, Category, Review
|
||||||
from ..tokenizer import BaseTokenizer
|
from ..tokenizer import BaseTokenizer
|
||||||
from .base import BaseSentimentAnalyzer, AlreadyTrainedError, NotTrainedError
|
from .base import BaseSentimentAnalyzer, AlreadyTrainedError, NotTrainedError
|
||||||
|
|
||||||
|
|
||||||
class TensorflowSentimentAnalyzer(BaseSentimentAnalyzer):
|
class TensorflowSentimentAnalyzer(BaseSentimentAnalyzer):
|
||||||
def __init__(self, *, tokenizer: BaseTokenizer):
|
def __init__(self):
|
||||||
super().__init__(tokenizer=tokenizer)
|
super().__init__()
|
||||||
self.trained = False
|
self.trained = False
|
||||||
self.text_vectorization_layer = None
|
self.text_vectorization_layer = None
|
||||||
self.neural_network: tensorflow.keras.Sequential | None = None
|
self.neural_network: tensorflow.keras.Sequential | None = None
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def __infinite_dataset_generator_factory(dataset: DataSet):
|
|
||||||
"""
|
|
||||||
A generator of infinite copies of dataset.
|
|
||||||
|
|
||||||
.. todo:: Loads the whole dataset in memory. What a waste! Can we perform multiple MongoDB queries instead?
|
|
||||||
"""
|
|
||||||
dataset = map(lambda text, category: (tensorflow.convert_to_tensor(text, dtype=tensorflow.string), tensorflow.convert_to_tensor(category, dtype=tensorflow.string)), dataset)
|
|
||||||
|
|
||||||
def generator():
|
|
||||||
while True:
|
|
||||||
nonlocal dataset
|
|
||||||
dataset, result = itertools.tee(dataset, 2)
|
|
||||||
yield result
|
|
||||||
|
|
||||||
return generator
|
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def __bda_dataset_to_tf_dataset(cls, dataset: DataSet) -> tensorflow.data.Dataset:
|
def __bda_dataset_to_tf_dataset(cls, dataset_func: t.Callable[[], t.Iterator[Review]]) -> tensorflow.data.Dataset:
|
||||||
"""
|
"""
|
||||||
Convert a `unimore_bda_6.database.DataSet` to a "real" `tensorflow.data.Dataset`.
|
Convert a `unimore_bda_6.database.DataSet` to a "real" `tensorflow.data.Dataset`.
|
||||||
"""
|
"""
|
||||||
return tensorflow.data.Dataset.from_generator(
|
return tensorflow.data.Dataset.from_generator(
|
||||||
cls.__infinite_dataset_generator_factory(dataset),
|
dataset_func,
|
||||||
output_signature=(
|
output_signature=(
|
||||||
tensorflow.TensorSpec(shape=(), dtype=tensorflow.string),
|
tensorflow.TensorSpec(shape=(), dtype=tensorflow.string),
|
||||||
tensorflow.TensorSpec(shape=(), dtype=tensorflow.string),
|
tensorflow.TensorSpec(shape=(), dtype=tensorflow.string),
|
||||||
|
@ -48,7 +31,7 @@ class TensorflowSentimentAnalyzer(BaseSentimentAnalyzer):
|
||||||
EMBEDDING_DIM = 16
|
EMBEDDING_DIM = 16
|
||||||
EPOCHS = 10
|
EPOCHS = 10
|
||||||
|
|
||||||
def train(self, training_set: DataSet) -> None:
|
def train(self, training_set: t.Iterator[Review]) -> None:
|
||||||
if self.trained:
|
if self.trained:
|
||||||
raise AlreadyTrainedError()
|
raise AlreadyTrainedError()
|
||||||
|
|
||||||
|
|
|
@ -4,14 +4,19 @@ import pymongo.collection
|
||||||
import contextlib
|
import contextlib
|
||||||
import bson
|
import bson
|
||||||
import logging
|
import logging
|
||||||
import itertools
|
import tensorflow
|
||||||
|
|
||||||
from .config import MONGO_HOST, MONGO_PORT, WORKING_SET_SIZE
|
from .config import MONGO_HOST, MONGO_PORT, WORKING_SET_SIZE
|
||||||
|
|
||||||
log = logging.getLogger(__name__)
|
log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
class Review(t.TypedDict):
|
class MongoReview(t.TypedDict):
|
||||||
|
"""
|
||||||
|
A review as it is stored on MongoDB.
|
||||||
|
|
||||||
|
.. warning:: Do not instantiate: this is only for type hints!
|
||||||
|
"""
|
||||||
_id: bson.ObjectId
|
_id: bson.ObjectId
|
||||||
reviewerID: str
|
reviewerID: str
|
||||||
asin: str
|
asin: str
|
||||||
|
@ -28,13 +33,13 @@ Text = str
|
||||||
Category = float
|
Category = float
|
||||||
|
|
||||||
|
|
||||||
class DataTuple:
|
class Review:
|
||||||
def __init__(self, text, category):
|
def __init__(self, text: Text, category: Category):
|
||||||
self.text: Text = text
|
self.text: Text = text
|
||||||
self.category: Category = category
|
self.category: Category = category
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def from_review(cls, review):
|
def from_mongoreview(cls, review: MongoReview):
|
||||||
return cls(
|
return cls(
|
||||||
text=review["reviewText"],
|
text=review["reviewText"],
|
||||||
category=review["overall"],
|
category=review["overall"],
|
||||||
|
@ -44,15 +49,15 @@ class DataTuple:
|
||||||
return f"<{self.__class__.__qualname__}: [{self.category}] {self.text}>"
|
return f"<{self.__class__.__qualname__}: [{self.category}] {self.text}>"
|
||||||
|
|
||||||
def __getitem__(self, item):
|
def __getitem__(self, item):
|
||||||
if item == 0:
|
if item == 0 or item == "text":
|
||||||
return self.text
|
return self.text
|
||||||
elif item == 1:
|
elif item == 1 or item == "category":
|
||||||
return self.category
|
return self.category
|
||||||
else:
|
else:
|
||||||
raise KeyError(item)
|
raise KeyError(item)
|
||||||
|
|
||||||
|
def to_tensor_tuple(self) -> tuple[tensorflow.Tensor, tensorflow.Tensor]:
|
||||||
DataSet = t.Iterable[DataTuple]
|
return tensorflow.convert_to_tensor(self.text, dtype=tensorflow.string), tensorflow.convert_to_tensor(self.category, dtype=tensorflow.string)
|
||||||
|
|
||||||
|
|
||||||
@contextlib.contextmanager
|
@contextlib.contextmanager
|
||||||
|
@ -65,7 +70,7 @@ def mongo_client_from_config() -> t.ContextManager[pymongo.MongoClient]:
|
||||||
host=MONGO_HOST.__wrapped__,
|
host=MONGO_HOST.__wrapped__,
|
||||||
port=MONGO_PORT.__wrapped__,
|
port=MONGO_PORT.__wrapped__,
|
||||||
)
|
)
|
||||||
log.info("Opened connection to MongoDB at %s!", client.address)
|
log.info("Opened connection to MongoDB!")
|
||||||
|
|
||||||
yield client
|
yield client
|
||||||
|
|
||||||
|
@ -75,7 +80,7 @@ def mongo_client_from_config() -> t.ContextManager[pymongo.MongoClient]:
|
||||||
|
|
||||||
|
|
||||||
@contextlib.contextmanager
|
@contextlib.contextmanager
|
||||||
def mongo_reviews_collection_from_config() -> pymongo.collection.Collection[Review]:
|
def mongo_reviews_collection_from_config() -> pymongo.collection.Collection[MongoReview]:
|
||||||
"""
|
"""
|
||||||
Create a new MongoDB client, access the ``reviews`` collection in the ``reviews`` database, and yield it.
|
Create a new MongoDB client, access the ``reviews`` collection in the ``reviews`` database, and yield it.
|
||||||
"""
|
"""
|
||||||
|
@ -86,82 +91,118 @@ def mongo_reviews_collection_from_config() -> pymongo.collection.Collection[Revi
|
||||||
yield collection
|
yield collection
|
||||||
|
|
||||||
|
|
||||||
def sample_reviews(reviews: pymongo.collection.Collection, amount: int) -> t.Iterator[Review]:
|
class DatasetFunc(t.Protocol):
|
||||||
|
def __call__(self) -> t.Iterator[Review]:
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
def sample_reviews(collection: pymongo.collection.Collection, amount: int) -> t.Iterator[Review]:
|
||||||
"""
|
"""
|
||||||
Get ``amount`` random reviews from the ``reviews`` collection.
|
Get ``amount`` random reviews from the ``reviews`` collection.
|
||||||
"""
|
"""
|
||||||
log.debug("Getting a sample of %d reviews...", amount)
|
log.debug("Getting a sample of %d reviews...", amount)
|
||||||
|
|
||||||
return reviews.aggregate([
|
cursor = collection.aggregate([
|
||||||
{"$limit": WORKING_SET_SIZE.__wrapped__},
|
{"$limit": WORKING_SET_SIZE.__wrapped__},
|
||||||
{"$sample": {"size": amount}},
|
{"$sample": {"size": amount}},
|
||||||
])
|
])
|
||||||
|
|
||||||
|
cursor = map(Review.from_mongoreview, cursor)
|
||||||
|
return cursor
|
||||||
|
|
||||||
def sample_reviews_by_rating(reviews: pymongo.collection.Collection, rating: float, amount: int) -> t.Iterator[Review]:
|
|
||||||
|
def sample_reviews_by_rating(collection: pymongo.collection.Collection, rating: float, amount: int) -> t.Iterator[Review]:
|
||||||
"""
|
"""
|
||||||
Get ``amount`` random reviews with ``rating`` stars from the ``reviews`` collection.
|
Get ``amount`` random reviews with ``rating`` stars from the ``reviews`` collection.
|
||||||
"""
|
"""
|
||||||
log.debug("Getting a sample of %d reviews with %d stars...", amount, rating)
|
log.debug("Getting a sample of %d reviews with %d stars...", amount, rating)
|
||||||
|
|
||||||
return reviews.aggregate([
|
cursor = collection.aggregate([
|
||||||
{"$limit": WORKING_SET_SIZE.__wrapped__},
|
{"$limit": WORKING_SET_SIZE.__wrapped__},
|
||||||
{"$match": {"overall": rating}},
|
{"$match": {"overall": rating}},
|
||||||
{"$sample": {"size": amount}},
|
{"$sample": {"size": amount}},
|
||||||
])
|
])
|
||||||
|
|
||||||
|
cursor = map(Review.from_mongoreview, cursor)
|
||||||
def polar_dataset(collection: pymongo.collection.Collection, amount: int) -> t.Iterator[DataTuple]:
|
return cursor
|
||||||
"""
|
|
||||||
Get a list of the same amount of 1-star and 5-star reviews.
|
|
||||||
"""
|
|
||||||
log.info("Building polar dataset with %d reviews...", amount * 2)
|
|
||||||
|
|
||||||
# Sample the required reviews
|
|
||||||
positive = sample_reviews_by_rating(collection, rating=5.0, amount=amount)
|
|
||||||
negative = sample_reviews_by_rating(collection, rating=1.0, amount=amount)
|
|
||||||
|
|
||||||
# Chain the iterators
|
|
||||||
full = itertools.chain(positive, negative)
|
|
||||||
|
|
||||||
# Convert reviews to datatuples
|
|
||||||
full = map(DataTuple.from_review, full)
|
|
||||||
|
|
||||||
return full
|
|
||||||
|
|
||||||
|
|
||||||
def varied_dataset(collection: pymongo.collection.Collection, amount: int) -> t.Iterator[DataTuple]:
|
def sample_reviews_polar(collection: pymongo.collection.Collection, amount: int) -> t.Iterator[Review]:
|
||||||
"""
|
log.debug("Getting a sample of %d polar reviews...", amount * 2)
|
||||||
Get a list of the same amount of reviews for each rating.
|
|
||||||
"""
|
|
||||||
log.info("Building varied dataset with %d reviews...", amount * 5)
|
|
||||||
|
|
||||||
# Sample the required reviews
|
cursor = collection.aggregate([
|
||||||
terrible = sample_reviews_by_rating(collection, rating=1.0, amount=amount)
|
{"$limit": WORKING_SET_SIZE.__wrapped__},
|
||||||
negative = sample_reviews_by_rating(collection, rating=2.0, amount=amount)
|
{"$match": {"overall": 1.0}},
|
||||||
mixed = sample_reviews_by_rating(collection, rating=3.0, amount=amount)
|
{"$sample": {"size": amount}},
|
||||||
positive = sample_reviews_by_rating(collection, rating=4.0, amount=amount)
|
{"$unionWith": {
|
||||||
great = sample_reviews_by_rating(collection, rating=5.0, amount=amount)
|
"coll": collection.name,
|
||||||
|
"pipeline": [
|
||||||
|
{"$limit": WORKING_SET_SIZE.__wrapped__},
|
||||||
|
{"$match": {"overall": 5.0}},
|
||||||
|
{"$sample": {"size": amount}},
|
||||||
|
],
|
||||||
|
}}
|
||||||
|
])
|
||||||
|
|
||||||
# Chain the iterators
|
cursor = map(Review.from_mongoreview, cursor)
|
||||||
full = itertools.chain(terrible, negative, mixed, positive, great)
|
return cursor
|
||||||
|
|
||||||
# Convert reviews to datatuples
|
|
||||||
full = map(DataTuple.from_review, full)
|
|
||||||
|
|
||||||
return full
|
def sample_reviews_varied(collection: pymongo.collection.Collection, amount: int) -> t.Iterator[Review]:
|
||||||
|
log.debug("Getting a sample of %d varied reviews...", amount * 5)
|
||||||
|
|
||||||
|
# Wow, this is ugly.
|
||||||
|
cursor = collection.aggregate([
|
||||||
|
{"$limit": WORKING_SET_SIZE.__wrapped__},
|
||||||
|
{"$match": {"overall": 1.0}},
|
||||||
|
{"$sample": {"size": amount}},
|
||||||
|
{"$unionWith": {
|
||||||
|
"coll": collection.name,
|
||||||
|
"pipeline": [
|
||||||
|
{"$limit": WORKING_SET_SIZE.__wrapped__},
|
||||||
|
{"$match": {"overall": 2.0}},
|
||||||
|
{"$sample": {"size": amount}},
|
||||||
|
{"$unionWith": {
|
||||||
|
"coll": collection.name,
|
||||||
|
"pipeline": [
|
||||||
|
{"$limit": WORKING_SET_SIZE.__wrapped__},
|
||||||
|
{"$match": {"overall": 3.0}},
|
||||||
|
{"$sample": {"size": amount}},
|
||||||
|
{"$unionWith": {
|
||||||
|
"coll": collection.name,
|
||||||
|
"pipeline": [
|
||||||
|
{"$limit": WORKING_SET_SIZE.__wrapped__},
|
||||||
|
{"$match": {"overall": 4.0}},
|
||||||
|
{"$sample": {"size": amount}},
|
||||||
|
{"$unionWith": {
|
||||||
|
"coll": collection.name,
|
||||||
|
"pipeline": [
|
||||||
|
{"$limit": WORKING_SET_SIZE.__wrapped__},
|
||||||
|
{"$match": {"overall": 5.0}},
|
||||||
|
{"$sample": {"size": amount}},
|
||||||
|
],
|
||||||
|
}}
|
||||||
|
],
|
||||||
|
}}
|
||||||
|
],
|
||||||
|
}}
|
||||||
|
],
|
||||||
|
}}
|
||||||
|
])
|
||||||
|
|
||||||
|
cursor = map(Review.from_mongoreview, cursor)
|
||||||
|
return cursor
|
||||||
|
|
||||||
|
|
||||||
__all__ = (
|
__all__ = (
|
||||||
"Review",
|
|
||||||
"Text",
|
"Text",
|
||||||
"Category",
|
"Category",
|
||||||
"DataTuple",
|
"Review",
|
||||||
"DataSet",
|
"DatasetFunc",
|
||||||
"mongo_client_from_config",
|
"mongo_client_from_config",
|
||||||
"mongo_reviews_collection_from_config",
|
"mongo_reviews_collection_from_config",
|
||||||
"sample_reviews",
|
"sample_reviews",
|
||||||
"sample_reviews_by_rating",
|
"sample_reviews_by_rating",
|
||||||
"polar_dataset",
|
"sample_reviews_polar",
|
||||||
"varied_dataset",
|
"sample_reviews_varied",
|
||||||
)
|
)
|
||||||
|
|
Loading…
Reference in a new issue