2023-02-01 03:20:09 +00:00
|
|
|
import typing as t
|
|
|
|
import pymongo
|
|
|
|
import pymongo.collection
|
|
|
|
import contextlib
|
|
|
|
import bson
|
|
|
|
|
|
|
|
from .config import MONGO_HOST, MONGO_PORT
|
|
|
|
|
|
|
|
|
|
|
|
class Review(t.TypedDict):
|
|
|
|
_id: bson.ObjectId
|
|
|
|
reviewerID: str
|
|
|
|
asin: str
|
|
|
|
reviewerName: str
|
|
|
|
helpful: tuple[int, int]
|
|
|
|
reviewText: str
|
|
|
|
overall: float
|
|
|
|
summary: str
|
|
|
|
unixReviewTime: int
|
|
|
|
reviewTime: str
|
|
|
|
|
|
|
|
|
|
|
|
@contextlib.contextmanager
|
|
|
|
def mongo_client_from_config() -> t.ContextManager[pymongo.MongoClient]:
|
|
|
|
"""
|
|
|
|
Create a new MongoDB client and yield it.
|
|
|
|
"""
|
|
|
|
client = pymongo.MongoClient(
|
|
|
|
host=MONGO_HOST.__resolved__,
|
|
|
|
port=MONGO_PORT.__resolved__,
|
|
|
|
)
|
|
|
|
yield client
|
|
|
|
client.close()
|
|
|
|
|
|
|
|
|
|
|
|
@contextlib.contextmanager
|
|
|
|
def mongo_reviews_collection_from_config() -> pymongo.collection.Collection[Review]:
|
|
|
|
"""
|
|
|
|
Create a new MongoDB client, access the ``reviews`` collection in the ``reviews`` database, and yield it.
|
|
|
|
"""
|
|
|
|
with mongo_client_from_config() as db:
|
|
|
|
yield db.reviews.reviews
|
|
|
|
|
|
|
|
|
|
|
|
def sample_reviews(reviews: pymongo.collection.Collection, amount: int) -> t.Iterable[Review]:
|
|
|
|
"""
|
|
|
|
Get ``amount`` random reviews from the ``reviews`` collection.
|
|
|
|
"""
|
|
|
|
|
|
|
|
return reviews.aggregate([
|
|
|
|
{"$sample": {"size": amount}}
|
|
|
|
])
|
|
|
|
|
|
|
|
|
|
|
|
def sample_reviews_by_rating(reviews: pymongo.collection.Collection, rating: float, amount: int) -> t.Iterable[Review]:
|
|
|
|
"""
|
|
|
|
Get ``amount`` random reviews with ``rating`` stars from the ``reviews`` collection.
|
|
|
|
"""
|
|
|
|
|
|
|
|
return reviews.aggregate([
|
2023-02-01 15:02:52 +00:00
|
|
|
{"$match": {"overall": rating}},
|
2023-02-01 03:20:09 +00:00
|
|
|
{"$sample": {"size": amount}},
|
|
|
|
])
|
|
|
|
|
|
|
|
|
|
|
|
def sample_reviews_by_rating_polar(reviews: pymongo.collection.Collection, amount: int) -> t.Iterable[Review]:
|
|
|
|
"""
|
|
|
|
Get ``amount`` random reviews with either a 5-star or 1-star rating from the ``reviews`` collection.
|
|
|
|
"""
|
|
|
|
|
|
|
|
return reviews.aggregate([
|
2023-02-01 15:02:52 +00:00
|
|
|
{"$match":
|
|
|
|
{"$or":
|
|
|
|
[
|
|
|
|
{"overall": 1.0},
|
|
|
|
{"overall": 5.0},
|
|
|
|
]
|
|
|
|
},
|
|
|
|
},
|
2023-02-01 03:20:09 +00:00
|
|
|
{"$sample": {"size": amount}},
|
|
|
|
])
|
|
|
|
|
|
|
|
|
|
|
|
def get_reviews_training_set(reviews: pymongo.collection.Collection, amount: int) -> t.Iterable[Review]:
|
|
|
|
"""
|
|
|
|
Get the subset of reviews that should act as training set.
|
|
|
|
"""
|
|
|
|
|
|
|
|
# Handle odd numbers
|
|
|
|
positive_amount: int = amount // 2
|
|
|
|
negative_amount: int = amount - positive_amount
|
|
|
|
|
|
|
|
# Sample the required reviews
|
|
|
|
positive = sample_reviews_by_rating(reviews, 5.0, positive_amount)
|
|
|
|
negative = sample_reviews_by_rating(reviews, 1.0, negative_amount)
|
|
|
|
|
|
|
|
# Randomness here does not matter, so just merge the lists
|
|
|
|
both = [*positive, *negative]
|
|
|
|
|
|
|
|
return both
|
|
|
|
|
|
|
|
|
|
|
|
def get_reviews_test_set(reviews: pymongo.collection.Collection, amount: int) -> t.Iterable[Review]:
|
|
|
|
"""
|
|
|
|
Get the subset of reviews that should act as test set.
|
|
|
|
"""
|
|
|
|
|
|
|
|
return sample_reviews_by_rating_polar(reviews, amount)
|