2023-02-01 03:20:09 +00:00
|
|
|
import typing as t
|
|
|
|
import pymongo
|
|
|
|
import pymongo.collection
|
|
|
|
import contextlib
|
|
|
|
import bson
|
2023-02-01 16:46:25 +00:00
|
|
|
import logging
|
2023-02-03 01:10:00 +00:00
|
|
|
import itertools
|
2023-02-04 04:16:54 +00:00
|
|
|
import collections
|
2023-02-01 03:20:09 +00:00
|
|
|
|
2023-02-03 22:27:44 +00:00
|
|
|
from .config import MONGO_HOST, MONGO_PORT, WORKING_SET_SIZE
|
2023-02-01 16:46:25 +00:00
|
|
|
|
|
|
|
log = logging.getLogger(__name__)
|
2023-02-01 03:20:09 +00:00
|
|
|
|
|
|
|
|
|
|
|
class Review(t.TypedDict):
|
|
|
|
_id: bson.ObjectId
|
|
|
|
reviewerID: str
|
|
|
|
asin: str
|
|
|
|
reviewerName: str
|
|
|
|
helpful: tuple[int, int]
|
|
|
|
reviewText: str
|
|
|
|
overall: float
|
|
|
|
summary: str
|
|
|
|
unixReviewTime: int
|
|
|
|
reviewTime: str
|
|
|
|
|
|
|
|
|
2023-02-03 22:27:44 +00:00
|
|
|
Text = str
|
2023-02-04 04:28:18 +00:00
|
|
|
Category = float
|
2023-02-04 04:16:54 +00:00
|
|
|
DataTuple = collections.namedtuple("DataTuple", ["text", "category"])
|
2023-02-03 22:27:44 +00:00
|
|
|
DataSet = t.Iterable[DataTuple]
|
|
|
|
|
|
|
|
|
2023-02-01 03:20:09 +00:00
|
|
|
@contextlib.contextmanager
|
|
|
|
def mongo_client_from_config() -> t.ContextManager[pymongo.MongoClient]:
|
|
|
|
"""
|
|
|
|
Create a new MongoDB client and yield it.
|
|
|
|
"""
|
2023-02-01 16:46:25 +00:00
|
|
|
log.debug("Opening connection to MongoDB...")
|
2023-02-02 16:46:21 +00:00
|
|
|
client: pymongo.MongoClient = pymongo.MongoClient(
|
2023-02-01 16:46:25 +00:00
|
|
|
host=MONGO_HOST.__wrapped__,
|
|
|
|
port=MONGO_PORT.__wrapped__,
|
2023-02-01 03:20:09 +00:00
|
|
|
)
|
2023-02-02 16:46:21 +00:00
|
|
|
log.info("Opened connection to MongoDB at %s!", client.address)
|
2023-02-01 16:46:25 +00:00
|
|
|
|
2023-02-01 03:20:09 +00:00
|
|
|
yield client
|
2023-02-01 16:46:25 +00:00
|
|
|
|
2023-02-02 16:46:21 +00:00
|
|
|
log.info("Closing connection to MongoDB...")
|
2023-02-01 03:20:09 +00:00
|
|
|
client.close()
|
2023-02-01 16:46:25 +00:00
|
|
|
log.debug("Closed connection to MongoDB!")
|
2023-02-01 03:20:09 +00:00
|
|
|
|
|
|
|
|
|
|
|
@contextlib.contextmanager
|
|
|
|
def mongo_reviews_collection_from_config() -> pymongo.collection.Collection[Review]:
|
|
|
|
"""
|
|
|
|
Create a new MongoDB client, access the ``reviews`` collection in the ``reviews`` database, and yield it.
|
|
|
|
"""
|
|
|
|
with mongo_client_from_config() as db:
|
2023-02-01 16:46:25 +00:00
|
|
|
log.debug("Accessing the reviews collection...")
|
|
|
|
collection = db.reviews.reviews
|
|
|
|
log.debug("Collection accessed successfully: %s", collection)
|
|
|
|
yield collection
|
|
|
|
|
|
|
|
|
2023-02-03 01:10:00 +00:00
|
|
|
def sample_reviews(reviews: pymongo.collection.Collection, amount: int) -> t.Iterator[Review]:
|
2023-02-01 03:20:09 +00:00
|
|
|
"""
|
|
|
|
Get ``amount`` random reviews from the ``reviews`` collection.
|
|
|
|
"""
|
2023-02-01 16:46:25 +00:00
|
|
|
log.debug("Getting a sample of %d reviews...", amount)
|
2023-02-01 03:20:09 +00:00
|
|
|
|
|
|
|
return reviews.aggregate([
|
2023-02-02 03:07:17 +00:00
|
|
|
{"$limit": WORKING_SET_SIZE.__wrapped__},
|
2023-02-02 01:56:37 +00:00
|
|
|
{"$sample": {"size": amount}},
|
2023-02-01 03:20:09 +00:00
|
|
|
])
|
|
|
|
|
|
|
|
|
2023-02-03 01:10:00 +00:00
|
|
|
def sample_reviews_by_rating(reviews: pymongo.collection.Collection, rating: float, amount: int) -> t.Iterator[Review]:
|
2023-02-01 03:20:09 +00:00
|
|
|
"""
|
|
|
|
Get ``amount`` random reviews with ``rating`` stars from the ``reviews`` collection.
|
|
|
|
"""
|
2023-02-01 16:46:25 +00:00
|
|
|
log.debug("Getting a sample of %d reviews with %d stars...", amount, rating)
|
2023-02-01 03:20:09 +00:00
|
|
|
|
|
|
|
return reviews.aggregate([
|
2023-02-02 03:07:17 +00:00
|
|
|
{"$limit": WORKING_SET_SIZE.__wrapped__},
|
2023-02-01 15:02:52 +00:00
|
|
|
{"$match": {"overall": rating}},
|
2023-02-02 01:56:37 +00:00
|
|
|
{"$sample": {"size": amount}},
|
2023-02-01 03:20:09 +00:00
|
|
|
])
|
|
|
|
|
|
|
|
|
2023-02-04 04:16:54 +00:00
|
|
|
def review_to_datatuple(review: Review) -> DataTuple:
|
2023-02-03 22:27:44 +00:00
|
|
|
"""
|
|
|
|
Return the label corresponding to the given review.
|
|
|
|
|
|
|
|
Possible categories are:
|
|
|
|
|
|
|
|
* terrible (1.0)
|
|
|
|
* negative (2.0)
|
|
|
|
* mixed (3.0)
|
|
|
|
* positive (4.0)
|
|
|
|
* great (5.0)
|
|
|
|
* unknown (everything else)
|
|
|
|
"""
|
|
|
|
text = review["reviewText"]
|
2023-02-04 04:28:18 +00:00
|
|
|
category = review["overall"]
|
|
|
|
|
|
|
|
return DataTuple(text=text, category=category)
|
2023-02-03 22:27:44 +00:00
|
|
|
|
|
|
|
|
|
|
|
def polar_dataset(collection: pymongo.collection.Collection, amount: int) -> t.Iterator[DataTuple]:
|
2023-02-01 03:20:09 +00:00
|
|
|
"""
|
2023-02-02 16:24:11 +00:00
|
|
|
Get a list of the same amount of 1-star and 5-star reviews.
|
2023-02-01 03:20:09 +00:00
|
|
|
"""
|
2023-02-02 16:24:11 +00:00
|
|
|
log.info("Building polar dataset with %d reviews...", amount * 2)
|
2023-02-01 03:20:09 +00:00
|
|
|
|
|
|
|
# Sample the required reviews
|
2023-02-02 04:01:31 +00:00
|
|
|
positive = sample_reviews_by_rating(collection, rating=5.0, amount=amount)
|
|
|
|
negative = sample_reviews_by_rating(collection, rating=1.0, amount=amount)
|
2023-02-01 03:20:09 +00:00
|
|
|
|
2023-02-03 22:27:44 +00:00
|
|
|
# Chain the iterators
|
|
|
|
full = itertools.chain(positive, negative)
|
|
|
|
|
|
|
|
# Convert reviews to datatuples
|
|
|
|
full = map(review_to_datatuple, full)
|
2023-02-01 03:20:09 +00:00
|
|
|
|
2023-02-03 22:27:44 +00:00
|
|
|
return full
|
2023-02-01 03:20:09 +00:00
|
|
|
|
|
|
|
|
2023-02-03 22:27:44 +00:00
|
|
|
def varied_dataset(collection: pymongo.collection.Collection, amount: int) -> t.Iterator[DataTuple]:
|
2023-02-01 03:20:09 +00:00
|
|
|
"""
|
2023-02-02 16:24:11 +00:00
|
|
|
Get a list of the same amount of reviews for each rating.
|
2023-02-01 03:20:09 +00:00
|
|
|
"""
|
2023-02-02 16:24:11 +00:00
|
|
|
log.info("Building varied dataset with %d reviews...", amount * 5)
|
2023-02-02 04:01:31 +00:00
|
|
|
|
|
|
|
# Sample the required reviews
|
|
|
|
terrible = sample_reviews_by_rating(collection, rating=1.0, amount=amount)
|
|
|
|
negative = sample_reviews_by_rating(collection, rating=2.0, amount=amount)
|
|
|
|
mixed = sample_reviews_by_rating(collection, rating=3.0, amount=amount)
|
|
|
|
positive = sample_reviews_by_rating(collection, rating=4.0, amount=amount)
|
|
|
|
great = sample_reviews_by_rating(collection, rating=5.0, amount=amount)
|
2023-02-01 03:20:09 +00:00
|
|
|
|
2023-02-03 22:27:44 +00:00
|
|
|
# Chain the iterators
|
2023-02-03 01:10:00 +00:00
|
|
|
full = itertools.chain(terrible, negative, mixed, positive, great)
|
2023-02-01 16:46:25 +00:00
|
|
|
|
2023-02-03 22:27:44 +00:00
|
|
|
# Convert reviews to datatuples
|
|
|
|
full = map(review_to_datatuple, full)
|
|
|
|
|
2023-02-02 14:16:46 +00:00
|
|
|
return full
|
2023-02-01 16:46:25 +00:00
|
|
|
|
|
|
|
|
|
|
|
__all__ = (
|
|
|
|
"Review",
|
2023-02-03 22:27:44 +00:00
|
|
|
"Text",
|
|
|
|
"Category",
|
|
|
|
"DataTuple",
|
|
|
|
"DataSet",
|
2023-02-01 16:46:25 +00:00
|
|
|
"mongo_client_from_config",
|
|
|
|
"mongo_reviews_collection_from_config",
|
|
|
|
"sample_reviews",
|
|
|
|
"sample_reviews_by_rating",
|
2023-02-03 22:27:44 +00:00
|
|
|
"polar_dataset",
|
|
|
|
"varied_dataset",
|
2023-02-01 16:46:25 +00:00
|
|
|
)
|