1
Fork 0
mirror of https://github.com/Steffo99/unimore-bda-6.git synced 2024-11-25 17:24:20 +00:00
bda-6-steffo/unimore_bda_6/database.py

176 lines
4.8 KiB
Python
Raw Normal View History

2023-02-01 03:20:09 +00:00
import typing as t
import pymongo
import pymongo.collection
import contextlib
import bson
2023-02-01 16:46:25 +00:00
import logging
2023-02-03 01:10:00 +00:00
import itertools
2023-02-01 03:20:09 +00:00
2023-02-03 22:27:44 +00:00
from .config import MONGO_HOST, MONGO_PORT, WORKING_SET_SIZE
2023-02-01 16:46:25 +00:00
log = logging.getLogger(__name__)
2023-02-01 03:20:09 +00:00
class Review(t.TypedDict):
_id: bson.ObjectId
reviewerID: str
asin: str
reviewerName: str
helpful: tuple[int, int]
reviewText: str
overall: float
summary: str
unixReviewTime: int
reviewTime: str
2023-02-03 22:27:44 +00:00
Text = str
Category = str
DataTuple = tuple[Text, Category]
DataSet = t.Iterable[DataTuple]
2023-02-01 03:20:09 +00:00
@contextlib.contextmanager
def mongo_client_from_config() -> t.ContextManager[pymongo.MongoClient]:
"""
Create a new MongoDB client and yield it.
"""
2023-02-01 16:46:25 +00:00
log.debug("Opening connection to MongoDB...")
2023-02-02 16:46:21 +00:00
client: pymongo.MongoClient = pymongo.MongoClient(
2023-02-01 16:46:25 +00:00
host=MONGO_HOST.__wrapped__,
port=MONGO_PORT.__wrapped__,
2023-02-01 03:20:09 +00:00
)
2023-02-02 16:46:21 +00:00
log.info("Opened connection to MongoDB at %s!", client.address)
2023-02-01 16:46:25 +00:00
2023-02-01 03:20:09 +00:00
yield client
2023-02-01 16:46:25 +00:00
2023-02-02 16:46:21 +00:00
log.info("Closing connection to MongoDB...")
2023-02-01 03:20:09 +00:00
client.close()
2023-02-01 16:46:25 +00:00
log.debug("Closed connection to MongoDB!")
2023-02-01 03:20:09 +00:00
@contextlib.contextmanager
def mongo_reviews_collection_from_config() -> pymongo.collection.Collection[Review]:
"""
Create a new MongoDB client, access the ``reviews`` collection in the ``reviews`` database, and yield it.
"""
with mongo_client_from_config() as db:
2023-02-01 16:46:25 +00:00
log.debug("Accessing the reviews collection...")
collection = db.reviews.reviews
log.debug("Collection accessed successfully: %s", collection)
yield collection
2023-02-03 01:10:00 +00:00
def sample_reviews(reviews: pymongo.collection.Collection, amount: int) -> t.Iterator[Review]:
2023-02-01 03:20:09 +00:00
"""
Get ``amount`` random reviews from the ``reviews`` collection.
"""
2023-02-01 16:46:25 +00:00
log.debug("Getting a sample of %d reviews...", amount)
2023-02-01 03:20:09 +00:00
return reviews.aggregate([
2023-02-02 03:07:17 +00:00
{"$limit": WORKING_SET_SIZE.__wrapped__},
2023-02-02 01:56:37 +00:00
{"$sample": {"size": amount}},
2023-02-01 03:20:09 +00:00
])
2023-02-03 01:10:00 +00:00
def sample_reviews_by_rating(reviews: pymongo.collection.Collection, rating: float, amount: int) -> t.Iterator[Review]:
2023-02-01 03:20:09 +00:00
"""
Get ``amount`` random reviews with ``rating`` stars from the ``reviews`` collection.
"""
2023-02-01 16:46:25 +00:00
log.debug("Getting a sample of %d reviews with %d stars...", amount, rating)
2023-02-01 03:20:09 +00:00
return reviews.aggregate([
2023-02-02 03:07:17 +00:00
{"$limit": WORKING_SET_SIZE.__wrapped__},
2023-02-01 15:02:52 +00:00
{"$match": {"overall": rating}},
2023-02-02 01:56:37 +00:00
{"$sample": {"size": amount}},
2023-02-01 03:20:09 +00:00
])
2023-02-03 22:27:44 +00:00
def review_to_datatuple(review: Review) -> tuple[Text, Category]:
"""
Return the label corresponding to the given review.
Possible categories are:
* terrible (1.0)
* negative (2.0)
* mixed (3.0)
* positive (4.0)
* great (5.0)
* unknown (everything else)
"""
text = review["reviewText"]
rating = review["overall"]
match rating:
case 1.0:
category = "terrible"
case 2.0:
category = "negative"
case 3.0:
category = "mixed"
case 4.0:
category = "positive"
case 5.0:
category = "great"
case _:
category = "unknown"
return text, category
def polar_dataset(collection: pymongo.collection.Collection, amount: int) -> t.Iterator[DataTuple]:
2023-02-01 03:20:09 +00:00
"""
2023-02-02 16:24:11 +00:00
Get a list of the same amount of 1-star and 5-star reviews.
2023-02-01 03:20:09 +00:00
"""
2023-02-02 16:24:11 +00:00
log.info("Building polar dataset with %d reviews...", amount * 2)
2023-02-01 03:20:09 +00:00
# Sample the required reviews
positive = sample_reviews_by_rating(collection, rating=5.0, amount=amount)
negative = sample_reviews_by_rating(collection, rating=1.0, amount=amount)
2023-02-01 03:20:09 +00:00
2023-02-03 22:27:44 +00:00
# Chain the iterators
full = itertools.chain(positive, negative)
# Convert reviews to datatuples
full = map(review_to_datatuple, full)
2023-02-01 03:20:09 +00:00
2023-02-03 22:27:44 +00:00
return full
2023-02-01 03:20:09 +00:00
2023-02-03 22:27:44 +00:00
def varied_dataset(collection: pymongo.collection.Collection, amount: int) -> t.Iterator[DataTuple]:
2023-02-01 03:20:09 +00:00
"""
2023-02-02 16:24:11 +00:00
Get a list of the same amount of reviews for each rating.
2023-02-01 03:20:09 +00:00
"""
2023-02-02 16:24:11 +00:00
log.info("Building varied dataset with %d reviews...", amount * 5)
# Sample the required reviews
terrible = sample_reviews_by_rating(collection, rating=1.0, amount=amount)
negative = sample_reviews_by_rating(collection, rating=2.0, amount=amount)
mixed = sample_reviews_by_rating(collection, rating=3.0, amount=amount)
positive = sample_reviews_by_rating(collection, rating=4.0, amount=amount)
great = sample_reviews_by_rating(collection, rating=5.0, amount=amount)
2023-02-01 03:20:09 +00:00
2023-02-03 22:27:44 +00:00
# Chain the iterators
2023-02-03 01:10:00 +00:00
full = itertools.chain(terrible, negative, mixed, positive, great)
2023-02-01 16:46:25 +00:00
2023-02-03 22:27:44 +00:00
# Convert reviews to datatuples
full = map(review_to_datatuple, full)
return full
2023-02-01 16:46:25 +00:00
__all__ = (
"Review",
2023-02-03 22:27:44 +00:00
"Text",
"Category",
"DataTuple",
"DataSet",
2023-02-01 16:46:25 +00:00
"mongo_client_from_config",
"mongo_reviews_collection_from_config",
"sample_reviews",
"sample_reviews_by_rating",
2023-02-03 22:27:44 +00:00
"polar_dataset",
"varied_dataset",
2023-02-01 16:46:25 +00:00
)