1
Fork 0
mirror of https://github.com/Steffo99/unimore-bda-6.git synced 2024-11-22 07:54:19 +00:00
bda-6-steffo/unimore_bda_6/database.py

164 lines
4.7 KiB
Python
Raw Normal View History

2023-02-01 03:20:09 +00:00
import typing as t
import pymongo
import pymongo.collection
import contextlib
import bson
2023-02-01 16:46:25 +00:00
import logging
import random
2023-02-01 03:20:09 +00:00
2023-02-01 16:46:25 +00:00
from .config import MONGO_HOST, MONGO_PORT, TRAINING_SET_SIZE, TEST_SET_SIZE, SAMPLE_MODE
log = logging.getLogger(__name__)
2023-02-01 03:20:09 +00:00
class Review(t.TypedDict):
_id: bson.ObjectId
reviewerID: str
asin: str
reviewerName: str
helpful: tuple[int, int]
reviewText: str
overall: float
summary: str
unixReviewTime: int
reviewTime: str
@contextlib.contextmanager
def mongo_client_from_config() -> t.ContextManager[pymongo.MongoClient]:
"""
Create a new MongoDB client and yield it.
"""
2023-02-01 16:46:25 +00:00
log.debug("Opening connection to MongoDB...")
2023-02-01 03:20:09 +00:00
client = pymongo.MongoClient(
2023-02-01 16:46:25 +00:00
host=MONGO_HOST.__wrapped__,
port=MONGO_PORT.__wrapped__,
2023-02-01 03:20:09 +00:00
)
2023-02-01 16:46:25 +00:00
log.info("Opened connection to MongoDB: %s", client)
2023-02-01 03:20:09 +00:00
yield client
2023-02-01 16:46:25 +00:00
log.info("Closing connection to MongoDB: %s", client)
2023-02-01 03:20:09 +00:00
client.close()
2023-02-01 16:46:25 +00:00
log.debug("Closed connection to MongoDB!")
2023-02-01 03:20:09 +00:00
@contextlib.contextmanager
def mongo_reviews_collection_from_config() -> pymongo.collection.Collection[Review]:
"""
Create a new MongoDB client, access the ``reviews`` collection in the ``reviews`` database, and yield it.
"""
with mongo_client_from_config() as db:
2023-02-01 16:46:25 +00:00
log.debug("Accessing the reviews collection...")
collection = db.reviews.reviews
log.debug("Collection accessed successfully: %s", collection)
yield collection
def pipeline_sample(collection: pymongo.collection.Collection, amount: int) -> list:
"""
Create pipeline stages for sampling random documents, either with true randomness or by skipping a random amount of them.
"""
if SAMPLE_MODE.__wrapped__ == "$sample":
return [
{"$sample": {"size": amount}},
]
elif SAMPLE_MODE.__wrapped__ == "$limit":
log.warning("USE_SAMPLE is disabled, sampling documents using $skip and $limit.")
skip = random.randint(0, collection.estimated_document_count(maxTimeMS=100))
return [
{"$skip": skip},
{"$limit": amount},
]
else:
raise ValueError("Unknown sample mode", SAMPLE_MODE)
2023-02-01 03:20:09 +00:00
def sample_reviews(reviews: pymongo.collection.Collection, amount: int) -> t.Iterable[Review]:
"""
Get ``amount`` random reviews from the ``reviews`` collection.
"""
2023-02-01 16:46:25 +00:00
log.debug("Getting a sample of %d reviews...", amount)
2023-02-01 03:20:09 +00:00
return reviews.aggregate([
2023-02-01 16:46:25 +00:00
*pipeline_sample(reviews, amount),
2023-02-01 03:20:09 +00:00
])
def sample_reviews_by_rating(reviews: pymongo.collection.Collection, rating: float, amount: int) -> t.Iterable[Review]:
"""
Get ``amount`` random reviews with ``rating`` stars from the ``reviews`` collection.
"""
2023-02-01 16:46:25 +00:00
log.debug("Getting a sample of %d reviews with %d stars...", amount, rating)
2023-02-01 03:20:09 +00:00
return reviews.aggregate([
2023-02-01 15:02:52 +00:00
{"$match": {"overall": rating}},
2023-02-01 16:46:25 +00:00
*pipeline_sample(reviews, amount),
2023-02-01 03:20:09 +00:00
])
def sample_reviews_by_rating_polar(reviews: pymongo.collection.Collection, amount: int) -> t.Iterable[Review]:
"""
Get ``amount`` random reviews with either a 5-star or 1-star rating from the ``reviews`` collection.
"""
2023-02-01 16:46:25 +00:00
log.debug("Getting a sample of %d reviews with either 5 or 1 stars...", amount)
2023-02-01 03:20:09 +00:00
return reviews.aggregate([
2023-02-01 15:02:52 +00:00
{"$match":
{"$or":
[
{"overall": 1.0},
{"overall": 5.0},
]
},
},
2023-02-01 16:46:25 +00:00
*pipeline_sample(reviews, amount),
2023-02-01 03:20:09 +00:00
])
2023-02-01 16:46:25 +00:00
def get_reviews_training_set(reviews: pymongo.collection.Collection) -> t.Iterable[Review]:
2023-02-01 03:20:09 +00:00
"""
Get the subset of reviews that should act as training set.
"""
2023-02-01 16:46:25 +00:00
log.info("Building training set...")
# Get the amount from the config
amount: int = TRAINING_SET_SIZE.__wrapped__
2023-02-01 03:20:09 +00:00
# Handle odd numbers
positive_amount: int = amount // 2
negative_amount: int = amount - positive_amount
# Sample the required reviews
positive = sample_reviews_by_rating(reviews, 5.0, positive_amount)
negative = sample_reviews_by_rating(reviews, 1.0, negative_amount)
# Randomness here does not matter, so just merge the lists
both = [*positive, *negative]
return both
2023-02-01 16:46:25 +00:00
def get_reviews_test_set(reviews: pymongo.collection.Collection) -> t.Iterable[Review]:
2023-02-01 03:20:09 +00:00
"""
Get the subset of reviews that should act as test set.
"""
2023-02-01 16:46:25 +00:00
log.info("Building test set...")
amount: int = TEST_SET_SIZE.__wrapped__
2023-02-01 03:20:09 +00:00
return sample_reviews_by_rating_polar(reviews, amount)
2023-02-01 16:46:25 +00:00
__all__ = (
"Review",
"mongo_client_from_config",
"mongo_reviews_collection_from_config",
"sample_reviews",
"sample_reviews_by_rating",
"sample_reviews_by_rating_polar",
"get_reviews_training_set",
"get_reviews_test_set",
)