1
Fork 0
mirror of https://github.com/Steffo99/unimore-bda-6.git synced 2024-11-22 16:04:18 +00:00
bda-6-steffo/unimore_bda_6/database.py
2023-02-01 04:20:09 +01:00

101 lines
2.7 KiB
Python

import typing as t
import pymongo
import pymongo.collection
import contextlib
import bson
from .config import MONGO_HOST, MONGO_PORT
class Review(t.TypedDict):
_id: bson.ObjectId
reviewerID: str
asin: str
reviewerName: str
helpful: tuple[int, int]
reviewText: str
overall: float
summary: str
unixReviewTime: int
reviewTime: str
@contextlib.contextmanager
def mongo_client_from_config() -> t.ContextManager[pymongo.MongoClient]:
"""
Create a new MongoDB client and yield it.
"""
client = pymongo.MongoClient(
host=MONGO_HOST.__resolved__,
port=MONGO_PORT.__resolved__,
)
yield client
client.close()
@contextlib.contextmanager
def mongo_reviews_collection_from_config() -> pymongo.collection.Collection[Review]:
"""
Create a new MongoDB client, access the ``reviews`` collection in the ``reviews`` database, and yield it.
"""
with mongo_client_from_config() as db:
yield db.reviews.reviews
def sample_reviews(reviews: pymongo.collection.Collection, amount: int) -> t.Iterable[Review]:
"""
Get ``amount`` random reviews from the ``reviews`` collection.
"""
return reviews.aggregate([
{"$sample": {"size": amount}}
])
def sample_reviews_by_rating(reviews: pymongo.collection.Collection, rating: float, amount: int) -> t.Iterable[Review]:
"""
Get ``amount`` random reviews with ``rating`` stars from the ``reviews`` collection.
"""
return reviews.aggregate([
{"$filter": {"overall": rating}},
{"$sample": {"size": amount}},
])
def sample_reviews_by_rating_polar(reviews: pymongo.collection.Collection, amount: int) -> t.Iterable[Review]:
"""
Get ``amount`` random reviews with either a 5-star or 1-star rating from the ``reviews`` collection.
"""
return reviews.aggregate([
{"$filter": {"overall": {"$or": [1.0, 5.0]}}},
{"$sample": {"size": amount}},
])
def get_reviews_training_set(reviews: pymongo.collection.Collection, amount: int) -> t.Iterable[Review]:
"""
Get the subset of reviews that should act as training set.
"""
# Handle odd numbers
positive_amount: int = amount // 2
negative_amount: int = amount - positive_amount
# Sample the required reviews
positive = sample_reviews_by_rating(reviews, 5.0, positive_amount)
negative = sample_reviews_by_rating(reviews, 1.0, negative_amount)
# Randomness here does not matter, so just merge the lists
both = [*positive, *negative]
return both
def get_reviews_test_set(reviews: pymongo.collection.Collection, amount: int) -> t.Iterable[Review]:
"""
Get the subset of reviews that should act as test set.
"""
return sample_reviews_by_rating_polar(reviews, amount)