1
Fork 0
mirror of https://github.com/Steffo99/unimore-bda-6.git synced 2025-02-16 14:13:59 +00:00
bda-6-steffo/unimore_bda_6/database.py

209 lines
6.3 KiB
Python
Raw Normal View History

2023-02-01 04:20:09 +01:00
import typing as t
import pymongo
import pymongo.collection
import contextlib
import bson
2023-02-01 17:46:25 +01:00
import logging
2023-02-04 06:14:24 +01:00
import tensorflow
2023-02-01 04:20:09 +01:00
2023-02-03 23:27:44 +01:00
from .config import MONGO_HOST, MONGO_PORT, WORKING_SET_SIZE
2023-02-01 17:46:25 +01:00
log = logging.getLogger(__name__)
2023-02-01 04:20:09 +01:00
2023-02-04 06:14:24 +01:00
class MongoReview(t.TypedDict):
"""
A review as it is stored on MongoDB.
.. warning:: Do not instantiate: this is only for type hints!
"""
2023-02-01 04:20:09 +01:00
_id: bson.ObjectId
reviewerID: str
asin: str
reviewerName: str
helpful: tuple[int, int]
reviewText: str
overall: float
summary: str
unixReviewTime: int
reviewTime: str
2023-02-03 23:27:44 +01:00
Text = str
Category = float
2023-02-04 05:34:56 +01:00
2023-02-04 06:14:24 +01:00
class Review:
def __init__(self, text: Text, category: Category):
2023-02-04 05:34:56 +01:00
self.text: Text = text
self.category: Category = category
@classmethod
2023-02-04 06:14:24 +01:00
def from_mongoreview(cls, review: MongoReview):
2023-02-04 05:34:56 +01:00
return cls(
text=review["reviewText"],
category=review["overall"],
)
def __repr__(self):
return f"<{self.__class__.__qualname__}: [{self.category}] {self.text}>"
def __getitem__(self, item):
2023-02-04 06:14:24 +01:00
if item == 0 or item == "text":
2023-02-04 05:34:56 +01:00
return self.text
2023-02-04 06:14:24 +01:00
elif item == 1 or item == "category":
2023-02-04 05:34:56 +01:00
return self.category
else:
raise KeyError(item)
2023-02-04 06:14:24 +01:00
def to_tensor_tuple(self) -> tuple[tensorflow.Tensor, tensorflow.Tensor]:
return tensorflow.convert_to_tensor(self.text, dtype=tensorflow.string), tensorflow.convert_to_tensor(self.category, dtype=tensorflow.string)
2023-02-03 23:27:44 +01:00
2023-02-01 04:20:09 +01:00
@contextlib.contextmanager
def mongo_client_from_config() -> t.ContextManager[pymongo.MongoClient]:
"""
Create a new MongoDB client and yield it.
"""
2023-02-01 17:46:25 +01:00
log.debug("Opening connection to MongoDB...")
2023-02-02 17:46:21 +01:00
client: pymongo.MongoClient = pymongo.MongoClient(
2023-02-01 17:46:25 +01:00
host=MONGO_HOST.__wrapped__,
port=MONGO_PORT.__wrapped__,
2023-02-01 04:20:09 +01:00
)
2023-02-04 06:14:24 +01:00
log.info("Opened connection to MongoDB!")
2023-02-01 17:46:25 +01:00
2023-02-01 04:20:09 +01:00
yield client
2023-02-01 17:46:25 +01:00
2023-02-02 17:46:21 +01:00
log.info("Closing connection to MongoDB...")
2023-02-01 04:20:09 +01:00
client.close()
2023-02-01 17:46:25 +01:00
log.debug("Closed connection to MongoDB!")
2023-02-01 04:20:09 +01:00
@contextlib.contextmanager
2023-02-04 06:14:24 +01:00
def mongo_reviews_collection_from_config() -> pymongo.collection.Collection[MongoReview]:
2023-02-01 04:20:09 +01:00
"""
Create a new MongoDB client, access the ``reviews`` collection in the ``reviews`` database, and yield it.
"""
with mongo_client_from_config() as db:
2023-02-01 17:46:25 +01:00
log.debug("Accessing the reviews collection...")
collection = db.reviews.reviews
log.debug("Collection accessed successfully: %s", collection)
yield collection
2023-02-04 06:14:24 +01:00
class DatasetFunc(t.Protocol):
def __call__(self) -> t.Iterator[Review]:
pass
def sample_reviews(collection: pymongo.collection.Collection, amount: int) -> t.Iterator[Review]:
2023-02-01 04:20:09 +01:00
"""
Get ``amount`` random reviews from the ``reviews`` collection.
"""
2023-02-01 17:46:25 +01:00
log.debug("Getting a sample of %d reviews...", amount)
2023-02-01 04:20:09 +01:00
2023-02-04 06:14:24 +01:00
cursor = collection.aggregate([
2023-02-02 04:07:17 +01:00
{"$limit": WORKING_SET_SIZE.__wrapped__},
2023-02-02 02:56:37 +01:00
{"$sample": {"size": amount}},
2023-02-01 04:20:09 +01:00
])
2023-02-04 06:14:24 +01:00
cursor = map(Review.from_mongoreview, cursor)
return cursor
2023-02-01 04:20:09 +01:00
2023-02-04 06:14:24 +01:00
def sample_reviews_by_rating(collection: pymongo.collection.Collection, rating: float, amount: int) -> t.Iterator[Review]:
2023-02-01 04:20:09 +01:00
"""
Get ``amount`` random reviews with ``rating`` stars from the ``reviews`` collection.
"""
2023-02-01 17:46:25 +01:00
log.debug("Getting a sample of %d reviews with %d stars...", amount, rating)
2023-02-01 04:20:09 +01:00
2023-02-04 06:14:24 +01:00
cursor = collection.aggregate([
2023-02-02 04:07:17 +01:00
{"$limit": WORKING_SET_SIZE.__wrapped__},
2023-02-01 16:02:52 +01:00
{"$match": {"overall": rating}},
2023-02-02 02:56:37 +01:00
{"$sample": {"size": amount}},
2023-02-01 04:20:09 +01:00
])
2023-02-04 06:14:24 +01:00
cursor = map(Review.from_mongoreview, cursor)
return cursor
2023-02-01 04:20:09 +01:00
2023-02-03 23:27:44 +01:00
2023-02-04 06:14:24 +01:00
def sample_reviews_polar(collection: pymongo.collection.Collection, amount: int) -> t.Iterator[Review]:
log.debug("Getting a sample of %d polar reviews...", amount * 2)
2023-02-01 04:20:09 +01:00
2023-02-04 06:14:24 +01:00
cursor = collection.aggregate([
{"$limit": WORKING_SET_SIZE.__wrapped__},
{"$match": {"overall": 1.0}},
{"$sample": {"size": amount}},
{"$unionWith": {
"coll": collection.name,
"pipeline": [
{"$limit": WORKING_SET_SIZE.__wrapped__},
{"$match": {"overall": 5.0}},
{"$sample": {"size": amount}},
],
}}
])
2023-02-01 04:20:09 +01:00
2023-02-04 06:14:24 +01:00
cursor = map(Review.from_mongoreview, cursor)
return cursor
2023-02-01 04:20:09 +01:00
2023-02-04 06:14:24 +01:00
def sample_reviews_varied(collection: pymongo.collection.Collection, amount: int) -> t.Iterator[Review]:
log.debug("Getting a sample of %d varied reviews...", amount * 5)
2023-02-01 17:46:25 +01:00
2023-02-04 06:14:24 +01:00
# Wow, this is ugly.
cursor = collection.aggregate([
{"$limit": WORKING_SET_SIZE.__wrapped__},
{"$match": {"overall": 1.0}},
{"$sample": {"size": amount}},
{"$unionWith": {
"coll": collection.name,
"pipeline": [
{"$limit": WORKING_SET_SIZE.__wrapped__},
{"$match": {"overall": 2.0}},
{"$sample": {"size": amount}},
{"$unionWith": {
"coll": collection.name,
"pipeline": [
{"$limit": WORKING_SET_SIZE.__wrapped__},
{"$match": {"overall": 3.0}},
{"$sample": {"size": amount}},
{"$unionWith": {
"coll": collection.name,
"pipeline": [
{"$limit": WORKING_SET_SIZE.__wrapped__},
{"$match": {"overall": 4.0}},
{"$sample": {"size": amount}},
{"$unionWith": {
"coll": collection.name,
"pipeline": [
{"$limit": WORKING_SET_SIZE.__wrapped__},
{"$match": {"overall": 5.0}},
{"$sample": {"size": amount}},
],
}}
],
}}
],
}}
],
}}
])
2023-02-03 23:27:44 +01:00
2023-02-04 06:14:24 +01:00
cursor = map(Review.from_mongoreview, cursor)
return cursor
2023-02-01 17:46:25 +01:00
__all__ = (
2023-02-03 23:27:44 +01:00
"Text",
"Category",
2023-02-04 06:14:24 +01:00
"Review",
"DatasetFunc",
2023-02-01 17:46:25 +01:00
"mongo_client_from_config",
"mongo_reviews_collection_from_config",
"sample_reviews",
"sample_reviews_by_rating",
2023-02-04 06:14:24 +01:00
"sample_reviews_polar",
"sample_reviews_varied",
2023-02-01 17:46:25 +01:00
)