1
Fork 0
mirror of https://github.com/Steffo99/unimore-bda-6.git synced 2024-11-22 16:04:18 +00:00
bda-6-steffo/unimore_bda_6/database/queries.py

137 lines
4.3 KiB
Python
Raw Normal View History

2023-02-01 16:46:25 +00:00
import logging
import pymongo
import typing as t
2023-02-01 03:20:09 +00:00
from ..config import WORKING_SET_SIZE
2023-02-12 04:11:58 +00:00
from .datatypes import TextReview
2023-02-01 16:46:25 +00:00
log = logging.getLogger(__name__)
2023-02-01 03:20:09 +00:00
2023-02-12 04:11:58 +00:00
SampleFunc = t.Callable[[pymongo.collection.Collection, int], t.Iterator[TextReview]]
2023-02-08 18:46:05 +00:00
2023-02-12 04:11:58 +00:00
def sample_reviews(collection: pymongo.collection.Collection, amount: int) -> t.Iterator[TextReview]:
2023-02-01 03:20:09 +00:00
"""
Get ``amount`` random reviews from the ``reviews`` collection.
"""
2023-02-01 16:46:25 +00:00
log.debug("Getting a sample of %d reviews...", amount)
2023-02-01 03:20:09 +00:00
2023-02-04 05:14:24 +00:00
cursor = collection.aggregate([
2023-02-02 03:07:17 +00:00
{"$limit": WORKING_SET_SIZE.__wrapped__},
2023-02-02 01:56:37 +00:00
{"$sample": {"size": amount}},
2023-02-01 03:20:09 +00:00
])
2023-02-12 04:11:58 +00:00
cursor = map(TextReview.from_mongoreview, cursor)
2023-02-04 05:14:24 +00:00
return cursor
2023-02-01 03:20:09 +00:00
2023-02-04 05:14:24 +00:00
2023-02-12 04:11:58 +00:00
def sample_reviews_by_rating(collection: pymongo.collection.Collection, rating: float, amount: int) -> t.Iterator[TextReview]:
2023-02-01 03:20:09 +00:00
"""
Get ``amount`` random reviews with ``rating`` stars from the ``reviews`` collection.
"""
2023-02-01 16:46:25 +00:00
log.debug("Getting a sample of %d reviews with %d stars...", amount, rating)
2023-02-01 03:20:09 +00:00
2023-02-04 05:14:24 +00:00
cursor = collection.aggregate([
2023-02-02 03:07:17 +00:00
{"$limit": WORKING_SET_SIZE.__wrapped__},
2023-02-01 15:02:52 +00:00
{"$match": {"overall": rating}},
2023-02-02 01:56:37 +00:00
{"$sample": {"size": amount}},
2023-02-01 03:20:09 +00:00
])
2023-02-04 05:14:24 +00:00
return cursor
2023-02-01 03:20:09 +00:00
2023-02-03 22:27:44 +00:00
2023-02-12 04:11:58 +00:00
def sample_reviews_polar(collection: pymongo.collection.Collection, amount: int) -> t.Iterator[TextReview]:
2023-02-08 18:46:05 +00:00
category_amount = amount // 2
log.debug("Getting a sample of %d polar reviews...", category_amount * 2)
2023-02-01 03:20:09 +00:00
2023-02-04 05:14:24 +00:00
cursor = collection.aggregate([
{"$limit": WORKING_SET_SIZE.__wrapped__},
{"$match": {"overall": 1.0}},
2023-02-08 18:46:05 +00:00
{"$sample": {"size": category_amount}},
2023-02-04 05:14:24 +00:00
{"$unionWith": {
"coll": collection.name,
"pipeline": [
{"$limit": WORKING_SET_SIZE.__wrapped__},
{"$match": {"overall": 5.0}},
2023-02-08 18:46:05 +00:00
{"$sample": {"size": category_amount}},
2023-02-04 05:14:24 +00:00
],
2023-02-08 09:54:14 +00:00
}},
{"$addFields": {
"sortKey": {"$rand": {}},
}},
{"$sort": {
"sortKey": 1,
2023-02-04 05:14:24 +00:00
}}
])
2023-02-01 03:20:09 +00:00
2023-02-12 04:11:58 +00:00
cursor = map(TextReview.from_mongoreview, cursor)
2023-02-04 05:14:24 +00:00
return cursor
2023-02-01 03:20:09 +00:00
2023-02-12 04:11:58 +00:00
def sample_reviews_varied(collection: pymongo.collection.Collection, amount: int) -> t.Iterator[TextReview]:
2023-02-08 18:46:05 +00:00
category_amount = amount // 5
log.debug("Getting a sample of %d varied reviews...", category_amount * 5)
2023-02-01 16:46:25 +00:00
2023-02-04 05:14:24 +00:00
# Wow, this is ugly.
cursor = collection.aggregate([
{"$limit": WORKING_SET_SIZE.__wrapped__},
{"$match": {"overall": 1.0}},
2023-02-08 18:46:05 +00:00
{"$sample": {"size": category_amount}},
2023-02-04 05:14:24 +00:00
{"$unionWith": {
"coll": collection.name,
"pipeline": [
{"$limit": WORKING_SET_SIZE.__wrapped__},
{"$match": {"overall": 2.0}},
2023-02-08 18:46:05 +00:00
{"$sample": {"size": category_amount}},
2023-02-04 05:14:24 +00:00
{"$unionWith": {
"coll": collection.name,
"pipeline": [
{"$limit": WORKING_SET_SIZE.__wrapped__},
{"$match": {"overall": 3.0}},
2023-02-08 18:46:05 +00:00
{"$sample": {"size": category_amount}},
2023-02-04 05:14:24 +00:00
{"$unionWith": {
"coll": collection.name,
"pipeline": [
{"$limit": WORKING_SET_SIZE.__wrapped__},
{"$match": {"overall": 4.0}},
2023-02-08 18:46:05 +00:00
{"$sample": {"size": category_amount}},
2023-02-04 05:14:24 +00:00
{"$unionWith": {
"coll": collection.name,
"pipeline": [
{"$limit": WORKING_SET_SIZE.__wrapped__},
{"$match": {"overall": 5.0}},
2023-02-08 18:46:05 +00:00
{"$sample": {"size": category_amount}},
2023-02-04 05:14:24 +00:00
],
}}
],
}}
],
}}
],
2023-02-08 09:54:14 +00:00
}},
{"$addFields": {
"sortKey": {"$rand": {}},
}},
{"$sort": {
"sortKey": 1,
2023-02-04 05:14:24 +00:00
}}
])
2023-02-03 22:27:44 +00:00
2023-02-12 04:11:58 +00:00
cursor = map(TextReview.from_mongoreview, cursor)
2023-02-04 05:14:24 +00:00
return cursor
2023-02-01 16:46:25 +00:00
__all__ = (
2023-02-08 18:46:05 +00:00
"SampleFunc",
2023-02-01 16:46:25 +00:00
"sample_reviews",
"sample_reviews_by_rating",
2023-02-04 05:14:24 +00:00
"sample_reviews_polar",
"sample_reviews_varied",
2023-02-01 16:46:25 +00:00
)