From ded20c33e1c52f8619a404450bd035030446d87e Mon Sep 17 00:00:00 2001 From: Stefano Pigozzi Date: Thu, 2 Feb 2023 04:07:17 +0100 Subject: [PATCH] Configure working set --- unimore_bda_6/config.py | 17 +++++++++++++++++ unimore_bda_6/database.py | 8 ++++---- 2 files changed, 21 insertions(+), 4 deletions(-) diff --git a/unimore_bda_6/config.py b/unimore_bda_6/config.py index 9173139..62ccdf9 100644 --- a/unimore_bda_6/config.py +++ b/unimore_bda_6/config.py @@ -28,6 +28,22 @@ def MONGO_PORT(val: str | None) -> int: raise cfig.InvalidValueError("Not an int.") +@config.optional() +def WORKING_SET_SIZE(val: str | None) -> int: + """ + The number of reviews to consider from the database. + Set this to a low number to prevent slowness due to the dataset's huge size. + + Defaults to `10000`. + """ + if val is None: + return 10000 + try: + return int(val) + except ValueError: + raise cfig.InvalidValueError("Not an int.") + + @config.optional() def TRAINING_SET_SIZE(val: str | None) -> int: """ @@ -62,6 +78,7 @@ __all__ = ( "config", "MONGO_HOST", "MONGO_PORT", + "WORKING_SET_SIZE", "TRAINING_SET_SIZE", "TEST_SET_SIZE", "NLTK_DOUBLE_NEG_SWITCH", diff --git a/unimore_bda_6/database.py b/unimore_bda_6/database.py index 29090da..8f1c174 100644 --- a/unimore_bda_6/database.py +++ b/unimore_bda_6/database.py @@ -5,7 +5,7 @@ import contextlib import bson import logging -from .config import MONGO_HOST, MONGO_PORT, TRAINING_SET_SIZE, TEST_SET_SIZE +from .config import MONGO_HOST, MONGO_PORT, WORKING_SET_SIZE, TRAINING_SET_SIZE, TEST_SET_SIZE log = logging.getLogger(__name__) @@ -61,7 +61,7 @@ def sample_reviews(reviews: pymongo.collection.Collection, amount: int) -> t.Ite log.debug("Getting a sample of %d reviews...", amount) return reviews.aggregate([ - {"$limit": 10000}, # TODO + {"$limit": WORKING_SET_SIZE.__wrapped__}, {"$sample": {"size": amount}}, ]) @@ -73,7 +73,7 @@ def sample_reviews_by_rating(reviews: pymongo.collection.Collection, rating: flo log.debug("Getting a sample of %d reviews with %d stars...", amount, rating) return reviews.aggregate([ - {"$limit": 10000}, # TODO + {"$limit": WORKING_SET_SIZE.__wrapped__}, {"$match": {"overall": rating}}, {"$sample": {"size": amount}}, ]) @@ -86,7 +86,7 @@ def sample_reviews_by_rating_polar(reviews: pymongo.collection.Collection, amoun log.debug("Getting a sample of %d reviews with either 5 or 1 stars...", amount) return reviews.aggregate([ - {"$limit": 10000}, # TODO + {"$limit": WORKING_SET_SIZE.__wrapped__}, {"$match": {"$or": [