From ded20c33e1c52f8619a404450bd035030446d87e Mon Sep 17 00:00:00 2001
From: Stefano Pigozzi <me@steffo.eu>
Date: Thu, 2 Feb 2023 04:07:17 +0100
Subject: [PATCH] Configure working set

---
 unimore_bda_6/config.py   | 17 +++++++++++++++++
 unimore_bda_6/database.py |  8 ++++----
 2 files changed, 21 insertions(+), 4 deletions(-)

diff --git a/unimore_bda_6/config.py b/unimore_bda_6/config.py
index 9173139..62ccdf9 100644
--- a/unimore_bda_6/config.py
+++ b/unimore_bda_6/config.py
@@ -28,6 +28,22 @@ def MONGO_PORT(val: str | None) -> int:
         raise cfig.InvalidValueError("Not an int.")
 
 
+@config.optional()
+def WORKING_SET_SIZE(val: str | None) -> int:
+    """
+    The number of reviews to consider from the database.
+    Set this to a low number to prevent slowness due to the dataset's huge size.
+
+    Defaults to `10000`.
+    """
+    if val is None:
+        return 10000
+    try:
+        return int(val)
+    except ValueError:
+        raise cfig.InvalidValueError("Not an int.")
+
+
 @config.optional()
 def TRAINING_SET_SIZE(val: str | None) -> int:
     """
@@ -62,6 +78,7 @@ __all__ = (
     "config",
     "MONGO_HOST",
     "MONGO_PORT",
+    "WORKING_SET_SIZE",
     "TRAINING_SET_SIZE",
     "TEST_SET_SIZE",
     "NLTK_DOUBLE_NEG_SWITCH",
diff --git a/unimore_bda_6/database.py b/unimore_bda_6/database.py
index 29090da..8f1c174 100644
--- a/unimore_bda_6/database.py
+++ b/unimore_bda_6/database.py
@@ -5,7 +5,7 @@ import contextlib
 import bson
 import logging
 
-from .config import MONGO_HOST, MONGO_PORT, TRAINING_SET_SIZE, TEST_SET_SIZE
+from .config import MONGO_HOST, MONGO_PORT, WORKING_SET_SIZE, TRAINING_SET_SIZE, TEST_SET_SIZE
 
 log = logging.getLogger(__name__)
 
@@ -61,7 +61,7 @@ def sample_reviews(reviews: pymongo.collection.Collection, amount: int) -> t.Ite
     log.debug("Getting a sample of %d reviews...", amount)
 
     return reviews.aggregate([
-        {"$limit": 10000},  # TODO
+        {"$limit": WORKING_SET_SIZE.__wrapped__},
         {"$sample": {"size": amount}},
     ])
 
@@ -73,7 +73,7 @@ def sample_reviews_by_rating(reviews: pymongo.collection.Collection, rating: flo
     log.debug("Getting a sample of %d reviews with %d stars...", amount, rating)
 
     return reviews.aggregate([
-        {"$limit": 10000},  # TODO
+        {"$limit": WORKING_SET_SIZE.__wrapped__},
         {"$match": {"overall": rating}},
         {"$sample": {"size": amount}},
     ])
@@ -86,7 +86,7 @@ def sample_reviews_by_rating_polar(reviews: pymongo.collection.Collection, amoun
     log.debug("Getting a sample of %d reviews with either 5 or 1 stars...", amount)
 
     return reviews.aggregate([
-        {"$limit": 10000},  # TODO
+        {"$limit": WORKING_SET_SIZE.__wrapped__},
         {"$match":
             {"$or":
                 [