Configure working set

2024-11-24 16:54:20 +00:00 · 2023-02-02 04:07:17 +01:00 · 2023-02-02 04:07:17 +01:00 · ded20c33e1
commit ded20c33e1
parent 14d1e1a22f
2 changed files with 21 additions and 4 deletions
--- a/unimore_bda_6/config.py
+++ b/unimore_bda_6/config.py
@ -28,6 +28,22 @@ def MONGO_PORT(val: str | None) -> int:
        raise cfig.InvalidValueError("Not an int.")
@config.optional()
 def WORKING_SET_SIZE(val: str | None) -> int:
    """
    The number of reviews to consider from the database.
    Set this to a low number to prevent slowness due to the dataset's huge size.
    Defaults to `10000`.
    """
    if val is None:
        return 10000
    try:
        return int(val)
    except ValueError:
        raise cfig.InvalidValueError("Not an int.")
@config.optional()
 def TRAINING_SET_SIZE(val: str | None) -> int:
    """
@ -62,6 +78,7 @@ __all__ = (
    "config",
    "MONGO_HOST",
    "MONGO_PORT",
    "WORKING_SET_SIZE",
    "TRAINING_SET_SIZE",
    "TEST_SET_SIZE",
    "NLTK_DOUBLE_NEG_SWITCH",
--- a/unimore_bda_6/database.py
+++ b/unimore_bda_6/database.py
@ -5,7 +5,7 @@ import contextlib
 import bson
 import logging
-from .config import MONGO_HOST, MONGO_PORT, TRAINING_SET_SIZE, TEST_SET_SIZE
+from .config import MONGO_HOST, MONGO_PORT, WORKING_SET_SIZE, TRAINING_SET_SIZE, TEST_SET_SIZE
 log = logging.getLogger(__name__)
@ -61,7 +61,7 @@ def sample_reviews(reviews: pymongo.collection.Collection, amount: int) -> t.Ite
    log.debug("Getting a sample of %d reviews...", amount)
    return reviews.aggregate([
-        {"$limit": 10000},  # TODO
+        {"$limit": WORKING_SET_SIZE.__wrapped__},
        {"$sample": {"size": amount}},
    ])
@ -73,7 +73,7 @@ def sample_reviews_by_rating(reviews: pymongo.collection.Collection, rating: flo
    log.debug("Getting a sample of %d reviews with %d stars...", amount, rating)
    return reviews.aggregate([
-        {"$limit": 10000},  # TODO
+        {"$limit": WORKING_SET_SIZE.__wrapped__},
        {"$match": {"overall": rating}},
        {"$sample": {"size": amount}},
    ])
@ -86,7 +86,7 @@ def sample_reviews_by_rating_polar(reviews: pymongo.collection.Collection, amoun
    log.debug("Getting a sample of %d reviews with either 5 or 1 stars...", amount)
    return reviews.aggregate([
-        {"$limit": 10000},  # TODO
+        {"$limit": WORKING_SET_SIZE.__wrapped__},
        {"$match":
            {"$or":
                [