1
Fork 0
mirror of https://github.com/Steffo99/unimore-bda-6.git synced 2024-11-24 16:54:20 +00:00

Configure working set

This commit is contained in:
Steffo 2023-02-02 04:07:17 +01:00
parent 14d1e1a22f
commit ded20c33e1
Signed by: steffo
GPG key ID: 2A24051445686895
2 changed files with 21 additions and 4 deletions

View file

@ -28,6 +28,22 @@ def MONGO_PORT(val: str | None) -> int:
raise cfig.InvalidValueError("Not an int.")
@config.optional()
def WORKING_SET_SIZE(val: str | None) -> int:
"""
The number of reviews to consider from the database.
Set this to a low number to prevent slowness due to the dataset's huge size.
Defaults to `10000`.
"""
if val is None:
return 10000
try:
return int(val)
except ValueError:
raise cfig.InvalidValueError("Not an int.")
@config.optional()
def TRAINING_SET_SIZE(val: str | None) -> int:
"""
@ -62,6 +78,7 @@ __all__ = (
"config",
"MONGO_HOST",
"MONGO_PORT",
"WORKING_SET_SIZE",
"TRAINING_SET_SIZE",
"TEST_SET_SIZE",
"NLTK_DOUBLE_NEG_SWITCH",

View file

@ -5,7 +5,7 @@ import contextlib
import bson
import logging
from .config import MONGO_HOST, MONGO_PORT, TRAINING_SET_SIZE, TEST_SET_SIZE
from .config import MONGO_HOST, MONGO_PORT, WORKING_SET_SIZE, TRAINING_SET_SIZE, TEST_SET_SIZE
log = logging.getLogger(__name__)
@ -61,7 +61,7 @@ def sample_reviews(reviews: pymongo.collection.Collection, amount: int) -> t.Ite
log.debug("Getting a sample of %d reviews...", amount)
return reviews.aggregate([
{"$limit": 10000}, # TODO
{"$limit": WORKING_SET_SIZE.__wrapped__},
{"$sample": {"size": amount}},
])
@ -73,7 +73,7 @@ def sample_reviews_by_rating(reviews: pymongo.collection.Collection, rating: flo
log.debug("Getting a sample of %d reviews with %d stars...", amount, rating)
return reviews.aggregate([
{"$limit": 10000}, # TODO
{"$limit": WORKING_SET_SIZE.__wrapped__},
{"$match": {"overall": rating}},
{"$sample": {"size": amount}},
])
@ -86,7 +86,7 @@ def sample_reviews_by_rating_polar(reviews: pymongo.collection.Collection, amoun
log.debug("Getting a sample of %d reviews with either 5 or 1 stars...", amount)
return reviews.aggregate([
{"$limit": 10000}, # TODO
{"$limit": WORKING_SET_SIZE.__wrapped__},
{"$match":
{"$or":
[