mirror of
https://github.com/Steffo99/unimore-bda-6.git
synced 2024-11-21 23:44:19 +00:00
Add missing file
This commit is contained in:
parent
027f8e07e8
commit
704624507a
1 changed files with 74 additions and 0 deletions
74
unimore_bda_6/gathering.py
Normal file
74
unimore_bda_6/gathering.py
Normal file
|
@ -0,0 +1,74 @@
|
|||
import typing as t
|
||||
import contextlib
|
||||
import dataclasses
|
||||
import logging
|
||||
import pymongo
|
||||
|
||||
from .config import TRAINING_SET_SIZE, VALIDATION_SET_SIZE, EVALUATION_SET_SIZE
|
||||
from .database import SampleFunc, CachedDatasetFunc, mongo_client_from_config, reviews_collection, store_cache, load_cache, delete_cache
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@dataclasses.dataclass
|
||||
class Caches:
|
||||
"""
|
||||
Container for the three generators that can create datasets.
|
||||
"""
|
||||
|
||||
training: CachedDatasetFunc
|
||||
validation: CachedDatasetFunc
|
||||
evaluation: CachedDatasetFunc
|
||||
|
||||
@classmethod
|
||||
@contextlib.contextmanager
|
||||
def from_database_samples(cls, collection: pymongo.collection.Collection, sample_func: SampleFunc) -> t.ContextManager["Caches"]:
|
||||
"""
|
||||
Create a new caches object from a database collection and a sampling function.
|
||||
"""
|
||||
|
||||
log.debug("Gathering datasets...")
|
||||
reviews_training = sample_func(collection, TRAINING_SET_SIZE.__wrapped__)
|
||||
reviews_validation = sample_func(collection, VALIDATION_SET_SIZE.__wrapped__)
|
||||
reviews_evaluation = sample_func(collection, EVALUATION_SET_SIZE.__wrapped__)
|
||||
|
||||
log.debug("Caching datasets...")
|
||||
store_cache(reviews_training, "./data/training")
|
||||
store_cache(reviews_validation, "./data/validation")
|
||||
store_cache(reviews_evaluation, "./data/evaluation")
|
||||
|
||||
log.debug("Loading dataset caches...")
|
||||
training_cache = load_cache("./data/training")
|
||||
validation_cache = load_cache("./data/validation")
|
||||
evaluation_cache = load_cache("./data/evaluation")
|
||||
|
||||
yield Caches(training=training_cache, validation=validation_cache, evaluation=evaluation_cache)
|
||||
|
||||
log.debug("Cleaning up caches...")
|
||||
delete_cache("./data/training")
|
||||
delete_cache("./data/validation")
|
||||
delete_cache("./data/evaluation")
|
||||
|
||||
@staticmethod
|
||||
def ensure_clean():
|
||||
log.debug("Ensuring there are no leftover caches...")
|
||||
|
||||
try:
|
||||
delete_cache("./data/training")
|
||||
except FileNotFoundError:
|
||||
pass
|
||||
|
||||
try:
|
||||
delete_cache("./data/validation")
|
||||
except FileNotFoundError:
|
||||
pass
|
||||
|
||||
try:
|
||||
delete_cache("./data/evaluation")
|
||||
except FileNotFoundError:
|
||||
pass
|
||||
|
||||
|
||||
__all__ = (
|
||||
"Caches",
|
||||
)
|
Loading…
Reference in a new issue