bda-6-steffo/unimore_bda_6/database/cache.py

import typing as t
import logging
import shutil
import pathlib
import pickle

from .datatypes import Review

log = logging.getLogger(__name__)


DatasetFunc = t.Callable[[], t.Generator[Review, t.Any, None]]


def store_cache(reviews: t.Iterator[Review], path: str | pathlib.Path) -> None:
    """
    Store the contents of the given `Review` iterator to different files in a directory at the given path.
    """
    path = pathlib.Path(path)

    if path.exists():
        raise FileExistsError("Specified cache path already exists.")

    # Create the temporary directory
    log.debug("Creating cache directory: %s", path)
    path.mkdir(parents=True)

    # Write the documents to path/{index}.pickle
    for index, document in enumerate(reviews):
        document_path = path.joinpath(f"{index}.pickle")

        log.debug("Storing pickle file: %s", document_path)
        with open(document_path, "wb") as file:
            pickle.dump(document, file)


def load_cache(path: str | pathlib.Path) -> DatasetFunc:
    """
    Load the contents of a directory
    """
    path = pathlib.Path(path)

    if not path.exists():
        log.error("Specified cache directory does not exist: %s", path)
        raise FileNotFoundError("The specified path does not exist.")

    def data_cache_loader():
        document_paths = path.iterdir()
        for document_path in document_paths:
            document_path = pathlib.Path(document_path)
            if not str(document_path).endswith(".pickle"):
                log.debug("Ignoring non-pickle file: %s", document_path)

            log.debug("Loading pickle file: %s", document_path)
            with open(document_path, "rb") as file:
                result: Review = pickle.load(file)
                yield result

    return data_cache_loader


__all__ = (
    "DatasetFunc",
    "store_cache",
    "load_cache",
)