1
Fork 0
mirror of https://github.com/Steffo99/unimore-bda-6.git synced 2024-11-25 09:14:19 +00:00
bda-6-steffo/unimore_bda_6/database/cache.py
Stefano Pigozzi 3abba24ca2
Made good progress
How does text vectorization in tensorflow work?
2023-02-05 17:40:22 +01:00

66 lines
1.8 KiB
Python

import typing as t
import logging
import shutil
import pathlib
import pickle
from .datatypes import Review
log = logging.getLogger(__name__)
DatasetFunc = t.Callable[[], t.Generator[Review, t.Any, None]]
def store_cache(reviews: t.Iterator[Review], path: str | pathlib.Path) -> None:
"""
Store the contents of the given `Review` iterator to different files in a directory at the given path.
"""
path = pathlib.Path(path)
if path.exists():
raise FileExistsError("Specified cache path already exists.")
# Create the temporary directory
log.debug("Creating cache directory: %s", path)
path.mkdir(parents=True)
# Write the documents to path/{index}.pickle
for index, document in enumerate(reviews):
document_path = path.joinpath(f"{index}.pickle")
log.debug("Storing pickle file: %s", document_path)
with open(document_path, "wb") as file:
pickle.dump(document, file)
def load_cache(path: str | pathlib.Path) -> DatasetFunc:
"""
Load the contents of a directory
"""
path = pathlib.Path(path)
if not path.exists():
log.error("Specified cache directory does not exist: %s", path)
raise FileNotFoundError("The specified path does not exist.")
def data_cache_loader():
document_paths = path.iterdir()
for document_path in document_paths:
document_path = pathlib.Path(document_path)
if not str(document_path).endswith(".pickle"):
log.debug("Ignoring non-pickle file: %s", document_path)
log.debug("Loading pickle file: %s", document_path)
with open(document_path, "rb") as file:
result: Review = pickle.load(file)
yield result
return data_cache_loader
__all__ = (
"DatasetFunc",
"store_cache",
"load_cache",
)