From 61141248db7184fc2d03eae61e5addf8028fcde2 Mon Sep 17 00:00:00 2001 From: Stefano Pigozzi Date: Sat, 18 Feb 2023 03:18:34 +0100 Subject: [PATCH] Some memory usage tweaks --- .vscode/launch.json | 1 - unimore_bda_6/__main__.py | 130 ++++++++++++++------------- unimore_bda_6/database/connection.py | 2 +- 3 files changed, 69 insertions(+), 64 deletions(-) diff --git a/.vscode/launch.json b/.vscode/launch.json index 0432d79..ee81316 100644 --- a/.vscode/launch.json +++ b/.vscode/launch.json @@ -12,7 +12,6 @@ "justMyCode": false, "env": { "NLTK_DATA": "./data/nltk", - "DATA_SET_SIZE": "250", "XLA_FLAGS": "--xla_gpu_cuda_data_dir=/opt/cuda" }, "cwd": "${workspaceFolder}", diff --git a/unimore_bda_6/__main__.py b/unimore_bda_6/__main__.py index c42952c..5e20772 100644 --- a/unimore_bda_6/__main__.py +++ b/unimore_bda_6/__main__.py @@ -31,88 +31,94 @@ def main(): log.fatal("MongoDB database is not available, exiting...") exit(1) - reviews = reviews_collection(db) + for sample_func in [ + sample_reviews_polar, + sample_reviews_varied, + ]: - for sample_func in [ - sample_reviews_polar, - sample_reviews_varied, + slog = logging.getLogger(f"{__name__}.{sample_func.__name__}") + slog.debug("Selected sample_func: %s", sample_func.__name__) + + for SentimentAnalyzer in [ + # ThreeCheat, + NLTKSentimentAnalyzer, + TensorflowPolarSentimentAnalyzer, + TensorflowCategorySentimentAnalyzer, ]: - slog = logging.getLogger(f"{__name__}.{sample_func.__name__}") - slog.debug("Selected sample_func: %s", sample_func.__name__) + slog = logging.getLogger(f"{__name__}.{sample_func.__name__}.{SentimentAnalyzer.__name__}") + slog.debug("Selected SentimentAnalyzer: %s", SentimentAnalyzer.__name__) - for SentimentAnalyzer in [ - # ThreeCheat, - NLTKSentimentAnalyzer, - TensorflowPolarSentimentAnalyzer, - TensorflowCategorySentimentAnalyzer, + for Tokenizer in [ + PlainTokenizer, + LowercaseTokenizer, + NLTKWordTokenizer, + PottsTokenizer, + PottsTokenizerWithNegation, + HuggingBertTokenizer, ]: - slog = logging.getLogger(f"{__name__}.{sample_func.__name__}.{SentimentAnalyzer.__name__}") - slog.debug("Selected SentimentAnalyzer: %s", SentimentAnalyzer.__name__) + log.debug("Running garbage collection...") + garbage_count = gc.collect() + log.debug("Collected %d pieces of garbage!", garbage_count) - for Tokenizer in [ - PlainTokenizer, - LowercaseTokenizer, - NLTKWordTokenizer, - PottsTokenizer, - PottsTokenizerWithNegation, - HuggingBertTokenizer, - ]: + slog = logging.getLogger(f"{__name__}.{sample_func.__name__}.{SentimentAnalyzer.__name__}.{Tokenizer.__name__}") + slog.debug("Selected Tokenizer: %s", Tokenizer.__name__) - log.debug("Running garbage collection...") - garbage_count = gc.collect() - log.debug("Collected %d pieces of garbage!", garbage_count) + runs = 0 + successful_runs = 0 + cumulative_evaluation_results = EvaluationResults() + + while True: slog = logging.getLogger(f"{__name__}.{sample_func.__name__}.{SentimentAnalyzer.__name__}.{Tokenizer.__name__}") - slog.debug("Selected Tokenizer: %s", Tokenizer.__name__) - runs = 0 - successful_runs = 0 - cumulative_evaluation_results = EvaluationResults() + if successful_runs >= TARGET_RUNS.__wrapped__: + slog.info("Reached target of %d runs, moving on...", TARGET_RUNS.__wrapped__) + break - while True: + if runs >= MAXIMUM_RUNS.__wrapped__: + slog.fatal("Exceeded %d runs, giving up and exiting...", MAXIMUM_RUNS.__wrapped__) + break - slog = logging.getLogger(f"{__name__}.{sample_func.__name__}.{SentimentAnalyzer.__name__}.{Tokenizer.__name__}") + runs += 1 + slog = logging.getLogger(f"{__name__}.{sample_func.__name__}.{SentimentAnalyzer.__name__}.{Tokenizer.__name__}.{runs}") + slog.debug("Run #%d", runs) - if successful_runs >= TARGET_RUNS.__wrapped__: - slog.info("Reached target of %d runs, moving on...", TARGET_RUNS.__wrapped__) - break + try: + slog.debug("Instantiating %s with %s...", SentimentAnalyzer.__name__, Tokenizer.__name__) + sa = SentimentAnalyzer(tokenizer=Tokenizer()) + except TypeError: + slog.warning("%s is not supported by %s, skipping run...", SentimentAnalyzer.__name__, Tokenizer.__name__) + break - if runs >= MAXIMUM_RUNS.__wrapped__: - slog.fatal("Exceeded %d runs, giving up and exiting...", MAXIMUM_RUNS.__wrapped__) - break - runs += 1 - slog = logging.getLogger(f"{__name__}.{sample_func.__name__}.{SentimentAnalyzer.__name__}.{Tokenizer.__name__}.{runs}") - slog.debug("Run #%d", runs) + with mongo_client_from_config() as db: + reviews = reviews_collection(db) + datasets_cm = Caches.from_database_samples(collection=reviews, sample_func=sample_func) + datasets = datasets_cm.__enter__() + try: try: - slog.debug("Instantiating %s with %s...", SentimentAnalyzer.__name__, Tokenizer.__name__) - sa = SentimentAnalyzer(tokenizer=Tokenizer()) - except TypeError: - slog.warning("%s is not supported by %s, skipping run...", SentimentAnalyzer.__name__, Tokenizer.__name__) + slog.info("Training sentiment analyzer: %s", sa) + sa.train(training_dataset_func=datasets.training, validation_dataset_func=datasets.validation) + + except TrainingFailedError: + slog.error("Training failed, trying again with a different dataset...") + continue + + else: + slog.info("Training succeeded!") + slog.info("Evaluating sentiment analyzer: %s", sa) + evaluation_results = sa.evaluate(evaluation_dataset_func=datasets.evaluation) + slog.info("Evaluation results: %s", evaluation_results) + successful_runs += 1 + cumulative_evaluation_results += evaluation_results break + finally: + datasets_cm.__exit__() - with Caches.from_database_samples(collection=reviews, sample_func=sample_func) as datasets: - try: - slog.info("Training sentiment analyzer: %s", sa) - sa.train(training_dataset_func=datasets.training, validation_dataset_func=datasets.validation) - - except TrainingFailedError: - slog.error("Training failed, trying again with a different dataset...") - continue - - else: - slog.info("Training succeeded!") - slog.info("Evaluating sentiment analyzer: %s", sa) - evaluation_results = sa.evaluate(evaluation_dataset_func=datasets.evaluation) - slog.info("Evaluation results: %s", evaluation_results) - successful_runs += 1 - cumulative_evaluation_results += evaluation_results - break - - slog.info("Cumulative evaluation results: %s", cumulative_evaluation_results) + slog.info("Cumulative evaluation results: %s", cumulative_evaluation_results) if __name__ == "__main__": diff --git a/unimore_bda_6/database/connection.py b/unimore_bda_6/database/connection.py index 5e3d703..8d0352a 100644 --- a/unimore_bda_6/database/connection.py +++ b/unimore_bda_6/database/connection.py @@ -23,7 +23,7 @@ def mongo_client_from_config() -> t.ContextManager[pymongo.MongoClient]: yield client - log.info("Closing connection to MongoDB...") + log.debug("Closing connection to MongoDB...") client.close() log.debug("Closed connection to MongoDB!")