mirror of
https://github.com/Steffo99/unimore-bda-6.git
synced 2024-11-21 23:44:19 +00:00
Some memory usage tweaks
This commit is contained in:
parent
35616d35c7
commit
61141248db
3 changed files with 69 additions and 64 deletions
1
.vscode/launch.json
vendored
1
.vscode/launch.json
vendored
|
@ -12,7 +12,6 @@
|
||||||
"justMyCode": false,
|
"justMyCode": false,
|
||||||
"env": {
|
"env": {
|
||||||
"NLTK_DATA": "./data/nltk",
|
"NLTK_DATA": "./data/nltk",
|
||||||
"DATA_SET_SIZE": "250",
|
|
||||||
"XLA_FLAGS": "--xla_gpu_cuda_data_dir=/opt/cuda"
|
"XLA_FLAGS": "--xla_gpu_cuda_data_dir=/opt/cuda"
|
||||||
},
|
},
|
||||||
"cwd": "${workspaceFolder}",
|
"cwd": "${workspaceFolder}",
|
||||||
|
|
|
@ -31,88 +31,94 @@ def main():
|
||||||
log.fatal("MongoDB database is not available, exiting...")
|
log.fatal("MongoDB database is not available, exiting...")
|
||||||
exit(1)
|
exit(1)
|
||||||
|
|
||||||
reviews = reviews_collection(db)
|
for sample_func in [
|
||||||
|
sample_reviews_polar,
|
||||||
|
sample_reviews_varied,
|
||||||
|
]:
|
||||||
|
|
||||||
for sample_func in [
|
slog = logging.getLogger(f"{__name__}.{sample_func.__name__}")
|
||||||
sample_reviews_polar,
|
slog.debug("Selected sample_func: %s", sample_func.__name__)
|
||||||
sample_reviews_varied,
|
|
||||||
|
for SentimentAnalyzer in [
|
||||||
|
# ThreeCheat,
|
||||||
|
NLTKSentimentAnalyzer,
|
||||||
|
TensorflowPolarSentimentAnalyzer,
|
||||||
|
TensorflowCategorySentimentAnalyzer,
|
||||||
]:
|
]:
|
||||||
|
|
||||||
slog = logging.getLogger(f"{__name__}.{sample_func.__name__}")
|
slog = logging.getLogger(f"{__name__}.{sample_func.__name__}.{SentimentAnalyzer.__name__}")
|
||||||
slog.debug("Selected sample_func: %s", sample_func.__name__)
|
slog.debug("Selected SentimentAnalyzer: %s", SentimentAnalyzer.__name__)
|
||||||
|
|
||||||
for SentimentAnalyzer in [
|
for Tokenizer in [
|
||||||
# ThreeCheat,
|
PlainTokenizer,
|
||||||
NLTKSentimentAnalyzer,
|
LowercaseTokenizer,
|
||||||
TensorflowPolarSentimentAnalyzer,
|
NLTKWordTokenizer,
|
||||||
TensorflowCategorySentimentAnalyzer,
|
PottsTokenizer,
|
||||||
|
PottsTokenizerWithNegation,
|
||||||
|
HuggingBertTokenizer,
|
||||||
]:
|
]:
|
||||||
|
|
||||||
slog = logging.getLogger(f"{__name__}.{sample_func.__name__}.{SentimentAnalyzer.__name__}")
|
log.debug("Running garbage collection...")
|
||||||
slog.debug("Selected SentimentAnalyzer: %s", SentimentAnalyzer.__name__)
|
garbage_count = gc.collect()
|
||||||
|
log.debug("Collected %d pieces of garbage!", garbage_count)
|
||||||
|
|
||||||
for Tokenizer in [
|
slog = logging.getLogger(f"{__name__}.{sample_func.__name__}.{SentimentAnalyzer.__name__}.{Tokenizer.__name__}")
|
||||||
PlainTokenizer,
|
slog.debug("Selected Tokenizer: %s", Tokenizer.__name__)
|
||||||
LowercaseTokenizer,
|
|
||||||
NLTKWordTokenizer,
|
|
||||||
PottsTokenizer,
|
|
||||||
PottsTokenizerWithNegation,
|
|
||||||
HuggingBertTokenizer,
|
|
||||||
]:
|
|
||||||
|
|
||||||
log.debug("Running garbage collection...")
|
runs = 0
|
||||||
garbage_count = gc.collect()
|
successful_runs = 0
|
||||||
log.debug("Collected %d pieces of garbage!", garbage_count)
|
cumulative_evaluation_results = EvaluationResults()
|
||||||
|
|
||||||
|
while True:
|
||||||
|
|
||||||
slog = logging.getLogger(f"{__name__}.{sample_func.__name__}.{SentimentAnalyzer.__name__}.{Tokenizer.__name__}")
|
slog = logging.getLogger(f"{__name__}.{sample_func.__name__}.{SentimentAnalyzer.__name__}.{Tokenizer.__name__}")
|
||||||
slog.debug("Selected Tokenizer: %s", Tokenizer.__name__)
|
|
||||||
|
|
||||||
runs = 0
|
if successful_runs >= TARGET_RUNS.__wrapped__:
|
||||||
successful_runs = 0
|
slog.info("Reached target of %d runs, moving on...", TARGET_RUNS.__wrapped__)
|
||||||
cumulative_evaluation_results = EvaluationResults()
|
break
|
||||||
|
|
||||||
while True:
|
if runs >= MAXIMUM_RUNS.__wrapped__:
|
||||||
|
slog.fatal("Exceeded %d runs, giving up and exiting...", MAXIMUM_RUNS.__wrapped__)
|
||||||
|
break
|
||||||
|
|
||||||
slog = logging.getLogger(f"{__name__}.{sample_func.__name__}.{SentimentAnalyzer.__name__}.{Tokenizer.__name__}")
|
runs += 1
|
||||||
|
slog = logging.getLogger(f"{__name__}.{sample_func.__name__}.{SentimentAnalyzer.__name__}.{Tokenizer.__name__}.{runs}")
|
||||||
|
slog.debug("Run #%d", runs)
|
||||||
|
|
||||||
if successful_runs >= TARGET_RUNS.__wrapped__:
|
try:
|
||||||
slog.info("Reached target of %d runs, moving on...", TARGET_RUNS.__wrapped__)
|
slog.debug("Instantiating %s with %s...", SentimentAnalyzer.__name__, Tokenizer.__name__)
|
||||||
break
|
sa = SentimentAnalyzer(tokenizer=Tokenizer())
|
||||||
|
except TypeError:
|
||||||
|
slog.warning("%s is not supported by %s, skipping run...", SentimentAnalyzer.__name__, Tokenizer.__name__)
|
||||||
|
break
|
||||||
|
|
||||||
if runs >= MAXIMUM_RUNS.__wrapped__:
|
|
||||||
slog.fatal("Exceeded %d runs, giving up and exiting...", MAXIMUM_RUNS.__wrapped__)
|
|
||||||
break
|
|
||||||
|
|
||||||
runs += 1
|
with mongo_client_from_config() as db:
|
||||||
slog = logging.getLogger(f"{__name__}.{sample_func.__name__}.{SentimentAnalyzer.__name__}.{Tokenizer.__name__}.{runs}")
|
reviews = reviews_collection(db)
|
||||||
slog.debug("Run #%d", runs)
|
datasets_cm = Caches.from_database_samples(collection=reviews, sample_func=sample_func)
|
||||||
|
datasets = datasets_cm.__enter__()
|
||||||
|
|
||||||
|
try:
|
||||||
try:
|
try:
|
||||||
slog.debug("Instantiating %s with %s...", SentimentAnalyzer.__name__, Tokenizer.__name__)
|
slog.info("Training sentiment analyzer: %s", sa)
|
||||||
sa = SentimentAnalyzer(tokenizer=Tokenizer())
|
sa.train(training_dataset_func=datasets.training, validation_dataset_func=datasets.validation)
|
||||||
except TypeError:
|
|
||||||
slog.warning("%s is not supported by %s, skipping run...", SentimentAnalyzer.__name__, Tokenizer.__name__)
|
except TrainingFailedError:
|
||||||
|
slog.error("Training failed, trying again with a different dataset...")
|
||||||
|
continue
|
||||||
|
|
||||||
|
else:
|
||||||
|
slog.info("Training succeeded!")
|
||||||
|
slog.info("Evaluating sentiment analyzer: %s", sa)
|
||||||
|
evaluation_results = sa.evaluate(evaluation_dataset_func=datasets.evaluation)
|
||||||
|
slog.info("Evaluation results: %s", evaluation_results)
|
||||||
|
successful_runs += 1
|
||||||
|
cumulative_evaluation_results += evaluation_results
|
||||||
break
|
break
|
||||||
|
finally:
|
||||||
|
datasets_cm.__exit__()
|
||||||
|
|
||||||
with Caches.from_database_samples(collection=reviews, sample_func=sample_func) as datasets:
|
slog.info("Cumulative evaluation results: %s", cumulative_evaluation_results)
|
||||||
try:
|
|
||||||
slog.info("Training sentiment analyzer: %s", sa)
|
|
||||||
sa.train(training_dataset_func=datasets.training, validation_dataset_func=datasets.validation)
|
|
||||||
|
|
||||||
except TrainingFailedError:
|
|
||||||
slog.error("Training failed, trying again with a different dataset...")
|
|
||||||
continue
|
|
||||||
|
|
||||||
else:
|
|
||||||
slog.info("Training succeeded!")
|
|
||||||
slog.info("Evaluating sentiment analyzer: %s", sa)
|
|
||||||
evaluation_results = sa.evaluate(evaluation_dataset_func=datasets.evaluation)
|
|
||||||
slog.info("Evaluation results: %s", evaluation_results)
|
|
||||||
successful_runs += 1
|
|
||||||
cumulative_evaluation_results += evaluation_results
|
|
||||||
break
|
|
||||||
|
|
||||||
slog.info("Cumulative evaluation results: %s", cumulative_evaluation_results)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|
|
@ -23,7 +23,7 @@ def mongo_client_from_config() -> t.ContextManager[pymongo.MongoClient]:
|
||||||
|
|
||||||
yield client
|
yield client
|
||||||
|
|
||||||
log.info("Closing connection to MongoDB...")
|
log.debug("Closing connection to MongoDB...")
|
||||||
client.close()
|
client.close()
|
||||||
log.debug("Closed connection to MongoDB!")
|
log.debug("Closed connection to MongoDB!")
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue