1
Fork 0
mirror of https://github.com/Steffo99/unimore-bda-6.git synced 2024-11-28 18:54:20 +00:00
bda-6-steffo/unimore_bda_6/__main__.py

120 lines
5.2 KiB
Python
Raw Normal View History

import logging
2023-02-08 18:46:05 +00:00
import pymongo.errors
import gc
2023-02-10 03:07:34 +00:00
from .log import install_general_log_handlers
2023-02-08 18:46:05 +00:00
2023-02-10 03:07:34 +00:00
install_general_log_handlers()
2023-02-14 01:25:38 +00:00
from .config import config, TARGET_RUNS, MAXIMUM_RUNS
2023-02-08 18:46:05 +00:00
from .database import mongo_client_from_config, reviews_collection, sample_reviews_polar, sample_reviews_varied
2023-02-12 04:11:58 +00:00
from .analysis import NLTKSentimentAnalyzer, TensorflowCategorySentimentAnalyzer, TensorflowPolarSentimentAnalyzer, ThreeCheat
2023-02-14 01:25:38 +00:00
from .analysis.base import TrainingFailedError, EvaluationResults
2023-02-11 03:32:17 +00:00
from .tokenizer import PlainTokenizer, LowercaseTokenizer, NLTKWordTokenizer, PottsTokenizer, PottsTokenizerWithNegation, HuggingBertTokenizer
2023-02-08 18:46:05 +00:00
from .gathering import Caches
2023-02-01 03:20:09 +00:00
log = logging.getLogger(__name__)
2023-02-01 01:33:42 +00:00
def main():
2023-02-08 18:46:05 +00:00
log.info("Started unimore-bda-6 in %s mode!", "DEBUG" if __debug__ else "PRODUCTION")
log.debug("Validating configuration...")
config.proxies.resolve()
log.debug("Ensuring there are no leftover caches...")
Caches.ensure_clean()
with mongo_client_from_config() as db:
try:
db.admin.command("ping")
except pymongo.errors.ServerSelectionTimeoutError:
log.fatal("MongoDB database is not available, exiting...")
exit(1)
reviews = reviews_collection(db)
2023-02-12 04:11:58 +00:00
for sample_func in [
sample_reviews_polar,
sample_reviews_varied,
]:
2023-02-08 18:46:05 +00:00
2023-02-10 02:30:41 +00:00
slog = logging.getLogger(f"{__name__}.{sample_func.__name__}")
2023-02-10 03:07:34 +00:00
slog.debug("Selected sample_func: %s", sample_func.__name__)
2023-02-10 02:30:41 +00:00
2023-02-08 18:46:05 +00:00
for SentimentAnalyzer in [
2023-02-14 01:25:38 +00:00
# ThreeCheat,
2023-02-13 14:40:22 +00:00
NLTKSentimentAnalyzer,
2023-02-10 04:52:13 +00:00
TensorflowPolarSentimentAnalyzer,
TensorflowCategorySentimentAnalyzer,
]:
2023-02-10 02:30:41 +00:00
slog = logging.getLogger(f"{__name__}.{sample_func.__name__}.{SentimentAnalyzer.__name__}")
2023-02-10 03:07:34 +00:00
slog.debug("Selected SentimentAnalyzer: %s", SentimentAnalyzer.__name__)
2023-02-10 02:30:41 +00:00
2023-02-08 18:46:05 +00:00
for Tokenizer in [
PlainTokenizer,
LowercaseTokenizer,
NLTKWordTokenizer,
2023-02-13 14:40:22 +00:00
PottsTokenizer,
PottsTokenizerWithNegation,
HuggingBertTokenizer,
2023-02-08 18:46:05 +00:00
]:
log.debug("Running garbage collection...")
garbage_count = gc.collect()
log.debug("Collected %d pieces of garbage!", garbage_count)
2023-02-08 18:46:05 +00:00
slog = logging.getLogger(f"{__name__}.{sample_func.__name__}.{SentimentAnalyzer.__name__}.{Tokenizer.__name__}")
2023-02-10 03:07:34 +00:00
slog.debug("Selected Tokenizer: %s", Tokenizer.__name__)
2023-02-10 02:32:31 +00:00
2023-02-14 01:25:38 +00:00
runs = 0
successful_runs = 0
cumulative_evaluation_results = EvaluationResults()
2023-02-08 18:46:05 +00:00
while True:
2023-02-14 01:25:38 +00:00
slog = logging.getLogger(f"{__name__}.{sample_func.__name__}.{SentimentAnalyzer.__name__}.{Tokenizer.__name__}")
2023-02-10 02:32:31 +00:00
2023-02-14 01:25:38 +00:00
if successful_runs >= TARGET_RUNS.__wrapped__:
slog.info("Reached target of %d runs, moving on...", TARGET_RUNS.__wrapped__)
break
if runs >= MAXIMUM_RUNS.__wrapped__:
slog.fatal("Exceeded %d runs, giving up and exiting...", MAXIMUM_RUNS.__wrapped__)
break
runs += 1
slog = logging.getLogger(f"{__name__}.{sample_func.__name__}.{SentimentAnalyzer.__name__}.{Tokenizer.__name__}.{runs}")
slog.debug("Run #%d", runs)
2023-02-10 05:21:50 +00:00
2023-02-08 18:46:05 +00:00
try:
2023-02-10 03:07:34 +00:00
slog.debug("Instantiating %s with %s...", SentimentAnalyzer.__name__, Tokenizer.__name__)
2023-02-08 18:46:05 +00:00
sa = SentimentAnalyzer(tokenizer=Tokenizer())
except TypeError:
2023-02-10 03:07:34 +00:00
slog.warning("%s is not supported by %s, skipping run...", SentimentAnalyzer.__name__, Tokenizer.__name__)
2023-02-08 18:46:05 +00:00
break
2023-02-08 18:46:05 +00:00
with Caches.from_database_samples(collection=reviews, sample_func=sample_func) as datasets:
2023-02-08 09:54:14 +00:00
try:
2023-02-08 18:46:05 +00:00
slog.info("Training sentiment analyzer: %s", sa)
sa.train(training_dataset_func=datasets.training, validation_dataset_func=datasets.validation)
except TrainingFailedError:
slog.error("Training failed, trying again with a different dataset...")
continue
2023-02-08 18:46:05 +00:00
else:
slog.info("Training succeeded!")
slog.info("Evaluating sentiment analyzer: %s", sa)
evaluation_results = sa.evaluate(evaluation_dataset_func=datasets.evaluation)
slog.info("Evaluation results: %s", evaluation_results)
2023-02-14 01:25:38 +00:00
successful_runs += 1
cumulative_evaluation_results += evaluation_results
2023-02-08 18:46:05 +00:00
break
2023-02-01 01:33:42 +00:00
2023-02-14 01:25:38 +00:00
slog.info("Cumulative evaluation results: %s", cumulative_evaluation_results)
2023-02-01 01:33:42 +00:00
if __name__ == "__main__":
main()