2023-02-02 03:34:05 +00:00
|
|
|
import logging
|
2023-02-08 18:46:05 +00:00
|
|
|
import pymongo.errors
|
2023-02-10 03:07:34 +00:00
|
|
|
from .log import install_general_log_handlers
|
2023-02-08 18:46:05 +00:00
|
|
|
|
2023-02-10 03:07:34 +00:00
|
|
|
install_general_log_handlers()
|
2023-02-02 03:34:05 +00:00
|
|
|
|
2023-02-08 18:46:05 +00:00
|
|
|
from .config import config
|
|
|
|
from .database import mongo_client_from_config, reviews_collection, sample_reviews_polar, sample_reviews_varied
|
2023-02-10 04:52:13 +00:00
|
|
|
from .analysis import NLTKSentimentAnalyzer, TensorflowCategorySentimentAnalyzer, TensorflowPolarSentimentAnalyzer
|
2023-02-08 09:54:14 +00:00
|
|
|
from .analysis.base import TrainingFailedError
|
2023-02-08 18:46:05 +00:00
|
|
|
from .tokenizer import PlainTokenizer, LowercaseTokenizer, NLTKWordTokenizer, PottsTokenizer, PottsTokenizerWithNegation
|
|
|
|
from .gathering import Caches
|
2023-02-01 03:20:09 +00:00
|
|
|
|
2023-02-02 03:34:05 +00:00
|
|
|
log = logging.getLogger(__name__)
|
|
|
|
|
2023-02-01 01:33:42 +00:00
|
|
|
|
|
|
|
def main():
|
2023-02-08 18:46:05 +00:00
|
|
|
log.info("Started unimore-bda-6 in %s mode!", "DEBUG" if __debug__ else "PRODUCTION")
|
|
|
|
|
|
|
|
log.debug("Validating configuration...")
|
|
|
|
config.proxies.resolve()
|
|
|
|
|
|
|
|
log.debug("Ensuring there are no leftover caches...")
|
|
|
|
Caches.ensure_clean()
|
|
|
|
|
|
|
|
with mongo_client_from_config() as db:
|
|
|
|
try:
|
|
|
|
db.admin.command("ping")
|
|
|
|
except pymongo.errors.ServerSelectionTimeoutError:
|
|
|
|
log.fatal("MongoDB database is not available, exiting...")
|
|
|
|
exit(1)
|
|
|
|
|
|
|
|
reviews = reviews_collection(db)
|
|
|
|
|
|
|
|
for sample_func in [sample_reviews_varied, sample_reviews_polar]:
|
|
|
|
|
2023-02-10 02:30:41 +00:00
|
|
|
slog = logging.getLogger(f"{__name__}.{sample_func.__name__}")
|
2023-02-10 03:07:34 +00:00
|
|
|
slog.debug("Selected sample_func: %s", sample_func.__name__)
|
2023-02-10 02:30:41 +00:00
|
|
|
|
2023-02-08 18:46:05 +00:00
|
|
|
for SentimentAnalyzer in [
|
2023-02-10 04:52:13 +00:00
|
|
|
TensorflowPolarSentimentAnalyzer,
|
2023-02-10 03:17:50 +00:00
|
|
|
TensorflowCategorySentimentAnalyzer,
|
2023-02-10 04:18:24 +00:00
|
|
|
NLTKSentimentAnalyzer,
|
2023-02-05 16:40:22 +00:00
|
|
|
]:
|
|
|
|
|
2023-02-10 02:30:41 +00:00
|
|
|
slog = logging.getLogger(f"{__name__}.{sample_func.__name__}.{SentimentAnalyzer.__name__}")
|
2023-02-10 03:07:34 +00:00
|
|
|
slog.debug("Selected SentimentAnalyzer: %s", SentimentAnalyzer.__name__)
|
2023-02-10 02:30:41 +00:00
|
|
|
|
2023-02-08 18:46:05 +00:00
|
|
|
for Tokenizer in [
|
|
|
|
PlainTokenizer,
|
|
|
|
LowercaseTokenizer,
|
|
|
|
NLTKWordTokenizer,
|
2023-02-10 04:18:24 +00:00
|
|
|
PottsTokenizer,
|
|
|
|
PottsTokenizerWithNegation,
|
2023-02-08 18:46:05 +00:00
|
|
|
]:
|
|
|
|
|
|
|
|
slog = logging.getLogger(f"{__name__}.{sample_func.__name__}.{SentimentAnalyzer.__name__}.{Tokenizer.__name__}")
|
2023-02-10 03:07:34 +00:00
|
|
|
slog.debug("Selected Tokenizer: %s", Tokenizer.__name__)
|
2023-02-10 02:32:31 +00:00
|
|
|
|
|
|
|
run_counter = 0
|
2023-02-08 18:46:05 +00:00
|
|
|
|
|
|
|
while True:
|
|
|
|
|
2023-02-10 02:32:31 +00:00
|
|
|
slog = logging.getLogger(f"{__name__}.{sample_func.__name__}.{SentimentAnalyzer.__name__}.{Tokenizer.__name__}.{run_counter}")
|
|
|
|
run_counter += 1
|
|
|
|
slog.debug("Run #%d", run_counter)
|
|
|
|
|
2023-02-10 05:21:50 +00:00
|
|
|
if run_counter >= 100:
|
|
|
|
slog.fatal("Exceeded 100 runs, giving up and exiting...")
|
|
|
|
exit(2)
|
|
|
|
|
2023-02-08 18:46:05 +00:00
|
|
|
try:
|
2023-02-10 03:07:34 +00:00
|
|
|
slog.debug("Instantiating %s with %s...", SentimentAnalyzer.__name__, Tokenizer.__name__)
|
2023-02-08 18:46:05 +00:00
|
|
|
sa = SentimentAnalyzer(tokenizer=Tokenizer())
|
|
|
|
except TypeError:
|
2023-02-10 03:07:34 +00:00
|
|
|
slog.warning("%s is not supported by %s, skipping run...", SentimentAnalyzer.__name__, Tokenizer.__name__)
|
2023-02-08 18:46:05 +00:00
|
|
|
break
|
2023-02-05 16:40:22 +00:00
|
|
|
|
2023-02-08 18:46:05 +00:00
|
|
|
with Caches.from_database_samples(collection=reviews, sample_func=sample_func) as datasets:
|
2023-02-08 09:54:14 +00:00
|
|
|
try:
|
2023-02-08 18:46:05 +00:00
|
|
|
slog.info("Training sentiment analyzer: %s", sa)
|
|
|
|
sa.train(training_dataset_func=datasets.training, validation_dataset_func=datasets.validation)
|
|
|
|
|
|
|
|
except TrainingFailedError:
|
|
|
|
slog.error("Training failed, trying again with a different dataset...")
|
|
|
|
continue
|
2023-02-05 16:40:22 +00:00
|
|
|
|
2023-02-08 18:46:05 +00:00
|
|
|
else:
|
|
|
|
slog.info("Training succeeded!")
|
2023-02-05 16:40:22 +00:00
|
|
|
|
2023-02-08 18:46:05 +00:00
|
|
|
slog.info("Evaluating sentiment analyzer: %s", sa)
|
|
|
|
evaluation_results = sa.evaluate(evaluation_dataset_func=datasets.evaluation)
|
|
|
|
slog.info("Evaluation results: %s", evaluation_results)
|
|
|
|
break
|
2023-02-01 01:33:42 +00:00
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
main()
|