bda-6-steffo/unimore_bda_6/__main__.py

import logging
import pymongo.errors
import gc
from .log import install_general_log_handlers

install_general_log_handlers()

from .config import config, TARGET_RUNS, MAXIMUM_RUNS
from .database import mongo_client_from_config, reviews_collection, sample_reviews_polar, sample_reviews_varied
from .analysis import NLTKSentimentAnalyzer, TensorflowCategorySentimentAnalyzer, TensorflowPolarSentimentAnalyzer, ThreeCheat
from .analysis.base import TrainingFailedError, EvaluationResults
from .tokenizer import PlainTokenizer, LowercaseTokenizer, NLTKWordTokenizer, PottsTokenizer, PottsTokenizerWithNegation, HuggingBertTokenizer
from .gathering import Caches

log = logging.getLogger(__name__)


def main():
    log.info("Started unimore-bda-6 in %s mode!", "DEBUG" if __debug__ else "PRODUCTION")

    log.debug("Validating configuration...")
    config.proxies.resolve()

    log.debug("Ensuring there are no leftover caches...")
    Caches.ensure_clean()

    with mongo_client_from_config() as db:
        try:
            db.admin.command("ping")
        except pymongo.errors.ServerSelectionTimeoutError:
            log.fatal("MongoDB database is not available, exiting...")
            exit(1)

    for sample_func in [
        sample_reviews_polar,
        sample_reviews_varied,
    ]:

        slog = logging.getLogger(f"{__name__}.{sample_func.__name__}")
        slog.debug("Selected sample_func: %s", sample_func.__name__)

        for SentimentAnalyzer in [
            # ThreeCheat,
            NLTKSentimentAnalyzer,
            TensorflowPolarSentimentAnalyzer,
            TensorflowCategorySentimentAnalyzer,
        ]:

            slog = logging.getLogger(f"{__name__}.{sample_func.__name__}.{SentimentAnalyzer.__name__}")
            slog.debug("Selected SentimentAnalyzer: %s", SentimentAnalyzer.__name__)

            for Tokenizer in [
                PlainTokenizer,
                LowercaseTokenizer,
                NLTKWordTokenizer,
                PottsTokenizer,
                PottsTokenizerWithNegation,
                HuggingBertTokenizer,
            ]:

                log.debug("Running garbage collection...")
                garbage_count = gc.collect()
                log.debug("Collected %d pieces of garbage!", garbage_count)

                slog = logging.getLogger(f"{__name__}.{sample_func.__name__}.{SentimentAnalyzer.__name__}.{Tokenizer.__name__}")
                slog.debug("Selected Tokenizer: %s", Tokenizer.__name__)

                runs = 0
                successful_runs = 0
                cumulative_evaluation_results = EvaluationResults()

                while True:

                    slog = logging.getLogger(f"{__name__}.{sample_func.__name__}.{SentimentAnalyzer.__name__}.{Tokenizer.__name__}")

                    if successful_runs >= TARGET_RUNS.__wrapped__:
                        slog.info("Reached target of %d runs, moving on...", TARGET_RUNS.__wrapped__)
                        break

                    if runs >= MAXIMUM_RUNS.__wrapped__:
                        slog.fatal("Exceeded %d runs, giving up and exiting...", MAXIMUM_RUNS.__wrapped__)
                        break

                    runs += 1
                    slog = logging.getLogger(f"{__name__}.{sample_func.__name__}.{SentimentAnalyzer.__name__}.{Tokenizer.__name__}.{runs}")
                    slog.debug("Run #%d", runs)

                    try:
                        slog.debug("Instantiating %s with %s...", SentimentAnalyzer.__name__, Tokenizer.__name__)
                        sa = SentimentAnalyzer(tokenizer=Tokenizer())
                    except TypeError:
                        slog.warning("%s is not supported by %s, skipping run...", SentimentAnalyzer.__name__, Tokenizer.__name__)
                        break


                    with mongo_client_from_config() as db:
                        reviews = reviews_collection(db)
                        datasets_cm = Caches.from_database_samples(collection=reviews, sample_func=sample_func)
                        datasets = datasets_cm.__enter__()

                    try:
                        try:
                            slog.info("Training sentiment analyzer: %s", sa)
                            sa.train(training_dataset_func=datasets.training, validation_dataset_func=datasets.validation)

                        except TrainingFailedError:
                            slog.error("Training failed, trying again with a different dataset...")
                            continue

                        else:
                            slog.info("Training succeeded!")
                            slog.info("Evaluating sentiment analyzer: %s", sa)
                            evaluation_results = sa.evaluate(evaluation_dataset_func=datasets.evaluation)
                            slog.info("Evaluation results: %s", evaluation_results)
                            successful_runs += 1
                            cumulative_evaluation_results += evaluation_results
                            break
                    finally:
                        datasets_cm.__exit__(None, None, None)

                slog.info("Cumulative evaluation results: %s", cumulative_evaluation_results)


if __name__ == "__main__":
    main()
Implement basic Potts sentiment analyzer 2023-02-02 03:34:05 +00:00			`import logging`
enough 2023-02-08 18:46:05 +00:00			`import pymongo.errors`
Manually run garbage collection after each iteration 2023-02-13 14:57:37 +00:00			`import gc`
Configure file logging 2023-02-10 03:07:34 +00:00			`from .log import install_general_log_handlers`
enough 2023-02-08 18:46:05 +00:00
Configure file logging 2023-02-10 03:07:34 +00:00			`install_general_log_handlers()`
Implement basic Potts sentiment analyzer 2023-02-02 03:34:05 +00:00
Various upgrades 2023-02-14 01:25:38 +00:00			`from .config import config, TARGET_RUNS, MAXIMUM_RUNS`
enough 2023-02-08 18:46:05 +00:00			`from .database import mongo_client_from_config, reviews_collection, sample_reviews_polar, sample_reviews_varied`
CODE IS DONE 2023-02-12 04:11:58 +00:00			`from .analysis import NLTKSentimentAnalyzer, TensorflowCategorySentimentAnalyzer, TensorflowPolarSentimentAnalyzer, ThreeCheat`
Various upgrades 2023-02-14 01:25:38 +00:00			`from .analysis.base import TrainingFailedError, EvaluationResults`
fix and patch things 2023-02-11 03:32:17 +00:00			`from .tokenizer import PlainTokenizer, LowercaseTokenizer, NLTKWordTokenizer, PottsTokenizer, PottsTokenizerWithNegation, HuggingBertTokenizer`
enough 2023-02-08 18:46:05 +00:00			`from .gathering import Caches`
Second commit 2023-02-01 03:20:09 +00:00
Implement basic Potts sentiment analyzer 2023-02-02 03:34:05 +00:00			`log = logging.getLogger(__name__)`

First commit 2023-02-01 01:33:42 +00:00
			`def main():`
enough 2023-02-08 18:46:05 +00:00			`log.info("Started unimore-bda-6 in %s mode!", "DEBUG" if __debug__ else "PRODUCTION")`

			`log.debug("Validating configuration...")`
			`config.proxies.resolve()`

			`log.debug("Ensuring there are no leftover caches...")`
			`Caches.ensure_clean()`

			`with mongo_client_from_config() as db:`
			`try:`
			`db.admin.command("ping")`
			`except pymongo.errors.ServerSelectionTimeoutError:`
			`log.fatal("MongoDB database is not available, exiting...")`
			`exit(1)`

Some memory usage tweaks 2023-02-18 02:18:34 +00:00			`for sample_func in [`
			`sample_reviews_polar,`
			`sample_reviews_varied,`
			`]:`

			`slog = logging.getLogger(f"{__name__}.{sample_func.__name__}")`
			`slog.debug("Selected sample_func: %s", sample_func.__name__)`

			`for SentimentAnalyzer in [`
			`# ThreeCheat,`
f 2023-02-18 14:27:51 +00:00			`NLTKSentimentAnalyzer,`
Some memory usage tweaks 2023-02-18 02:18:34 +00:00			`TensorflowPolarSentimentAnalyzer,`
			`TensorflowCategorySentimentAnalyzer,`
CODE IS DONE 2023-02-12 04:11:58 +00:00			`]:`
enough 2023-02-08 18:46:05 +00:00
Some memory usage tweaks 2023-02-18 02:18:34 +00:00			`slog = logging.getLogger(f"{__name__}.{sample_func.__name__}.{SentimentAnalyzer.__name__}")`
			`slog.debug("Selected SentimentAnalyzer: %s", SentimentAnalyzer.__name__)`
Improve logging 2023-02-10 02:30:41 +00:00
Some memory usage tweaks 2023-02-18 02:18:34 +00:00			`for Tokenizer in [`
			`PlainTokenizer,`
			`LowercaseTokenizer,`
			`NLTKWordTokenizer,`
			`PottsTokenizer,`
			`PottsTokenizerWithNegation,`
			`HuggingBertTokenizer,`
Made good progress How does text vectorization in tensorflow work? 2023-02-05 16:40:22 +00:00			`]:`

Some memory usage tweaks 2023-02-18 02:18:34 +00:00			`log.debug("Running garbage collection...")`
			`garbage_count = gc.collect()`
			`log.debug("Collected %d pieces of garbage!", garbage_count)`

			`slog = logging.getLogger(f"{__name__}.{sample_func.__name__}.{SentimentAnalyzer.__name__}.{Tokenizer.__name__}")`
			`slog.debug("Selected Tokenizer: %s", Tokenizer.__name__)`
Improve logging 2023-02-10 02:30:41 +00:00
Some memory usage tweaks 2023-02-18 02:18:34 +00:00			`runs = 0`
			`successful_runs = 0`
			`cumulative_evaluation_results = EvaluationResults()`
enough 2023-02-08 18:46:05 +00:00
Some memory usage tweaks 2023-02-18 02:18:34 +00:00			`while True:`
Manually run garbage collection after each iteration 2023-02-13 14:57:37 +00:00
enough 2023-02-08 18:46:05 +00:00			`slog = logging.getLogger(f"{__name__}.{sample_func.__name__}.{SentimentAnalyzer.__name__}.{Tokenizer.__name__}")`
Count runs 2023-02-10 02:32:31 +00:00
Some memory usage tweaks 2023-02-18 02:18:34 +00:00			`if successful_runs >= TARGET_RUNS.__wrapped__:`
			`slog.info("Reached target of %d runs, moving on...", TARGET_RUNS.__wrapped__)`
			`break`
enough 2023-02-08 18:46:05 +00:00
Some memory usage tweaks 2023-02-18 02:18:34 +00:00			`if runs >= MAXIMUM_RUNS.__wrapped__:`
			`slog.fatal("Exceeded %d runs, giving up and exiting...", MAXIMUM_RUNS.__wrapped__)`
			`break`
enough 2023-02-08 18:46:05 +00:00
Some memory usage tweaks 2023-02-18 02:18:34 +00:00			`runs += 1`
			`slog = logging.getLogger(f"{__name__}.{sample_func.__name__}.{SentimentAnalyzer.__name__}.{Tokenizer.__name__}.{runs}")`
			`slog.debug("Run #%d", runs)`
Count runs 2023-02-10 02:32:31 +00:00
Some memory usage tweaks 2023-02-18 02:18:34 +00:00			`try:`
			`slog.debug("Instantiating %s with %s...", SentimentAnalyzer.__name__, Tokenizer.__name__)`
			`sa = SentimentAnalyzer(tokenizer=Tokenizer())`
			`except TypeError:`
			`slog.warning("%s is not supported by %s, skipping run...", SentimentAnalyzer.__name__, Tokenizer.__name__)`
			`break`
Various upgrades 2023-02-14 01:25:38 +00:00

Some memory usage tweaks 2023-02-18 02:18:34 +00:00			`with mongo_client_from_config() as db:`
			`reviews = reviews_collection(db)`
			`datasets_cm = Caches.from_database_samples(collection=reviews, sample_func=sample_func)`
			`datasets = datasets_cm.__enter__()`
idk something more & more 2023-02-10 05:21:50 +00:00
Some memory usage tweaks 2023-02-18 02:18:34 +00:00			`try:`
enough 2023-02-08 18:46:05 +00:00			`try:`
Some memory usage tweaks 2023-02-18 02:18:34 +00:00			`slog.info("Training sentiment analyzer: %s", sa)`
			`sa.train(training_dataset_func=datasets.training, validation_dataset_func=datasets.validation)`

			`except TrainingFailedError:`
			`slog.error("Training failed, trying again with a different dataset...")`
			`continue`

			`else:`
			`slog.info("Training succeeded!")`
			`slog.info("Evaluating sentiment analyzer: %s", sa)`
			`evaluation_results = sa.evaluate(evaluation_dataset_func=datasets.evaluation)`
			`slog.info("Evaluation results: %s", evaluation_results)`
			`successful_runs += 1`
			`cumulative_evaluation_results += evaluation_results`
enough 2023-02-08 18:46:05 +00:00			`break`
Some memory usage tweaks 2023-02-18 02:18:34 +00:00			`finally:`
did i just fix all ram problems? 2023-02-18 02:37:53 +00:00			`datasets_cm.__exit__(None, None, None)`
Made good progress How does text vectorization in tensorflow work? 2023-02-05 16:40:22 +00:00
Some memory usage tweaks 2023-02-18 02:18:34 +00:00			`slog.info("Cumulative evaluation results: %s", cumulative_evaluation_results)`
Various upgrades 2023-02-14 01:25:38 +00:00
First commit 2023-02-01 01:33:42 +00:00
			`if __name__ == "__main__":`
			`main()`