2023-02-02 03:34:05 +00:00
|
|
|
import logging
|
|
|
|
|
2023-02-02 04:01:31 +00:00
|
|
|
from .config import config, DATA_SET_SIZE
|
2023-02-02 16:24:11 +00:00
|
|
|
from .database import Review, mongo_reviews_collection_from_config, dataset_polar, dataset_varied
|
|
|
|
from .analysis.vanilla import VanillaSA
|
|
|
|
from .tokenization import all_tokenizers
|
2023-02-01 16:46:25 +00:00
|
|
|
from .log import install_log_handler
|
2023-02-01 03:20:09 +00:00
|
|
|
|
2023-02-02 03:34:05 +00:00
|
|
|
log = logging.getLogger(__name__)
|
|
|
|
|
2023-02-01 01:33:42 +00:00
|
|
|
|
2023-02-02 16:24:11 +00:00
|
|
|
def review_vanilla_extractor(review: Review) -> tuple[str, float]:
|
|
|
|
"""
|
|
|
|
Extract review text and rating from a `Review`.
|
|
|
|
"""
|
|
|
|
return review["reviewText"], review["overall"]
|
|
|
|
|
|
|
|
|
|
|
|
def polar_categorizer(rating: float) -> str:
|
|
|
|
"""
|
|
|
|
Return the polar label corresponding to the given rating.
|
|
|
|
|
|
|
|
Possible categories are:
|
|
|
|
|
|
|
|
* negative (1.0, 2.0)
|
|
|
|
* positive (3.0, 4.0, 5.0)
|
|
|
|
* unknown (everything else)
|
|
|
|
"""
|
|
|
|
match rating:
|
|
|
|
case 1.0 | 2.0:
|
|
|
|
return "negative"
|
|
|
|
case 3.0 | 4.0 | 5.0:
|
|
|
|
return "positive"
|
|
|
|
case _:
|
|
|
|
return "unknown"
|
|
|
|
|
|
|
|
|
|
|
|
def varied_categorizer(rating: float) -> str:
|
|
|
|
"""
|
|
|
|
Return the "stars" label corresponding to the given rating.
|
|
|
|
|
|
|
|
Possible categories are:
|
|
|
|
|
|
|
|
* terrible (1.0)
|
|
|
|
* negative (2.0)
|
|
|
|
* mixed (3.0)
|
|
|
|
* positive (4.0)
|
|
|
|
* great (5.0)
|
|
|
|
* unknown (everything else)
|
|
|
|
"""
|
|
|
|
match rating:
|
|
|
|
case 1.0:
|
|
|
|
return "terrible"
|
|
|
|
case 2.0:
|
|
|
|
return "negative"
|
|
|
|
case 3.0:
|
|
|
|
return "mixed"
|
|
|
|
case 4.0:
|
|
|
|
return "positive"
|
|
|
|
case 5.0:
|
|
|
|
return "great"
|
|
|
|
case _:
|
|
|
|
return "unknown"
|
|
|
|
|
|
|
|
|
2023-02-01 01:33:42 +00:00
|
|
|
def main():
|
2023-02-02 01:56:37 +00:00
|
|
|
with mongo_reviews_collection_from_config() as reviews:
|
2023-02-02 16:24:11 +00:00
|
|
|
reviews_polar_training = dataset_polar(collection=reviews, amount=DATA_SET_SIZE.__wrapped__)
|
|
|
|
reviews_polar_evaluation = dataset_polar(collection=reviews, amount=DATA_SET_SIZE.__wrapped__)
|
2023-02-02 01:56:37 +00:00
|
|
|
|
2023-02-02 16:24:11 +00:00
|
|
|
for tokenizer in all_tokenizers:
|
|
|
|
log.info("Training polar model with %s tokenizer", tokenizer)
|
|
|
|
model = VanillaSA(extractor=review_vanilla_extractor, tokenizer=tokenizer, categorizer=polar_categorizer)
|
|
|
|
model.train(reviews_polar_training)
|
|
|
|
log.info("Evaluating polar model with %s tokenizer", tokenizer)
|
|
|
|
evaluation = model.evaluate(reviews_polar_evaluation)
|
|
|
|
log.info("Polar model with %s results: %s", tokenizer, evaluation)
|
2023-02-02 04:01:31 +00:00
|
|
|
|
2023-02-02 16:24:11 +00:00
|
|
|
del reviews_polar_training
|
|
|
|
del reviews_polar_evaluation
|
2023-02-02 04:01:31 +00:00
|
|
|
|
2023-02-02 16:24:11 +00:00
|
|
|
with mongo_reviews_collection_from_config() as reviews:
|
|
|
|
reviews_varied_training = dataset_varied(collection=reviews, amount=DATA_SET_SIZE.__wrapped__)
|
|
|
|
reviews_varied_evaluation = dataset_varied(collection=reviews, amount=DATA_SET_SIZE.__wrapped__)
|
2023-02-02 04:01:31 +00:00
|
|
|
|
2023-02-02 16:24:11 +00:00
|
|
|
for tokenizer in all_tokenizers:
|
|
|
|
log.info("Training varied model with %s tokenizer", tokenizer)
|
|
|
|
model = VanillaSA(extractor=review_vanilla_extractor, tokenizer=tokenizer, categorizer=varied_categorizer)
|
|
|
|
model.train(reviews_varied_training)
|
|
|
|
log.info("Evaluating varied model with %s tokenizer", tokenizer)
|
|
|
|
evaluation = model.evaluate(reviews_varied_evaluation)
|
|
|
|
log.info("Varied model with %s results: %s", tokenizer, evaluation)
|
2023-02-02 03:34:05 +00:00
|
|
|
|
2023-02-02 16:24:11 +00:00
|
|
|
del reviews_varied_training
|
|
|
|
del reviews_varied_evaluation
|
2023-02-01 01:33:42 +00:00
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
2023-02-01 16:46:25 +00:00
|
|
|
install_log_handler()
|
|
|
|
config.proxies.resolve()
|
2023-02-01 01:33:42 +00:00
|
|
|
main()
|