1
Fork 0
mirror of https://github.com/Steffo99/unimore-bda-6.git synced 2024-11-22 16:04:18 +00:00
bda-6-steffo/unimore_bda_6/__main__.py

95 lines
2.6 KiB
Python
Raw Normal View History

import logging
from .config import config, DATA_SET_SIZE
2023-02-02 16:24:11 +00:00
from .database import Review, mongo_reviews_collection_from_config, dataset_polar, dataset_varied
from .analysis.vanilla import VanillaSA
from .tokenization import all_tokenizers
2023-02-01 16:46:25 +00:00
from .log import install_log_handler
2023-02-01 03:20:09 +00:00
log = logging.getLogger(__name__)
2023-02-01 01:33:42 +00:00
2023-02-02 16:24:11 +00:00
def review_vanilla_extractor(review: Review) -> tuple[str, float]:
"""
Extract review text and rating from a `Review`.
"""
return review["reviewText"], review["overall"]
def polar_categorizer(rating: float) -> str:
"""
Return the polar label corresponding to the given rating.
Possible categories are:
* negative (1.0, 2.0)
* positive (3.0, 4.0, 5.0)
* unknown (everything else)
"""
match rating:
case 1.0 | 2.0:
return "negative"
case 3.0 | 4.0 | 5.0:
return "positive"
case _:
return "unknown"
def varied_categorizer(rating: float) -> str:
"""
Return the "stars" label corresponding to the given rating.
Possible categories are:
* terrible (1.0)
* negative (2.0)
* mixed (3.0)
* positive (4.0)
* great (5.0)
* unknown (everything else)
"""
match rating:
case 1.0:
return "terrible"
case 2.0:
return "negative"
case 3.0:
return "mixed"
case 4.0:
return "positive"
case 5.0:
return "great"
case _:
return "unknown"
2023-02-01 01:33:42 +00:00
def main():
2023-02-03 01:10:00 +00:00
for dataset_func, categorizer in [
(dataset_polar, polar_categorizer),
(dataset_varied, varied_categorizer),
]:
for tokenizer in all_tokenizers:
with mongo_reviews_collection_from_config() as reviews:
reviews_training = dataset_func(collection=reviews, amount=DATA_SET_SIZE.__wrapped__)
reviews_evaluation = dataset_func(collection=reviews, amount=DATA_SET_SIZE.__wrapped__)
model = VanillaSA(extractor=review_vanilla_extractor, tokenizer=tokenizer, categorizer=categorizer)
log.info("Training model %s", model)
model.train(reviews_training)
log.info("Evaluating model %s", model)
evaluation = model.evaluate(reviews_evaluation)
log.info("Results of model %s: %s", tokenizer, evaluation)
try:
print("Model %s" % model)
while True:
print(model.use(input()))
except KeyboardInterrupt:
pass
2023-02-01 01:33:42 +00:00
if __name__ == "__main__":
2023-02-01 16:46:25 +00:00
install_log_handler()
config.proxies.resolve()
2023-02-01 01:33:42 +00:00
main()