1
Fork 0
mirror of https://github.com/Steffo99/unimore-bda-6.git synced 2024-11-25 17:24:20 +00:00

Implement basic Potts sentiment analyzer

This commit is contained in:
Steffo 2023-02-02 04:34:05 +01:00
parent e2b9133bd5
commit ab5f12f8fc
Signed by: steffo
GPG key ID: 2A24051445686895
2 changed files with 42 additions and 9 deletions

View file

@ -1,23 +1,26 @@
import logging
from .config import config from .config import config
from .database import mongo_reviews_collection_from_config, get_training_reviews, get_test_reviews from .database import mongo_reviews_collection_from_config, get_training_reviews, get_test_reviews
from .analysis.vanilla import VanillaReviewSA from .analysis.vanilla import VanillaReviewSA
from .analysis.potts import PottsReviewSA
from .log import install_log_handler from .log import install_log_handler
log = logging.getLogger(__name__)
def main(): def main():
with mongo_reviews_collection_from_config() as reviews: with mongo_reviews_collection_from_config() as reviews:
training_reviews = get_training_reviews(collection=reviews) training_reviews = get_training_reviews(collection=reviews)
test_reviews = get_test_reviews(collection=reviews) test_reviews = get_test_reviews(collection=reviews)
model = VanillaReviewSA() vanilla = VanillaReviewSA()
model.train(training_reviews) vanilla.train(training_reviews)
log.info("Vanilla evaluation results: %s", vanilla.evaluate(test_reviews))
evaluation = model.evaluate(test_reviews) potts = PottsReviewSA()
print(evaluation) potts.train(training_reviews)
log.info("Potts evaluation results: %s", potts.evaluate(test_reviews))
while True:
classification = model.use(input())
print(classification)
if __name__ == "__main__": if __name__ == "__main__":

View file

@ -0,0 +1,30 @@
from ..vendor.potts import Tokenizer
from .vanilla import VanillaSA, VanillaReviewSA
class PottsSA(VanillaSA):
"""
A sentiment analyzer using Potts' tokenizer.
"""
def __init__(self) -> None:
super().__init__()
def _tokenize_text(self, text: str) -> list[str]:
"""
Convert a text string into a list of tokens, using the language of the model.
"""
tokenizer: Tokenizer = Tokenizer(preserve_case=False)
return list(tokenizer.tokenize(text))
class PottsReviewSA(VanillaReviewSA, PottsSA):
"""
A `PottsSA` to be used with `Review`s.
"""
__all__ = (
"PottsSA",
"PottsReviewSA",
)