1
Fork 0
mirror of https://github.com/Steffo99/unimore-bda-6.git synced 2024-11-22 07:54:19 +00:00

Make some more progress for the night

Many things still do not work properly
This commit is contained in:
Steffo 2023-02-02 05:01:31 +01:00
parent b347031663
commit 4344752cf6
Signed by: steffo
GPG key ID: 2A24051445686895
6 changed files with 80 additions and 73 deletions

1
.vscode/launch.json vendored
View file

@ -12,6 +12,7 @@
"justMyCode": true, "justMyCode": true,
"env": { "env": {
"NLTK_DATA": "./data/nltk", "NLTK_DATA": "./data/nltk",
"DATA_SET_SIZE": "100",
}, },
"cwd": "${workspaceFolder}", "cwd": "${workspaceFolder}",
} }

View file

@ -1,8 +1,8 @@
import logging import logging
from .config import config from .config import config, DATA_SET_SIZE
from .database import mongo_reviews_collection_from_config, get_training_reviews, get_test_reviews from .database import mongo_reviews_collection_from_config, get_reviews_dataset_polar, get_reviews_dataset_uniform
from .analysis.vanilla import VanillaReviewSA from .analysis.vanilla import VanillaReviewSA, VanillaUniformReviewSA
from .analysis.potts import PottsReviewSA from .analysis.potts import PottsReviewSA
from .log import install_log_handler from .log import install_log_handler
@ -11,16 +11,26 @@ log = logging.getLogger(__name__)
def main(): def main():
with mongo_reviews_collection_from_config() as reviews: with mongo_reviews_collection_from_config() as reviews:
training_reviews = get_training_reviews(collection=reviews) reviews_polar_training = get_reviews_dataset_polar(collection=reviews, amount=DATA_SET_SIZE.__wrapped__)
test_reviews = get_test_reviews(collection=reviews) reviews_polar_evaluation = get_reviews_dataset_polar(collection=reviews, amount=DATA_SET_SIZE.__wrapped__)
reviews_uniform_training = get_reviews_dataset_uniform(collection=reviews, amount=DATA_SET_SIZE.__wrapped__)
reviews_uniform_evaluation = get_reviews_dataset_uniform(collection=reviews, amount=DATA_SET_SIZE.__wrapped__)
vanilla = VanillaReviewSA() vanilla_polar = VanillaReviewSA()
vanilla.train(training_reviews) vanilla_polar.train(reviews_polar_training)
log.info("Vanilla evaluation results: %s", vanilla.evaluate(test_reviews)) log.info("Vanilla polar evaluation results: %s", vanilla_polar.evaluate(reviews_polar_evaluation))
potts_polar = PottsReviewSA()
potts_polar.train(reviews_polar_training)
log.info("Potts polar evaluation results: %s", potts_polar.evaluate(reviews_polar_evaluation))
vanilla_uniform = VanillaUniformReviewSA()
vanilla_uniform.train(reviews_uniform_training)
log.info("Vanilla uniform evaluation results: %s", vanilla_polar.evaluate(reviews_polar_evaluation))
while True:
print(vanilla_uniform.use(input("> ")))
potts = PottsReviewSA()
potts.train(training_reviews)
log.info("Potts evaluation results: %s", potts.evaluate(test_reviews))
if __name__ == "__main__": if __name__ == "__main__":

View file

@ -1,5 +1,5 @@
from ..vendor.potts import Tokenizer from ..vendor.potts import Tokenizer
from .vanilla import VanillaSA, VanillaReviewSA from .vanilla import VanillaSA, VanillaReviewSA, VanillaUniformReviewSA
class PottsSA(VanillaSA): class PottsSA(VanillaSA):
@ -24,6 +24,12 @@ class PottsReviewSA(VanillaReviewSA, PottsSA):
""" """
class PottsUniformReviewSA(VanillaUniformReviewSA, PottsSA):
"""
A `PottsSA` with 5 buckets instead of 2.
"""
__all__ = ( __all__ = (
"PottsSA", "PottsSA",
"PottsReviewSA", "PottsReviewSA",

View file

@ -108,6 +108,26 @@ class VanillaReviewSA(VanillaSA):
return self._use_with_tokens(self._tokenize_text(text)) return self._use_with_tokens(self._tokenize_text(text))
class VanillaUniformReviewSA(VanillaReviewSA):
@staticmethod
def _rating_to_label(rating: float) -> str:
match rating:
case 0.0:
return "abysmal"
case 1.0:
return "terrible"
case 2.0:
return "negative"
case 3.0:
return "mixed"
case 4.0:
return "positive"
case 5.0:
return "great"
case _:
return "unknown"
__all__ = ( __all__ = (
"VanillaSA", "VanillaSA",
"VanillaReviewSA", "VanillaReviewSA",

View file

@ -45,24 +45,9 @@ def WORKING_SET_SIZE(val: str | None) -> int:
@config.optional() @config.optional()
def TRAINING_SET_SIZE(val: str | None) -> int: def DATA_SET_SIZE(val: str | None) -> int:
""" """
The number of reviews from each category to fetch for the training set. The number of reviews from each category to fetch for the datasets.
Defaults to `1000`.
"""
if val is None:
return 1000
try:
return int(val)
except ValueError:
raise cfig.InvalidValueError("Not an int.")
@config.optional()
def TEST_SET_SIZE(val: str | None) -> int:
"""
The number of reviews to fetch for the test set.
Defaults to `1000`. Defaults to `1000`.
""" """
@ -79,9 +64,7 @@ __all__ = (
"MONGO_HOST", "MONGO_HOST",
"MONGO_PORT", "MONGO_PORT",
"WORKING_SET_SIZE", "WORKING_SET_SIZE",
"TRAINING_SET_SIZE", "DATA_SET_SIZE",
"TEST_SET_SIZE",
"NLTK_DOUBLE_NEG_SWITCH",
) )

View file

@ -4,8 +4,9 @@ import pymongo.collection
import contextlib import contextlib
import bson import bson
import logging import logging
import random
from .config import MONGO_HOST, MONGO_PORT, WORKING_SET_SIZE, TRAINING_SET_SIZE, TEST_SET_SIZE from .config import MONGO_HOST, MONGO_PORT, WORKING_SET_SIZE, DATA_SET_SIZE
log = logging.getLogger(__name__) log = logging.getLogger(__name__)
@ -79,59 +80,47 @@ def sample_reviews_by_rating(reviews: pymongo.collection.Collection, rating: flo
]) ])
def sample_reviews_by_rating_polar(reviews: pymongo.collection.Collection, amount: int) -> t.Iterable[Review]: def get_reviews_dataset_polar(collection: pymongo.collection.Collection, amount: int) -> list[Review]:
""" """
Get ``amount`` random reviews with either a 5-star or 1-star rating from the ``reviews`` collection. Get a list of shuffled 1-star and 5-star reviews.
""" """
log.debug("Getting a sample of %d reviews with either 5 or 1 stars...", amount) log.info("Building dataset with %d polar reviews...", amount * 2)
return reviews.aggregate([
{"$limit": WORKING_SET_SIZE.__wrapped__},
{"$match":
{"$or":
[
{"overall": 1.0},
{"overall": 5.0},
]
},
},
{"$sample": {"size": amount}},
])
def get_training_reviews(collection: pymongo.collection.Collection) -> list[Review]:
"""
Get the subset of reviews that should act as training set.
"""
log.info("Building training set...")
# Get the amount from the config
amount: int = TRAINING_SET_SIZE.__wrapped__
# Handle odd numbers
positive_amount: int = amount // 2
negative_amount: int = amount - positive_amount
# Sample the required reviews # Sample the required reviews
positive = sample_reviews_by_rating(collection, 5.0, positive_amount) positive = sample_reviews_by_rating(collection, rating=5.0, amount=amount)
negative = sample_reviews_by_rating(collection, 1.0, negative_amount) negative = sample_reviews_by_rating(collection, rating=1.0, amount=amount)
# Randomness here does not matter, so just merge the lists # Randomness here does not matter, so just merge the lists
both = [*positive, *negative] both = [*positive, *negative]
# Shuffle the dataset, just in case it affects the performance
# TODO: does it actually?
random.shuffle(both)
return both return both
def get_test_reviews(collection: pymongo.collection.Collection) -> list[Review]: def get_reviews_dataset_uniform(collection: pymongo.collection.Collection, amount: int) -> list[Review]:
""" """
Get the subset of reviews that should act as test set. Get a list of shuffled reviews of any rating.
""" """
log.info("Building dataset with %d uniform reviews...", amount * 5)
log.info("Building test set...") # Sample the required reviews
terrible = sample_reviews_by_rating(collection, rating=1.0, amount=amount)
negative = sample_reviews_by_rating(collection, rating=2.0, amount=amount)
mixed = sample_reviews_by_rating(collection, rating=3.0, amount=amount)
positive = sample_reviews_by_rating(collection, rating=4.0, amount=amount)
great = sample_reviews_by_rating(collection, rating=5.0, amount=amount)
amount: int = TEST_SET_SIZE.__wrapped__ # Randomness here does not matter, so just merge the lists
both = [*positive, *negative]
return list(sample_reviews_by_rating_polar(collection, amount)) # Shuffle the dataset, just in case it affects the performance
# TODO: does it actually?
random.shuffle(both)
return both
__all__ = ( __all__ = (
@ -140,7 +129,5 @@ __all__ = (
"mongo_reviews_collection_from_config", "mongo_reviews_collection_from_config",
"sample_reviews", "sample_reviews",
"sample_reviews_by_rating", "sample_reviews_by_rating",
"sample_reviews_by_rating_polar", "get_reviews_dataset_polar",
"get_training_reviews",
"get_test_reviews",
) )