1
Fork 0
mirror of https://github.com/Steffo99/unimore-bda-6.git synced 2024-11-21 23:44:19 +00:00

Make some more progress for the night

Many things still do not work properly
This commit is contained in:
Steffo 2023-02-02 05:01:31 +01:00
parent b347031663
commit 4344752cf6
Signed by: steffo
GPG key ID: 2A24051445686895
6 changed files with 80 additions and 73 deletions

1
.vscode/launch.json vendored
View file

@ -12,6 +12,7 @@
"justMyCode": true,
"env": {
"NLTK_DATA": "./data/nltk",
"DATA_SET_SIZE": "100",
},
"cwd": "${workspaceFolder}",
}

View file

@ -1,8 +1,8 @@
import logging
from .config import config
from .database import mongo_reviews_collection_from_config, get_training_reviews, get_test_reviews
from .analysis.vanilla import VanillaReviewSA
from .config import config, DATA_SET_SIZE
from .database import mongo_reviews_collection_from_config, get_reviews_dataset_polar, get_reviews_dataset_uniform
from .analysis.vanilla import VanillaReviewSA, VanillaUniformReviewSA
from .analysis.potts import PottsReviewSA
from .log import install_log_handler
@ -11,16 +11,26 @@ log = logging.getLogger(__name__)
def main():
with mongo_reviews_collection_from_config() as reviews:
training_reviews = get_training_reviews(collection=reviews)
test_reviews = get_test_reviews(collection=reviews)
reviews_polar_training = get_reviews_dataset_polar(collection=reviews, amount=DATA_SET_SIZE.__wrapped__)
reviews_polar_evaluation = get_reviews_dataset_polar(collection=reviews, amount=DATA_SET_SIZE.__wrapped__)
reviews_uniform_training = get_reviews_dataset_uniform(collection=reviews, amount=DATA_SET_SIZE.__wrapped__)
reviews_uniform_evaluation = get_reviews_dataset_uniform(collection=reviews, amount=DATA_SET_SIZE.__wrapped__)
vanilla = VanillaReviewSA()
vanilla.train(training_reviews)
log.info("Vanilla evaluation results: %s", vanilla.evaluate(test_reviews))
vanilla_polar = VanillaReviewSA()
vanilla_polar.train(reviews_polar_training)
log.info("Vanilla polar evaluation results: %s", vanilla_polar.evaluate(reviews_polar_evaluation))
potts_polar = PottsReviewSA()
potts_polar.train(reviews_polar_training)
log.info("Potts polar evaluation results: %s", potts_polar.evaluate(reviews_polar_evaluation))
vanilla_uniform = VanillaUniformReviewSA()
vanilla_uniform.train(reviews_uniform_training)
log.info("Vanilla uniform evaluation results: %s", vanilla_polar.evaluate(reviews_polar_evaluation))
while True:
print(vanilla_uniform.use(input("> ")))
potts = PottsReviewSA()
potts.train(training_reviews)
log.info("Potts evaluation results: %s", potts.evaluate(test_reviews))
if __name__ == "__main__":

View file

@ -1,5 +1,5 @@
from ..vendor.potts import Tokenizer
from .vanilla import VanillaSA, VanillaReviewSA
from .vanilla import VanillaSA, VanillaReviewSA, VanillaUniformReviewSA
class PottsSA(VanillaSA):
@ -24,6 +24,12 @@ class PottsReviewSA(VanillaReviewSA, PottsSA):
"""
class PottsUniformReviewSA(VanillaUniformReviewSA, PottsSA):
"""
A `PottsSA` with 5 buckets instead of 2.
"""
__all__ = (
"PottsSA",
"PottsReviewSA",

View file

@ -108,6 +108,26 @@ class VanillaReviewSA(VanillaSA):
return self._use_with_tokens(self._tokenize_text(text))
class VanillaUniformReviewSA(VanillaReviewSA):
@staticmethod
def _rating_to_label(rating: float) -> str:
match rating:
case 0.0:
return "abysmal"
case 1.0:
return "terrible"
case 2.0:
return "negative"
case 3.0:
return "mixed"
case 4.0:
return "positive"
case 5.0:
return "great"
case _:
return "unknown"
__all__ = (
"VanillaSA",
"VanillaReviewSA",

View file

@ -45,24 +45,9 @@ def WORKING_SET_SIZE(val: str | None) -> int:
@config.optional()
def TRAINING_SET_SIZE(val: str | None) -> int:
def DATA_SET_SIZE(val: str | None) -> int:
"""
The number of reviews from each category to fetch for the training set.
Defaults to `1000`.
"""
if val is None:
return 1000
try:
return int(val)
except ValueError:
raise cfig.InvalidValueError("Not an int.")
@config.optional()
def TEST_SET_SIZE(val: str | None) -> int:
"""
The number of reviews to fetch for the test set.
The number of reviews from each category to fetch for the datasets.
Defaults to `1000`.
"""
@ -79,9 +64,7 @@ __all__ = (
"MONGO_HOST",
"MONGO_PORT",
"WORKING_SET_SIZE",
"TRAINING_SET_SIZE",
"TEST_SET_SIZE",
"NLTK_DOUBLE_NEG_SWITCH",
"DATA_SET_SIZE",
)

View file

@ -4,8 +4,9 @@ import pymongo.collection
import contextlib
import bson
import logging
import random
from .config import MONGO_HOST, MONGO_PORT, WORKING_SET_SIZE, TRAINING_SET_SIZE, TEST_SET_SIZE
from .config import MONGO_HOST, MONGO_PORT, WORKING_SET_SIZE, DATA_SET_SIZE
log = logging.getLogger(__name__)
@ -79,59 +80,47 @@ def sample_reviews_by_rating(reviews: pymongo.collection.Collection, rating: flo
])
def sample_reviews_by_rating_polar(reviews: pymongo.collection.Collection, amount: int) -> t.Iterable[Review]:
def get_reviews_dataset_polar(collection: pymongo.collection.Collection, amount: int) -> list[Review]:
"""
Get ``amount`` random reviews with either a 5-star or 1-star rating from the ``reviews`` collection.
Get a list of shuffled 1-star and 5-star reviews.
"""
log.debug("Getting a sample of %d reviews with either 5 or 1 stars...", amount)
return reviews.aggregate([
{"$limit": WORKING_SET_SIZE.__wrapped__},
{"$match":
{"$or":
[
{"overall": 1.0},
{"overall": 5.0},
]
},
},
{"$sample": {"size": amount}},
])
def get_training_reviews(collection: pymongo.collection.Collection) -> list[Review]:
"""
Get the subset of reviews that should act as training set.
"""
log.info("Building training set...")
# Get the amount from the config
amount: int = TRAINING_SET_SIZE.__wrapped__
# Handle odd numbers
positive_amount: int = amount // 2
negative_amount: int = amount - positive_amount
log.info("Building dataset with %d polar reviews...", amount * 2)
# Sample the required reviews
positive = sample_reviews_by_rating(collection, 5.0, positive_amount)
negative = sample_reviews_by_rating(collection, 1.0, negative_amount)
positive = sample_reviews_by_rating(collection, rating=5.0, amount=amount)
negative = sample_reviews_by_rating(collection, rating=1.0, amount=amount)
# Randomness here does not matter, so just merge the lists
both = [*positive, *negative]
# Shuffle the dataset, just in case it affects the performance
# TODO: does it actually?
random.shuffle(both)
return both
def get_test_reviews(collection: pymongo.collection.Collection) -> list[Review]:
def get_reviews_dataset_uniform(collection: pymongo.collection.Collection, amount: int) -> list[Review]:
"""
Get the subset of reviews that should act as test set.
Get a list of shuffled reviews of any rating.
"""
log.info("Building dataset with %d uniform reviews...", amount * 5)
log.info("Building test set...")
# Sample the required reviews
terrible = sample_reviews_by_rating(collection, rating=1.0, amount=amount)
negative = sample_reviews_by_rating(collection, rating=2.0, amount=amount)
mixed = sample_reviews_by_rating(collection, rating=3.0, amount=amount)
positive = sample_reviews_by_rating(collection, rating=4.0, amount=amount)
great = sample_reviews_by_rating(collection, rating=5.0, amount=amount)
amount: int = TEST_SET_SIZE.__wrapped__
# Randomness here does not matter, so just merge the lists
both = [*positive, *negative]
return list(sample_reviews_by_rating_polar(collection, amount))
# Shuffle the dataset, just in case it affects the performance
# TODO: does it actually?
random.shuffle(both)
return both
__all__ = (
@ -140,7 +129,5 @@ __all__ = (
"mongo_reviews_collection_from_config",
"sample_reviews",
"sample_reviews_by_rating",
"sample_reviews_by_rating_polar",
"get_training_reviews",
"get_test_reviews",
"get_reviews_dataset_polar",
)