mirror of
https://github.com/Steffo99/unimore-bda-6.git
synced 2024-11-22 07:54:19 +00:00
Make some more progress for the night
Many things still do not work properly
This commit is contained in:
parent
b347031663
commit
4344752cf6
6 changed files with 80 additions and 73 deletions
1
.vscode/launch.json
vendored
1
.vscode/launch.json
vendored
|
@ -12,6 +12,7 @@
|
|||
"justMyCode": true,
|
||||
"env": {
|
||||
"NLTK_DATA": "./data/nltk",
|
||||
"DATA_SET_SIZE": "100",
|
||||
},
|
||||
"cwd": "${workspaceFolder}",
|
||||
}
|
||||
|
|
|
@ -1,8 +1,8 @@
|
|||
import logging
|
||||
|
||||
from .config import config
|
||||
from .database import mongo_reviews_collection_from_config, get_training_reviews, get_test_reviews
|
||||
from .analysis.vanilla import VanillaReviewSA
|
||||
from .config import config, DATA_SET_SIZE
|
||||
from .database import mongo_reviews_collection_from_config, get_reviews_dataset_polar, get_reviews_dataset_uniform
|
||||
from .analysis.vanilla import VanillaReviewSA, VanillaUniformReviewSA
|
||||
from .analysis.potts import PottsReviewSA
|
||||
from .log import install_log_handler
|
||||
|
||||
|
@ -11,16 +11,26 @@ log = logging.getLogger(__name__)
|
|||
|
||||
def main():
|
||||
with mongo_reviews_collection_from_config() as reviews:
|
||||
training_reviews = get_training_reviews(collection=reviews)
|
||||
test_reviews = get_test_reviews(collection=reviews)
|
||||
reviews_polar_training = get_reviews_dataset_polar(collection=reviews, amount=DATA_SET_SIZE.__wrapped__)
|
||||
reviews_polar_evaluation = get_reviews_dataset_polar(collection=reviews, amount=DATA_SET_SIZE.__wrapped__)
|
||||
reviews_uniform_training = get_reviews_dataset_uniform(collection=reviews, amount=DATA_SET_SIZE.__wrapped__)
|
||||
reviews_uniform_evaluation = get_reviews_dataset_uniform(collection=reviews, amount=DATA_SET_SIZE.__wrapped__)
|
||||
|
||||
vanilla = VanillaReviewSA()
|
||||
vanilla.train(training_reviews)
|
||||
log.info("Vanilla evaluation results: %s", vanilla.evaluate(test_reviews))
|
||||
vanilla_polar = VanillaReviewSA()
|
||||
vanilla_polar.train(reviews_polar_training)
|
||||
log.info("Vanilla polar evaluation results: %s", vanilla_polar.evaluate(reviews_polar_evaluation))
|
||||
|
||||
potts_polar = PottsReviewSA()
|
||||
potts_polar.train(reviews_polar_training)
|
||||
log.info("Potts polar evaluation results: %s", potts_polar.evaluate(reviews_polar_evaluation))
|
||||
|
||||
vanilla_uniform = VanillaUniformReviewSA()
|
||||
vanilla_uniform.train(reviews_uniform_training)
|
||||
log.info("Vanilla uniform evaluation results: %s", vanilla_polar.evaluate(reviews_polar_evaluation))
|
||||
|
||||
while True:
|
||||
print(vanilla_uniform.use(input("> ")))
|
||||
|
||||
potts = PottsReviewSA()
|
||||
potts.train(training_reviews)
|
||||
log.info("Potts evaluation results: %s", potts.evaluate(test_reviews))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
from ..vendor.potts import Tokenizer
|
||||
from .vanilla import VanillaSA, VanillaReviewSA
|
||||
from .vanilla import VanillaSA, VanillaReviewSA, VanillaUniformReviewSA
|
||||
|
||||
|
||||
class PottsSA(VanillaSA):
|
||||
|
@ -24,6 +24,12 @@ class PottsReviewSA(VanillaReviewSA, PottsSA):
|
|||
"""
|
||||
|
||||
|
||||
class PottsUniformReviewSA(VanillaUniformReviewSA, PottsSA):
|
||||
"""
|
||||
A `PottsSA` with 5 buckets instead of 2.
|
||||
"""
|
||||
|
||||
|
||||
__all__ = (
|
||||
"PottsSA",
|
||||
"PottsReviewSA",
|
||||
|
|
|
@ -108,6 +108,26 @@ class VanillaReviewSA(VanillaSA):
|
|||
return self._use_with_tokens(self._tokenize_text(text))
|
||||
|
||||
|
||||
class VanillaUniformReviewSA(VanillaReviewSA):
|
||||
@staticmethod
|
||||
def _rating_to_label(rating: float) -> str:
|
||||
match rating:
|
||||
case 0.0:
|
||||
return "abysmal"
|
||||
case 1.0:
|
||||
return "terrible"
|
||||
case 2.0:
|
||||
return "negative"
|
||||
case 3.0:
|
||||
return "mixed"
|
||||
case 4.0:
|
||||
return "positive"
|
||||
case 5.0:
|
||||
return "great"
|
||||
case _:
|
||||
return "unknown"
|
||||
|
||||
|
||||
__all__ = (
|
||||
"VanillaSA",
|
||||
"VanillaReviewSA",
|
||||
|
|
|
@ -45,24 +45,9 @@ def WORKING_SET_SIZE(val: str | None) -> int:
|
|||
|
||||
|
||||
@config.optional()
|
||||
def TRAINING_SET_SIZE(val: str | None) -> int:
|
||||
def DATA_SET_SIZE(val: str | None) -> int:
|
||||
"""
|
||||
The number of reviews from each category to fetch for the training set.
|
||||
|
||||
Defaults to `1000`.
|
||||
"""
|
||||
if val is None:
|
||||
return 1000
|
||||
try:
|
||||
return int(val)
|
||||
except ValueError:
|
||||
raise cfig.InvalidValueError("Not an int.")
|
||||
|
||||
|
||||
@config.optional()
|
||||
def TEST_SET_SIZE(val: str | None) -> int:
|
||||
"""
|
||||
The number of reviews to fetch for the test set.
|
||||
The number of reviews from each category to fetch for the datasets.
|
||||
|
||||
Defaults to `1000`.
|
||||
"""
|
||||
|
@ -79,9 +64,7 @@ __all__ = (
|
|||
"MONGO_HOST",
|
||||
"MONGO_PORT",
|
||||
"WORKING_SET_SIZE",
|
||||
"TRAINING_SET_SIZE",
|
||||
"TEST_SET_SIZE",
|
||||
"NLTK_DOUBLE_NEG_SWITCH",
|
||||
"DATA_SET_SIZE",
|
||||
)
|
||||
|
||||
|
||||
|
|
|
@ -4,8 +4,9 @@ import pymongo.collection
|
|||
import contextlib
|
||||
import bson
|
||||
import logging
|
||||
import random
|
||||
|
||||
from .config import MONGO_HOST, MONGO_PORT, WORKING_SET_SIZE, TRAINING_SET_SIZE, TEST_SET_SIZE
|
||||
from .config import MONGO_HOST, MONGO_PORT, WORKING_SET_SIZE, DATA_SET_SIZE
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
@ -79,59 +80,47 @@ def sample_reviews_by_rating(reviews: pymongo.collection.Collection, rating: flo
|
|||
])
|
||||
|
||||
|
||||
def sample_reviews_by_rating_polar(reviews: pymongo.collection.Collection, amount: int) -> t.Iterable[Review]:
|
||||
def get_reviews_dataset_polar(collection: pymongo.collection.Collection, amount: int) -> list[Review]:
|
||||
"""
|
||||
Get ``amount`` random reviews with either a 5-star or 1-star rating from the ``reviews`` collection.
|
||||
Get a list of shuffled 1-star and 5-star reviews.
|
||||
"""
|
||||
log.debug("Getting a sample of %d reviews with either 5 or 1 stars...", amount)
|
||||
|
||||
return reviews.aggregate([
|
||||
{"$limit": WORKING_SET_SIZE.__wrapped__},
|
||||
{"$match":
|
||||
{"$or":
|
||||
[
|
||||
{"overall": 1.0},
|
||||
{"overall": 5.0},
|
||||
]
|
||||
},
|
||||
},
|
||||
{"$sample": {"size": amount}},
|
||||
])
|
||||
|
||||
|
||||
def get_training_reviews(collection: pymongo.collection.Collection) -> list[Review]:
|
||||
"""
|
||||
Get the subset of reviews that should act as training set.
|
||||
"""
|
||||
log.info("Building training set...")
|
||||
|
||||
# Get the amount from the config
|
||||
amount: int = TRAINING_SET_SIZE.__wrapped__
|
||||
|
||||
# Handle odd numbers
|
||||
positive_amount: int = amount // 2
|
||||
negative_amount: int = amount - positive_amount
|
||||
log.info("Building dataset with %d polar reviews...", amount * 2)
|
||||
|
||||
# Sample the required reviews
|
||||
positive = sample_reviews_by_rating(collection, 5.0, positive_amount)
|
||||
negative = sample_reviews_by_rating(collection, 1.0, negative_amount)
|
||||
positive = sample_reviews_by_rating(collection, rating=5.0, amount=amount)
|
||||
negative = sample_reviews_by_rating(collection, rating=1.0, amount=amount)
|
||||
|
||||
# Randomness here does not matter, so just merge the lists
|
||||
both = [*positive, *negative]
|
||||
|
||||
# Shuffle the dataset, just in case it affects the performance
|
||||
# TODO: does it actually?
|
||||
random.shuffle(both)
|
||||
|
||||
return both
|
||||
|
||||
|
||||
def get_test_reviews(collection: pymongo.collection.Collection) -> list[Review]:
|
||||
def get_reviews_dataset_uniform(collection: pymongo.collection.Collection, amount: int) -> list[Review]:
|
||||
"""
|
||||
Get the subset of reviews that should act as test set.
|
||||
Get a list of shuffled reviews of any rating.
|
||||
"""
|
||||
log.info("Building dataset with %d uniform reviews...", amount * 5)
|
||||
|
||||
log.info("Building test set...")
|
||||
# Sample the required reviews
|
||||
terrible = sample_reviews_by_rating(collection, rating=1.0, amount=amount)
|
||||
negative = sample_reviews_by_rating(collection, rating=2.0, amount=amount)
|
||||
mixed = sample_reviews_by_rating(collection, rating=3.0, amount=amount)
|
||||
positive = sample_reviews_by_rating(collection, rating=4.0, amount=amount)
|
||||
great = sample_reviews_by_rating(collection, rating=5.0, amount=amount)
|
||||
|
||||
amount: int = TEST_SET_SIZE.__wrapped__
|
||||
# Randomness here does not matter, so just merge the lists
|
||||
both = [*positive, *negative]
|
||||
|
||||
return list(sample_reviews_by_rating_polar(collection, amount))
|
||||
# Shuffle the dataset, just in case it affects the performance
|
||||
# TODO: does it actually?
|
||||
random.shuffle(both)
|
||||
|
||||
return both
|
||||
|
||||
|
||||
__all__ = (
|
||||
|
@ -140,7 +129,5 @@ __all__ = (
|
|||
"mongo_reviews_collection_from_config",
|
||||
"sample_reviews",
|
||||
"sample_reviews_by_rating",
|
||||
"sample_reviews_by_rating_polar",
|
||||
"get_training_reviews",
|
||||
"get_test_reviews",
|
||||
"get_reviews_dataset_polar",
|
||||
)
|
||||
|
|
Loading…
Reference in a new issue