mirror of
https://github.com/Steffo99/unimore-bda-6.git
synced 2024-11-21 23:44:19 +00:00
Make some more progress for the night
Many things still do not work properly
This commit is contained in:
parent
b347031663
commit
4344752cf6
6 changed files with 80 additions and 73 deletions
1
.vscode/launch.json
vendored
1
.vscode/launch.json
vendored
|
@ -12,6 +12,7 @@
|
||||||
"justMyCode": true,
|
"justMyCode": true,
|
||||||
"env": {
|
"env": {
|
||||||
"NLTK_DATA": "./data/nltk",
|
"NLTK_DATA": "./data/nltk",
|
||||||
|
"DATA_SET_SIZE": "100",
|
||||||
},
|
},
|
||||||
"cwd": "${workspaceFolder}",
|
"cwd": "${workspaceFolder}",
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,8 +1,8 @@
|
||||||
import logging
|
import logging
|
||||||
|
|
||||||
from .config import config
|
from .config import config, DATA_SET_SIZE
|
||||||
from .database import mongo_reviews_collection_from_config, get_training_reviews, get_test_reviews
|
from .database import mongo_reviews_collection_from_config, get_reviews_dataset_polar, get_reviews_dataset_uniform
|
||||||
from .analysis.vanilla import VanillaReviewSA
|
from .analysis.vanilla import VanillaReviewSA, VanillaUniformReviewSA
|
||||||
from .analysis.potts import PottsReviewSA
|
from .analysis.potts import PottsReviewSA
|
||||||
from .log import install_log_handler
|
from .log import install_log_handler
|
||||||
|
|
||||||
|
@ -11,16 +11,26 @@ log = logging.getLogger(__name__)
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
with mongo_reviews_collection_from_config() as reviews:
|
with mongo_reviews_collection_from_config() as reviews:
|
||||||
training_reviews = get_training_reviews(collection=reviews)
|
reviews_polar_training = get_reviews_dataset_polar(collection=reviews, amount=DATA_SET_SIZE.__wrapped__)
|
||||||
test_reviews = get_test_reviews(collection=reviews)
|
reviews_polar_evaluation = get_reviews_dataset_polar(collection=reviews, amount=DATA_SET_SIZE.__wrapped__)
|
||||||
|
reviews_uniform_training = get_reviews_dataset_uniform(collection=reviews, amount=DATA_SET_SIZE.__wrapped__)
|
||||||
|
reviews_uniform_evaluation = get_reviews_dataset_uniform(collection=reviews, amount=DATA_SET_SIZE.__wrapped__)
|
||||||
|
|
||||||
vanilla = VanillaReviewSA()
|
vanilla_polar = VanillaReviewSA()
|
||||||
vanilla.train(training_reviews)
|
vanilla_polar.train(reviews_polar_training)
|
||||||
log.info("Vanilla evaluation results: %s", vanilla.evaluate(test_reviews))
|
log.info("Vanilla polar evaluation results: %s", vanilla_polar.evaluate(reviews_polar_evaluation))
|
||||||
|
|
||||||
|
potts_polar = PottsReviewSA()
|
||||||
|
potts_polar.train(reviews_polar_training)
|
||||||
|
log.info("Potts polar evaluation results: %s", potts_polar.evaluate(reviews_polar_evaluation))
|
||||||
|
|
||||||
|
vanilla_uniform = VanillaUniformReviewSA()
|
||||||
|
vanilla_uniform.train(reviews_uniform_training)
|
||||||
|
log.info("Vanilla uniform evaluation results: %s", vanilla_polar.evaluate(reviews_polar_evaluation))
|
||||||
|
|
||||||
|
while True:
|
||||||
|
print(vanilla_uniform.use(input("> ")))
|
||||||
|
|
||||||
potts = PottsReviewSA()
|
|
||||||
potts.train(training_reviews)
|
|
||||||
log.info("Potts evaluation results: %s", potts.evaluate(test_reviews))
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
from ..vendor.potts import Tokenizer
|
from ..vendor.potts import Tokenizer
|
||||||
from .vanilla import VanillaSA, VanillaReviewSA
|
from .vanilla import VanillaSA, VanillaReviewSA, VanillaUniformReviewSA
|
||||||
|
|
||||||
|
|
||||||
class PottsSA(VanillaSA):
|
class PottsSA(VanillaSA):
|
||||||
|
@ -24,6 +24,12 @@ class PottsReviewSA(VanillaReviewSA, PottsSA):
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
class PottsUniformReviewSA(VanillaUniformReviewSA, PottsSA):
|
||||||
|
"""
|
||||||
|
A `PottsSA` with 5 buckets instead of 2.
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
__all__ = (
|
__all__ = (
|
||||||
"PottsSA",
|
"PottsSA",
|
||||||
"PottsReviewSA",
|
"PottsReviewSA",
|
||||||
|
|
|
@ -108,6 +108,26 @@ class VanillaReviewSA(VanillaSA):
|
||||||
return self._use_with_tokens(self._tokenize_text(text))
|
return self._use_with_tokens(self._tokenize_text(text))
|
||||||
|
|
||||||
|
|
||||||
|
class VanillaUniformReviewSA(VanillaReviewSA):
|
||||||
|
@staticmethod
|
||||||
|
def _rating_to_label(rating: float) -> str:
|
||||||
|
match rating:
|
||||||
|
case 0.0:
|
||||||
|
return "abysmal"
|
||||||
|
case 1.0:
|
||||||
|
return "terrible"
|
||||||
|
case 2.0:
|
||||||
|
return "negative"
|
||||||
|
case 3.0:
|
||||||
|
return "mixed"
|
||||||
|
case 4.0:
|
||||||
|
return "positive"
|
||||||
|
case 5.0:
|
||||||
|
return "great"
|
||||||
|
case _:
|
||||||
|
return "unknown"
|
||||||
|
|
||||||
|
|
||||||
__all__ = (
|
__all__ = (
|
||||||
"VanillaSA",
|
"VanillaSA",
|
||||||
"VanillaReviewSA",
|
"VanillaReviewSA",
|
||||||
|
|
|
@ -45,24 +45,9 @@ def WORKING_SET_SIZE(val: str | None) -> int:
|
||||||
|
|
||||||
|
|
||||||
@config.optional()
|
@config.optional()
|
||||||
def TRAINING_SET_SIZE(val: str | None) -> int:
|
def DATA_SET_SIZE(val: str | None) -> int:
|
||||||
"""
|
"""
|
||||||
The number of reviews from each category to fetch for the training set.
|
The number of reviews from each category to fetch for the datasets.
|
||||||
|
|
||||||
Defaults to `1000`.
|
|
||||||
"""
|
|
||||||
if val is None:
|
|
||||||
return 1000
|
|
||||||
try:
|
|
||||||
return int(val)
|
|
||||||
except ValueError:
|
|
||||||
raise cfig.InvalidValueError("Not an int.")
|
|
||||||
|
|
||||||
|
|
||||||
@config.optional()
|
|
||||||
def TEST_SET_SIZE(val: str | None) -> int:
|
|
||||||
"""
|
|
||||||
The number of reviews to fetch for the test set.
|
|
||||||
|
|
||||||
Defaults to `1000`.
|
Defaults to `1000`.
|
||||||
"""
|
"""
|
||||||
|
@ -79,9 +64,7 @@ __all__ = (
|
||||||
"MONGO_HOST",
|
"MONGO_HOST",
|
||||||
"MONGO_PORT",
|
"MONGO_PORT",
|
||||||
"WORKING_SET_SIZE",
|
"WORKING_SET_SIZE",
|
||||||
"TRAINING_SET_SIZE",
|
"DATA_SET_SIZE",
|
||||||
"TEST_SET_SIZE",
|
|
||||||
"NLTK_DOUBLE_NEG_SWITCH",
|
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -4,8 +4,9 @@ import pymongo.collection
|
||||||
import contextlib
|
import contextlib
|
||||||
import bson
|
import bson
|
||||||
import logging
|
import logging
|
||||||
|
import random
|
||||||
|
|
||||||
from .config import MONGO_HOST, MONGO_PORT, WORKING_SET_SIZE, TRAINING_SET_SIZE, TEST_SET_SIZE
|
from .config import MONGO_HOST, MONGO_PORT, WORKING_SET_SIZE, DATA_SET_SIZE
|
||||||
|
|
||||||
log = logging.getLogger(__name__)
|
log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
@ -79,59 +80,47 @@ def sample_reviews_by_rating(reviews: pymongo.collection.Collection, rating: flo
|
||||||
])
|
])
|
||||||
|
|
||||||
|
|
||||||
def sample_reviews_by_rating_polar(reviews: pymongo.collection.Collection, amount: int) -> t.Iterable[Review]:
|
def get_reviews_dataset_polar(collection: pymongo.collection.Collection, amount: int) -> list[Review]:
|
||||||
"""
|
"""
|
||||||
Get ``amount`` random reviews with either a 5-star or 1-star rating from the ``reviews`` collection.
|
Get a list of shuffled 1-star and 5-star reviews.
|
||||||
"""
|
"""
|
||||||
log.debug("Getting a sample of %d reviews with either 5 or 1 stars...", amount)
|
log.info("Building dataset with %d polar reviews...", amount * 2)
|
||||||
|
|
||||||
return reviews.aggregate([
|
|
||||||
{"$limit": WORKING_SET_SIZE.__wrapped__},
|
|
||||||
{"$match":
|
|
||||||
{"$or":
|
|
||||||
[
|
|
||||||
{"overall": 1.0},
|
|
||||||
{"overall": 5.0},
|
|
||||||
]
|
|
||||||
},
|
|
||||||
},
|
|
||||||
{"$sample": {"size": amount}},
|
|
||||||
])
|
|
||||||
|
|
||||||
|
|
||||||
def get_training_reviews(collection: pymongo.collection.Collection) -> list[Review]:
|
|
||||||
"""
|
|
||||||
Get the subset of reviews that should act as training set.
|
|
||||||
"""
|
|
||||||
log.info("Building training set...")
|
|
||||||
|
|
||||||
# Get the amount from the config
|
|
||||||
amount: int = TRAINING_SET_SIZE.__wrapped__
|
|
||||||
|
|
||||||
# Handle odd numbers
|
|
||||||
positive_amount: int = amount // 2
|
|
||||||
negative_amount: int = amount - positive_amount
|
|
||||||
|
|
||||||
# Sample the required reviews
|
# Sample the required reviews
|
||||||
positive = sample_reviews_by_rating(collection, 5.0, positive_amount)
|
positive = sample_reviews_by_rating(collection, rating=5.0, amount=amount)
|
||||||
negative = sample_reviews_by_rating(collection, 1.0, negative_amount)
|
negative = sample_reviews_by_rating(collection, rating=1.0, amount=amount)
|
||||||
|
|
||||||
# Randomness here does not matter, so just merge the lists
|
# Randomness here does not matter, so just merge the lists
|
||||||
both = [*positive, *negative]
|
both = [*positive, *negative]
|
||||||
|
|
||||||
|
# Shuffle the dataset, just in case it affects the performance
|
||||||
|
# TODO: does it actually?
|
||||||
|
random.shuffle(both)
|
||||||
|
|
||||||
return both
|
return both
|
||||||
|
|
||||||
|
|
||||||
def get_test_reviews(collection: pymongo.collection.Collection) -> list[Review]:
|
def get_reviews_dataset_uniform(collection: pymongo.collection.Collection, amount: int) -> list[Review]:
|
||||||
"""
|
"""
|
||||||
Get the subset of reviews that should act as test set.
|
Get a list of shuffled reviews of any rating.
|
||||||
"""
|
"""
|
||||||
|
log.info("Building dataset with %d uniform reviews...", amount * 5)
|
||||||
|
|
||||||
log.info("Building test set...")
|
# Sample the required reviews
|
||||||
|
terrible = sample_reviews_by_rating(collection, rating=1.0, amount=amount)
|
||||||
|
negative = sample_reviews_by_rating(collection, rating=2.0, amount=amount)
|
||||||
|
mixed = sample_reviews_by_rating(collection, rating=3.0, amount=amount)
|
||||||
|
positive = sample_reviews_by_rating(collection, rating=4.0, amount=amount)
|
||||||
|
great = sample_reviews_by_rating(collection, rating=5.0, amount=amount)
|
||||||
|
|
||||||
amount: int = TEST_SET_SIZE.__wrapped__
|
# Randomness here does not matter, so just merge the lists
|
||||||
|
both = [*positive, *negative]
|
||||||
|
|
||||||
return list(sample_reviews_by_rating_polar(collection, amount))
|
# Shuffle the dataset, just in case it affects the performance
|
||||||
|
# TODO: does it actually?
|
||||||
|
random.shuffle(both)
|
||||||
|
|
||||||
|
return both
|
||||||
|
|
||||||
|
|
||||||
__all__ = (
|
__all__ = (
|
||||||
|
@ -140,7 +129,5 @@ __all__ = (
|
||||||
"mongo_reviews_collection_from_config",
|
"mongo_reviews_collection_from_config",
|
||||||
"sample_reviews",
|
"sample_reviews",
|
||||||
"sample_reviews_by_rating",
|
"sample_reviews_by_rating",
|
||||||
"sample_reviews_by_rating_polar",
|
"get_reviews_dataset_polar",
|
||||||
"get_training_reviews",
|
|
||||||
"get_test_reviews",
|
|
||||||
)
|
)
|
||||||
|
|
Loading…
Reference in a new issue