1
Fork 0
mirror of https://github.com/Steffo99/unimore-bda-6.git synced 2025-02-16 14:13:59 +00:00

Make some progress

This commit is contained in:
Steffo 2023-02-01 17:46:25 +01:00
parent 0f37d206a1
commit 2f7237ebfa
Signed by: steffo
GPG key ID: 2A24051445686895
9 changed files with 243 additions and 28 deletions

47
poetry.lock generated
View file

@ -47,6 +47,24 @@ files = [
{file = "colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44"}, {file = "colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44"},
] ]
[[package]]
name = "coloredlogs"
version = "15.0.1"
description = "Colored terminal output for Python's logging module"
category = "main"
optional = false
python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*"
files = [
{file = "coloredlogs-15.0.1-py2.py3-none-any.whl", hash = "sha256:612ee75c546f53e92e70049c9dbfcc18c935a2b9a53b66085ce9ef6a6e5c0934"},
{file = "coloredlogs-15.0.1.tar.gz", hash = "sha256:7c991aa71a4577af2f82600d8f8f3a89f936baeaf9b50a9c197da014e5bf16b0"},
]
[package.dependencies]
humanfriendly = ">=9.1"
[package.extras]
cron = ["capturer (>=2.4)"]
[[package]] [[package]]
name = "dnspython" name = "dnspython"
version = "2.3.0" version = "2.3.0"
@ -68,6 +86,21 @@ idna = ["idna (>=2.1,<4.0)"]
trio = ["trio (>=0.14,<0.23)"] trio = ["trio (>=0.14,<0.23)"]
wmi = ["wmi (>=1.5.1,<2.0.0)"] wmi = ["wmi (>=1.5.1,<2.0.0)"]
[[package]]
name = "humanfriendly"
version = "10.0"
description = "Human friendly output for text interfaces using Python"
category = "main"
optional = false
python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*"
files = [
{file = "humanfriendly-10.0-py2.py3-none-any.whl", hash = "sha256:1697e1a8a8f550fd43c2865cd84542fc175a61dcb779b6fee18cf6b6ccba1477"},
{file = "humanfriendly-10.0.tar.gz", hash = "sha256:6b0b831ce8f15f7300721aa49829fc4e83921a9a301cc7f606be6686a2288ddc"},
]
[package.dependencies]
pyreadline3 = {version = "*", markers = "sys_platform == \"win32\" and python_version >= \"3.8\""}
[[package]] [[package]]
name = "joblib" name = "joblib"
version = "1.2.0" version = "1.2.0"
@ -247,6 +280,18 @@ ocsp = ["certifi", "pyopenssl (>=17.2.0)", "requests (<3.0.0)", "service-identit
snappy = ["python-snappy"] snappy = ["python-snappy"]
zstd = ["zstandard"] zstd = ["zstandard"]
[[package]]
name = "pyreadline3"
version = "3.4.1"
description = "A python implementation of GNU readline."
category = "main"
optional = false
python-versions = "*"
files = [
{file = "pyreadline3-3.4.1-py3-none-any.whl", hash = "sha256:b0efb6516fd4fb07b45949053826a62fa4cb353db5be2bbb4a7aa1fdd1e345fb"},
{file = "pyreadline3-3.4.1.tar.gz", hash = "sha256:6f3d1f7b8a31ba32b73917cefc1f28cc660562f39aea8646d30bd6eff21f7bae"},
]
[[package]] [[package]]
name = "regex" name = "regex"
version = "2022.10.31" version = "2022.10.31"
@ -369,4 +414,4 @@ telegram = ["requests"]
[metadata] [metadata]
lock-version = "2.0" lock-version = "2.0"
python-versions = "^3.10" python-versions = "^3.10"
content-hash = "baf82d3820a17be0b6fcd6b31d1627a6ab198c9c094a2ee0fc2b0caca2659a72" content-hash = "4545b19bfa3d0ad9a489c2bee037d0c317dd34ba5b9745375d3566d5fc68ef55"

View file

@ -135,6 +135,7 @@ python = "^3.10"
pymongo = "^4.3.3" pymongo = "^4.3.3"
nltk = "^3.8.1" nltk = "^3.8.1"
cfig = {extras = ["cli"], version = "^0.3.0"} cfig = {extras = ["cli"], version = "^0.3.0"}
coloredlogs = "^15.0.1"

View file

@ -1,9 +1,15 @@
from .database import create_mongo_client_from_config from .config import config
from .analysis.vanilla import create_model_vanilla, train_model_vanilla, evaluate_model_vanilla
from .log import install_log_handler
def main(): def main():
pass model = create_model_vanilla()
train_model_vanilla(model)
evaluate_model_vanilla(model)
if __name__ == "__main__": if __name__ == "__main__":
install_log_handler()
config.proxies.resolve()
main() main()

View file

@ -1,14 +0,0 @@
import nltk
import nltk.sentiment
def create_sentiment_analysis_model() -> ntlk.sentiment.SentimentAnalyzer:
analyzer = nltk.sentiment.SentimentAnalyzer()
def train():
...
def test():
...

View file

View file

@ -0,0 +1,58 @@
import nltk
import nltk.classify
import nltk.sentiment
import nltk.sentiment.util
import logging
from ..database import mongo_reviews_collection_from_config, get_reviews_training_set, get_reviews_test_set
log = logging.getLogger(__name__)
def create_model_vanilla() -> nltk.sentiment.SentimentAnalyzer:
log.debug("Creating model...")
model = nltk.sentiment.SentimentAnalyzer()
log.debug("Created model %s!", model)
return model
def train_model_vanilla(model: nltk.sentiment.SentimentAnalyzer) -> None:
# TODO: This doesn't work yet
with mongo_reviews_collection_from_config() as reviews:
training_set = get_reviews_training_set(reviews)
log.debug("Marking negations...")
training_negated_set = list(map(nltk.sentiment.util.mark_negation, training_set))
log.debug("Extracting tokens...")
training_tokens = model.all_words(training_negated_set, labeled=False)
log.debug("Counting unigrams...")
training_unigrams = model.unigram_word_feats(words=training_tokens, min_freq=4)
log.debug("Configuring model features...")
model.add_feat_extractor(nltk.sentiment.util.extract_unigram_feats, unigrams=training_unigrams)
training_set = model.apply_features(documents=training_set)
log.info("Training model...")
model.train(trainer=nltk.classify.NaiveBayesClassifier.train, training_set=training_set)
def evaluate_model_vanilla(model: nltk.sentiment.SentimentAnalyzer):
with mongo_reviews_collection_from_config() as reviews:
test_set = get_reviews_test_set(reviews)
log.info("Evaluating model...")
model.evaluate(test_set)
# TODO
breakpoint()
__all__ = (
"create_model_vanilla",
"train_model_vanilla",
"evaluate_model_vanilla",
)

View file

@ -16,7 +16,7 @@ def MONGO_PORT(val: str | None) -> int:
""" """
The port of the MongoDB database to connect to. The port of the MongoDB database to connect to.
""" """
if not val: if val is None:
return 27017 return 27017
try: try:
return int(val) return int(val)
@ -24,12 +24,38 @@ def MONGO_PORT(val: str | None) -> int:
raise cfig.InvalidValueError("Not an int.") raise cfig.InvalidValueError("Not an int.")
@config.optional()
def SAMPLE_MODE(val: str | None) -> str:
"""
Whether `$sample` or `$limit` should be used to aggregate the training and test sets.
`$limit` is much faster, but not truly random, while `$sample` is completely random.
"""
if val is None:
return "$sample"
if val not in ["$sample", "$limit"]:
raise cfig.InvalidValueError("Neither $sample or $limit.")
return val
@config.optional() @config.optional()
def TRAINING_SET_SIZE(val: str | None) -> int: def TRAINING_SET_SIZE(val: str | None) -> int:
""" """
The number of reviews from each category to fetch for the training set. The number of reviews from each category to fetch for the training set.
""" """
if not val: if val is None:
return 1000
try:
return int(val)
except ValueError:
raise cfig.InvalidValueError("Not an int.")
@config.optional()
def TEST_SET_SIZE(val: str | None) -> int:
"""
The number of reviews to fetch for the test set.
"""
if val is None:
return 1000 return 1000
try: try:
return int(val) return int(val)
@ -41,4 +67,7 @@ __all__ = (
"config", "config",
"MONGO_HOST", "MONGO_HOST",
"MONGO_PORT", "MONGO_PORT",
"SAMPLE_MODE",
"TRAINING_SET_SIZE",
"TEST_SET_SIZE",
) )

View file

@ -3,8 +3,12 @@ import pymongo
import pymongo.collection import pymongo.collection
import contextlib import contextlib
import bson import bson
import logging
import random
from .config import MONGO_HOST, MONGO_PORT from .config import MONGO_HOST, MONGO_PORT, TRAINING_SET_SIZE, TEST_SET_SIZE, SAMPLE_MODE
log = logging.getLogger(__name__)
class Review(t.TypedDict): class Review(t.TypedDict):
@ -25,12 +29,18 @@ def mongo_client_from_config() -> t.ContextManager[pymongo.MongoClient]:
""" """
Create a new MongoDB client and yield it. Create a new MongoDB client and yield it.
""" """
log.debug("Opening connection to MongoDB...")
client = pymongo.MongoClient( client = pymongo.MongoClient(
host=MONGO_HOST.__resolved__, host=MONGO_HOST.__wrapped__,
port=MONGO_PORT.__resolved__, port=MONGO_PORT.__wrapped__,
) )
log.info("Opened connection to MongoDB: %s", client)
yield client yield client
log.info("Closing connection to MongoDB: %s", client)
client.close() client.close()
log.debug("Closed connection to MongoDB!")
@contextlib.contextmanager @contextlib.contextmanager
@ -39,16 +49,39 @@ def mongo_reviews_collection_from_config() -> pymongo.collection.Collection[Revi
Create a new MongoDB client, access the ``reviews`` collection in the ``reviews`` database, and yield it. Create a new MongoDB client, access the ``reviews`` collection in the ``reviews`` database, and yield it.
""" """
with mongo_client_from_config() as db: with mongo_client_from_config() as db:
yield db.reviews.reviews log.debug("Accessing the reviews collection...")
collection = db.reviews.reviews
log.debug("Collection accessed successfully: %s", collection)
yield collection
def pipeline_sample(collection: pymongo.collection.Collection, amount: int) -> list:
"""
Create pipeline stages for sampling random documents, either with true randomness or by skipping a random amount of them.
"""
if SAMPLE_MODE.__wrapped__ == "$sample":
return [
{"$sample": {"size": amount}},
]
elif SAMPLE_MODE.__wrapped__ == "$limit":
log.warning("USE_SAMPLE is disabled, sampling documents using $skip and $limit.")
skip = random.randint(0, collection.estimated_document_count(maxTimeMS=100))
return [
{"$skip": skip},
{"$limit": amount},
]
else:
raise ValueError("Unknown sample mode", SAMPLE_MODE)
def sample_reviews(reviews: pymongo.collection.Collection, amount: int) -> t.Iterable[Review]: def sample_reviews(reviews: pymongo.collection.Collection, amount: int) -> t.Iterable[Review]:
""" """
Get ``amount`` random reviews from the ``reviews`` collection. Get ``amount`` random reviews from the ``reviews`` collection.
""" """
log.debug("Getting a sample of %d reviews...", amount)
return reviews.aggregate([ return reviews.aggregate([
{"$sample": {"size": amount}} *pipeline_sample(reviews, amount),
]) ])
@ -56,10 +89,11 @@ def sample_reviews_by_rating(reviews: pymongo.collection.Collection, rating: flo
""" """
Get ``amount`` random reviews with ``rating`` stars from the ``reviews`` collection. Get ``amount`` random reviews with ``rating`` stars from the ``reviews`` collection.
""" """
log.debug("Getting a sample of %d reviews with %d stars...", amount, rating)
return reviews.aggregate([ return reviews.aggregate([
{"$match": {"overall": rating}}, {"$match": {"overall": rating}},
{"$sample": {"size": amount}}, *pipeline_sample(reviews, amount),
]) ])
@ -67,6 +101,7 @@ def sample_reviews_by_rating_polar(reviews: pymongo.collection.Collection, amoun
""" """
Get ``amount`` random reviews with either a 5-star or 1-star rating from the ``reviews`` collection. Get ``amount`` random reviews with either a 5-star or 1-star rating from the ``reviews`` collection.
""" """
log.debug("Getting a sample of %d reviews with either 5 or 1 stars...", amount)
return reviews.aggregate([ return reviews.aggregate([
{"$match": {"$match":
@ -77,14 +112,18 @@ def sample_reviews_by_rating_polar(reviews: pymongo.collection.Collection, amoun
] ]
}, },
}, },
{"$sample": {"size": amount}}, *pipeline_sample(reviews, amount),
]) ])
def get_reviews_training_set(reviews: pymongo.collection.Collection, amount: int) -> t.Iterable[Review]: def get_reviews_training_set(reviews: pymongo.collection.Collection) -> t.Iterable[Review]:
""" """
Get the subset of reviews that should act as training set. Get the subset of reviews that should act as training set.
""" """
log.info("Building training set...")
# Get the amount from the config
amount: int = TRAINING_SET_SIZE.__wrapped__
# Handle odd numbers # Handle odd numbers
positive_amount: int = amount // 2 positive_amount: int = amount // 2
@ -100,9 +139,25 @@ def get_reviews_training_set(reviews: pymongo.collection.Collection, amount: int
return both return both
def get_reviews_test_set(reviews: pymongo.collection.Collection, amount: int) -> t.Iterable[Review]: def get_reviews_test_set(reviews: pymongo.collection.Collection) -> t.Iterable[Review]:
""" """
Get the subset of reviews that should act as test set. Get the subset of reviews that should act as test set.
""" """
log.info("Building test set...")
amount: int = TEST_SET_SIZE.__wrapped__
return sample_reviews_by_rating_polar(reviews, amount) return sample_reviews_by_rating_polar(reviews, amount)
__all__ = (
"Review",
"mongo_client_from_config",
"mongo_reviews_collection_from_config",
"sample_reviews",
"sample_reviews_by_rating",
"sample_reviews_by_rating_polar",
"get_reviews_training_set",
"get_reviews_test_set",
)

35
unimore_bda_6/log.py Normal file
View file

@ -0,0 +1,35 @@
import logging
import coloredlogs
log = logging.getLogger(__name__)
def install_log_handler(logger: logging.Logger = None):
if logger is None:
logger = logging.getLogger("unimore_bda_6")
coloredlogs.install(
logger=logger,
level="DEBUG",
fmt="{asctime} | {name:<32} | {levelname:>8} | {message}",
style="{",
level_styles=dict(
debug=dict(color="white"),
info=dict(color="cyan"),
warning=dict(color="yellow"),
error=dict(color="red"),
critical=dict(color="red", bold=True),
),
field_styles=dict(
asctime=dict(color='magenta'),
levelname=dict(color='blue', bold=True),
name=dict(color='blue'),
),
isatty=True,
)
log.info("Installed custom log handler!")
__all__ = (
"install_log_handler",
)