1
Fork 0
mirror of https://github.com/Steffo99/unimore-bda-6.git synced 2024-11-21 23:44:19 +00:00

Make some progress

This commit is contained in:
Steffo 2023-02-01 17:46:25 +01:00
parent 0f37d206a1
commit 2f7237ebfa
Signed by: steffo
GPG key ID: 2A24051445686895
9 changed files with 243 additions and 28 deletions

47
poetry.lock generated
View file

@ -47,6 +47,24 @@ files = [
{file = "colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44"},
]
[[package]]
name = "coloredlogs"
version = "15.0.1"
description = "Colored terminal output for Python's logging module"
category = "main"
optional = false
python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*"
files = [
{file = "coloredlogs-15.0.1-py2.py3-none-any.whl", hash = "sha256:612ee75c546f53e92e70049c9dbfcc18c935a2b9a53b66085ce9ef6a6e5c0934"},
{file = "coloredlogs-15.0.1.tar.gz", hash = "sha256:7c991aa71a4577af2f82600d8f8f3a89f936baeaf9b50a9c197da014e5bf16b0"},
]
[package.dependencies]
humanfriendly = ">=9.1"
[package.extras]
cron = ["capturer (>=2.4)"]
[[package]]
name = "dnspython"
version = "2.3.0"
@ -68,6 +86,21 @@ idna = ["idna (>=2.1,<4.0)"]
trio = ["trio (>=0.14,<0.23)"]
wmi = ["wmi (>=1.5.1,<2.0.0)"]
[[package]]
name = "humanfriendly"
version = "10.0"
description = "Human friendly output for text interfaces using Python"
category = "main"
optional = false
python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*"
files = [
{file = "humanfriendly-10.0-py2.py3-none-any.whl", hash = "sha256:1697e1a8a8f550fd43c2865cd84542fc175a61dcb779b6fee18cf6b6ccba1477"},
{file = "humanfriendly-10.0.tar.gz", hash = "sha256:6b0b831ce8f15f7300721aa49829fc4e83921a9a301cc7f606be6686a2288ddc"},
]
[package.dependencies]
pyreadline3 = {version = "*", markers = "sys_platform == \"win32\" and python_version >= \"3.8\""}
[[package]]
name = "joblib"
version = "1.2.0"
@ -247,6 +280,18 @@ ocsp = ["certifi", "pyopenssl (>=17.2.0)", "requests (<3.0.0)", "service-identit
snappy = ["python-snappy"]
zstd = ["zstandard"]
[[package]]
name = "pyreadline3"
version = "3.4.1"
description = "A python implementation of GNU readline."
category = "main"
optional = false
python-versions = "*"
files = [
{file = "pyreadline3-3.4.1-py3-none-any.whl", hash = "sha256:b0efb6516fd4fb07b45949053826a62fa4cb353db5be2bbb4a7aa1fdd1e345fb"},
{file = "pyreadline3-3.4.1.tar.gz", hash = "sha256:6f3d1f7b8a31ba32b73917cefc1f28cc660562f39aea8646d30bd6eff21f7bae"},
]
[[package]]
name = "regex"
version = "2022.10.31"
@ -369,4 +414,4 @@ telegram = ["requests"]
[metadata]
lock-version = "2.0"
python-versions = "^3.10"
content-hash = "baf82d3820a17be0b6fcd6b31d1627a6ab198c9c094a2ee0fc2b0caca2659a72"
content-hash = "4545b19bfa3d0ad9a489c2bee037d0c317dd34ba5b9745375d3566d5fc68ef55"

View file

@ -135,6 +135,7 @@ python = "^3.10"
pymongo = "^4.3.3"
nltk = "^3.8.1"
cfig = {extras = ["cli"], version = "^0.3.0"}
coloredlogs = "^15.0.1"

View file

@ -1,9 +1,15 @@
from .database import create_mongo_client_from_config
from .config import config
from .analysis.vanilla import create_model_vanilla, train_model_vanilla, evaluate_model_vanilla
from .log import install_log_handler
def main():
pass
model = create_model_vanilla()
train_model_vanilla(model)
evaluate_model_vanilla(model)
if __name__ == "__main__":
install_log_handler()
config.proxies.resolve()
main()

View file

@ -1,14 +0,0 @@
import nltk
import nltk.sentiment
def create_sentiment_analysis_model() -> ntlk.sentiment.SentimentAnalyzer:
analyzer = nltk.sentiment.SentimentAnalyzer()
def train():
...
def test():
...

View file

View file

@ -0,0 +1,58 @@
import nltk
import nltk.classify
import nltk.sentiment
import nltk.sentiment.util
import logging
from ..database import mongo_reviews_collection_from_config, get_reviews_training_set, get_reviews_test_set
log = logging.getLogger(__name__)
def create_model_vanilla() -> nltk.sentiment.SentimentAnalyzer:
log.debug("Creating model...")
model = nltk.sentiment.SentimentAnalyzer()
log.debug("Created model %s!", model)
return model
def train_model_vanilla(model: nltk.sentiment.SentimentAnalyzer) -> None:
# TODO: This doesn't work yet
with mongo_reviews_collection_from_config() as reviews:
training_set = get_reviews_training_set(reviews)
log.debug("Marking negations...")
training_negated_set = list(map(nltk.sentiment.util.mark_negation, training_set))
log.debug("Extracting tokens...")
training_tokens = model.all_words(training_negated_set, labeled=False)
log.debug("Counting unigrams...")
training_unigrams = model.unigram_word_feats(words=training_tokens, min_freq=4)
log.debug("Configuring model features...")
model.add_feat_extractor(nltk.sentiment.util.extract_unigram_feats, unigrams=training_unigrams)
training_set = model.apply_features(documents=training_set)
log.info("Training model...")
model.train(trainer=nltk.classify.NaiveBayesClassifier.train, training_set=training_set)
def evaluate_model_vanilla(model: nltk.sentiment.SentimentAnalyzer):
with mongo_reviews_collection_from_config() as reviews:
test_set = get_reviews_test_set(reviews)
log.info("Evaluating model...")
model.evaluate(test_set)
# TODO
breakpoint()
__all__ = (
"create_model_vanilla",
"train_model_vanilla",
"evaluate_model_vanilla",
)

View file

@ -16,7 +16,7 @@ def MONGO_PORT(val: str | None) -> int:
"""
The port of the MongoDB database to connect to.
"""
if not val:
if val is None:
return 27017
try:
return int(val)
@ -24,12 +24,38 @@ def MONGO_PORT(val: str | None) -> int:
raise cfig.InvalidValueError("Not an int.")
@config.optional()
def SAMPLE_MODE(val: str | None) -> str:
"""
Whether `$sample` or `$limit` should be used to aggregate the training and test sets.
`$limit` is much faster, but not truly random, while `$sample` is completely random.
"""
if val is None:
return "$sample"
if val not in ["$sample", "$limit"]:
raise cfig.InvalidValueError("Neither $sample or $limit.")
return val
@config.optional()
def TRAINING_SET_SIZE(val: str | None) -> int:
"""
The number of reviews from each category to fetch for the training set.
"""
if not val:
if val is None:
return 1000
try:
return int(val)
except ValueError:
raise cfig.InvalidValueError("Not an int.")
@config.optional()
def TEST_SET_SIZE(val: str | None) -> int:
"""
The number of reviews to fetch for the test set.
"""
if val is None:
return 1000
try:
return int(val)
@ -41,4 +67,7 @@ __all__ = (
"config",
"MONGO_HOST",
"MONGO_PORT",
"SAMPLE_MODE",
"TRAINING_SET_SIZE",
"TEST_SET_SIZE",
)

View file

@ -3,8 +3,12 @@ import pymongo
import pymongo.collection
import contextlib
import bson
import logging
import random
from .config import MONGO_HOST, MONGO_PORT
from .config import MONGO_HOST, MONGO_PORT, TRAINING_SET_SIZE, TEST_SET_SIZE, SAMPLE_MODE
log = logging.getLogger(__name__)
class Review(t.TypedDict):
@ -25,12 +29,18 @@ def mongo_client_from_config() -> t.ContextManager[pymongo.MongoClient]:
"""
Create a new MongoDB client and yield it.
"""
log.debug("Opening connection to MongoDB...")
client = pymongo.MongoClient(
host=MONGO_HOST.__resolved__,
port=MONGO_PORT.__resolved__,
host=MONGO_HOST.__wrapped__,
port=MONGO_PORT.__wrapped__,
)
log.info("Opened connection to MongoDB: %s", client)
yield client
log.info("Closing connection to MongoDB: %s", client)
client.close()
log.debug("Closed connection to MongoDB!")
@contextlib.contextmanager
@ -39,16 +49,39 @@ def mongo_reviews_collection_from_config() -> pymongo.collection.Collection[Revi
Create a new MongoDB client, access the ``reviews`` collection in the ``reviews`` database, and yield it.
"""
with mongo_client_from_config() as db:
yield db.reviews.reviews
log.debug("Accessing the reviews collection...")
collection = db.reviews.reviews
log.debug("Collection accessed successfully: %s", collection)
yield collection
def pipeline_sample(collection: pymongo.collection.Collection, amount: int) -> list:
"""
Create pipeline stages for sampling random documents, either with true randomness or by skipping a random amount of them.
"""
if SAMPLE_MODE.__wrapped__ == "$sample":
return [
{"$sample": {"size": amount}},
]
elif SAMPLE_MODE.__wrapped__ == "$limit":
log.warning("USE_SAMPLE is disabled, sampling documents using $skip and $limit.")
skip = random.randint(0, collection.estimated_document_count(maxTimeMS=100))
return [
{"$skip": skip},
{"$limit": amount},
]
else:
raise ValueError("Unknown sample mode", SAMPLE_MODE)
def sample_reviews(reviews: pymongo.collection.Collection, amount: int) -> t.Iterable[Review]:
"""
Get ``amount`` random reviews from the ``reviews`` collection.
"""
log.debug("Getting a sample of %d reviews...", amount)
return reviews.aggregate([
{"$sample": {"size": amount}}
*pipeline_sample(reviews, amount),
])
@ -56,10 +89,11 @@ def sample_reviews_by_rating(reviews: pymongo.collection.Collection, rating: flo
"""
Get ``amount`` random reviews with ``rating`` stars from the ``reviews`` collection.
"""
log.debug("Getting a sample of %d reviews with %d stars...", amount, rating)
return reviews.aggregate([
{"$match": {"overall": rating}},
{"$sample": {"size": amount}},
*pipeline_sample(reviews, amount),
])
@ -67,6 +101,7 @@ def sample_reviews_by_rating_polar(reviews: pymongo.collection.Collection, amoun
"""
Get ``amount`` random reviews with either a 5-star or 1-star rating from the ``reviews`` collection.
"""
log.debug("Getting a sample of %d reviews with either 5 or 1 stars...", amount)
return reviews.aggregate([
{"$match":
@ -77,14 +112,18 @@ def sample_reviews_by_rating_polar(reviews: pymongo.collection.Collection, amoun
]
},
},
{"$sample": {"size": amount}},
*pipeline_sample(reviews, amount),
])
def get_reviews_training_set(reviews: pymongo.collection.Collection, amount: int) -> t.Iterable[Review]:
def get_reviews_training_set(reviews: pymongo.collection.Collection) -> t.Iterable[Review]:
"""
Get the subset of reviews that should act as training set.
"""
log.info("Building training set...")
# Get the amount from the config
amount: int = TRAINING_SET_SIZE.__wrapped__
# Handle odd numbers
positive_amount: int = amount // 2
@ -100,9 +139,25 @@ def get_reviews_training_set(reviews: pymongo.collection.Collection, amount: int
return both
def get_reviews_test_set(reviews: pymongo.collection.Collection, amount: int) -> t.Iterable[Review]:
def get_reviews_test_set(reviews: pymongo.collection.Collection) -> t.Iterable[Review]:
"""
Get the subset of reviews that should act as test set.
"""
log.info("Building test set...")
amount: int = TEST_SET_SIZE.__wrapped__
return sample_reviews_by_rating_polar(reviews, amount)
__all__ = (
"Review",
"mongo_client_from_config",
"mongo_reviews_collection_from_config",
"sample_reviews",
"sample_reviews_by_rating",
"sample_reviews_by_rating_polar",
"get_reviews_training_set",
"get_reviews_test_set",
)

35
unimore_bda_6/log.py Normal file
View file

@ -0,0 +1,35 @@
import logging
import coloredlogs
log = logging.getLogger(__name__)
def install_log_handler(logger: logging.Logger = None):
if logger is None:
logger = logging.getLogger("unimore_bda_6")
coloredlogs.install(
logger=logger,
level="DEBUG",
fmt="{asctime} | {name:<32} | {levelname:>8} | {message}",
style="{",
level_styles=dict(
debug=dict(color="white"),
info=dict(color="cyan"),
warning=dict(color="yellow"),
error=dict(color="red"),
critical=dict(color="red", bold=True),
),
field_styles=dict(
asctime=dict(color='magenta'),
levelname=dict(color='blue', bold=True),
name=dict(color='blue'),
),
isatty=True,
)
log.info("Installed custom log handler!")
__all__ = (
"install_log_handler",
)