mirror of
https://github.com/Steffo99/unimore-bda-6.git
synced 2024-11-21 23:44:19 +00:00
Make some progress
This commit is contained in:
parent
0f37d206a1
commit
2f7237ebfa
9 changed files with 243 additions and 28 deletions
47
poetry.lock
generated
47
poetry.lock
generated
|
@ -47,6 +47,24 @@ files = [
|
|||
{file = "colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "coloredlogs"
|
||||
version = "15.0.1"
|
||||
description = "Colored terminal output for Python's logging module"
|
||||
category = "main"
|
||||
optional = false
|
||||
python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*"
|
||||
files = [
|
||||
{file = "coloredlogs-15.0.1-py2.py3-none-any.whl", hash = "sha256:612ee75c546f53e92e70049c9dbfcc18c935a2b9a53b66085ce9ef6a6e5c0934"},
|
||||
{file = "coloredlogs-15.0.1.tar.gz", hash = "sha256:7c991aa71a4577af2f82600d8f8f3a89f936baeaf9b50a9c197da014e5bf16b0"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
humanfriendly = ">=9.1"
|
||||
|
||||
[package.extras]
|
||||
cron = ["capturer (>=2.4)"]
|
||||
|
||||
[[package]]
|
||||
name = "dnspython"
|
||||
version = "2.3.0"
|
||||
|
@ -68,6 +86,21 @@ idna = ["idna (>=2.1,<4.0)"]
|
|||
trio = ["trio (>=0.14,<0.23)"]
|
||||
wmi = ["wmi (>=1.5.1,<2.0.0)"]
|
||||
|
||||
[[package]]
|
||||
name = "humanfriendly"
|
||||
version = "10.0"
|
||||
description = "Human friendly output for text interfaces using Python"
|
||||
category = "main"
|
||||
optional = false
|
||||
python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*"
|
||||
files = [
|
||||
{file = "humanfriendly-10.0-py2.py3-none-any.whl", hash = "sha256:1697e1a8a8f550fd43c2865cd84542fc175a61dcb779b6fee18cf6b6ccba1477"},
|
||||
{file = "humanfriendly-10.0.tar.gz", hash = "sha256:6b0b831ce8f15f7300721aa49829fc4e83921a9a301cc7f606be6686a2288ddc"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
pyreadline3 = {version = "*", markers = "sys_platform == \"win32\" and python_version >= \"3.8\""}
|
||||
|
||||
[[package]]
|
||||
name = "joblib"
|
||||
version = "1.2.0"
|
||||
|
@ -247,6 +280,18 @@ ocsp = ["certifi", "pyopenssl (>=17.2.0)", "requests (<3.0.0)", "service-identit
|
|||
snappy = ["python-snappy"]
|
||||
zstd = ["zstandard"]
|
||||
|
||||
[[package]]
|
||||
name = "pyreadline3"
|
||||
version = "3.4.1"
|
||||
description = "A python implementation of GNU readline."
|
||||
category = "main"
|
||||
optional = false
|
||||
python-versions = "*"
|
||||
files = [
|
||||
{file = "pyreadline3-3.4.1-py3-none-any.whl", hash = "sha256:b0efb6516fd4fb07b45949053826a62fa4cb353db5be2bbb4a7aa1fdd1e345fb"},
|
||||
{file = "pyreadline3-3.4.1.tar.gz", hash = "sha256:6f3d1f7b8a31ba32b73917cefc1f28cc660562f39aea8646d30bd6eff21f7bae"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "regex"
|
||||
version = "2022.10.31"
|
||||
|
@ -369,4 +414,4 @@ telegram = ["requests"]
|
|||
[metadata]
|
||||
lock-version = "2.0"
|
||||
python-versions = "^3.10"
|
||||
content-hash = "baf82d3820a17be0b6fcd6b31d1627a6ab198c9c094a2ee0fc2b0caca2659a72"
|
||||
content-hash = "4545b19bfa3d0ad9a489c2bee037d0c317dd34ba5b9745375d3566d5fc68ef55"
|
||||
|
|
|
@ -135,6 +135,7 @@ python = "^3.10"
|
|||
pymongo = "^4.3.3"
|
||||
nltk = "^3.8.1"
|
||||
cfig = {extras = ["cli"], version = "^0.3.0"}
|
||||
coloredlogs = "^15.0.1"
|
||||
|
||||
|
||||
|
||||
|
|
|
@ -1,9 +1,15 @@
|
|||
from .database import create_mongo_client_from_config
|
||||
from .config import config
|
||||
from .analysis.vanilla import create_model_vanilla, train_model_vanilla, evaluate_model_vanilla
|
||||
from .log import install_log_handler
|
||||
|
||||
|
||||
def main():
|
||||
pass
|
||||
model = create_model_vanilla()
|
||||
train_model_vanilla(model)
|
||||
evaluate_model_vanilla(model)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
install_log_handler()
|
||||
config.proxies.resolve()
|
||||
main()
|
||||
|
|
|
@ -1,14 +0,0 @@
|
|||
import nltk
|
||||
import nltk.sentiment
|
||||
|
||||
|
||||
def create_sentiment_analysis_model() -> ntlk.sentiment.SentimentAnalyzer:
|
||||
analyzer = nltk.sentiment.SentimentAnalyzer()
|
||||
|
||||
|
||||
def train():
|
||||
...
|
||||
|
||||
|
||||
def test():
|
||||
...
|
0
unimore_bda_6/analysis/__init__.py
Normal file
0
unimore_bda_6/analysis/__init__.py
Normal file
58
unimore_bda_6/analysis/vanilla.py
Normal file
58
unimore_bda_6/analysis/vanilla.py
Normal file
|
@ -0,0 +1,58 @@
|
|||
import nltk
|
||||
import nltk.classify
|
||||
import nltk.sentiment
|
||||
import nltk.sentiment.util
|
||||
import logging
|
||||
|
||||
from ..database import mongo_reviews_collection_from_config, get_reviews_training_set, get_reviews_test_set
|
||||
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def create_model_vanilla() -> nltk.sentiment.SentimentAnalyzer:
|
||||
log.debug("Creating model...")
|
||||
model = nltk.sentiment.SentimentAnalyzer()
|
||||
log.debug("Created model %s!", model)
|
||||
return model
|
||||
|
||||
|
||||
def train_model_vanilla(model: nltk.sentiment.SentimentAnalyzer) -> None:
|
||||
# TODO: This doesn't work yet
|
||||
|
||||
with mongo_reviews_collection_from_config() as reviews:
|
||||
training_set = get_reviews_training_set(reviews)
|
||||
|
||||
log.debug("Marking negations...")
|
||||
training_negated_set = list(map(nltk.sentiment.util.mark_negation, training_set))
|
||||
|
||||
log.debug("Extracting tokens...")
|
||||
training_tokens = model.all_words(training_negated_set, labeled=False)
|
||||
|
||||
log.debug("Counting unigrams...")
|
||||
training_unigrams = model.unigram_word_feats(words=training_tokens, min_freq=4)
|
||||
|
||||
log.debug("Configuring model features...")
|
||||
model.add_feat_extractor(nltk.sentiment.util.extract_unigram_feats, unigrams=training_unigrams)
|
||||
training_set = model.apply_features(documents=training_set)
|
||||
|
||||
log.info("Training model...")
|
||||
model.train(trainer=nltk.classify.NaiveBayesClassifier.train, training_set=training_set)
|
||||
|
||||
|
||||
def evaluate_model_vanilla(model: nltk.sentiment.SentimentAnalyzer):
|
||||
with mongo_reviews_collection_from_config() as reviews:
|
||||
test_set = get_reviews_test_set(reviews)
|
||||
|
||||
log.info("Evaluating model...")
|
||||
model.evaluate(test_set)
|
||||
|
||||
# TODO
|
||||
breakpoint()
|
||||
|
||||
|
||||
__all__ = (
|
||||
"create_model_vanilla",
|
||||
"train_model_vanilla",
|
||||
"evaluate_model_vanilla",
|
||||
)
|
|
@ -16,7 +16,7 @@ def MONGO_PORT(val: str | None) -> int:
|
|||
"""
|
||||
The port of the MongoDB database to connect to.
|
||||
"""
|
||||
if not val:
|
||||
if val is None:
|
||||
return 27017
|
||||
try:
|
||||
return int(val)
|
||||
|
@ -24,12 +24,38 @@ def MONGO_PORT(val: str | None) -> int:
|
|||
raise cfig.InvalidValueError("Not an int.")
|
||||
|
||||
|
||||
@config.optional()
|
||||
def SAMPLE_MODE(val: str | None) -> str:
|
||||
"""
|
||||
Whether `$sample` or `$limit` should be used to aggregate the training and test sets.
|
||||
`$limit` is much faster, but not truly random, while `$sample` is completely random.
|
||||
"""
|
||||
if val is None:
|
||||
return "$sample"
|
||||
if val not in ["$sample", "$limit"]:
|
||||
raise cfig.InvalidValueError("Neither $sample or $limit.")
|
||||
return val
|
||||
|
||||
|
||||
@config.optional()
|
||||
def TRAINING_SET_SIZE(val: str | None) -> int:
|
||||
"""
|
||||
The number of reviews from each category to fetch for the training set.
|
||||
"""
|
||||
if not val:
|
||||
if val is None:
|
||||
return 1000
|
||||
try:
|
||||
return int(val)
|
||||
except ValueError:
|
||||
raise cfig.InvalidValueError("Not an int.")
|
||||
|
||||
|
||||
@config.optional()
|
||||
def TEST_SET_SIZE(val: str | None) -> int:
|
||||
"""
|
||||
The number of reviews to fetch for the test set.
|
||||
"""
|
||||
if val is None:
|
||||
return 1000
|
||||
try:
|
||||
return int(val)
|
||||
|
@ -41,4 +67,7 @@ __all__ = (
|
|||
"config",
|
||||
"MONGO_HOST",
|
||||
"MONGO_PORT",
|
||||
"SAMPLE_MODE",
|
||||
"TRAINING_SET_SIZE",
|
||||
"TEST_SET_SIZE",
|
||||
)
|
||||
|
|
|
@ -3,8 +3,12 @@ import pymongo
|
|||
import pymongo.collection
|
||||
import contextlib
|
||||
import bson
|
||||
import logging
|
||||
import random
|
||||
|
||||
from .config import MONGO_HOST, MONGO_PORT
|
||||
from .config import MONGO_HOST, MONGO_PORT, TRAINING_SET_SIZE, TEST_SET_SIZE, SAMPLE_MODE
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class Review(t.TypedDict):
|
||||
|
@ -25,12 +29,18 @@ def mongo_client_from_config() -> t.ContextManager[pymongo.MongoClient]:
|
|||
"""
|
||||
Create a new MongoDB client and yield it.
|
||||
"""
|
||||
log.debug("Opening connection to MongoDB...")
|
||||
client = pymongo.MongoClient(
|
||||
host=MONGO_HOST.__resolved__,
|
||||
port=MONGO_PORT.__resolved__,
|
||||
host=MONGO_HOST.__wrapped__,
|
||||
port=MONGO_PORT.__wrapped__,
|
||||
)
|
||||
log.info("Opened connection to MongoDB: %s", client)
|
||||
|
||||
yield client
|
||||
|
||||
log.info("Closing connection to MongoDB: %s", client)
|
||||
client.close()
|
||||
log.debug("Closed connection to MongoDB!")
|
||||
|
||||
|
||||
@contextlib.contextmanager
|
||||
|
@ -39,16 +49,39 @@ def mongo_reviews_collection_from_config() -> pymongo.collection.Collection[Revi
|
|||
Create a new MongoDB client, access the ``reviews`` collection in the ``reviews`` database, and yield it.
|
||||
"""
|
||||
with mongo_client_from_config() as db:
|
||||
yield db.reviews.reviews
|
||||
log.debug("Accessing the reviews collection...")
|
||||
collection = db.reviews.reviews
|
||||
log.debug("Collection accessed successfully: %s", collection)
|
||||
yield collection
|
||||
|
||||
|
||||
def pipeline_sample(collection: pymongo.collection.Collection, amount: int) -> list:
|
||||
"""
|
||||
Create pipeline stages for sampling random documents, either with true randomness or by skipping a random amount of them.
|
||||
"""
|
||||
if SAMPLE_MODE.__wrapped__ == "$sample":
|
||||
return [
|
||||
{"$sample": {"size": amount}},
|
||||
]
|
||||
elif SAMPLE_MODE.__wrapped__ == "$limit":
|
||||
log.warning("USE_SAMPLE is disabled, sampling documents using $skip and $limit.")
|
||||
skip = random.randint(0, collection.estimated_document_count(maxTimeMS=100))
|
||||
return [
|
||||
{"$skip": skip},
|
||||
{"$limit": amount},
|
||||
]
|
||||
else:
|
||||
raise ValueError("Unknown sample mode", SAMPLE_MODE)
|
||||
|
||||
|
||||
def sample_reviews(reviews: pymongo.collection.Collection, amount: int) -> t.Iterable[Review]:
|
||||
"""
|
||||
Get ``amount`` random reviews from the ``reviews`` collection.
|
||||
"""
|
||||
log.debug("Getting a sample of %d reviews...", amount)
|
||||
|
||||
return reviews.aggregate([
|
||||
{"$sample": {"size": amount}}
|
||||
*pipeline_sample(reviews, amount),
|
||||
])
|
||||
|
||||
|
||||
|
@ -56,10 +89,11 @@ def sample_reviews_by_rating(reviews: pymongo.collection.Collection, rating: flo
|
|||
"""
|
||||
Get ``amount`` random reviews with ``rating`` stars from the ``reviews`` collection.
|
||||
"""
|
||||
log.debug("Getting a sample of %d reviews with %d stars...", amount, rating)
|
||||
|
||||
return reviews.aggregate([
|
||||
{"$match": {"overall": rating}},
|
||||
{"$sample": {"size": amount}},
|
||||
*pipeline_sample(reviews, amount),
|
||||
])
|
||||
|
||||
|
||||
|
@ -67,6 +101,7 @@ def sample_reviews_by_rating_polar(reviews: pymongo.collection.Collection, amoun
|
|||
"""
|
||||
Get ``amount`` random reviews with either a 5-star or 1-star rating from the ``reviews`` collection.
|
||||
"""
|
||||
log.debug("Getting a sample of %d reviews with either 5 or 1 stars...", amount)
|
||||
|
||||
return reviews.aggregate([
|
||||
{"$match":
|
||||
|
@ -77,14 +112,18 @@ def sample_reviews_by_rating_polar(reviews: pymongo.collection.Collection, amoun
|
|||
]
|
||||
},
|
||||
},
|
||||
{"$sample": {"size": amount}},
|
||||
*pipeline_sample(reviews, amount),
|
||||
])
|
||||
|
||||
|
||||
def get_reviews_training_set(reviews: pymongo.collection.Collection, amount: int) -> t.Iterable[Review]:
|
||||
def get_reviews_training_set(reviews: pymongo.collection.Collection) -> t.Iterable[Review]:
|
||||
"""
|
||||
Get the subset of reviews that should act as training set.
|
||||
"""
|
||||
log.info("Building training set...")
|
||||
|
||||
# Get the amount from the config
|
||||
amount: int = TRAINING_SET_SIZE.__wrapped__
|
||||
|
||||
# Handle odd numbers
|
||||
positive_amount: int = amount // 2
|
||||
|
@ -100,9 +139,25 @@ def get_reviews_training_set(reviews: pymongo.collection.Collection, amount: int
|
|||
return both
|
||||
|
||||
|
||||
def get_reviews_test_set(reviews: pymongo.collection.Collection, amount: int) -> t.Iterable[Review]:
|
||||
def get_reviews_test_set(reviews: pymongo.collection.Collection) -> t.Iterable[Review]:
|
||||
"""
|
||||
Get the subset of reviews that should act as test set.
|
||||
"""
|
||||
|
||||
log.info("Building test set...")
|
||||
|
||||
amount: int = TEST_SET_SIZE.__wrapped__
|
||||
|
||||
return sample_reviews_by_rating_polar(reviews, amount)
|
||||
|
||||
|
||||
__all__ = (
|
||||
"Review",
|
||||
"mongo_client_from_config",
|
||||
"mongo_reviews_collection_from_config",
|
||||
"sample_reviews",
|
||||
"sample_reviews_by_rating",
|
||||
"sample_reviews_by_rating_polar",
|
||||
"get_reviews_training_set",
|
||||
"get_reviews_test_set",
|
||||
)
|
||||
|
|
35
unimore_bda_6/log.py
Normal file
35
unimore_bda_6/log.py
Normal file
|
@ -0,0 +1,35 @@
|
|||
import logging
|
||||
import coloredlogs
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def install_log_handler(logger: logging.Logger = None):
|
||||
if logger is None:
|
||||
logger = logging.getLogger("unimore_bda_6")
|
||||
|
||||
coloredlogs.install(
|
||||
logger=logger,
|
||||
level="DEBUG",
|
||||
fmt="{asctime} | {name:<32} | {levelname:>8} | {message}",
|
||||
style="{",
|
||||
level_styles=dict(
|
||||
debug=dict(color="white"),
|
||||
info=dict(color="cyan"),
|
||||
warning=dict(color="yellow"),
|
||||
error=dict(color="red"),
|
||||
critical=dict(color="red", bold=True),
|
||||
),
|
||||
field_styles=dict(
|
||||
asctime=dict(color='magenta'),
|
||||
levelname=dict(color='blue', bold=True),
|
||||
name=dict(color='blue'),
|
||||
),
|
||||
isatty=True,
|
||||
)
|
||||
log.info("Installed custom log handler!")
|
||||
|
||||
|
||||
__all__ = (
|
||||
"install_log_handler",
|
||||
)
|
Loading…
Reference in a new issue