mirror of
https://github.com/Steffo99/unimore-bda-6.git
synced 2024-11-21 23:44:19 +00:00
Working prototype
This commit is contained in:
parent
2f7237ebfa
commit
14d1e1a22f
13 changed files with 254 additions and 90 deletions
1
.gitignore
vendored
1
.gitignore
vendored
|
@ -9,6 +9,7 @@
|
|||
|
||||
data/raw/
|
||||
data/db/
|
||||
data/nltk/
|
||||
|
||||
##################
|
||||
# Python ignores #
|
||||
|
|
3
.idea/dictionaries/steffo.xml
Normal file
3
.idea/dictionaries/steffo.xml
Normal file
|
@ -0,0 +1,3 @@
|
|||
<component name="ProjectDictionaryState">
|
||||
<dictionary name="steffo" />
|
||||
</component>
|
26
.idea/runConfigurations/unimore_bda_6.xml
Normal file
26
.idea/runConfigurations/unimore_bda_6.xml
Normal file
|
@ -0,0 +1,26 @@
|
|||
<component name="ProjectRunConfigurationManager">
|
||||
<configuration default="false" name="unimore_bda_6" type="PythonConfigurationType" factoryName="Python" nameIsGenerated="true">
|
||||
<module name="unimore-bda-6" />
|
||||
<option name="INTERPRETER_OPTIONS" value="" />
|
||||
<option name="PARENT_ENVS" value="true" />
|
||||
<envs>
|
||||
<env name="NLTK_DATA" value="./data/nltk" />
|
||||
<env name="PYTHONUNBUFFERED" value="1" />
|
||||
</envs>
|
||||
<option name="SDK_HOME" value="$PROJECT_DIR$/.venv/bin/python" />
|
||||
<option name="SDK_NAME" value="Poetry (unimore-bda-6)" />
|
||||
<option name="WORKING_DIRECTORY" value="$PROJECT_DIR$" />
|
||||
<option name="IS_MODULE_SDK" value="false" />
|
||||
<option name="ADD_CONTENT_ROOTS" value="true" />
|
||||
<option name="ADD_SOURCE_ROOTS" value="true" />
|
||||
<EXTENSION ID="PythonCoverageRunConfigurationExtension" runner="coverage.py" />
|
||||
<option name="SCRIPT_NAME" value="unimore_bda_6" />
|
||||
<option name="PARAMETERS" value="" />
|
||||
<option name="SHOW_COMMAND_LINE" value="false" />
|
||||
<option name="EMULATE_TERMINAL" value="false" />
|
||||
<option name="MODULE_MODE" value="true" />
|
||||
<option name="REDIRECT_INPUT" value="false" />
|
||||
<option name="INPUT_FILE" value="" />
|
||||
<method v="2" />
|
||||
</configuration>
|
||||
</component>
|
19
.vscode/launch.json
vendored
Normal file
19
.vscode/launch.json
vendored
Normal file
|
@ -0,0 +1,19 @@
|
|||
{
|
||||
// Use IntelliSense to learn about possible attributes.
|
||||
// Hover to view descriptions of existing attributes.
|
||||
// For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
|
||||
"version": "0.2.0",
|
||||
"configurations": [
|
||||
{
|
||||
"name": "Python: unimore_bda_6",
|
||||
"type": "python",
|
||||
"request": "launch",
|
||||
"module": "unimore_bda_6",
|
||||
"justMyCode": true,
|
||||
"env": {
|
||||
"NLTK_DATA": "./data/nltk",
|
||||
},
|
||||
"cwd": "${workspaceFolder}",
|
||||
}
|
||||
]
|
||||
}
|
4
data/scripts/download-nltk.sh
Executable file
4
data/scripts/download-nltk.sh
Executable file
|
@ -0,0 +1,4 @@
|
|||
#!/usr/bin/env bash
|
||||
repo=$(git rev-parse --show-toplevel)
|
||||
export NLTK_DATA="$repo/data/nltk"
|
||||
"$repo/.venv/bin/python" -m nltk.downloader popular
|
8
data/scripts/index-db.mongodb
Normal file
8
data/scripts/index-db.mongodb
Normal file
|
@ -0,0 +1,8 @@
|
|||
db.reviews.createIndex(
|
||||
{
|
||||
overall: 1,
|
||||
},
|
||||
{
|
||||
name: "rating_index"
|
||||
}
|
||||
)
|
|
@ -6,6 +6,7 @@
|
|||
<sourceFolder url="file://$MODULE_DIR$/unimore_bda_6" isTestSource="false" />
|
||||
<excludeFolder url="file://$MODULE_DIR$/data/db" />
|
||||
<excludeFolder url="file://$MODULE_DIR$/data/raw" />
|
||||
<excludeFolder url="file://$MODULE_DIR$/data/nltk" />
|
||||
</content>
|
||||
<orderEntry type="jdk" jdkName="Poetry (unimore-bda-6)" jdkType="Python SDK" />
|
||||
<orderEntry type="sourceFolder" forTests="false" />
|
||||
|
|
|
@ -1,5 +0,0 @@
|
|||
# If you are building a **library**, use this file to export objects!
|
||||
|
||||
__all__ = (
|
||||
# "",
|
||||
)
|
|
@ -1,12 +1,23 @@
|
|||
from .config import config
|
||||
from .analysis.vanilla import create_model_vanilla, train_model_vanilla, evaluate_model_vanilla
|
||||
from .database import mongo_reviews_collection_from_config, get_training_reviews, get_test_reviews
|
||||
from .analysis.vanilla import VanillaReviewSA
|
||||
from .log import install_log_handler
|
||||
|
||||
|
||||
def main():
|
||||
model = create_model_vanilla()
|
||||
train_model_vanilla(model)
|
||||
evaluate_model_vanilla(model)
|
||||
with mongo_reviews_collection_from_config() as reviews:
|
||||
training_reviews = get_training_reviews(collection=reviews)
|
||||
test_reviews = get_test_reviews(collection=reviews)
|
||||
|
||||
model = VanillaReviewSA()
|
||||
model.train(training_reviews)
|
||||
|
||||
evaluation = model.evaluate(test_reviews)
|
||||
print(evaluation)
|
||||
|
||||
while True:
|
||||
classification = model.use(input())
|
||||
print(classification)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
|
54
unimore_bda_6/analysis/base.py
Normal file
54
unimore_bda_6/analysis/base.py
Normal file
|
@ -0,0 +1,54 @@
|
|||
import abc
|
||||
|
||||
|
||||
class BaseSA(metaclass=abc.ABCMeta):
|
||||
"""
|
||||
Abstract base class for sentiment analyzers implemented in this project.
|
||||
"""
|
||||
|
||||
def __init__(self) -> None:
|
||||
"""
|
||||
Create the empty shell of the sentiment analyzer.
|
||||
"""
|
||||
|
||||
self.trained = False
|
||||
"If :meth:`train` has been called at least once, and the analyzer is ready or not to be evaluated or used."
|
||||
|
||||
@abc.abstractmethod
|
||||
def train(self, training_set) -> None:
|
||||
"""
|
||||
Train the analyzer with the given training set.
|
||||
"""
|
||||
raise NotImplementedError()
|
||||
|
||||
@abc.abstractmethod
|
||||
def evaluate(self, test_set) -> None:
|
||||
"""
|
||||
Evaluate the analyzer with the given test set.
|
||||
"""
|
||||
raise NotImplementedError()
|
||||
|
||||
@abc.abstractmethod
|
||||
def use(self, text: str) -> str:
|
||||
"""
|
||||
Use the sentiment analyzer.
|
||||
"""
|
||||
raise NotImplementedError()
|
||||
|
||||
|
||||
class AlreadyTrainedError(Exception):
|
||||
"""
|
||||
This model has already been trained and cannot be trained again.
|
||||
"""
|
||||
|
||||
class NotTrainedError(Exception):
|
||||
"""
|
||||
This model has not been trained yet.
|
||||
"""
|
||||
|
||||
|
||||
__all__ = (
|
||||
"BaseSA",
|
||||
"AlreadyTrainedError",
|
||||
"NotTrainedError",
|
||||
)
|
|
@ -1,58 +1,118 @@
|
|||
import abc
|
||||
import nltk
|
||||
import nltk.classify
|
||||
import nltk.sentiment
|
||||
import nltk.sentiment.util
|
||||
import logging
|
||||
import typing as t
|
||||
|
||||
from ..database import mongo_reviews_collection_from_config, get_reviews_training_set, get_reviews_test_set
|
||||
from ..database import Review
|
||||
from .base import BaseSA, AlreadyTrainedError, NotTrainedError
|
||||
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def create_model_vanilla() -> nltk.sentiment.SentimentAnalyzer:
|
||||
log.debug("Creating model...")
|
||||
model = nltk.sentiment.SentimentAnalyzer()
|
||||
log.debug("Created model %s!", model)
|
||||
return model
|
||||
class VanillaSA(BaseSA, metaclass=abc.ABCMeta):
|
||||
"""
|
||||
A sentiment analyzer resembling the one implemented in structure the one implemented in the classroom, using the basic sentiment analyzer of NLTK.
|
||||
"""
|
||||
|
||||
def __init__(self, language="english") -> None:
|
||||
super().__init__()
|
||||
self.language: str = language
|
||||
self.model: nltk.sentiment.SentimentAnalyzer = nltk.sentiment.SentimentAnalyzer()
|
||||
|
||||
def _tokenize_text(self, text: str) -> list[str]:
|
||||
"""
|
||||
Convert a text string into a list of tokens, using the language of the model.
|
||||
"""
|
||||
tokens = nltk.word_tokenize(text, language=self.language)
|
||||
nltk.sentiment.util.mark_negation(tokens, shallow=True)
|
||||
return tokens
|
||||
|
||||
def __add_feature_unigrams(self, training_set: list[tuple[list[str], str]]) -> None:
|
||||
"""
|
||||
Add the `nltk.sentiment.util.extract_unigram_feats` feature to the model.
|
||||
"""
|
||||
all_words = self.model.all_words(training_set, labeled=True)
|
||||
unigrams = self.model.unigram_word_feats(words=all_words, min_freq=4)
|
||||
self.model.add_feat_extractor(nltk.sentiment.util.extract_unigram_feats, unigrams=unigrams)
|
||||
|
||||
def _featurize_documents(self, documents: list[tuple[list[str], str]]):
|
||||
"""
|
||||
Apply features to a document.
|
||||
"""
|
||||
return self.model.apply_features(documents, labeled=True)
|
||||
|
||||
def _train_with_set(self, training_set: list[tuple[list[str], str]]) -> None:
|
||||
"""
|
||||
Train the model with the given **pre-classified but not pre-tokenized** training set.
|
||||
"""
|
||||
if self.trained:
|
||||
raise AlreadyTrainedError()
|
||||
|
||||
self.__add_feature_unigrams(training_set)
|
||||
training_set_with_features = self._featurize_documents(training_set)
|
||||
|
||||
self.model.train(trainer=nltk.classify.NaiveBayesClassifier.train, training_set=training_set_with_features)
|
||||
self.trained = True
|
||||
|
||||
def _evaluate_with_set(self, test_set: list[tuple[list[str], str]]) -> dict:
|
||||
if not self.trained:
|
||||
raise NotTrainedError()
|
||||
|
||||
test_set_with_features = self._featurize_documents(test_set)
|
||||
return self.model.evaluate(test_set_with_features)
|
||||
|
||||
def _use_with_tokens(self, tokens: list[str]) -> str:
|
||||
if not self.trained:
|
||||
raise NotTrainedError()
|
||||
|
||||
return self.model.classify(instance=tokens)
|
||||
|
||||
|
||||
def train_model_vanilla(model: nltk.sentiment.SentimentAnalyzer) -> None:
|
||||
# TODO: This doesn't work yet
|
||||
class VanillaReviewSA(VanillaSA):
|
||||
"""
|
||||
A `VanillaSA` to be used with `Review`s.
|
||||
"""
|
||||
|
||||
with mongo_reviews_collection_from_config() as reviews:
|
||||
training_set = get_reviews_training_set(reviews)
|
||||
@staticmethod
|
||||
def _rating_to_label(rating: float) -> str:
|
||||
"""
|
||||
Return the label corresponding to the given rating.
|
||||
|
||||
log.debug("Marking negations...")
|
||||
training_negated_set = list(map(nltk.sentiment.util.mark_negation, training_set))
|
||||
Possible categories are:
|
||||
* negative (0.0 <= rating < 2.5)
|
||||
* mixed (2.5 <= rating <= 3.5)
|
||||
* positive (3.5 < rating <= 5.0)
|
||||
"""
|
||||
if rating < 2.5:
|
||||
return "negative"
|
||||
elif rating <= 3.5:
|
||||
return "mixed"
|
||||
else:
|
||||
return "positive"
|
||||
|
||||
log.debug("Extracting tokens...")
|
||||
training_tokens = model.all_words(training_negated_set, labeled=False)
|
||||
def _review_to_data_set(self, review: Review) -> tuple[list[str], str]:
|
||||
"""
|
||||
Convert a review to a NLTK-compatible dataset.
|
||||
"""
|
||||
return self._tokenize_text(text=review["reviewText"]), self._rating_to_label(rating=review["overall"])
|
||||
|
||||
log.debug("Counting unigrams...")
|
||||
training_unigrams = model.unigram_word_feats(words=training_tokens, min_freq=4)
|
||||
def train(self, reviews: t.Iterable[Review]) -> None:
|
||||
data_set = list(map(self._review_to_data_set, reviews))
|
||||
self._train_with_set(data_set)
|
||||
|
||||
log.debug("Configuring model features...")
|
||||
model.add_feat_extractor(nltk.sentiment.util.extract_unigram_feats, unigrams=training_unigrams)
|
||||
training_set = model.apply_features(documents=training_set)
|
||||
def evaluate(self, reviews: t.Iterable[Review]):
|
||||
data_set = list(map(self._review_to_data_set, reviews))
|
||||
return self._evaluate_with_set(data_set)
|
||||
|
||||
log.info("Training model...")
|
||||
model.train(trainer=nltk.classify.NaiveBayesClassifier.train, training_set=training_set)
|
||||
|
||||
|
||||
def evaluate_model_vanilla(model: nltk.sentiment.SentimentAnalyzer):
|
||||
with mongo_reviews_collection_from_config() as reviews:
|
||||
test_set = get_reviews_test_set(reviews)
|
||||
|
||||
log.info("Evaluating model...")
|
||||
model.evaluate(test_set)
|
||||
|
||||
# TODO
|
||||
breakpoint()
|
||||
def use(self, text: str) -> str:
|
||||
return self._use_with_tokens(self._tokenize_text(text))
|
||||
|
||||
|
||||
__all__ = (
|
||||
"create_model_vanilla",
|
||||
"train_model_vanilla",
|
||||
"evaluate_model_vanilla",
|
||||
"VanillaSA",
|
||||
"VanillaReviewSA",
|
||||
)
|
||||
|
|
|
@ -7,6 +7,8 @@ config = cfig.Configuration()
|
|||
def MONGO_HOST(val: str | None) -> str:
|
||||
"""
|
||||
The hostname of the MongoDB database to connect to.
|
||||
|
||||
Defaults to `"127.0.0.1"`.
|
||||
"""
|
||||
return val or "127.0.0.1"
|
||||
|
||||
|
@ -15,6 +17,8 @@ def MONGO_HOST(val: str | None) -> str:
|
|||
def MONGO_PORT(val: str | None) -> int:
|
||||
"""
|
||||
The port of the MongoDB database to connect to.
|
||||
|
||||
Defaults to `27017`.
|
||||
"""
|
||||
if val is None:
|
||||
return 27017
|
||||
|
@ -24,23 +28,12 @@ def MONGO_PORT(val: str | None) -> int:
|
|||
raise cfig.InvalidValueError("Not an int.")
|
||||
|
||||
|
||||
@config.optional()
|
||||
def SAMPLE_MODE(val: str | None) -> str:
|
||||
"""
|
||||
Whether `$sample` or `$limit` should be used to aggregate the training and test sets.
|
||||
`$limit` is much faster, but not truly random, while `$sample` is completely random.
|
||||
"""
|
||||
if val is None:
|
||||
return "$sample"
|
||||
if val not in ["$sample", "$limit"]:
|
||||
raise cfig.InvalidValueError("Neither $sample or $limit.")
|
||||
return val
|
||||
|
||||
|
||||
@config.optional()
|
||||
def TRAINING_SET_SIZE(val: str | None) -> int:
|
||||
"""
|
||||
The number of reviews from each category to fetch for the training set.
|
||||
|
||||
Defaults to `1000`.
|
||||
"""
|
||||
if val is None:
|
||||
return 1000
|
||||
|
@ -54,6 +47,8 @@ def TRAINING_SET_SIZE(val: str | None) -> int:
|
|||
def TEST_SET_SIZE(val: str | None) -> int:
|
||||
"""
|
||||
The number of reviews to fetch for the test set.
|
||||
|
||||
Defaults to `1000`.
|
||||
"""
|
||||
if val is None:
|
||||
return 1000
|
||||
|
@ -67,7 +62,11 @@ __all__ = (
|
|||
"config",
|
||||
"MONGO_HOST",
|
||||
"MONGO_PORT",
|
||||
"SAMPLE_MODE",
|
||||
"TRAINING_SET_SIZE",
|
||||
"TEST_SET_SIZE",
|
||||
"NLTK_DOUBLE_NEG_SWITCH",
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
config.cli()
|
||||
|
|
|
@ -4,9 +4,8 @@ import pymongo.collection
|
|||
import contextlib
|
||||
import bson
|
||||
import logging
|
||||
import random
|
||||
|
||||
from .config import MONGO_HOST, MONGO_PORT, TRAINING_SET_SIZE, TEST_SET_SIZE, SAMPLE_MODE
|
||||
from .config import MONGO_HOST, MONGO_PORT, TRAINING_SET_SIZE, TEST_SET_SIZE
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
@ -55,25 +54,6 @@ def mongo_reviews_collection_from_config() -> pymongo.collection.Collection[Revi
|
|||
yield collection
|
||||
|
||||
|
||||
def pipeline_sample(collection: pymongo.collection.Collection, amount: int) -> list:
|
||||
"""
|
||||
Create pipeline stages for sampling random documents, either with true randomness or by skipping a random amount of them.
|
||||
"""
|
||||
if SAMPLE_MODE.__wrapped__ == "$sample":
|
||||
return [
|
||||
{"$sample": {"size": amount}},
|
||||
]
|
||||
elif SAMPLE_MODE.__wrapped__ == "$limit":
|
||||
log.warning("USE_SAMPLE is disabled, sampling documents using $skip and $limit.")
|
||||
skip = random.randint(0, collection.estimated_document_count(maxTimeMS=100))
|
||||
return [
|
||||
{"$skip": skip},
|
||||
{"$limit": amount},
|
||||
]
|
||||
else:
|
||||
raise ValueError("Unknown sample mode", SAMPLE_MODE)
|
||||
|
||||
|
||||
def sample_reviews(reviews: pymongo.collection.Collection, amount: int) -> t.Iterable[Review]:
|
||||
"""
|
||||
Get ``amount`` random reviews from the ``reviews`` collection.
|
||||
|
@ -81,7 +61,8 @@ def sample_reviews(reviews: pymongo.collection.Collection, amount: int) -> t.Ite
|
|||
log.debug("Getting a sample of %d reviews...", amount)
|
||||
|
||||
return reviews.aggregate([
|
||||
*pipeline_sample(reviews, amount),
|
||||
{"$limit": 10000}, # TODO
|
||||
{"$sample": {"size": amount}},
|
||||
])
|
||||
|
||||
|
||||
|
@ -92,8 +73,9 @@ def sample_reviews_by_rating(reviews: pymongo.collection.Collection, rating: flo
|
|||
log.debug("Getting a sample of %d reviews with %d stars...", amount, rating)
|
||||
|
||||
return reviews.aggregate([
|
||||
{"$limit": 10000}, # TODO
|
||||
{"$match": {"overall": rating}},
|
||||
*pipeline_sample(reviews, amount),
|
||||
{"$sample": {"size": amount}},
|
||||
])
|
||||
|
||||
|
||||
|
@ -104,6 +86,7 @@ def sample_reviews_by_rating_polar(reviews: pymongo.collection.Collection, amoun
|
|||
log.debug("Getting a sample of %d reviews with either 5 or 1 stars...", amount)
|
||||
|
||||
return reviews.aggregate([
|
||||
{"$limit": 10000}, # TODO
|
||||
{"$match":
|
||||
{"$or":
|
||||
[
|
||||
|
@ -112,11 +95,11 @@ def sample_reviews_by_rating_polar(reviews: pymongo.collection.Collection, amoun
|
|||
]
|
||||
},
|
||||
},
|
||||
*pipeline_sample(reviews, amount),
|
||||
{"$sample": {"size": amount}},
|
||||
])
|
||||
|
||||
|
||||
def get_reviews_training_set(reviews: pymongo.collection.Collection) -> t.Iterable[Review]:
|
||||
def get_training_reviews(collection: pymongo.collection.Collection) -> list[Review]:
|
||||
"""
|
||||
Get the subset of reviews that should act as training set.
|
||||
"""
|
||||
|
@ -130,8 +113,8 @@ def get_reviews_training_set(reviews: pymongo.collection.Collection) -> t.Iterab
|
|||
negative_amount: int = amount - positive_amount
|
||||
|
||||
# Sample the required reviews
|
||||
positive = sample_reviews_by_rating(reviews, 5.0, positive_amount)
|
||||
negative = sample_reviews_by_rating(reviews, 1.0, negative_amount)
|
||||
positive = sample_reviews_by_rating(collection, 5.0, positive_amount)
|
||||
negative = sample_reviews_by_rating(collection, 1.0, negative_amount)
|
||||
|
||||
# Randomness here does not matter, so just merge the lists
|
||||
both = [*positive, *negative]
|
||||
|
@ -139,7 +122,7 @@ def get_reviews_training_set(reviews: pymongo.collection.Collection) -> t.Iterab
|
|||
return both
|
||||
|
||||
|
||||
def get_reviews_test_set(reviews: pymongo.collection.Collection) -> t.Iterable[Review]:
|
||||
def get_test_reviews(collection: pymongo.collection.Collection) -> list[Review]:
|
||||
"""
|
||||
Get the subset of reviews that should act as test set.
|
||||
"""
|
||||
|
@ -148,7 +131,7 @@ def get_reviews_test_set(reviews: pymongo.collection.Collection) -> t.Iterable[R
|
|||
|
||||
amount: int = TEST_SET_SIZE.__wrapped__
|
||||
|
||||
return sample_reviews_by_rating_polar(reviews, amount)
|
||||
return list(sample_reviews_by_rating_polar(collection, amount))
|
||||
|
||||
|
||||
__all__ = (
|
||||
|
@ -158,6 +141,6 @@ __all__ = (
|
|||
"sample_reviews",
|
||||
"sample_reviews_by_rating",
|
||||
"sample_reviews_by_rating_polar",
|
||||
"get_reviews_training_set",
|
||||
"get_reviews_test_set",
|
||||
"get_training_reviews",
|
||||
"get_test_reviews",
|
||||
)
|
||||
|
|
Loading…
Reference in a new issue