diff --git a/.gitignore b/.gitignore
index f11c9ac..8467da3 100644
--- a/.gitignore
+++ b/.gitignore
@@ -9,6 +9,7 @@
data/raw/
data/db/
+data/nltk/
##################
# Python ignores #
diff --git a/.idea/dictionaries/steffo.xml b/.idea/dictionaries/steffo.xml
new file mode 100644
index 0000000..56712aa
--- /dev/null
+++ b/.idea/dictionaries/steffo.xml
@@ -0,0 +1,3 @@
+
+
+
\ No newline at end of file
diff --git a/.idea/runConfigurations/unimore_bda_6.xml b/.idea/runConfigurations/unimore_bda_6.xml
new file mode 100644
index 0000000..22a226f
--- /dev/null
+++ b/.idea/runConfigurations/unimore_bda_6.xml
@@ -0,0 +1,26 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.vscode/launch.json b/.vscode/launch.json
new file mode 100644
index 0000000..e492b02
--- /dev/null
+++ b/.vscode/launch.json
@@ -0,0 +1,19 @@
+{
+ // Use IntelliSense to learn about possible attributes.
+ // Hover to view descriptions of existing attributes.
+ // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
+ "version": "0.2.0",
+ "configurations": [
+ {
+ "name": "Python: unimore_bda_6",
+ "type": "python",
+ "request": "launch",
+ "module": "unimore_bda_6",
+ "justMyCode": true,
+ "env": {
+ "NLTK_DATA": "./data/nltk",
+ },
+ "cwd": "${workspaceFolder}",
+ }
+ ]
+}
\ No newline at end of file
diff --git a/data/scripts/download-nltk.sh b/data/scripts/download-nltk.sh
new file mode 100755
index 0000000..3ea2b95
--- /dev/null
+++ b/data/scripts/download-nltk.sh
@@ -0,0 +1,4 @@
+#!/usr/bin/env bash
+repo=$(git rev-parse --show-toplevel)
+export NLTK_DATA="$repo/data/nltk"
+"$repo/.venv/bin/python" -m nltk.downloader popular
diff --git a/data/scripts/index-db.mongodb b/data/scripts/index-db.mongodb
new file mode 100644
index 0000000..7517ff9
--- /dev/null
+++ b/data/scripts/index-db.mongodb
@@ -0,0 +1,8 @@
+db.reviews.createIndex(
+ {
+ overall: 1,
+ },
+ {
+ name: "rating_index"
+ }
+)
diff --git a/unimore-bda-6.iml b/unimore-bda-6.iml
index 7616de1..1312514 100644
--- a/unimore-bda-6.iml
+++ b/unimore-bda-6.iml
@@ -6,6 +6,7 @@
+
diff --git a/unimore_bda_6/__init__.py b/unimore_bda_6/__init__.py
index 8c571c4..e69de29 100644
--- a/unimore_bda_6/__init__.py
+++ b/unimore_bda_6/__init__.py
@@ -1,5 +0,0 @@
-# If you are building a **library**, use this file to export objects!
-
-__all__ = (
- # "",
-)
diff --git a/unimore_bda_6/__main__.py b/unimore_bda_6/__main__.py
index eb4e616..73f03b9 100644
--- a/unimore_bda_6/__main__.py
+++ b/unimore_bda_6/__main__.py
@@ -1,12 +1,23 @@
from .config import config
-from .analysis.vanilla import create_model_vanilla, train_model_vanilla, evaluate_model_vanilla
+from .database import mongo_reviews_collection_from_config, get_training_reviews, get_test_reviews
+from .analysis.vanilla import VanillaReviewSA
from .log import install_log_handler
def main():
- model = create_model_vanilla()
- train_model_vanilla(model)
- evaluate_model_vanilla(model)
+ with mongo_reviews_collection_from_config() as reviews:
+ training_reviews = get_training_reviews(collection=reviews)
+ test_reviews = get_test_reviews(collection=reviews)
+
+ model = VanillaReviewSA()
+ model.train(training_reviews)
+
+ evaluation = model.evaluate(test_reviews)
+ print(evaluation)
+
+ while True:
+ classification = model.use(input())
+ print(classification)
if __name__ == "__main__":
diff --git a/unimore_bda_6/analysis/base.py b/unimore_bda_6/analysis/base.py
new file mode 100644
index 0000000..736cc28
--- /dev/null
+++ b/unimore_bda_6/analysis/base.py
@@ -0,0 +1,54 @@
+import abc
+
+
+class BaseSA(metaclass=abc.ABCMeta):
+ """
+ Abstract base class for sentiment analyzers implemented in this project.
+ """
+
+ def __init__(self) -> None:
+ """
+ Create the empty shell of the sentiment analyzer.
+ """
+
+ self.trained = False
+ "If :meth:`train` has been called at least once, and the analyzer is ready or not to be evaluated or used."
+
+ @abc.abstractmethod
+ def train(self, training_set) -> None:
+ """
+ Train the analyzer with the given training set.
+ """
+ raise NotImplementedError()
+
+ @abc.abstractmethod
+ def evaluate(self, test_set) -> None:
+ """
+ Evaluate the analyzer with the given test set.
+ """
+ raise NotImplementedError()
+
+ @abc.abstractmethod
+ def use(self, text: str) -> str:
+ """
+ Use the sentiment analyzer.
+ """
+ raise NotImplementedError()
+
+
+class AlreadyTrainedError(Exception):
+ """
+ This model has already been trained and cannot be trained again.
+ """
+
+class NotTrainedError(Exception):
+ """
+ This model has not been trained yet.
+ """
+
+
+__all__ = (
+ "BaseSA",
+ "AlreadyTrainedError",
+ "NotTrainedError",
+)
diff --git a/unimore_bda_6/analysis/vanilla.py b/unimore_bda_6/analysis/vanilla.py
index c190b78..249dedd 100644
--- a/unimore_bda_6/analysis/vanilla.py
+++ b/unimore_bda_6/analysis/vanilla.py
@@ -1,58 +1,118 @@
+import abc
import nltk
import nltk.classify
import nltk.sentiment
import nltk.sentiment.util
import logging
+import typing as t
-from ..database import mongo_reviews_collection_from_config, get_reviews_training_set, get_reviews_test_set
+from ..database import Review
+from .base import BaseSA, AlreadyTrainedError, NotTrainedError
log = logging.getLogger(__name__)
-def create_model_vanilla() -> nltk.sentiment.SentimentAnalyzer:
- log.debug("Creating model...")
- model = nltk.sentiment.SentimentAnalyzer()
- log.debug("Created model %s!", model)
- return model
+class VanillaSA(BaseSA, metaclass=abc.ABCMeta):
+ """
+ A sentiment analyzer resembling the one implemented in structure the one implemented in the classroom, using the basic sentiment analyzer of NLTK.
+ """
+
+ def __init__(self, language="english") -> None:
+ super().__init__()
+ self.language: str = language
+ self.model: nltk.sentiment.SentimentAnalyzer = nltk.sentiment.SentimentAnalyzer()
+
+ def _tokenize_text(self, text: str) -> list[str]:
+ """
+ Convert a text string into a list of tokens, using the language of the model.
+ """
+ tokens = nltk.word_tokenize(text, language=self.language)
+ nltk.sentiment.util.mark_negation(tokens, shallow=True)
+ return tokens
+
+ def __add_feature_unigrams(self, training_set: list[tuple[list[str], str]]) -> None:
+ """
+ Add the `nltk.sentiment.util.extract_unigram_feats` feature to the model.
+ """
+ all_words = self.model.all_words(training_set, labeled=True)
+ unigrams = self.model.unigram_word_feats(words=all_words, min_freq=4)
+ self.model.add_feat_extractor(nltk.sentiment.util.extract_unigram_feats, unigrams=unigrams)
+
+ def _featurize_documents(self, documents: list[tuple[list[str], str]]):
+ """
+ Apply features to a document.
+ """
+ return self.model.apply_features(documents, labeled=True)
+
+ def _train_with_set(self, training_set: list[tuple[list[str], str]]) -> None:
+ """
+ Train the model with the given **pre-classified but not pre-tokenized** training set.
+ """
+ if self.trained:
+ raise AlreadyTrainedError()
+
+ self.__add_feature_unigrams(training_set)
+ training_set_with_features = self._featurize_documents(training_set)
+
+ self.model.train(trainer=nltk.classify.NaiveBayesClassifier.train, training_set=training_set_with_features)
+ self.trained = True
+
+ def _evaluate_with_set(self, test_set: list[tuple[list[str], str]]) -> dict:
+ if not self.trained:
+ raise NotTrainedError()
+
+ test_set_with_features = self._featurize_documents(test_set)
+ return self.model.evaluate(test_set_with_features)
+
+ def _use_with_tokens(self, tokens: list[str]) -> str:
+ if not self.trained:
+ raise NotTrainedError()
+
+ return self.model.classify(instance=tokens)
-def train_model_vanilla(model: nltk.sentiment.SentimentAnalyzer) -> None:
- # TODO: This doesn't work yet
+class VanillaReviewSA(VanillaSA):
+ """
+ A `VanillaSA` to be used with `Review`s.
+ """
- with mongo_reviews_collection_from_config() as reviews:
- training_set = get_reviews_training_set(reviews)
+ @staticmethod
+ def _rating_to_label(rating: float) -> str:
+ """
+ Return the label corresponding to the given rating.
- log.debug("Marking negations...")
- training_negated_set = list(map(nltk.sentiment.util.mark_negation, training_set))
+ Possible categories are:
+ * negative (0.0 <= rating < 2.5)
+ * mixed (2.5 <= rating <= 3.5)
+ * positive (3.5 < rating <= 5.0)
+ """
+ if rating < 2.5:
+ return "negative"
+ elif rating <= 3.5:
+ return "mixed"
+ else:
+ return "positive"
- log.debug("Extracting tokens...")
- training_tokens = model.all_words(training_negated_set, labeled=False)
+ def _review_to_data_set(self, review: Review) -> tuple[list[str], str]:
+ """
+ Convert a review to a NLTK-compatible dataset.
+ """
+ return self._tokenize_text(text=review["reviewText"]), self._rating_to_label(rating=review["overall"])
+
+ def train(self, reviews: t.Iterable[Review]) -> None:
+ data_set = list(map(self._review_to_data_set, reviews))
+ self._train_with_set(data_set)
- log.debug("Counting unigrams...")
- training_unigrams = model.unigram_word_feats(words=training_tokens, min_freq=4)
+ def evaluate(self, reviews: t.Iterable[Review]):
+ data_set = list(map(self._review_to_data_set, reviews))
+ return self._evaluate_with_set(data_set)
- log.debug("Configuring model features...")
- model.add_feat_extractor(nltk.sentiment.util.extract_unigram_feats, unigrams=training_unigrams)
- training_set = model.apply_features(documents=training_set)
-
- log.info("Training model...")
- model.train(trainer=nltk.classify.NaiveBayesClassifier.train, training_set=training_set)
-
-
-def evaluate_model_vanilla(model: nltk.sentiment.SentimentAnalyzer):
- with mongo_reviews_collection_from_config() as reviews:
- test_set = get_reviews_test_set(reviews)
-
- log.info("Evaluating model...")
- model.evaluate(test_set)
-
- # TODO
- breakpoint()
+ def use(self, text: str) -> str:
+ return self._use_with_tokens(self._tokenize_text(text))
__all__ = (
- "create_model_vanilla",
- "train_model_vanilla",
- "evaluate_model_vanilla",
+ "VanillaSA",
+ "VanillaReviewSA",
)
diff --git a/unimore_bda_6/config.py b/unimore_bda_6/config.py
index 758b973..9173139 100644
--- a/unimore_bda_6/config.py
+++ b/unimore_bda_6/config.py
@@ -7,6 +7,8 @@ config = cfig.Configuration()
def MONGO_HOST(val: str | None) -> str:
"""
The hostname of the MongoDB database to connect to.
+
+ Defaults to `"127.0.0.1"`.
"""
return val or "127.0.0.1"
@@ -15,6 +17,8 @@ def MONGO_HOST(val: str | None) -> str:
def MONGO_PORT(val: str | None) -> int:
"""
The port of the MongoDB database to connect to.
+
+ Defaults to `27017`.
"""
if val is None:
return 27017
@@ -24,23 +28,12 @@ def MONGO_PORT(val: str | None) -> int:
raise cfig.InvalidValueError("Not an int.")
-@config.optional()
-def SAMPLE_MODE(val: str | None) -> str:
- """
- Whether `$sample` or `$limit` should be used to aggregate the training and test sets.
- `$limit` is much faster, but not truly random, while `$sample` is completely random.
- """
- if val is None:
- return "$sample"
- if val not in ["$sample", "$limit"]:
- raise cfig.InvalidValueError("Neither $sample or $limit.")
- return val
-
-
@config.optional()
def TRAINING_SET_SIZE(val: str | None) -> int:
"""
The number of reviews from each category to fetch for the training set.
+
+ Defaults to `1000`.
"""
if val is None:
return 1000
@@ -54,6 +47,8 @@ def TRAINING_SET_SIZE(val: str | None) -> int:
def TEST_SET_SIZE(val: str | None) -> int:
"""
The number of reviews to fetch for the test set.
+
+ Defaults to `1000`.
"""
if val is None:
return 1000
@@ -67,7 +62,11 @@ __all__ = (
"config",
"MONGO_HOST",
"MONGO_PORT",
- "SAMPLE_MODE",
"TRAINING_SET_SIZE",
"TEST_SET_SIZE",
+ "NLTK_DOUBLE_NEG_SWITCH",
)
+
+
+if __name__ == "__main__":
+ config.cli()
diff --git a/unimore_bda_6/database.py b/unimore_bda_6/database.py
index c152d24..29090da 100644
--- a/unimore_bda_6/database.py
+++ b/unimore_bda_6/database.py
@@ -4,9 +4,8 @@ import pymongo.collection
import contextlib
import bson
import logging
-import random
-from .config import MONGO_HOST, MONGO_PORT, TRAINING_SET_SIZE, TEST_SET_SIZE, SAMPLE_MODE
+from .config import MONGO_HOST, MONGO_PORT, TRAINING_SET_SIZE, TEST_SET_SIZE
log = logging.getLogger(__name__)
@@ -55,25 +54,6 @@ def mongo_reviews_collection_from_config() -> pymongo.collection.Collection[Revi
yield collection
-def pipeline_sample(collection: pymongo.collection.Collection, amount: int) -> list:
- """
- Create pipeline stages for sampling random documents, either with true randomness or by skipping a random amount of them.
- """
- if SAMPLE_MODE.__wrapped__ == "$sample":
- return [
- {"$sample": {"size": amount}},
- ]
- elif SAMPLE_MODE.__wrapped__ == "$limit":
- log.warning("USE_SAMPLE is disabled, sampling documents using $skip and $limit.")
- skip = random.randint(0, collection.estimated_document_count(maxTimeMS=100))
- return [
- {"$skip": skip},
- {"$limit": amount},
- ]
- else:
- raise ValueError("Unknown sample mode", SAMPLE_MODE)
-
-
def sample_reviews(reviews: pymongo.collection.Collection, amount: int) -> t.Iterable[Review]:
"""
Get ``amount`` random reviews from the ``reviews`` collection.
@@ -81,7 +61,8 @@ def sample_reviews(reviews: pymongo.collection.Collection, amount: int) -> t.Ite
log.debug("Getting a sample of %d reviews...", amount)
return reviews.aggregate([
- *pipeline_sample(reviews, amount),
+ {"$limit": 10000}, # TODO
+ {"$sample": {"size": amount}},
])
@@ -92,8 +73,9 @@ def sample_reviews_by_rating(reviews: pymongo.collection.Collection, rating: flo
log.debug("Getting a sample of %d reviews with %d stars...", amount, rating)
return reviews.aggregate([
+ {"$limit": 10000}, # TODO
{"$match": {"overall": rating}},
- *pipeline_sample(reviews, amount),
+ {"$sample": {"size": amount}},
])
@@ -104,6 +86,7 @@ def sample_reviews_by_rating_polar(reviews: pymongo.collection.Collection, amoun
log.debug("Getting a sample of %d reviews with either 5 or 1 stars...", amount)
return reviews.aggregate([
+ {"$limit": 10000}, # TODO
{"$match":
{"$or":
[
@@ -112,11 +95,11 @@ def sample_reviews_by_rating_polar(reviews: pymongo.collection.Collection, amoun
]
},
},
- *pipeline_sample(reviews, amount),
+ {"$sample": {"size": amount}},
])
-def get_reviews_training_set(reviews: pymongo.collection.Collection) -> t.Iterable[Review]:
+def get_training_reviews(collection: pymongo.collection.Collection) -> list[Review]:
"""
Get the subset of reviews that should act as training set.
"""
@@ -130,8 +113,8 @@ def get_reviews_training_set(reviews: pymongo.collection.Collection) -> t.Iterab
negative_amount: int = amount - positive_amount
# Sample the required reviews
- positive = sample_reviews_by_rating(reviews, 5.0, positive_amount)
- negative = sample_reviews_by_rating(reviews, 1.0, negative_amount)
+ positive = sample_reviews_by_rating(collection, 5.0, positive_amount)
+ negative = sample_reviews_by_rating(collection, 1.0, negative_amount)
# Randomness here does not matter, so just merge the lists
both = [*positive, *negative]
@@ -139,7 +122,7 @@ def get_reviews_training_set(reviews: pymongo.collection.Collection) -> t.Iterab
return both
-def get_reviews_test_set(reviews: pymongo.collection.Collection) -> t.Iterable[Review]:
+def get_test_reviews(collection: pymongo.collection.Collection) -> list[Review]:
"""
Get the subset of reviews that should act as test set.
"""
@@ -148,7 +131,7 @@ def get_reviews_test_set(reviews: pymongo.collection.Collection) -> t.Iterable[R
amount: int = TEST_SET_SIZE.__wrapped__
- return sample_reviews_by_rating_polar(reviews, amount)
+ return list(sample_reviews_by_rating_polar(collection, amount))
__all__ = (
@@ -158,6 +141,6 @@ __all__ = (
"sample_reviews",
"sample_reviews_by_rating",
"sample_reviews_by_rating_polar",
- "get_reviews_training_set",
- "get_reviews_test_set",
+ "get_training_reviews",
+ "get_test_reviews",
)