mirror of
https://github.com/Steffo99/unimore-bda-6.git
synced 2024-11-22 07:54:19 +00:00
New version working nicely
This commit is contained in:
parent
379cbdd13a
commit
6ef81c1c19
15 changed files with 371 additions and 316 deletions
|
@ -1,12 +1,44 @@
|
||||||
<component name="InspectionProjectProfileManager">
|
<component name="InspectionProjectProfileManager">
|
||||||
<profile version="1.0">
|
<profile version="1.0">
|
||||||
<option name="myName" value="Project Default" />
|
<option name="myName" value="Project Default" />
|
||||||
|
<inspection_tool class="HttpUrlsUsage" enabled="true" level="WEAK WARNING" enabled_by_default="true">
|
||||||
|
<option name="ignoredUrls">
|
||||||
|
<list>
|
||||||
|
<option value="http://localhost" />
|
||||||
|
<option value="http://127.0.0.1" />
|
||||||
|
<option value="http://0.0.0.0" />
|
||||||
|
<option value="http://www.w3.org/" />
|
||||||
|
<option value="http://json-schema.org/draft" />
|
||||||
|
<option value="http://java.sun.com/" />
|
||||||
|
<option value="http://xmlns.jcp.org/" />
|
||||||
|
<option value="http://javafx.com/javafx/" />
|
||||||
|
<option value="http://javafx.com/fxml" />
|
||||||
|
<option value="http://maven.apache.org/xsd/" />
|
||||||
|
<option value="http://maven.apache.org/POM/" />
|
||||||
|
<option value="http://www.springframework.org/schema/" />
|
||||||
|
<option value="http://www.springframework.org/tags" />
|
||||||
|
<option value="http://www.springframework.org/security/tags" />
|
||||||
|
<option value="http://www.thymeleaf.org" />
|
||||||
|
<option value="http://www.jboss.org/j2ee/schema/" />
|
||||||
|
<option value="http://www.jboss.com/xml/ns/" />
|
||||||
|
<option value="http://www.ibm.com/webservices/xsd" />
|
||||||
|
<option value="http://activemq.apache.org/schema/" />
|
||||||
|
<option value="http://schema.cloudfoundry.org/spring/" />
|
||||||
|
<option value="http://schemas.xmlsoap.org/" />
|
||||||
|
<option value="http://cxf.apache.org/schemas/" />
|
||||||
|
<option value="http://primefaces.org/ui" />
|
||||||
|
<option value="http://tiles.apache.org/" />
|
||||||
|
<option value="http://sentiment.christopherpotts.net" />
|
||||||
|
</list>
|
||||||
|
</option>
|
||||||
|
</inspection_tool>
|
||||||
<inspection_tool class="PyPep8Inspection" enabled="true" level="WEAK WARNING" enabled_by_default="true">
|
<inspection_tool class="PyPep8Inspection" enabled="true" level="WEAK WARNING" enabled_by_default="true">
|
||||||
<option name="ignoredErrors">
|
<option name="ignoredErrors">
|
||||||
<list>
|
<list>
|
||||||
<option value="E124" />
|
<option value="E124" />
|
||||||
<option value="E501" />
|
<option value="E501" />
|
||||||
<option value="E221" />
|
<option value="E221" />
|
||||||
|
<option value="E203" />
|
||||||
</list>
|
</list>
|
||||||
</option>
|
</option>
|
||||||
</inspection_tool>
|
</inspection_tool>
|
||||||
|
|
|
@ -76,11 +76,11 @@ $ mongosh < ./data/scripts/index-db.js
|
||||||
|
|
||||||
<!-- TODO -->
|
<!-- TODO -->
|
||||||
|
|
||||||
## `base`: Costruzione dell'impalcatura necessaria al confronto
|
## `.analysis.base`: Costruzione dell'impalcatura necessaria al confronto
|
||||||
|
|
||||||
<!-- TODO -->
|
<!-- TODO -->
|
||||||
|
|
||||||
## `vanilla`: Ricostruzione e ottimizzazione del modello basato su `nltk.sentiment` realizzato a lezione
|
## `.analysis.nltk_sentiment`: Ricostruzione e ottimizzazione del modello basato su `nltk.sentiment` realizzato a lezione
|
||||||
|
|
||||||
Per avere un modello baseline con cui effettuare un confronto, si è ricostruito un modello basato su `nltk.sentiment` ispirato a quello realizzato a lezione.
|
Per avere un modello baseline con cui effettuare un confronto, si è ricostruito un modello basato su `nltk.sentiment` ispirato a quello realizzato a lezione.
|
||||||
|
|
||||||
|
|
|
@ -1,91 +1,38 @@
|
||||||
import logging
|
import logging
|
||||||
|
|
||||||
from .config import config, DATA_SET_SIZE
|
from .config import config, DATA_SET_SIZE
|
||||||
from .database import Review, mongo_reviews_collection_from_config, dataset_polar, dataset_varied
|
from .database import mongo_reviews_collection_from_config, polar_dataset, varied_dataset
|
||||||
from .analysis.vanilla import VanillaSA
|
from .analysis.nltk_sentiment import NLTKSentimentAnalyzer
|
||||||
from .tokenization import all_tokenizers
|
from .tokenizer import NLTKWordTokenizer, PottsTokenizer, PottsTokenizerWithNegation
|
||||||
from .log import install_log_handler
|
from .log import install_log_handler
|
||||||
|
|
||||||
log = logging.getLogger(__name__)
|
log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
def review_vanilla_extractor(review: Review) -> tuple[str, float]:
|
|
||||||
"""
|
|
||||||
Extract review text and rating from a `Review`.
|
|
||||||
"""
|
|
||||||
return review["reviewText"], review["overall"]
|
|
||||||
|
|
||||||
|
|
||||||
def polar_categorizer(rating: float) -> str:
|
|
||||||
"""
|
|
||||||
Return the polar label corresponding to the given rating.
|
|
||||||
|
|
||||||
Possible categories are:
|
|
||||||
|
|
||||||
* negative (1.0, 2.0)
|
|
||||||
* positive (3.0, 4.0, 5.0)
|
|
||||||
* unknown (everything else)
|
|
||||||
"""
|
|
||||||
match rating:
|
|
||||||
case 1.0 | 2.0:
|
|
||||||
return "negative"
|
|
||||||
case 3.0 | 4.0 | 5.0:
|
|
||||||
return "positive"
|
|
||||||
case _:
|
|
||||||
return "unknown"
|
|
||||||
|
|
||||||
|
|
||||||
def varied_categorizer(rating: float) -> str:
|
|
||||||
"""
|
|
||||||
Return the "stars" label corresponding to the given rating.
|
|
||||||
|
|
||||||
Possible categories are:
|
|
||||||
|
|
||||||
* terrible (1.0)
|
|
||||||
* negative (2.0)
|
|
||||||
* mixed (3.0)
|
|
||||||
* positive (4.0)
|
|
||||||
* great (5.0)
|
|
||||||
* unknown (everything else)
|
|
||||||
"""
|
|
||||||
match rating:
|
|
||||||
case 1.0:
|
|
||||||
return "terrible"
|
|
||||||
case 2.0:
|
|
||||||
return "negative"
|
|
||||||
case 3.0:
|
|
||||||
return "mixed"
|
|
||||||
case 4.0:
|
|
||||||
return "positive"
|
|
||||||
case 5.0:
|
|
||||||
return "great"
|
|
||||||
case _:
|
|
||||||
return "unknown"
|
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
for dataset_func, categorizer in [
|
for dataset_func in [polar_dataset, varied_dataset]:
|
||||||
(dataset_polar, polar_categorizer),
|
for SentimentAnalyzer in [NLTKSentimentAnalyzer]:
|
||||||
(dataset_varied, varied_categorizer),
|
for Tokenizer in [NLTKWordTokenizer, PottsTokenizer, PottsTokenizerWithNegation]:
|
||||||
]:
|
tokenizer = Tokenizer()
|
||||||
for tokenizer in all_tokenizers:
|
model = SentimentAnalyzer(tokenizer=tokenizer)
|
||||||
|
|
||||||
with mongo_reviews_collection_from_config() as reviews:
|
with mongo_reviews_collection_from_config() as reviews:
|
||||||
reviews_training = dataset_func(collection=reviews, amount=DATA_SET_SIZE.__wrapped__)
|
reviews_training = dataset_func(collection=reviews, amount=DATA_SET_SIZE.__wrapped__)
|
||||||
reviews_evaluation = dataset_func(collection=reviews, amount=DATA_SET_SIZE.__wrapped__)
|
reviews_evaluation = dataset_func(collection=reviews, amount=DATA_SET_SIZE.__wrapped__)
|
||||||
|
|
||||||
model = VanillaSA(extractor=review_vanilla_extractor, tokenizer=tokenizer, categorizer=categorizer)
|
|
||||||
log.info("Training model %s", model)
|
log.info("Training model %s", model)
|
||||||
model.train(reviews_training)
|
model.train(reviews_training)
|
||||||
log.info("Evaluating model %s", model)
|
log.info("Evaluating model %s", model)
|
||||||
evaluation = model.evaluate(reviews_evaluation)
|
correct, evaluated = model.evaluate(reviews_evaluation)
|
||||||
log.info("Results of model %s: %s", tokenizer, evaluation)
|
log.info("%d evaluated, %d correct, %0.2d %% accuracy", evaluated, correct, correct / evaluated * 100)
|
||||||
|
|
||||||
try:
|
# try:
|
||||||
print("Model %s" % model)
|
# print("Manual testing for %s" % model)
|
||||||
while inp := input():
|
# print("Input an empty string to continue to the next model.")
|
||||||
print(model.use(inp))
|
# while inp := input():
|
||||||
except KeyboardInterrupt:
|
# print(model.use(inp))
|
||||||
pass
|
# except KeyboardInterrupt:
|
||||||
|
# pass
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|
|
@ -1,6 +1,8 @@
|
||||||
from .vanilla import BaseSA
|
from .base import BaseSentimentAnalyzer
|
||||||
|
from .nltk_sentiment import NLTKSentimentAnalyzer
|
||||||
|
|
||||||
|
|
||||||
__all__ = (
|
__all__ = (
|
||||||
"BaseSA",
|
"BaseSentimentAnalyzer",
|
||||||
|
"NLTKSentimentAnalyzer",
|
||||||
)
|
)
|
||||||
|
|
|
@ -1,47 +1,49 @@
|
||||||
import abc
|
import abc
|
||||||
import typing as t
|
import logging
|
||||||
|
|
||||||
|
from ..database import DataSet, Text, Category
|
||||||
|
|
||||||
|
log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
Input = t.TypeVar("Input")
|
class BaseSentimentAnalyzer(metaclass=abc.ABCMeta):
|
||||||
Category = t.TypeVar("Category")
|
|
||||||
|
|
||||||
|
|
||||||
class BaseSA(metaclass=abc.ABCMeta):
|
|
||||||
"""
|
"""
|
||||||
Abstract base class for sentiment analyzers implemented in this project.
|
Abstract base class for sentiment analyzers implemented in this project.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
@abc.abstractmethod
|
@abc.abstractmethod
|
||||||
def train(self, training_set: t.Iterable[tuple[Input, Category]]) -> None:
|
def train(self, training_set: DataSet) -> None:
|
||||||
"""
|
"""
|
||||||
Train the analyzer with the given training set.
|
Train the analyzer with the given training dataset.
|
||||||
"""
|
"""
|
||||||
raise NotImplementedError()
|
raise NotImplementedError()
|
||||||
|
|
||||||
|
def evaluate(self, test_set: DataSet) -> tuple[int, int]:
|
||||||
|
"""
|
||||||
|
Perform a model evaluation by calling repeatedly `.use` on every text of the test dataset and by comparing its resulting category with the expected category.
|
||||||
|
|
||||||
|
Returns a tuple with the number of correct results and the number of evaluated results.
|
||||||
|
"""
|
||||||
|
evaluated: int = 0
|
||||||
|
correct: int = 0
|
||||||
|
|
||||||
|
for text, expected_category in test_set:
|
||||||
|
resulting_category = self.use(text)
|
||||||
|
evaluated += 1
|
||||||
|
correct += 1 if resulting_category == expected_category else 0
|
||||||
|
if not evaluated % 100:
|
||||||
|
log.debug("%d evaluated, %d correct, %0.2d %% accuracy", evaluated, correct, correct / evaluated * 100)
|
||||||
|
|
||||||
|
return correct, evaluated
|
||||||
|
|
||||||
@abc.abstractmethod
|
@abc.abstractmethod
|
||||||
def use(self, text: Input) -> Category:
|
def use(self, text: Text) -> Category:
|
||||||
"""
|
"""
|
||||||
Use the sentiment analyzer.
|
Run the model on the given input.
|
||||||
"""
|
"""
|
||||||
raise NotImplementedError()
|
raise NotImplementedError()
|
||||||
|
|
||||||
|
|
||||||
class AlreadyTrainedError(Exception):
|
|
||||||
"""
|
|
||||||
This model has already been trained and cannot be trained again.
|
|
||||||
"""
|
|
||||||
|
|
||||||
|
|
||||||
class NotTrainedError(Exception):
|
|
||||||
"""
|
|
||||||
This model has not been trained yet.
|
|
||||||
"""
|
|
||||||
|
|
||||||
|
|
||||||
__all__ = (
|
__all__ = (
|
||||||
"Input",
|
"BaseSentimentAnalyzer",
|
||||||
"Category",
|
|
||||||
"BaseSA",
|
|
||||||
"AlreadyTrainedError",
|
|
||||||
"NotTrainedError",
|
|
||||||
)
|
)
|
||||||
|
|
123
unimore_bda_6/analysis/nltk_sentiment.py
Normal file
123
unimore_bda_6/analysis/nltk_sentiment.py
Normal file
|
@ -0,0 +1,123 @@
|
||||||
|
import nltk
|
||||||
|
import nltk.classify
|
||||||
|
import nltk.sentiment
|
||||||
|
import nltk.sentiment.util
|
||||||
|
import logging
|
||||||
|
import typing as t
|
||||||
|
import itertools
|
||||||
|
|
||||||
|
from ..database import Text, Category, DataTuple, DataSet
|
||||||
|
from .base import BaseSentimentAnalyzer
|
||||||
|
from ..log import count_passage
|
||||||
|
from ..tokenizer import BaseTokenizer
|
||||||
|
|
||||||
|
log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
TokenBag = list[str]
|
||||||
|
Features = dict[str, int]
|
||||||
|
|
||||||
|
|
||||||
|
class AlreadyTrainedError(Exception):
|
||||||
|
"""
|
||||||
|
This model has already been trained and cannot be trained again.
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
class NotTrainedError(Exception):
|
||||||
|
"""
|
||||||
|
This model has not been trained yet.
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
class NLTKSentimentAnalyzer(BaseSentimentAnalyzer):
|
||||||
|
"""
|
||||||
|
A sentiment analyzer resembling the one implemented in structure the one implemented in the classroom, using the basic sentiment analyzer of NLTK.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, *, tokenizer: BaseTokenizer) -> None:
|
||||||
|
super().__init__()
|
||||||
|
self.model: nltk.sentiment.SentimentAnalyzer = nltk.sentiment.SentimentAnalyzer()
|
||||||
|
self.trained: bool = False
|
||||||
|
self.tokenizer: BaseTokenizer = tokenizer
|
||||||
|
|
||||||
|
def __repr__(self):
|
||||||
|
return f"<{self.__class__.__qualname__} {'trained' if self.trained else 'untrained'} tokenizer={self.tokenizer!r}>"
|
||||||
|
|
||||||
|
def __tokenize_datatuple(self, datatuple: DataTuple) -> tuple[TokenBag, Category]:
|
||||||
|
"""
|
||||||
|
Convert the `Text` of a `DataTuple` to a `TokenBag`.
|
||||||
|
"""
|
||||||
|
count_passage(log, "tokenize_datatuple", 100)
|
||||||
|
return self.tokenizer.tokenize(datatuple[0]), datatuple[1]
|
||||||
|
|
||||||
|
def _add_feature_unigrams(self, dataset: t.Iterator[tuple[TokenBag, Category]]) -> None:
|
||||||
|
"""
|
||||||
|
Register the `nltk.sentiment.util.extract_unigram_feats` feature extrator on the model.
|
||||||
|
"""
|
||||||
|
# Ignore the category and only access the tokens
|
||||||
|
tokenbags = map(lambda d: d[0], dataset)
|
||||||
|
# Get all words in the documents
|
||||||
|
all_words = self.model.all_words(tokenbags, labeled=False)
|
||||||
|
# Create unigram `contains(*)` features from the previously gathered words
|
||||||
|
unigrams = self.model.unigram_word_feats(words=all_words, min_freq=4)
|
||||||
|
# Add the feature extractor to the model
|
||||||
|
self.model.add_feat_extractor(nltk.sentiment.util.extract_unigram_feats, unigrams=unigrams)
|
||||||
|
|
||||||
|
def _add_feature_extractors(self, dataset: t.Iterator[tuple[TokenBag, Category]]):
|
||||||
|
"""
|
||||||
|
Register new feature extractors on the `.model`.
|
||||||
|
"""
|
||||||
|
# Add the unigrams feature
|
||||||
|
self._add_feature_unigrams(dataset)
|
||||||
|
|
||||||
|
def __extract_features(self, data: tuple[TokenBag, Category]) -> tuple[Features, Category]:
|
||||||
|
"""
|
||||||
|
Convert a (TokenBag, Category) tuple to a (Features, Category) tuple.
|
||||||
|
|
||||||
|
Does not use `SentimentAnalyzer.apply_features` due to unexpected behaviour when using iterators.
|
||||||
|
"""
|
||||||
|
count_passage(log, "extract_features", 100)
|
||||||
|
return self.model.extract_features(data[0]), data[1]
|
||||||
|
|
||||||
|
def train(self, dataset: DataSet) -> None:
|
||||||
|
# Forbid retraining the model
|
||||||
|
if self.trained:
|
||||||
|
raise AlreadyTrainedError()
|
||||||
|
|
||||||
|
# Tokenize the dataset
|
||||||
|
dataset: t.Iterator[tuple[TokenBag, Category]] = map(self.__tokenize_datatuple, dataset)
|
||||||
|
|
||||||
|
# Cleanly duplicate the dataset iterator
|
||||||
|
# Reduce average memory footprint, but not maximum
|
||||||
|
dataset_1, dataset_2 = itertools.tee(dataset, 2)
|
||||||
|
dataset_1: t.Iterator[tuple[TokenBag, Category]]
|
||||||
|
dataset_2: t.Iterator[tuple[TokenBag, Category]]
|
||||||
|
|
||||||
|
# Add the feature extractors to the model
|
||||||
|
self._add_feature_extractors(dataset_1)
|
||||||
|
del dataset_1 # Delete exausted iterator
|
||||||
|
|
||||||
|
# Extract features from the dataset
|
||||||
|
dataset_2: t.Iterator[tuple[Features, Category]] = map(self.__extract_features, dataset_2)
|
||||||
|
|
||||||
|
# Train the classifier with the extracted features and category
|
||||||
|
self.model.classifier = nltk.classify.NaiveBayesClassifier.train(dataset_2)
|
||||||
|
|
||||||
|
# Toggle the trained flag
|
||||||
|
self.trained = True
|
||||||
|
|
||||||
|
def use(self, text: Text) -> Category:
|
||||||
|
# Require the model to be trained
|
||||||
|
if not self.trained:
|
||||||
|
raise NotTrainedError()
|
||||||
|
|
||||||
|
# Tokenize the input
|
||||||
|
tokens = self.tokenizer.tokenize(text)
|
||||||
|
|
||||||
|
# Run the classification method
|
||||||
|
return self.model.classify(instance=tokens)
|
||||||
|
|
||||||
|
|
||||||
|
__all__ = (
|
||||||
|
"NLTKSentimentAnalyzer",
|
||||||
|
)
|
|
@ -1,126 +0,0 @@
|
||||||
import nltk
|
|
||||||
import nltk.classify
|
|
||||||
import nltk.sentiment
|
|
||||||
import nltk.sentiment.util
|
|
||||||
import logging
|
|
||||||
import typing as t
|
|
||||||
import itertools
|
|
||||||
|
|
||||||
from .base import Input, Category, BaseSA, AlreadyTrainedError, NotTrainedError
|
|
||||||
from ..log import count_passage
|
|
||||||
|
|
||||||
TokenBag = list[str]
|
|
||||||
IntermediateValue = t.TypeVar("IntermediateValue")
|
|
||||||
Features = dict[str, int]
|
|
||||||
|
|
||||||
|
|
||||||
log = logging.getLogger(__name__)
|
|
||||||
|
|
||||||
|
|
||||||
class VanillaSA(BaseSA):
|
|
||||||
"""
|
|
||||||
A sentiment analyzer resembling the one implemented in structure the one implemented in the classroom, using the basic sentiment analyzer of NLTK.
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(self, *, extractor: t.Callable[[Input], tuple[str, IntermediateValue]], tokenizer: t.Callable[[str], TokenBag], categorizer: t.Callable[[IntermediateValue], Category]) -> None:
|
|
||||||
super().__init__()
|
|
||||||
self.model: nltk.sentiment.SentimentAnalyzer = nltk.sentiment.SentimentAnalyzer()
|
|
||||||
self.trained: bool = False
|
|
||||||
self.extractor: t.Callable[[Input], tuple[str, IntermediateValue]] = extractor
|
|
||||||
self.tokenizer: t.Callable[[str], TokenBag] = tokenizer
|
|
||||||
self.categorizer: t.Callable[[IntermediateValue], Category] = categorizer
|
|
||||||
|
|
||||||
def __repr__(self):
|
|
||||||
return f"<{self.__class__.__qualname__} {'trained' if self.trained else 'untrained'} tokenizer={self.extractor!r} categorizer={self.categorizer!r}>"
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def __data_to_tokenbag(data: tuple[TokenBag, Category]) -> TokenBag:
|
|
||||||
"""
|
|
||||||
Access the tokenbag of a data tuple.
|
|
||||||
"""
|
|
||||||
return data[0]
|
|
||||||
|
|
||||||
def __add_feature_unigrams(self, dataset: t.Iterator[tuple[TokenBag, Category]]) -> None:
|
|
||||||
"""
|
|
||||||
Register the `nltk.sentiment.util.extract_unigram_feats` feature extrator on the model.
|
|
||||||
"""
|
|
||||||
tokenbags = map(self.__data_to_tokenbag, dataset)
|
|
||||||
all_words = self.model.all_words(tokenbags, labeled=False)
|
|
||||||
unigrams = self.model.unigram_word_feats(words=all_words, min_freq=4)
|
|
||||||
self.model.add_feat_extractor(nltk.sentiment.util.extract_unigram_feats, unigrams=unigrams)
|
|
||||||
|
|
||||||
def _add_features(self, dataset: t.Iterator[tuple[TokenBag, Category]]):
|
|
||||||
"""
|
|
||||||
Register new feature extractors on the `.model`.
|
|
||||||
"""
|
|
||||||
self.__add_feature_unigrams(dataset)
|
|
||||||
|
|
||||||
def __extract_features(self, data: tuple[TokenBag, Category]) -> tuple[Features, Category]:
|
|
||||||
"""
|
|
||||||
Convert a (TokenBag, Category) tuple to a (Features, Category) tuple.
|
|
||||||
|
|
||||||
Does not use `SentimentAnalyzer.apply_features` due to unexpected behaviour when using iterators.
|
|
||||||
"""
|
|
||||||
count_passage("processed_features", 100)
|
|
||||||
return self.model.extract_features(data[0]), data[1]
|
|
||||||
|
|
||||||
def _train_from_dataset(self, dataset: t.Iterator[tuple[TokenBag, Category]]) -> None:
|
|
||||||
"""
|
|
||||||
Train the model with the given training set.
|
|
||||||
"""
|
|
||||||
if self.trained:
|
|
||||||
raise AlreadyTrainedError()
|
|
||||||
|
|
||||||
dataset_1, dataset_2 = itertools.tee(dataset, 2)
|
|
||||||
|
|
||||||
self._add_features(dataset_1)
|
|
||||||
del dataset_1
|
|
||||||
|
|
||||||
dataset_2 = map(self.__extract_features, dataset_2)
|
|
||||||
self.model.classifier = nltk.classify.NaiveBayesClassifier.train(dataset_2)
|
|
||||||
self.trained = True
|
|
||||||
|
|
||||||
def _evaluate_from_dataset(self, dataset: t.Iterator[tuple[TokenBag, Category]]) -> dict:
|
|
||||||
"""
|
|
||||||
Perform a model evaluation with the given test set.
|
|
||||||
"""
|
|
||||||
if not self.trained:
|
|
||||||
raise NotTrainedError()
|
|
||||||
|
|
||||||
dataset_1 = map(self.__extract_features, dataset)
|
|
||||||
# FIXME: This won't work with streams :(
|
|
||||||
return self.model.evaluate(list(dataset_1))
|
|
||||||
|
|
||||||
def _use_from_tokenbag(self, tokens: TokenBag) -> Category:
|
|
||||||
"""
|
|
||||||
Categorize the given token bag.
|
|
||||||
"""
|
|
||||||
if not self.trained:
|
|
||||||
raise NotTrainedError()
|
|
||||||
|
|
||||||
return self.model.classify(instance=tokens)
|
|
||||||
|
|
||||||
def _extract_data(self, inp: Input) -> tuple[TokenBag, Category]:
|
|
||||||
count_passage("processed_data", 100)
|
|
||||||
text, value = self.extractor(inp)
|
|
||||||
return self.tokenizer(text), self.categorizer(value)
|
|
||||||
|
|
||||||
def _extract_dataset(self, inp: t.Iterator[Input]) -> list[tuple[TokenBag, Category]]:
|
|
||||||
return map(self._extract_data, inp)
|
|
||||||
|
|
||||||
def train(self, training_set: t.Iterator[Input]) -> None:
|
|
||||||
dataset = self._extract_dataset(training_set)
|
|
||||||
self._train_from_dataset(dataset)
|
|
||||||
|
|
||||||
def evaluate(self, test_set: t.Iterator[Input]) -> dict:
|
|
||||||
dataset = self._extract_dataset(test_set)
|
|
||||||
return self._evaluate_from_dataset(dataset)
|
|
||||||
|
|
||||||
def use(self, text: Input) -> Category:
|
|
||||||
tokens = self.tokenizer(text)
|
|
||||||
return self._use_from_tokenbag(tokens)
|
|
||||||
|
|
||||||
|
|
||||||
__all__ = (
|
|
||||||
"VanillaSA",
|
|
||||||
)
|
|
|
@ -6,7 +6,7 @@ import bson
|
||||||
import logging
|
import logging
|
||||||
import itertools
|
import itertools
|
||||||
|
|
||||||
from .config import MONGO_HOST, MONGO_PORT, WORKING_SET_SIZE, DATA_SET_SIZE
|
from .config import MONGO_HOST, MONGO_PORT, WORKING_SET_SIZE
|
||||||
|
|
||||||
log = logging.getLogger(__name__)
|
log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
@ -24,6 +24,12 @@ class Review(t.TypedDict):
|
||||||
reviewTime: str
|
reviewTime: str
|
||||||
|
|
||||||
|
|
||||||
|
Text = str
|
||||||
|
Category = str
|
||||||
|
DataTuple = tuple[Text, Category]
|
||||||
|
DataSet = t.Iterable[DataTuple]
|
||||||
|
|
||||||
|
|
||||||
@contextlib.contextmanager
|
@contextlib.contextmanager
|
||||||
def mongo_client_from_config() -> t.ContextManager[pymongo.MongoClient]:
|
def mongo_client_from_config() -> t.ContextManager[pymongo.MongoClient]:
|
||||||
"""
|
"""
|
||||||
|
@ -80,7 +86,40 @@ def sample_reviews_by_rating(reviews: pymongo.collection.Collection, rating: flo
|
||||||
])
|
])
|
||||||
|
|
||||||
|
|
||||||
def dataset_polar(collection: pymongo.collection.Collection, amount: int) -> t.Iterator[Review]:
|
def review_to_datatuple(review: Review) -> tuple[Text, Category]:
|
||||||
|
"""
|
||||||
|
Return the label corresponding to the given review.
|
||||||
|
|
||||||
|
Possible categories are:
|
||||||
|
|
||||||
|
* terrible (1.0)
|
||||||
|
* negative (2.0)
|
||||||
|
* mixed (3.0)
|
||||||
|
* positive (4.0)
|
||||||
|
* great (5.0)
|
||||||
|
* unknown (everything else)
|
||||||
|
"""
|
||||||
|
text = review["reviewText"]
|
||||||
|
rating = review["overall"]
|
||||||
|
|
||||||
|
match rating:
|
||||||
|
case 1.0:
|
||||||
|
category = "terrible"
|
||||||
|
case 2.0:
|
||||||
|
category = "negative"
|
||||||
|
case 3.0:
|
||||||
|
category = "mixed"
|
||||||
|
case 4.0:
|
||||||
|
category = "positive"
|
||||||
|
case 5.0:
|
||||||
|
category = "great"
|
||||||
|
case _:
|
||||||
|
category = "unknown"
|
||||||
|
|
||||||
|
return text, category
|
||||||
|
|
||||||
|
|
||||||
|
def polar_dataset(collection: pymongo.collection.Collection, amount: int) -> t.Iterator[DataTuple]:
|
||||||
"""
|
"""
|
||||||
Get a list of the same amount of 1-star and 5-star reviews.
|
Get a list of the same amount of 1-star and 5-star reviews.
|
||||||
"""
|
"""
|
||||||
|
@ -90,13 +129,16 @@ def dataset_polar(collection: pymongo.collection.Collection, amount: int) -> t.I
|
||||||
positive = sample_reviews_by_rating(collection, rating=5.0, amount=amount)
|
positive = sample_reviews_by_rating(collection, rating=5.0, amount=amount)
|
||||||
negative = sample_reviews_by_rating(collection, rating=1.0, amount=amount)
|
negative = sample_reviews_by_rating(collection, rating=1.0, amount=amount)
|
||||||
|
|
||||||
# Randomness here does not matter, so just merge the lists
|
# Chain the iterators
|
||||||
both = itertools.chain(positive, negative)
|
full = itertools.chain(positive, negative)
|
||||||
|
|
||||||
return both
|
# Convert reviews to datatuples
|
||||||
|
full = map(review_to_datatuple, full)
|
||||||
|
|
||||||
|
return full
|
||||||
|
|
||||||
|
|
||||||
def dataset_varied(collection: pymongo.collection.Collection, amount: int) -> t.Iterator[Review]:
|
def varied_dataset(collection: pymongo.collection.Collection, amount: int) -> t.Iterator[DataTuple]:
|
||||||
"""
|
"""
|
||||||
Get a list of the same amount of reviews for each rating.
|
Get a list of the same amount of reviews for each rating.
|
||||||
"""
|
"""
|
||||||
|
@ -109,17 +151,25 @@ def dataset_varied(collection: pymongo.collection.Collection, amount: int) -> t.
|
||||||
positive = sample_reviews_by_rating(collection, rating=4.0, amount=amount)
|
positive = sample_reviews_by_rating(collection, rating=4.0, amount=amount)
|
||||||
great = sample_reviews_by_rating(collection, rating=5.0, amount=amount)
|
great = sample_reviews_by_rating(collection, rating=5.0, amount=amount)
|
||||||
|
|
||||||
|
# Chain the iterators
|
||||||
full = itertools.chain(terrible, negative, mixed, positive, great)
|
full = itertools.chain(terrible, negative, mixed, positive, great)
|
||||||
|
|
||||||
|
# Convert reviews to datatuples
|
||||||
|
full = map(review_to_datatuple, full)
|
||||||
|
|
||||||
return full
|
return full
|
||||||
|
|
||||||
|
|
||||||
__all__ = (
|
__all__ = (
|
||||||
"Review",
|
"Review",
|
||||||
|
"Text",
|
||||||
|
"Category",
|
||||||
|
"DataTuple",
|
||||||
|
"DataSet",
|
||||||
"mongo_client_from_config",
|
"mongo_client_from_config",
|
||||||
"mongo_reviews_collection_from_config",
|
"mongo_reviews_collection_from_config",
|
||||||
"sample_reviews",
|
"sample_reviews",
|
||||||
"sample_reviews_by_rating",
|
"sample_reviews_by_rating",
|
||||||
"dataset_polar",
|
"polar_dataset",
|
||||||
"dataset_varied",
|
"varied_dataset",
|
||||||
)
|
)
|
||||||
|
|
|
@ -2,7 +2,7 @@ import collections
|
||||||
import logging
|
import logging
|
||||||
import coloredlogs
|
import coloredlogs
|
||||||
|
|
||||||
log = logging.getLogger(__name__)
|
this_log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
def install_log_handler(loggers: list[logging.Logger] = None):
|
def install_log_handler(loggers: list[logging.Logger] = None):
|
||||||
|
@ -32,13 +32,13 @@ def install_log_handler(loggers: list[logging.Logger] = None):
|
||||||
),
|
),
|
||||||
isatty=True,
|
isatty=True,
|
||||||
)
|
)
|
||||||
log.debug("Installed custom log handler on: %s", logger)
|
this_log.debug("Installed custom log handler on: %s", logger)
|
||||||
|
|
||||||
|
|
||||||
_passage_counts = collections.defaultdict(lambda: 0)
|
_passage_counts = collections.defaultdict(lambda: 0)
|
||||||
|
|
||||||
|
|
||||||
def count_passage(key: str, mod: int):
|
def count_passage(log: logging.Logger, key: str, mod: int):
|
||||||
_passage_counts[key] += 1
|
_passage_counts[key] += 1
|
||||||
if not _passage_counts[key] % mod:
|
if not _passage_counts[key] % mod:
|
||||||
log.debug("%s - %d calls", key, _passage_counts[key])
|
log.debug("%s - %d calls", key, _passage_counts[key])
|
||||||
|
|
|
@ -1,15 +0,0 @@
|
||||||
from . import nltk_based
|
|
||||||
from . import potts_based
|
|
||||||
|
|
||||||
|
|
||||||
all_tokenizers = [
|
|
||||||
nltk_based.nltk_tokenizer,
|
|
||||||
potts_based.potts_tokenizer,
|
|
||||||
]
|
|
||||||
|
|
||||||
|
|
||||||
__all__ = (
|
|
||||||
"nltk_based",
|
|
||||||
"potts_based",
|
|
||||||
"all_tokenizers",
|
|
||||||
)
|
|
|
@ -1,16 +0,0 @@
|
||||||
import nltk
|
|
||||||
import nltk.sentiment.util
|
|
||||||
|
|
||||||
|
|
||||||
def nltk_tokenizer(text: str) -> list[str]:
|
|
||||||
"""
|
|
||||||
Convert a text string into a list of tokens.
|
|
||||||
"""
|
|
||||||
tokens = nltk.word_tokenize(text)
|
|
||||||
nltk.sentiment.util.mark_negation(tokens, shallow=True)
|
|
||||||
return tokens
|
|
||||||
|
|
||||||
|
|
||||||
__all__ = (
|
|
||||||
"nltk_tokenizer",
|
|
||||||
)
|
|
10
unimore_bda_6/tokenizer/__init__.py
Normal file
10
unimore_bda_6/tokenizer/__init__.py
Normal file
|
@ -0,0 +1,10 @@
|
||||||
|
from .base import BaseTokenizer
|
||||||
|
from .nltk_word_tokenize import NLTKWordTokenizer
|
||||||
|
from .potts import PottsTokenizer, PottsTokenizerWithNegation
|
||||||
|
|
||||||
|
|
||||||
|
__all__ = (
|
||||||
|
"BaseTokenizer",
|
||||||
|
"NLTKWordTokenizer",
|
||||||
|
"PottsTokenizer",
|
||||||
|
)
|
17
unimore_bda_6/tokenizer/base.py
Normal file
17
unimore_bda_6/tokenizer/base.py
Normal file
|
@ -0,0 +1,17 @@
|
||||||
|
import abc
|
||||||
|
|
||||||
|
|
||||||
|
class BaseTokenizer(metaclass=abc.ABCMeta):
|
||||||
|
"""
|
||||||
|
The base for all tokenizers in this project.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __repr__(self):
|
||||||
|
return f"{self.__class__.__qualname__}()"
|
||||||
|
|
||||||
|
@abc.abstractmethod
|
||||||
|
def tokenize(self, text: str) -> list[str]:
|
||||||
|
"""
|
||||||
|
Convert a text string into a list of tokens.
|
||||||
|
"""
|
||||||
|
raise NotImplementedError()
|
21
unimore_bda_6/tokenizer/nltk_word_tokenize.py
Normal file
21
unimore_bda_6/tokenizer/nltk_word_tokenize.py
Normal file
|
@ -0,0 +1,21 @@
|
||||||
|
import nltk
|
||||||
|
import nltk.sentiment.util
|
||||||
|
import typing as t
|
||||||
|
|
||||||
|
from .base import BaseTokenizer
|
||||||
|
|
||||||
|
|
||||||
|
class NLTKWordTokenizer(BaseTokenizer):
|
||||||
|
"""
|
||||||
|
Tokenizer based on `nltk.word_tokenize`.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def tokenize(self, text: str) -> t.Iterable[str]:
|
||||||
|
tokens = nltk.word_tokenize(text)
|
||||||
|
nltk.sentiment.util.mark_negation(tokens, shallow=True)
|
||||||
|
return tokens
|
||||||
|
|
||||||
|
|
||||||
|
__all__ = (
|
||||||
|
"NLTKWordTokenizer",
|
||||||
|
)
|
|
@ -1,8 +1,4 @@
|
||||||
"""
|
"""
|
||||||
This file is a vendored version of `Christopher Potts' tokenizer <http://sentiment.christopherpotts.net/tokenizing.html>`_, which the project's specifications require to use.
|
|
||||||
|
|
||||||
It has been altered to be used with Python 3.10, but the code is mostly the same.
|
|
||||||
|
|
||||||
=========================
|
=========================
|
||||||
Original module docstring
|
Original module docstring
|
||||||
=========================
|
=========================
|
||||||
|
@ -54,6 +50,9 @@ __email__ = "See the author's website"
|
||||||
import re
|
import re
|
||||||
import html.entities
|
import html.entities
|
||||||
import typing as t
|
import typing as t
|
||||||
|
import nltk.sentiment.util
|
||||||
|
|
||||||
|
from .base import BaseTokenizer
|
||||||
|
|
||||||
######################################################################
|
######################################################################
|
||||||
# The following strings are components in the regular expression
|
# The following strings are components in the regular expression
|
||||||
|
@ -143,49 +142,58 @@ amp = "&"
|
||||||
######################################################################
|
######################################################################
|
||||||
|
|
||||||
|
|
||||||
def potts_tokenizer(text: str) -> t.Iterable[str]:
|
class PottsTokenizer(BaseTokenizer):
|
||||||
"""
|
"""
|
||||||
Argument: s -- any string object
|
Tokenizer based on `Christopher Potts' tokenizer <http://sentiment.christopherpotts.net/tokenizing.html>`_.
|
||||||
Value: a tokenize list of strings; conatenating this list returns the original string if preserve_case=False
|
|
||||||
"""
|
"""
|
||||||
# Fix HTML character entitites:
|
|
||||||
s = __html2string(text)
|
|
||||||
# Tokenize:
|
|
||||||
words = word_re.findall(s)
|
|
||||||
# Possible alter the case, but avoid changing emoticons like :D into :d:
|
|
||||||
words = map((lambda x : x if emoticon_re.search(x) else x.lower()), words)
|
|
||||||
# Return the results
|
|
||||||
return words
|
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
def __html2string(html: str) -> str:
|
def __html2string(s: str) -> str:
|
||||||
"""
|
"""
|
||||||
Internal metod that seeks to replace all the HTML entities in
|
Internal metod that seeks to replace all the HTML entities in
|
||||||
s with their corresponding unicode characters.
|
s with their corresponding unicode characters.
|
||||||
"""
|
"""
|
||||||
# First the digits:
|
# First the digits:
|
||||||
ents = set(html_entity_digit_re.findall(html))
|
ents = set(html_entity_digit_re.findall(s))
|
||||||
if len(ents) > 0:
|
if len(ents) > 0:
|
||||||
for ent in ents:
|
for ent in ents:
|
||||||
entnum = ent[2:-1]
|
entnum = ent[2:-1]
|
||||||
try:
|
try:
|
||||||
entnum = int(entnum)
|
entnum = int(entnum)
|
||||||
html = html.replace(ent, chr(entnum))
|
s = s.replace(ent, chr(entnum))
|
||||||
except:
|
except (ValueError, KeyError):
|
||||||
pass
|
pass
|
||||||
# Now the alpha versions:
|
# Now the alpha versions:
|
||||||
ents = set(html_entity_alpha_re.findall(html))
|
ents = set(html_entity_alpha_re.findall(s))
|
||||||
ents = filter((lambda x : x != amp), ents)
|
ents = filter((lambda x : x != amp), ents)
|
||||||
for ent in ents:
|
for ent in ents:
|
||||||
entname = ent[1:-1]
|
entname = ent[1:-1]
|
||||||
try:
|
try:
|
||||||
html = html.replace(ent, chr(html.entities.name2codepoint[entname]))
|
s = s.replace(ent, chr(html.entities.name2codepoint[entname]))
|
||||||
except:
|
except (ValueError, KeyError):
|
||||||
pass
|
pass
|
||||||
html = html.replace(amp, " and ")
|
s = s.replace(amp, " and ")
|
||||||
return html
|
return s
|
||||||
|
|
||||||
|
def tokenize(self, text: str) -> t.Iterable[str]:
|
||||||
|
# Fix HTML character entitites:
|
||||||
|
s = self.__html2string(text)
|
||||||
|
# Tokenize:
|
||||||
|
words = word_re.findall(s)
|
||||||
|
# Possible alter the case, but avoid changing emoticons like :D into :d:
|
||||||
|
words = list(map(lambda x: x if emoticon_re.search(x) else x.lower(), words))
|
||||||
|
# Return the results
|
||||||
|
return words
|
||||||
|
|
||||||
|
|
||||||
|
class PottsTokenizerWithNegation(PottsTokenizer):
|
||||||
|
def tokenize(self, text: str) -> t.Iterable[str]:
|
||||||
|
words = super().tokenize(text)
|
||||||
|
nltk.sentiment.util.mark_negation(words, shallow=True)
|
||||||
|
return words
|
||||||
|
|
||||||
|
|
||||||
__all__ = (
|
__all__ = (
|
||||||
"potts_tokenizer",
|
"PottsTokenizer",
|
||||||
|
"PottsTokenizerWithNegation",
|
||||||
)
|
)
|
Loading…
Reference in a new issue