mirror of
https://github.com/Steffo99/unimore-bda-6.git
synced 2024-11-29 03:04:18 +00:00
stop here for now
This commit is contained in:
parent
6ef81c1c19
commit
e6dcf6e423
13 changed files with 1161 additions and 36 deletions
|
@ -32,6 +32,7 @@
|
||||||
</list>
|
</list>
|
||||||
</option>
|
</option>
|
||||||
</inspection_tool>
|
</inspection_tool>
|
||||||
|
<inspection_tool class="PyAbstractClassInspection" enabled="false" level="WEAK WARNING" enabled_by_default="false" />
|
||||||
<inspection_tool class="PyPep8Inspection" enabled="true" level="WEAK WARNING" enabled_by_default="true">
|
<inspection_tool class="PyPep8Inspection" enabled="true" level="WEAK WARNING" enabled_by_default="true">
|
||||||
<option name="ignoredErrors">
|
<option name="ignoredErrors">
|
||||||
<list>
|
<list>
|
||||||
|
|
|
@ -30,16 +30,20 @@
|
||||||
|
|
||||||
### Codice
|
### Codice
|
||||||
|
|
||||||
Il codice dell'attività è incluso come package Python compatibile con PEP518.
|
Il codice dell'attività è incluso come package Python 3.10 compatibile con PEP518.
|
||||||
|
|
||||||
Per installare il package, è sufficiente eseguire i seguenti comandi dall'interno della directory del progetto:
|
Per installare il package, è sufficiente eseguire i seguenti comandi dall'interno della directory del progetto:
|
||||||
|
|
||||||
```console
|
```console
|
||||||
$ python -m venv .venv
|
$ python3.10 -m venv .venv
|
||||||
$ source venv/bin/activate
|
$ source venv/bin/activate
|
||||||
$ pip install .
|
$ pip install .
|
||||||
```
|
```
|
||||||
|
|
||||||
|
> **Note:**
|
||||||
|
>
|
||||||
|
> Per via di requisiti particolari di Tensorflow, Python 3.11 non è supportato.
|
||||||
|
|
||||||
#### NLTK
|
#### NLTK
|
||||||
|
|
||||||
NLTK richiede dipendenze aggiuntive per funzionare, che possono essere scaricate eseguendo il seguente comando su console:
|
NLTK richiede dipendenze aggiuntive per funzionare, che possono essere scaricate eseguendo il seguente comando su console:
|
||||||
|
|
980
poetry.lock
generated
980
poetry.lock
generated
File diff suppressed because it is too large
Load diff
|
@ -131,11 +131,12 @@ classifiers = [
|
||||||
# nothing means "this specific release"
|
# nothing means "this specific release"
|
||||||
# 3.10.1 → == 3.10.1
|
# 3.10.1 → == 3.10.1
|
||||||
|
|
||||||
python = "^3.10"
|
python = "~3.10"
|
||||||
pymongo = "^4.3.3"
|
pymongo = "^4.3.3"
|
||||||
nltk = "^3.8.1"
|
nltk = "^3.8.1"
|
||||||
cfig = {extras = ["cli"], version = "^0.3.0"}
|
cfig = {extras = ["cli"], version = "^0.3.0"}
|
||||||
coloredlogs = "^15.0.1"
|
coloredlogs = "^15.0.1"
|
||||||
|
tensorflow = "^2.11.0"
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,18 +1,33 @@
|
||||||
import logging
|
import logging
|
||||||
|
import tensorflow
|
||||||
|
|
||||||
from .config import config, DATA_SET_SIZE
|
from .config import config, DATA_SET_SIZE
|
||||||
from .database import mongo_reviews_collection_from_config, polar_dataset, varied_dataset
|
from .database import mongo_reviews_collection_from_config, polar_dataset, varied_dataset
|
||||||
from .analysis.nltk_sentiment import NLTKSentimentAnalyzer
|
from .analysis.nltk_sentiment import NLTKSentimentAnalyzer
|
||||||
from .tokenizer import NLTKWordTokenizer, PottsTokenizer, PottsTokenizerWithNegation
|
from .analysis.tf_text import TensorflowSentimentAnalyzer
|
||||||
|
from .tokenizer import NLTKWordTokenizer, PottsTokenizer, PottsTokenizerWithNegation, LowercaseTokenizer
|
||||||
from .log import install_log_handler
|
from .log import install_log_handler
|
||||||
|
|
||||||
log = logging.getLogger(__name__)
|
log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
|
if len(tensorflow.config.list_physical_devices(device_type="GPU")) == 0:
|
||||||
|
log.warning("Tensorflow reports no GPU acceleration available.")
|
||||||
|
else:
|
||||||
|
log.debug("Tensorflow successfully found GPU acceleration!")
|
||||||
|
|
||||||
for dataset_func in [polar_dataset, varied_dataset]:
|
for dataset_func in [polar_dataset, varied_dataset]:
|
||||||
for SentimentAnalyzer in [NLTKSentimentAnalyzer]:
|
for SentimentAnalyzer in [
|
||||||
for Tokenizer in [NLTKWordTokenizer, PottsTokenizer, PottsTokenizerWithNegation]:
|
# NLTKSentimentAnalyzer,
|
||||||
|
TensorflowSentimentAnalyzer,
|
||||||
|
]:
|
||||||
|
for Tokenizer in [
|
||||||
|
# NLTKWordTokenizer,
|
||||||
|
# PottsTokenizer,
|
||||||
|
# PottsTokenizerWithNegation,
|
||||||
|
LowercaseTokenizer,
|
||||||
|
]:
|
||||||
tokenizer = Tokenizer()
|
tokenizer = Tokenizer()
|
||||||
model = SentimentAnalyzer(tokenizer=tokenizer)
|
model = SentimentAnalyzer(tokenizer=tokenizer)
|
||||||
|
|
||||||
|
|
|
@ -2,6 +2,7 @@ import abc
|
||||||
import logging
|
import logging
|
||||||
|
|
||||||
from ..database import DataSet, Text, Category
|
from ..database import DataSet, Text, Category
|
||||||
|
from ..tokenizer import BaseTokenizer
|
||||||
|
|
||||||
log = logging.getLogger(__name__)
|
log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
@ -11,6 +12,12 @@ class BaseSentimentAnalyzer(metaclass=abc.ABCMeta):
|
||||||
Abstract base class for sentiment analyzers implemented in this project.
|
Abstract base class for sentiment analyzers implemented in this project.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
def __init__(self, *, tokenizer: BaseTokenizer):
|
||||||
|
self.tokenizer: BaseTokenizer = tokenizer
|
||||||
|
|
||||||
|
def __repr__(self):
|
||||||
|
return f"<{self.__class__.__qualname__} tokenizer={self.tokenizer!r}>"
|
||||||
|
|
||||||
@abc.abstractmethod
|
@abc.abstractmethod
|
||||||
def train(self, training_set: DataSet) -> None:
|
def train(self, training_set: DataSet) -> None:
|
||||||
"""
|
"""
|
||||||
|
@ -44,6 +51,20 @@ class BaseSentimentAnalyzer(metaclass=abc.ABCMeta):
|
||||||
raise NotImplementedError()
|
raise NotImplementedError()
|
||||||
|
|
||||||
|
|
||||||
|
class AlreadyTrainedError(Exception):
|
||||||
|
"""
|
||||||
|
This model has already been trained and cannot be trained again.
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
class NotTrainedError(Exception):
|
||||||
|
"""
|
||||||
|
This model has not been trained yet.
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
__all__ = (
|
__all__ = (
|
||||||
"BaseSentimentAnalyzer",
|
"BaseSentimentAnalyzer",
|
||||||
|
"AlreadyTrainedError",
|
||||||
|
"NotTrainedError",
|
||||||
)
|
)
|
||||||
|
|
|
@ -7,7 +7,7 @@ import typing as t
|
||||||
import itertools
|
import itertools
|
||||||
|
|
||||||
from ..database import Text, Category, DataTuple, DataSet
|
from ..database import Text, Category, DataTuple, DataSet
|
||||||
from .base import BaseSentimentAnalyzer
|
from .base import BaseSentimentAnalyzer, AlreadyTrainedError, NotTrainedError
|
||||||
from ..log import count_passage
|
from ..log import count_passage
|
||||||
from ..tokenizer import BaseTokenizer
|
from ..tokenizer import BaseTokenizer
|
||||||
|
|
||||||
|
@ -17,38 +17,22 @@ TokenBag = list[str]
|
||||||
Features = dict[str, int]
|
Features = dict[str, int]
|
||||||
|
|
||||||
|
|
||||||
class AlreadyTrainedError(Exception):
|
|
||||||
"""
|
|
||||||
This model has already been trained and cannot be trained again.
|
|
||||||
"""
|
|
||||||
|
|
||||||
|
|
||||||
class NotTrainedError(Exception):
|
|
||||||
"""
|
|
||||||
This model has not been trained yet.
|
|
||||||
"""
|
|
||||||
|
|
||||||
|
|
||||||
class NLTKSentimentAnalyzer(BaseSentimentAnalyzer):
|
class NLTKSentimentAnalyzer(BaseSentimentAnalyzer):
|
||||||
"""
|
"""
|
||||||
A sentiment analyzer resembling the one implemented in structure the one implemented in the classroom, using the basic sentiment analyzer of NLTK.
|
A sentiment analyzer resembling the one implemented in structure the one implemented in the classroom, using the basic sentiment analyzer of NLTK.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, *, tokenizer: BaseTokenizer) -> None:
|
def __init__(self, *, tokenizer: BaseTokenizer) -> None:
|
||||||
super().__init__()
|
super().__init__(tokenizer=tokenizer)
|
||||||
self.model: nltk.sentiment.SentimentAnalyzer = nltk.sentiment.SentimentAnalyzer()
|
self.model: nltk.sentiment.SentimentAnalyzer = nltk.sentiment.SentimentAnalyzer()
|
||||||
self.trained: bool = False
|
self.trained: bool = False
|
||||||
self.tokenizer: BaseTokenizer = tokenizer
|
|
||||||
|
|
||||||
def __repr__(self):
|
|
||||||
return f"<{self.__class__.__qualname__} {'trained' if self.trained else 'untrained'} tokenizer={self.tokenizer!r}>"
|
|
||||||
|
|
||||||
def __tokenize_datatuple(self, datatuple: DataTuple) -> tuple[TokenBag, Category]:
|
def __tokenize_datatuple(self, datatuple: DataTuple) -> tuple[TokenBag, Category]:
|
||||||
"""
|
"""
|
||||||
Convert the `Text` of a `DataTuple` to a `TokenBag`.
|
Convert the `Text` of a `DataTuple` to a `TokenBag`.
|
||||||
"""
|
"""
|
||||||
count_passage(log, "tokenize_datatuple", 100)
|
count_passage(log, "tokenize_datatuple", 100)
|
||||||
return self.tokenizer.tokenize(datatuple[0]), datatuple[1]
|
return self.tokenizer.tokenize_builtins(datatuple[0]), datatuple[1]
|
||||||
|
|
||||||
def _add_feature_unigrams(self, dataset: t.Iterator[tuple[TokenBag, Category]]) -> None:
|
def _add_feature_unigrams(self, dataset: t.Iterator[tuple[TokenBag, Category]]) -> None:
|
||||||
"""
|
"""
|
||||||
|
@ -112,7 +96,7 @@ class NLTKSentimentAnalyzer(BaseSentimentAnalyzer):
|
||||||
raise NotTrainedError()
|
raise NotTrainedError()
|
||||||
|
|
||||||
# Tokenize the input
|
# Tokenize the input
|
||||||
tokens = self.tokenizer.tokenize(text)
|
tokens = self.tokenizer.tokenize_builtins(text)
|
||||||
|
|
||||||
# Run the classification method
|
# Run the classification method
|
||||||
return self.model.classify(instance=tokens)
|
return self.model.classify(instance=tokens)
|
||||||
|
|
91
unimore_bda_6/analysis/tf_text.py
Normal file
91
unimore_bda_6/analysis/tf_text.py
Normal file
|
@ -0,0 +1,91 @@
|
||||||
|
import tensorflow
|
||||||
|
import itertools
|
||||||
|
import typing as t
|
||||||
|
|
||||||
|
from ..database import DataSet, Text, Category
|
||||||
|
from ..tokenizer import BaseTokenizer
|
||||||
|
from .base import BaseSentimentAnalyzer, AlreadyTrainedError, NotTrainedError
|
||||||
|
|
||||||
|
|
||||||
|
class TensorflowSentimentAnalyzer(BaseSentimentAnalyzer):
|
||||||
|
def __init__(self, *, tokenizer: BaseTokenizer):
|
||||||
|
super().__init__(tokenizer=tokenizer)
|
||||||
|
self.trained = False
|
||||||
|
self.text_vectorization_layer = None
|
||||||
|
self.neural_network: tensorflow.keras.Sequential | None = None
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def __infinite_dataset_generator_factory(dataset: DataSet):
|
||||||
|
"""
|
||||||
|
A generator of infinite copies of dataset.
|
||||||
|
|
||||||
|
.. todo:: Loads the whole dataset in memory. What a waste! Can we perform multiple MongoDB queries instead?
|
||||||
|
"""
|
||||||
|
dataset = map(lambda text, category: (tensorflow.convert_to_tensor(text, dtype=tensorflow.string), tensorflow.convert_to_tensor(category, dtype=tensorflow.string)), dataset)
|
||||||
|
|
||||||
|
def generator():
|
||||||
|
while True:
|
||||||
|
nonlocal dataset
|
||||||
|
dataset, result = itertools.tee(dataset, 2)
|
||||||
|
yield result
|
||||||
|
|
||||||
|
return generator
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def __bda_dataset_to_tf_dataset(cls, dataset: DataSet) -> tensorflow.data.Dataset:
|
||||||
|
"""
|
||||||
|
Convert a `unimore_bda_6.database.DataSet` to a "real" `tensorflow.data.Dataset`.
|
||||||
|
"""
|
||||||
|
return tensorflow.data.Dataset.from_generator(
|
||||||
|
cls.__infinite_dataset_generator_factory(dataset),
|
||||||
|
output_signature=(
|
||||||
|
tensorflow.TensorSpec(shape=(), dtype=tensorflow.string),
|
||||||
|
tensorflow.TensorSpec(shape=(), dtype=tensorflow.string),
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
MAX_FEATURES = 20000
|
||||||
|
EMBEDDING_DIM = 16
|
||||||
|
EPOCHS = 10
|
||||||
|
|
||||||
|
def train(self, training_set: DataSet) -> None:
|
||||||
|
if self.trained:
|
||||||
|
raise AlreadyTrainedError()
|
||||||
|
|
||||||
|
training_set = self.__bda_dataset_to_tf_dataset(training_set)
|
||||||
|
|
||||||
|
self.text_vectorization_layer = tensorflow.keras.layers.TextVectorization(
|
||||||
|
max_tokens=self.MAX_FEATURES,
|
||||||
|
standardize=self.tokenizer.tokenize_tensorflow,
|
||||||
|
)
|
||||||
|
self.text_vectorization_layer.adapt(map(lambda t: t[0], training_set))
|
||||||
|
|
||||||
|
training_set = training_set.map(self.text_vectorization_layer)
|
||||||
|
|
||||||
|
# I have no idea of what I'm doing here
|
||||||
|
self.neural_network = tensorflow.keras.Sequential([
|
||||||
|
tensorflow.keras.layers.Embedding(self.MAX_FEATURES + 1, self.EMBEDDING_DIM),
|
||||||
|
tensorflow.keras.layers.Dropout(0.2),
|
||||||
|
tensorflow.keras.layers.GlobalAveragePooling1D(),
|
||||||
|
tensorflow.keras.layers.Dropout(0.2),
|
||||||
|
tensorflow.keras.layers.Dense(1),
|
||||||
|
])
|
||||||
|
|
||||||
|
self.neural_network.compile(
|
||||||
|
loss=tensorflow.losses.BinaryCrossentropy(from_logits=True), # Only works with two tags
|
||||||
|
metrics=tensorflow.metrics.BinaryAccuracy(threshold=0.0)
|
||||||
|
)
|
||||||
|
|
||||||
|
self.neural_network.fit(
|
||||||
|
training_set,
|
||||||
|
epochs=self.EPOCHS,
|
||||||
|
)
|
||||||
|
|
||||||
|
self.trained = True
|
||||||
|
|
||||||
|
def use(self, text: Text) -> Category:
|
||||||
|
if not self.trained:
|
||||||
|
raise NotTrainedError()
|
||||||
|
|
||||||
|
prediction = self.neural_network.predict(text)
|
||||||
|
breakpoint()
|
|
@ -1,10 +1,13 @@
|
||||||
from .base import BaseTokenizer
|
from .base import BaseTokenizer
|
||||||
from .nltk_word_tokenize import NLTKWordTokenizer
|
from .nltk_word_tokenize import NLTKWordTokenizer
|
||||||
from .potts import PottsTokenizer, PottsTokenizerWithNegation
|
from .potts import PottsTokenizer, PottsTokenizerWithNegation
|
||||||
|
from .lower import LowercaseTokenizer
|
||||||
|
|
||||||
|
|
||||||
__all__ = (
|
__all__ = (
|
||||||
"BaseTokenizer",
|
"BaseTokenizer",
|
||||||
"NLTKWordTokenizer",
|
"NLTKWordTokenizer",
|
||||||
"PottsTokenizer",
|
"PottsTokenizer",
|
||||||
|
"PottsTokenizerWithNegation",
|
||||||
|
"LowercaseTokenizer",
|
||||||
)
|
)
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
import abc
|
import tensorflow
|
||||||
|
|
||||||
|
|
||||||
class BaseTokenizer(metaclass=abc.ABCMeta):
|
class BaseTokenizer:
|
||||||
"""
|
"""
|
||||||
The base for all tokenizers in this project.
|
The base for all tokenizers in this project.
|
||||||
"""
|
"""
|
||||||
|
@ -9,9 +9,27 @@ class BaseTokenizer(metaclass=abc.ABCMeta):
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
return f"{self.__class__.__qualname__}()"
|
return f"{self.__class__.__qualname__}()"
|
||||||
|
|
||||||
@abc.abstractmethod
|
@staticmethod
|
||||||
def tokenize(self, text: str) -> list[str]:
|
def __not_implemented(f):
|
||||||
|
f.__notimplemented__ = True
|
||||||
|
return f
|
||||||
|
|
||||||
|
def can_tokenize_builtins(self) -> bool:
|
||||||
|
return getattr(self.tokenize_builtins, "__notimplemented__", False)
|
||||||
|
|
||||||
|
def can_tokenize_tensorflow(self) -> bool:
|
||||||
|
return getattr(self.tokenize_tensorflow, "__notimplemented__", False)
|
||||||
|
|
||||||
|
@__not_implemented
|
||||||
|
def tokenize_builtins(self, text: str) -> list[str]:
|
||||||
"""
|
"""
|
||||||
Convert a text string into a list of tokens.
|
Convert a text string into a list of tokens.
|
||||||
"""
|
"""
|
||||||
raise NotImplementedError()
|
raise NotImplementedError()
|
||||||
|
|
||||||
|
@__not_implemented
|
||||||
|
def tokenize_tensorflow(self, text: tensorflow.Tensor) -> tensorflow.Tensor:
|
||||||
|
"""
|
||||||
|
Convert a `tensorflow.Tensor` string into another `tensorflow.Tensor` space-separated string.
|
||||||
|
"""
|
||||||
|
raise NotImplementedError()
|
||||||
|
|
11
unimore_bda_6/tokenizer/lower.py
Normal file
11
unimore_bda_6/tokenizer/lower.py
Normal file
|
@ -0,0 +1,11 @@
|
||||||
|
import tensorflow
|
||||||
|
|
||||||
|
from .base import BaseTokenizer
|
||||||
|
|
||||||
|
|
||||||
|
class LowercaseTokenizer(BaseTokenizer):
|
||||||
|
def tokenize_builtins(self, text: str) -> list[str]:
|
||||||
|
return text.lower().split()
|
||||||
|
|
||||||
|
def tokenize_tensorflow(self, text: tensorflow.Tensor) -> tensorflow.Tensor:
|
||||||
|
return tensorflow.strings.lower(text)
|
|
@ -10,7 +10,7 @@ class NLTKWordTokenizer(BaseTokenizer):
|
||||||
Tokenizer based on `nltk.word_tokenize`.
|
Tokenizer based on `nltk.word_tokenize`.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def tokenize(self, text: str) -> t.Iterable[str]:
|
def tokenize_builtins(self, text: str) -> t.Iterable[str]:
|
||||||
tokens = nltk.word_tokenize(text)
|
tokens = nltk.word_tokenize(text)
|
||||||
nltk.sentiment.util.mark_negation(tokens, shallow=True)
|
nltk.sentiment.util.mark_negation(tokens, shallow=True)
|
||||||
return tokens
|
return tokens
|
||||||
|
|
|
@ -175,7 +175,7 @@ class PottsTokenizer(BaseTokenizer):
|
||||||
s = s.replace(amp, " and ")
|
s = s.replace(amp, " and ")
|
||||||
return s
|
return s
|
||||||
|
|
||||||
def tokenize(self, text: str) -> t.Iterable[str]:
|
def tokenize_builtins(self, text: str) -> t.Iterable[str]:
|
||||||
# Fix HTML character entitites:
|
# Fix HTML character entitites:
|
||||||
s = self.__html2string(text)
|
s = self.__html2string(text)
|
||||||
# Tokenize:
|
# Tokenize:
|
||||||
|
@ -187,8 +187,8 @@ class PottsTokenizer(BaseTokenizer):
|
||||||
|
|
||||||
|
|
||||||
class PottsTokenizerWithNegation(PottsTokenizer):
|
class PottsTokenizerWithNegation(PottsTokenizer):
|
||||||
def tokenize(self, text: str) -> t.Iterable[str]:
|
def tokenize_builtins(self, text: str) -> t.Iterable[str]:
|
||||||
words = super().tokenize(text)
|
words = super().tokenize_builtins(text)
|
||||||
nltk.sentiment.util.mark_negation(words, shallow=True)
|
nltk.sentiment.util.mark_negation(words, shallow=True)
|
||||||
return words
|
return words
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue