1
Fork 0
mirror of https://github.com/Steffo99/unimore-bda-6.git synced 2024-11-28 18:54:20 +00:00

stop here for now

This commit is contained in:
Steffo 2023-02-04 01:36:42 +01:00
parent 6ef81c1c19
commit e6dcf6e423
Signed by: steffo
GPG key ID: 2A24051445686895
13 changed files with 1161 additions and 36 deletions

View file

@ -32,6 +32,7 @@
</list> </list>
</option> </option>
</inspection_tool> </inspection_tool>
<inspection_tool class="PyAbstractClassInspection" enabled="false" level="WEAK WARNING" enabled_by_default="false" />
<inspection_tool class="PyPep8Inspection" enabled="true" level="WEAK WARNING" enabled_by_default="true"> <inspection_tool class="PyPep8Inspection" enabled="true" level="WEAK WARNING" enabled_by_default="true">
<option name="ignoredErrors"> <option name="ignoredErrors">
<list> <list>

View file

@ -30,16 +30,20 @@
### Codice ### Codice
Il codice dell'attività è incluso come package Python compatibile con PEP518. Il codice dell'attività è incluso come package Python 3.10 compatibile con PEP518.
Per installare il package, è sufficiente eseguire i seguenti comandi dall'interno della directory del progetto: Per installare il package, è sufficiente eseguire i seguenti comandi dall'interno della directory del progetto:
```console ```console
$ python -m venv .venv $ python3.10 -m venv .venv
$ source venv/bin/activate $ source venv/bin/activate
$ pip install . $ pip install .
``` ```
> **Note:**
>
> Per via di requisiti particolari di Tensorflow, Python 3.11 non è supportato.
#### NLTK #### NLTK
NLTK richiede dipendenze aggiuntive per funzionare, che possono essere scaricate eseguendo il seguente comando su console: NLTK richiede dipendenze aggiuntive per funzionare, che possono essere scaricate eseguendo il seguente comando su console:

980
poetry.lock generated

File diff suppressed because it is too large Load diff

View file

@ -131,11 +131,12 @@ classifiers = [
# nothing means "this specific release" # nothing means "this specific release"
# 3.10.1 → == 3.10.1 # 3.10.1 → == 3.10.1
python = "^3.10" python = "~3.10"
pymongo = "^4.3.3" pymongo = "^4.3.3"
nltk = "^3.8.1" nltk = "^3.8.1"
cfig = {extras = ["cli"], version = "^0.3.0"} cfig = {extras = ["cli"], version = "^0.3.0"}
coloredlogs = "^15.0.1" coloredlogs = "^15.0.1"
tensorflow = "^2.11.0"

View file

@ -1,18 +1,33 @@
import logging import logging
import tensorflow
from .config import config, DATA_SET_SIZE from .config import config, DATA_SET_SIZE
from .database import mongo_reviews_collection_from_config, polar_dataset, varied_dataset from .database import mongo_reviews_collection_from_config, polar_dataset, varied_dataset
from .analysis.nltk_sentiment import NLTKSentimentAnalyzer from .analysis.nltk_sentiment import NLTKSentimentAnalyzer
from .tokenizer import NLTKWordTokenizer, PottsTokenizer, PottsTokenizerWithNegation from .analysis.tf_text import TensorflowSentimentAnalyzer
from .tokenizer import NLTKWordTokenizer, PottsTokenizer, PottsTokenizerWithNegation, LowercaseTokenizer
from .log import install_log_handler from .log import install_log_handler
log = logging.getLogger(__name__) log = logging.getLogger(__name__)
def main(): def main():
if len(tensorflow.config.list_physical_devices(device_type="GPU")) == 0:
log.warning("Tensorflow reports no GPU acceleration available.")
else:
log.debug("Tensorflow successfully found GPU acceleration!")
for dataset_func in [polar_dataset, varied_dataset]: for dataset_func in [polar_dataset, varied_dataset]:
for SentimentAnalyzer in [NLTKSentimentAnalyzer]: for SentimentAnalyzer in [
for Tokenizer in [NLTKWordTokenizer, PottsTokenizer, PottsTokenizerWithNegation]: # NLTKSentimentAnalyzer,
TensorflowSentimentAnalyzer,
]:
for Tokenizer in [
# NLTKWordTokenizer,
# PottsTokenizer,
# PottsTokenizerWithNegation,
LowercaseTokenizer,
]:
tokenizer = Tokenizer() tokenizer = Tokenizer()
model = SentimentAnalyzer(tokenizer=tokenizer) model = SentimentAnalyzer(tokenizer=tokenizer)

View file

@ -2,6 +2,7 @@ import abc
import logging import logging
from ..database import DataSet, Text, Category from ..database import DataSet, Text, Category
from ..tokenizer import BaseTokenizer
log = logging.getLogger(__name__) log = logging.getLogger(__name__)
@ -11,6 +12,12 @@ class BaseSentimentAnalyzer(metaclass=abc.ABCMeta):
Abstract base class for sentiment analyzers implemented in this project. Abstract base class for sentiment analyzers implemented in this project.
""" """
def __init__(self, *, tokenizer: BaseTokenizer):
self.tokenizer: BaseTokenizer = tokenizer
def __repr__(self):
return f"<{self.__class__.__qualname__} tokenizer={self.tokenizer!r}>"
@abc.abstractmethod @abc.abstractmethod
def train(self, training_set: DataSet) -> None: def train(self, training_set: DataSet) -> None:
""" """
@ -44,6 +51,20 @@ class BaseSentimentAnalyzer(metaclass=abc.ABCMeta):
raise NotImplementedError() raise NotImplementedError()
class AlreadyTrainedError(Exception):
"""
This model has already been trained and cannot be trained again.
"""
class NotTrainedError(Exception):
"""
This model has not been trained yet.
"""
__all__ = ( __all__ = (
"BaseSentimentAnalyzer", "BaseSentimentAnalyzer",
"AlreadyTrainedError",
"NotTrainedError",
) )

View file

@ -7,7 +7,7 @@ import typing as t
import itertools import itertools
from ..database import Text, Category, DataTuple, DataSet from ..database import Text, Category, DataTuple, DataSet
from .base import BaseSentimentAnalyzer from .base import BaseSentimentAnalyzer, AlreadyTrainedError, NotTrainedError
from ..log import count_passage from ..log import count_passage
from ..tokenizer import BaseTokenizer from ..tokenizer import BaseTokenizer
@ -17,38 +17,22 @@ TokenBag = list[str]
Features = dict[str, int] Features = dict[str, int]
class AlreadyTrainedError(Exception):
"""
This model has already been trained and cannot be trained again.
"""
class NotTrainedError(Exception):
"""
This model has not been trained yet.
"""
class NLTKSentimentAnalyzer(BaseSentimentAnalyzer): class NLTKSentimentAnalyzer(BaseSentimentAnalyzer):
""" """
A sentiment analyzer resembling the one implemented in structure the one implemented in the classroom, using the basic sentiment analyzer of NLTK. A sentiment analyzer resembling the one implemented in structure the one implemented in the classroom, using the basic sentiment analyzer of NLTK.
""" """
def __init__(self, *, tokenizer: BaseTokenizer) -> None: def __init__(self, *, tokenizer: BaseTokenizer) -> None:
super().__init__() super().__init__(tokenizer=tokenizer)
self.model: nltk.sentiment.SentimentAnalyzer = nltk.sentiment.SentimentAnalyzer() self.model: nltk.sentiment.SentimentAnalyzer = nltk.sentiment.SentimentAnalyzer()
self.trained: bool = False self.trained: bool = False
self.tokenizer: BaseTokenizer = tokenizer
def __repr__(self):
return f"<{self.__class__.__qualname__} {'trained' if self.trained else 'untrained'} tokenizer={self.tokenizer!r}>"
def __tokenize_datatuple(self, datatuple: DataTuple) -> tuple[TokenBag, Category]: def __tokenize_datatuple(self, datatuple: DataTuple) -> tuple[TokenBag, Category]:
""" """
Convert the `Text` of a `DataTuple` to a `TokenBag`. Convert the `Text` of a `DataTuple` to a `TokenBag`.
""" """
count_passage(log, "tokenize_datatuple", 100) count_passage(log, "tokenize_datatuple", 100)
return self.tokenizer.tokenize(datatuple[0]), datatuple[1] return self.tokenizer.tokenize_builtins(datatuple[0]), datatuple[1]
def _add_feature_unigrams(self, dataset: t.Iterator[tuple[TokenBag, Category]]) -> None: def _add_feature_unigrams(self, dataset: t.Iterator[tuple[TokenBag, Category]]) -> None:
""" """
@ -112,7 +96,7 @@ class NLTKSentimentAnalyzer(BaseSentimentAnalyzer):
raise NotTrainedError() raise NotTrainedError()
# Tokenize the input # Tokenize the input
tokens = self.tokenizer.tokenize(text) tokens = self.tokenizer.tokenize_builtins(text)
# Run the classification method # Run the classification method
return self.model.classify(instance=tokens) return self.model.classify(instance=tokens)

View file

@ -0,0 +1,91 @@
import tensorflow
import itertools
import typing as t
from ..database import DataSet, Text, Category
from ..tokenizer import BaseTokenizer
from .base import BaseSentimentAnalyzer, AlreadyTrainedError, NotTrainedError
class TensorflowSentimentAnalyzer(BaseSentimentAnalyzer):
def __init__(self, *, tokenizer: BaseTokenizer):
super().__init__(tokenizer=tokenizer)
self.trained = False
self.text_vectorization_layer = None
self.neural_network: tensorflow.keras.Sequential | None = None
@staticmethod
def __infinite_dataset_generator_factory(dataset: DataSet):
"""
A generator of infinite copies of dataset.
.. todo:: Loads the whole dataset in memory. What a waste! Can we perform multiple MongoDB queries instead?
"""
dataset = map(lambda text, category: (tensorflow.convert_to_tensor(text, dtype=tensorflow.string), tensorflow.convert_to_tensor(category, dtype=tensorflow.string)), dataset)
def generator():
while True:
nonlocal dataset
dataset, result = itertools.tee(dataset, 2)
yield result
return generator
@classmethod
def __bda_dataset_to_tf_dataset(cls, dataset: DataSet) -> tensorflow.data.Dataset:
"""
Convert a `unimore_bda_6.database.DataSet` to a "real" `tensorflow.data.Dataset`.
"""
return tensorflow.data.Dataset.from_generator(
cls.__infinite_dataset_generator_factory(dataset),
output_signature=(
tensorflow.TensorSpec(shape=(), dtype=tensorflow.string),
tensorflow.TensorSpec(shape=(), dtype=tensorflow.string),
)
)
MAX_FEATURES = 20000
EMBEDDING_DIM = 16
EPOCHS = 10
def train(self, training_set: DataSet) -> None:
if self.trained:
raise AlreadyTrainedError()
training_set = self.__bda_dataset_to_tf_dataset(training_set)
self.text_vectorization_layer = tensorflow.keras.layers.TextVectorization(
max_tokens=self.MAX_FEATURES,
standardize=self.tokenizer.tokenize_tensorflow,
)
self.text_vectorization_layer.adapt(map(lambda t: t[0], training_set))
training_set = training_set.map(self.text_vectorization_layer)
# I have no idea of what I'm doing here
self.neural_network = tensorflow.keras.Sequential([
tensorflow.keras.layers.Embedding(self.MAX_FEATURES + 1, self.EMBEDDING_DIM),
tensorflow.keras.layers.Dropout(0.2),
tensorflow.keras.layers.GlobalAveragePooling1D(),
tensorflow.keras.layers.Dropout(0.2),
tensorflow.keras.layers.Dense(1),
])
self.neural_network.compile(
loss=tensorflow.losses.BinaryCrossentropy(from_logits=True), # Only works with two tags
metrics=tensorflow.metrics.BinaryAccuracy(threshold=0.0)
)
self.neural_network.fit(
training_set,
epochs=self.EPOCHS,
)
self.trained = True
def use(self, text: Text) -> Category:
if not self.trained:
raise NotTrainedError()
prediction = self.neural_network.predict(text)
breakpoint()

View file

@ -1,10 +1,13 @@
from .base import BaseTokenizer from .base import BaseTokenizer
from .nltk_word_tokenize import NLTKWordTokenizer from .nltk_word_tokenize import NLTKWordTokenizer
from .potts import PottsTokenizer, PottsTokenizerWithNegation from .potts import PottsTokenizer, PottsTokenizerWithNegation
from .lower import LowercaseTokenizer
__all__ = ( __all__ = (
"BaseTokenizer", "BaseTokenizer",
"NLTKWordTokenizer", "NLTKWordTokenizer",
"PottsTokenizer", "PottsTokenizer",
"PottsTokenizerWithNegation",
"LowercaseTokenizer",
) )

View file

@ -1,7 +1,7 @@
import abc import tensorflow
class BaseTokenizer(metaclass=abc.ABCMeta): class BaseTokenizer:
""" """
The base for all tokenizers in this project. The base for all tokenizers in this project.
""" """
@ -9,9 +9,27 @@ class BaseTokenizer(metaclass=abc.ABCMeta):
def __repr__(self): def __repr__(self):
return f"{self.__class__.__qualname__}()" return f"{self.__class__.__qualname__}()"
@abc.abstractmethod @staticmethod
def tokenize(self, text: str) -> list[str]: def __not_implemented(f):
f.__notimplemented__ = True
return f
def can_tokenize_builtins(self) -> bool:
return getattr(self.tokenize_builtins, "__notimplemented__", False)
def can_tokenize_tensorflow(self) -> bool:
return getattr(self.tokenize_tensorflow, "__notimplemented__", False)
@__not_implemented
def tokenize_builtins(self, text: str) -> list[str]:
""" """
Convert a text string into a list of tokens. Convert a text string into a list of tokens.
""" """
raise NotImplementedError() raise NotImplementedError()
@__not_implemented
def tokenize_tensorflow(self, text: tensorflow.Tensor) -> tensorflow.Tensor:
"""
Convert a `tensorflow.Tensor` string into another `tensorflow.Tensor` space-separated string.
"""
raise NotImplementedError()

View file

@ -0,0 +1,11 @@
import tensorflow
from .base import BaseTokenizer
class LowercaseTokenizer(BaseTokenizer):
def tokenize_builtins(self, text: str) -> list[str]:
return text.lower().split()
def tokenize_tensorflow(self, text: tensorflow.Tensor) -> tensorflow.Tensor:
return tensorflow.strings.lower(text)

View file

@ -10,7 +10,7 @@ class NLTKWordTokenizer(BaseTokenizer):
Tokenizer based on `nltk.word_tokenize`. Tokenizer based on `nltk.word_tokenize`.
""" """
def tokenize(self, text: str) -> t.Iterable[str]: def tokenize_builtins(self, text: str) -> t.Iterable[str]:
tokens = nltk.word_tokenize(text) tokens = nltk.word_tokenize(text)
nltk.sentiment.util.mark_negation(tokens, shallow=True) nltk.sentiment.util.mark_negation(tokens, shallow=True)
return tokens return tokens

View file

@ -175,7 +175,7 @@ class PottsTokenizer(BaseTokenizer):
s = s.replace(amp, " and ") s = s.replace(amp, " and ")
return s return s
def tokenize(self, text: str) -> t.Iterable[str]: def tokenize_builtins(self, text: str) -> t.Iterable[str]:
# Fix HTML character entitites: # Fix HTML character entitites:
s = self.__html2string(text) s = self.__html2string(text)
# Tokenize: # Tokenize:
@ -187,8 +187,8 @@ class PottsTokenizer(BaseTokenizer):
class PottsTokenizerWithNegation(PottsTokenizer): class PottsTokenizerWithNegation(PottsTokenizer):
def tokenize(self, text: str) -> t.Iterable[str]: def tokenize_builtins(self, text: str) -> t.Iterable[str]:
words = super().tokenize(text) words = super().tokenize_builtins(text)
nltk.sentiment.util.mark_negation(words, shallow=True) nltk.sentiment.util.mark_negation(words, shallow=True)
return words return words