1
Fork 0
mirror of https://github.com/Steffo99/unimore-bda-6.git synced 2024-11-25 01:04:19 +00:00

stop here for now

This commit is contained in:
Steffo 2023-02-04 01:36:42 +01:00
parent 6ef81c1c19
commit e6dcf6e423
Signed by: steffo
GPG key ID: 2A24051445686895
13 changed files with 1161 additions and 36 deletions

View file

@ -32,6 +32,7 @@
</list>
</option>
</inspection_tool>
<inspection_tool class="PyAbstractClassInspection" enabled="false" level="WEAK WARNING" enabled_by_default="false" />
<inspection_tool class="PyPep8Inspection" enabled="true" level="WEAK WARNING" enabled_by_default="true">
<option name="ignoredErrors">
<list>

View file

@ -30,16 +30,20 @@
### Codice
Il codice dell'attività è incluso come package Python compatibile con PEP518.
Il codice dell'attività è incluso come package Python 3.10 compatibile con PEP518.
Per installare il package, è sufficiente eseguire i seguenti comandi dall'interno della directory del progetto:
```console
$ python -m venv .venv
$ python3.10 -m venv .venv
$ source venv/bin/activate
$ pip install .
```
> **Note:**
>
> Per via di requisiti particolari di Tensorflow, Python 3.11 non è supportato.
#### NLTK
NLTK richiede dipendenze aggiuntive per funzionare, che possono essere scaricate eseguendo il seguente comando su console:

980
poetry.lock generated

File diff suppressed because it is too large Load diff

View file

@ -131,11 +131,12 @@ classifiers = [
# nothing means "this specific release"
# 3.10.1 → == 3.10.1
python = "^3.10"
python = "~3.10"
pymongo = "^4.3.3"
nltk = "^3.8.1"
cfig = {extras = ["cli"], version = "^0.3.0"}
coloredlogs = "^15.0.1"
tensorflow = "^2.11.0"

View file

@ -1,18 +1,33 @@
import logging
import tensorflow
from .config import config, DATA_SET_SIZE
from .database import mongo_reviews_collection_from_config, polar_dataset, varied_dataset
from .analysis.nltk_sentiment import NLTKSentimentAnalyzer
from .tokenizer import NLTKWordTokenizer, PottsTokenizer, PottsTokenizerWithNegation
from .analysis.tf_text import TensorflowSentimentAnalyzer
from .tokenizer import NLTKWordTokenizer, PottsTokenizer, PottsTokenizerWithNegation, LowercaseTokenizer
from .log import install_log_handler
log = logging.getLogger(__name__)
def main():
if len(tensorflow.config.list_physical_devices(device_type="GPU")) == 0:
log.warning("Tensorflow reports no GPU acceleration available.")
else:
log.debug("Tensorflow successfully found GPU acceleration!")
for dataset_func in [polar_dataset, varied_dataset]:
for SentimentAnalyzer in [NLTKSentimentAnalyzer]:
for Tokenizer in [NLTKWordTokenizer, PottsTokenizer, PottsTokenizerWithNegation]:
for SentimentAnalyzer in [
# NLTKSentimentAnalyzer,
TensorflowSentimentAnalyzer,
]:
for Tokenizer in [
# NLTKWordTokenizer,
# PottsTokenizer,
# PottsTokenizerWithNegation,
LowercaseTokenizer,
]:
tokenizer = Tokenizer()
model = SentimentAnalyzer(tokenizer=tokenizer)

View file

@ -2,6 +2,7 @@ import abc
import logging
from ..database import DataSet, Text, Category
from ..tokenizer import BaseTokenizer
log = logging.getLogger(__name__)
@ -11,6 +12,12 @@ class BaseSentimentAnalyzer(metaclass=abc.ABCMeta):
Abstract base class for sentiment analyzers implemented in this project.
"""
def __init__(self, *, tokenizer: BaseTokenizer):
self.tokenizer: BaseTokenizer = tokenizer
def __repr__(self):
return f"<{self.__class__.__qualname__} tokenizer={self.tokenizer!r}>"
@abc.abstractmethod
def train(self, training_set: DataSet) -> None:
"""
@ -44,6 +51,20 @@ class BaseSentimentAnalyzer(metaclass=abc.ABCMeta):
raise NotImplementedError()
class AlreadyTrainedError(Exception):
"""
This model has already been trained and cannot be trained again.
"""
class NotTrainedError(Exception):
"""
This model has not been trained yet.
"""
__all__ = (
"BaseSentimentAnalyzer",
"AlreadyTrainedError",
"NotTrainedError",
)

View file

@ -7,7 +7,7 @@ import typing as t
import itertools
from ..database import Text, Category, DataTuple, DataSet
from .base import BaseSentimentAnalyzer
from .base import BaseSentimentAnalyzer, AlreadyTrainedError, NotTrainedError
from ..log import count_passage
from ..tokenizer import BaseTokenizer
@ -17,38 +17,22 @@ TokenBag = list[str]
Features = dict[str, int]
class AlreadyTrainedError(Exception):
"""
This model has already been trained and cannot be trained again.
"""
class NotTrainedError(Exception):
"""
This model has not been trained yet.
"""
class NLTKSentimentAnalyzer(BaseSentimentAnalyzer):
"""
A sentiment analyzer resembling the one implemented in structure the one implemented in the classroom, using the basic sentiment analyzer of NLTK.
"""
def __init__(self, *, tokenizer: BaseTokenizer) -> None:
super().__init__()
super().__init__(tokenizer=tokenizer)
self.model: nltk.sentiment.SentimentAnalyzer = nltk.sentiment.SentimentAnalyzer()
self.trained: bool = False
self.tokenizer: BaseTokenizer = tokenizer
def __repr__(self):
return f"<{self.__class__.__qualname__} {'trained' if self.trained else 'untrained'} tokenizer={self.tokenizer!r}>"
def __tokenize_datatuple(self, datatuple: DataTuple) -> tuple[TokenBag, Category]:
"""
Convert the `Text` of a `DataTuple` to a `TokenBag`.
"""
count_passage(log, "tokenize_datatuple", 100)
return self.tokenizer.tokenize(datatuple[0]), datatuple[1]
return self.tokenizer.tokenize_builtins(datatuple[0]), datatuple[1]
def _add_feature_unigrams(self, dataset: t.Iterator[tuple[TokenBag, Category]]) -> None:
"""
@ -112,7 +96,7 @@ class NLTKSentimentAnalyzer(BaseSentimentAnalyzer):
raise NotTrainedError()
# Tokenize the input
tokens = self.tokenizer.tokenize(text)
tokens = self.tokenizer.tokenize_builtins(text)
# Run the classification method
return self.model.classify(instance=tokens)

View file

@ -0,0 +1,91 @@
import tensorflow
import itertools
import typing as t
from ..database import DataSet, Text, Category
from ..tokenizer import BaseTokenizer
from .base import BaseSentimentAnalyzer, AlreadyTrainedError, NotTrainedError
class TensorflowSentimentAnalyzer(BaseSentimentAnalyzer):
def __init__(self, *, tokenizer: BaseTokenizer):
super().__init__(tokenizer=tokenizer)
self.trained = False
self.text_vectorization_layer = None
self.neural_network: tensorflow.keras.Sequential | None = None
@staticmethod
def __infinite_dataset_generator_factory(dataset: DataSet):
"""
A generator of infinite copies of dataset.
.. todo:: Loads the whole dataset in memory. What a waste! Can we perform multiple MongoDB queries instead?
"""
dataset = map(lambda text, category: (tensorflow.convert_to_tensor(text, dtype=tensorflow.string), tensorflow.convert_to_tensor(category, dtype=tensorflow.string)), dataset)
def generator():
while True:
nonlocal dataset
dataset, result = itertools.tee(dataset, 2)
yield result
return generator
@classmethod
def __bda_dataset_to_tf_dataset(cls, dataset: DataSet) -> tensorflow.data.Dataset:
"""
Convert a `unimore_bda_6.database.DataSet` to a "real" `tensorflow.data.Dataset`.
"""
return tensorflow.data.Dataset.from_generator(
cls.__infinite_dataset_generator_factory(dataset),
output_signature=(
tensorflow.TensorSpec(shape=(), dtype=tensorflow.string),
tensorflow.TensorSpec(shape=(), dtype=tensorflow.string),
)
)
MAX_FEATURES = 20000
EMBEDDING_DIM = 16
EPOCHS = 10
def train(self, training_set: DataSet) -> None:
if self.trained:
raise AlreadyTrainedError()
training_set = self.__bda_dataset_to_tf_dataset(training_set)
self.text_vectorization_layer = tensorflow.keras.layers.TextVectorization(
max_tokens=self.MAX_FEATURES,
standardize=self.tokenizer.tokenize_tensorflow,
)
self.text_vectorization_layer.adapt(map(lambda t: t[0], training_set))
training_set = training_set.map(self.text_vectorization_layer)
# I have no idea of what I'm doing here
self.neural_network = tensorflow.keras.Sequential([
tensorflow.keras.layers.Embedding(self.MAX_FEATURES + 1, self.EMBEDDING_DIM),
tensorflow.keras.layers.Dropout(0.2),
tensorflow.keras.layers.GlobalAveragePooling1D(),
tensorflow.keras.layers.Dropout(0.2),
tensorflow.keras.layers.Dense(1),
])
self.neural_network.compile(
loss=tensorflow.losses.BinaryCrossentropy(from_logits=True), # Only works with two tags
metrics=tensorflow.metrics.BinaryAccuracy(threshold=0.0)
)
self.neural_network.fit(
training_set,
epochs=self.EPOCHS,
)
self.trained = True
def use(self, text: Text) -> Category:
if not self.trained:
raise NotTrainedError()
prediction = self.neural_network.predict(text)
breakpoint()

View file

@ -1,10 +1,13 @@
from .base import BaseTokenizer
from .nltk_word_tokenize import NLTKWordTokenizer
from .potts import PottsTokenizer, PottsTokenizerWithNegation
from .lower import LowercaseTokenizer
__all__ = (
"BaseTokenizer",
"NLTKWordTokenizer",
"PottsTokenizer",
"PottsTokenizerWithNegation",
"LowercaseTokenizer",
)

View file

@ -1,7 +1,7 @@
import abc
import tensorflow
class BaseTokenizer(metaclass=abc.ABCMeta):
class BaseTokenizer:
"""
The base for all tokenizers in this project.
"""
@ -9,9 +9,27 @@ class BaseTokenizer(metaclass=abc.ABCMeta):
def __repr__(self):
return f"{self.__class__.__qualname__}()"
@abc.abstractmethod
def tokenize(self, text: str) -> list[str]:
@staticmethod
def __not_implemented(f):
f.__notimplemented__ = True
return f
def can_tokenize_builtins(self) -> bool:
return getattr(self.tokenize_builtins, "__notimplemented__", False)
def can_tokenize_tensorflow(self) -> bool:
return getattr(self.tokenize_tensorflow, "__notimplemented__", False)
@__not_implemented
def tokenize_builtins(self, text: str) -> list[str]:
"""
Convert a text string into a list of tokens.
"""
raise NotImplementedError()
@__not_implemented
def tokenize_tensorflow(self, text: tensorflow.Tensor) -> tensorflow.Tensor:
"""
Convert a `tensorflow.Tensor` string into another `tensorflow.Tensor` space-separated string.
"""
raise NotImplementedError()

View file

@ -0,0 +1,11 @@
import tensorflow
from .base import BaseTokenizer
class LowercaseTokenizer(BaseTokenizer):
def tokenize_builtins(self, text: str) -> list[str]:
return text.lower().split()
def tokenize_tensorflow(self, text: tensorflow.Tensor) -> tensorflow.Tensor:
return tensorflow.strings.lower(text)

View file

@ -10,7 +10,7 @@ class NLTKWordTokenizer(BaseTokenizer):
Tokenizer based on `nltk.word_tokenize`.
"""
def tokenize(self, text: str) -> t.Iterable[str]:
def tokenize_builtins(self, text: str) -> t.Iterable[str]:
tokens = nltk.word_tokenize(text)
nltk.sentiment.util.mark_negation(tokens, shallow=True)
return tokens

View file

@ -175,7 +175,7 @@ class PottsTokenizer(BaseTokenizer):
s = s.replace(amp, " and ")
return s
def tokenize(self, text: str) -> t.Iterable[str]:
def tokenize_builtins(self, text: str) -> t.Iterable[str]:
# Fix HTML character entitites:
s = self.__html2string(text)
# Tokenize:
@ -187,8 +187,8 @@ class PottsTokenizer(BaseTokenizer):
class PottsTokenizerWithNegation(PottsTokenizer):
def tokenize(self, text: str) -> t.Iterable[str]:
words = super().tokenize(text)
def tokenize_builtins(self, text: str) -> t.Iterable[str]:
words = super().tokenize_builtins(text)
nltk.sentiment.util.mark_negation(words, shallow=True)
return words