mirror of
https://github.com/Steffo99/unimore-bda-6.git
synced 2024-11-25 01:04:19 +00:00
stop here for now
This commit is contained in:
parent
6ef81c1c19
commit
e6dcf6e423
13 changed files with 1161 additions and 36 deletions
|
@ -32,6 +32,7 @@
|
|||
</list>
|
||||
</option>
|
||||
</inspection_tool>
|
||||
<inspection_tool class="PyAbstractClassInspection" enabled="false" level="WEAK WARNING" enabled_by_default="false" />
|
||||
<inspection_tool class="PyPep8Inspection" enabled="true" level="WEAK WARNING" enabled_by_default="true">
|
||||
<option name="ignoredErrors">
|
||||
<list>
|
||||
|
|
|
@ -30,16 +30,20 @@
|
|||
|
||||
### Codice
|
||||
|
||||
Il codice dell'attività è incluso come package Python compatibile con PEP518.
|
||||
Il codice dell'attività è incluso come package Python 3.10 compatibile con PEP518.
|
||||
|
||||
Per installare il package, è sufficiente eseguire i seguenti comandi dall'interno della directory del progetto:
|
||||
|
||||
```console
|
||||
$ python -m venv .venv
|
||||
$ python3.10 -m venv .venv
|
||||
$ source venv/bin/activate
|
||||
$ pip install .
|
||||
```
|
||||
|
||||
> **Note:**
|
||||
>
|
||||
> Per via di requisiti particolari di Tensorflow, Python 3.11 non è supportato.
|
||||
|
||||
#### NLTK
|
||||
|
||||
NLTK richiede dipendenze aggiuntive per funzionare, che possono essere scaricate eseguendo il seguente comando su console:
|
||||
|
|
980
poetry.lock
generated
980
poetry.lock
generated
File diff suppressed because it is too large
Load diff
|
@ -131,11 +131,12 @@ classifiers = [
|
|||
# nothing means "this specific release"
|
||||
# 3.10.1 → == 3.10.1
|
||||
|
||||
python = "^3.10"
|
||||
python = "~3.10"
|
||||
pymongo = "^4.3.3"
|
||||
nltk = "^3.8.1"
|
||||
cfig = {extras = ["cli"], version = "^0.3.0"}
|
||||
coloredlogs = "^15.0.1"
|
||||
tensorflow = "^2.11.0"
|
||||
|
||||
|
||||
|
||||
|
|
|
@ -1,18 +1,33 @@
|
|||
import logging
|
||||
import tensorflow
|
||||
|
||||
from .config import config, DATA_SET_SIZE
|
||||
from .database import mongo_reviews_collection_from_config, polar_dataset, varied_dataset
|
||||
from .analysis.nltk_sentiment import NLTKSentimentAnalyzer
|
||||
from .tokenizer import NLTKWordTokenizer, PottsTokenizer, PottsTokenizerWithNegation
|
||||
from .analysis.tf_text import TensorflowSentimentAnalyzer
|
||||
from .tokenizer import NLTKWordTokenizer, PottsTokenizer, PottsTokenizerWithNegation, LowercaseTokenizer
|
||||
from .log import install_log_handler
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def main():
|
||||
if len(tensorflow.config.list_physical_devices(device_type="GPU")) == 0:
|
||||
log.warning("Tensorflow reports no GPU acceleration available.")
|
||||
else:
|
||||
log.debug("Tensorflow successfully found GPU acceleration!")
|
||||
|
||||
for dataset_func in [polar_dataset, varied_dataset]:
|
||||
for SentimentAnalyzer in [NLTKSentimentAnalyzer]:
|
||||
for Tokenizer in [NLTKWordTokenizer, PottsTokenizer, PottsTokenizerWithNegation]:
|
||||
for SentimentAnalyzer in [
|
||||
# NLTKSentimentAnalyzer,
|
||||
TensorflowSentimentAnalyzer,
|
||||
]:
|
||||
for Tokenizer in [
|
||||
# NLTKWordTokenizer,
|
||||
# PottsTokenizer,
|
||||
# PottsTokenizerWithNegation,
|
||||
LowercaseTokenizer,
|
||||
]:
|
||||
tokenizer = Tokenizer()
|
||||
model = SentimentAnalyzer(tokenizer=tokenizer)
|
||||
|
||||
|
|
|
@ -2,6 +2,7 @@ import abc
|
|||
import logging
|
||||
|
||||
from ..database import DataSet, Text, Category
|
||||
from ..tokenizer import BaseTokenizer
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
@ -11,6 +12,12 @@ class BaseSentimentAnalyzer(metaclass=abc.ABCMeta):
|
|||
Abstract base class for sentiment analyzers implemented in this project.
|
||||
"""
|
||||
|
||||
def __init__(self, *, tokenizer: BaseTokenizer):
|
||||
self.tokenizer: BaseTokenizer = tokenizer
|
||||
|
||||
def __repr__(self):
|
||||
return f"<{self.__class__.__qualname__} tokenizer={self.tokenizer!r}>"
|
||||
|
||||
@abc.abstractmethod
|
||||
def train(self, training_set: DataSet) -> None:
|
||||
"""
|
||||
|
@ -44,6 +51,20 @@ class BaseSentimentAnalyzer(metaclass=abc.ABCMeta):
|
|||
raise NotImplementedError()
|
||||
|
||||
|
||||
class AlreadyTrainedError(Exception):
|
||||
"""
|
||||
This model has already been trained and cannot be trained again.
|
||||
"""
|
||||
|
||||
|
||||
class NotTrainedError(Exception):
|
||||
"""
|
||||
This model has not been trained yet.
|
||||
"""
|
||||
|
||||
|
||||
__all__ = (
|
||||
"BaseSentimentAnalyzer",
|
||||
"AlreadyTrainedError",
|
||||
"NotTrainedError",
|
||||
)
|
||||
|
|
|
@ -7,7 +7,7 @@ import typing as t
|
|||
import itertools
|
||||
|
||||
from ..database import Text, Category, DataTuple, DataSet
|
||||
from .base import BaseSentimentAnalyzer
|
||||
from .base import BaseSentimentAnalyzer, AlreadyTrainedError, NotTrainedError
|
||||
from ..log import count_passage
|
||||
from ..tokenizer import BaseTokenizer
|
||||
|
||||
|
@ -17,38 +17,22 @@ TokenBag = list[str]
|
|||
Features = dict[str, int]
|
||||
|
||||
|
||||
class AlreadyTrainedError(Exception):
|
||||
"""
|
||||
This model has already been trained and cannot be trained again.
|
||||
"""
|
||||
|
||||
|
||||
class NotTrainedError(Exception):
|
||||
"""
|
||||
This model has not been trained yet.
|
||||
"""
|
||||
|
||||
|
||||
class NLTKSentimentAnalyzer(BaseSentimentAnalyzer):
|
||||
"""
|
||||
A sentiment analyzer resembling the one implemented in structure the one implemented in the classroom, using the basic sentiment analyzer of NLTK.
|
||||
"""
|
||||
|
||||
def __init__(self, *, tokenizer: BaseTokenizer) -> None:
|
||||
super().__init__()
|
||||
super().__init__(tokenizer=tokenizer)
|
||||
self.model: nltk.sentiment.SentimentAnalyzer = nltk.sentiment.SentimentAnalyzer()
|
||||
self.trained: bool = False
|
||||
self.tokenizer: BaseTokenizer = tokenizer
|
||||
|
||||
def __repr__(self):
|
||||
return f"<{self.__class__.__qualname__} {'trained' if self.trained else 'untrained'} tokenizer={self.tokenizer!r}>"
|
||||
|
||||
def __tokenize_datatuple(self, datatuple: DataTuple) -> tuple[TokenBag, Category]:
|
||||
"""
|
||||
Convert the `Text` of a `DataTuple` to a `TokenBag`.
|
||||
"""
|
||||
count_passage(log, "tokenize_datatuple", 100)
|
||||
return self.tokenizer.tokenize(datatuple[0]), datatuple[1]
|
||||
return self.tokenizer.tokenize_builtins(datatuple[0]), datatuple[1]
|
||||
|
||||
def _add_feature_unigrams(self, dataset: t.Iterator[tuple[TokenBag, Category]]) -> None:
|
||||
"""
|
||||
|
@ -112,7 +96,7 @@ class NLTKSentimentAnalyzer(BaseSentimentAnalyzer):
|
|||
raise NotTrainedError()
|
||||
|
||||
# Tokenize the input
|
||||
tokens = self.tokenizer.tokenize(text)
|
||||
tokens = self.tokenizer.tokenize_builtins(text)
|
||||
|
||||
# Run the classification method
|
||||
return self.model.classify(instance=tokens)
|
||||
|
|
91
unimore_bda_6/analysis/tf_text.py
Normal file
91
unimore_bda_6/analysis/tf_text.py
Normal file
|
@ -0,0 +1,91 @@
|
|||
import tensorflow
|
||||
import itertools
|
||||
import typing as t
|
||||
|
||||
from ..database import DataSet, Text, Category
|
||||
from ..tokenizer import BaseTokenizer
|
||||
from .base import BaseSentimentAnalyzer, AlreadyTrainedError, NotTrainedError
|
||||
|
||||
|
||||
class TensorflowSentimentAnalyzer(BaseSentimentAnalyzer):
|
||||
def __init__(self, *, tokenizer: BaseTokenizer):
|
||||
super().__init__(tokenizer=tokenizer)
|
||||
self.trained = False
|
||||
self.text_vectorization_layer = None
|
||||
self.neural_network: tensorflow.keras.Sequential | None = None
|
||||
|
||||
@staticmethod
|
||||
def __infinite_dataset_generator_factory(dataset: DataSet):
|
||||
"""
|
||||
A generator of infinite copies of dataset.
|
||||
|
||||
.. todo:: Loads the whole dataset in memory. What a waste! Can we perform multiple MongoDB queries instead?
|
||||
"""
|
||||
dataset = map(lambda text, category: (tensorflow.convert_to_tensor(text, dtype=tensorflow.string), tensorflow.convert_to_tensor(category, dtype=tensorflow.string)), dataset)
|
||||
|
||||
def generator():
|
||||
while True:
|
||||
nonlocal dataset
|
||||
dataset, result = itertools.tee(dataset, 2)
|
||||
yield result
|
||||
|
||||
return generator
|
||||
|
||||
@classmethod
|
||||
def __bda_dataset_to_tf_dataset(cls, dataset: DataSet) -> tensorflow.data.Dataset:
|
||||
"""
|
||||
Convert a `unimore_bda_6.database.DataSet` to a "real" `tensorflow.data.Dataset`.
|
||||
"""
|
||||
return tensorflow.data.Dataset.from_generator(
|
||||
cls.__infinite_dataset_generator_factory(dataset),
|
||||
output_signature=(
|
||||
tensorflow.TensorSpec(shape=(), dtype=tensorflow.string),
|
||||
tensorflow.TensorSpec(shape=(), dtype=tensorflow.string),
|
||||
)
|
||||
)
|
||||
|
||||
MAX_FEATURES = 20000
|
||||
EMBEDDING_DIM = 16
|
||||
EPOCHS = 10
|
||||
|
||||
def train(self, training_set: DataSet) -> None:
|
||||
if self.trained:
|
||||
raise AlreadyTrainedError()
|
||||
|
||||
training_set = self.__bda_dataset_to_tf_dataset(training_set)
|
||||
|
||||
self.text_vectorization_layer = tensorflow.keras.layers.TextVectorization(
|
||||
max_tokens=self.MAX_FEATURES,
|
||||
standardize=self.tokenizer.tokenize_tensorflow,
|
||||
)
|
||||
self.text_vectorization_layer.adapt(map(lambda t: t[0], training_set))
|
||||
|
||||
training_set = training_set.map(self.text_vectorization_layer)
|
||||
|
||||
# I have no idea of what I'm doing here
|
||||
self.neural_network = tensorflow.keras.Sequential([
|
||||
tensorflow.keras.layers.Embedding(self.MAX_FEATURES + 1, self.EMBEDDING_DIM),
|
||||
tensorflow.keras.layers.Dropout(0.2),
|
||||
tensorflow.keras.layers.GlobalAveragePooling1D(),
|
||||
tensorflow.keras.layers.Dropout(0.2),
|
||||
tensorflow.keras.layers.Dense(1),
|
||||
])
|
||||
|
||||
self.neural_network.compile(
|
||||
loss=tensorflow.losses.BinaryCrossentropy(from_logits=True), # Only works with two tags
|
||||
metrics=tensorflow.metrics.BinaryAccuracy(threshold=0.0)
|
||||
)
|
||||
|
||||
self.neural_network.fit(
|
||||
training_set,
|
||||
epochs=self.EPOCHS,
|
||||
)
|
||||
|
||||
self.trained = True
|
||||
|
||||
def use(self, text: Text) -> Category:
|
||||
if not self.trained:
|
||||
raise NotTrainedError()
|
||||
|
||||
prediction = self.neural_network.predict(text)
|
||||
breakpoint()
|
|
@ -1,10 +1,13 @@
|
|||
from .base import BaseTokenizer
|
||||
from .nltk_word_tokenize import NLTKWordTokenizer
|
||||
from .potts import PottsTokenizer, PottsTokenizerWithNegation
|
||||
from .lower import LowercaseTokenizer
|
||||
|
||||
|
||||
__all__ = (
|
||||
"BaseTokenizer",
|
||||
"NLTKWordTokenizer",
|
||||
"PottsTokenizer",
|
||||
"PottsTokenizerWithNegation",
|
||||
"LowercaseTokenizer",
|
||||
)
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
import abc
|
||||
import tensorflow
|
||||
|
||||
|
||||
class BaseTokenizer(metaclass=abc.ABCMeta):
|
||||
class BaseTokenizer:
|
||||
"""
|
||||
The base for all tokenizers in this project.
|
||||
"""
|
||||
|
@ -9,9 +9,27 @@ class BaseTokenizer(metaclass=abc.ABCMeta):
|
|||
def __repr__(self):
|
||||
return f"{self.__class__.__qualname__}()"
|
||||
|
||||
@abc.abstractmethod
|
||||
def tokenize(self, text: str) -> list[str]:
|
||||
@staticmethod
|
||||
def __not_implemented(f):
|
||||
f.__notimplemented__ = True
|
||||
return f
|
||||
|
||||
def can_tokenize_builtins(self) -> bool:
|
||||
return getattr(self.tokenize_builtins, "__notimplemented__", False)
|
||||
|
||||
def can_tokenize_tensorflow(self) -> bool:
|
||||
return getattr(self.tokenize_tensorflow, "__notimplemented__", False)
|
||||
|
||||
@__not_implemented
|
||||
def tokenize_builtins(self, text: str) -> list[str]:
|
||||
"""
|
||||
Convert a text string into a list of tokens.
|
||||
"""
|
||||
raise NotImplementedError()
|
||||
|
||||
@__not_implemented
|
||||
def tokenize_tensorflow(self, text: tensorflow.Tensor) -> tensorflow.Tensor:
|
||||
"""
|
||||
Convert a `tensorflow.Tensor` string into another `tensorflow.Tensor` space-separated string.
|
||||
"""
|
||||
raise NotImplementedError()
|
||||
|
|
11
unimore_bda_6/tokenizer/lower.py
Normal file
11
unimore_bda_6/tokenizer/lower.py
Normal file
|
@ -0,0 +1,11 @@
|
|||
import tensorflow
|
||||
|
||||
from .base import BaseTokenizer
|
||||
|
||||
|
||||
class LowercaseTokenizer(BaseTokenizer):
|
||||
def tokenize_builtins(self, text: str) -> list[str]:
|
||||
return text.lower().split()
|
||||
|
||||
def tokenize_tensorflow(self, text: tensorflow.Tensor) -> tensorflow.Tensor:
|
||||
return tensorflow.strings.lower(text)
|
|
@ -10,7 +10,7 @@ class NLTKWordTokenizer(BaseTokenizer):
|
|||
Tokenizer based on `nltk.word_tokenize`.
|
||||
"""
|
||||
|
||||
def tokenize(self, text: str) -> t.Iterable[str]:
|
||||
def tokenize_builtins(self, text: str) -> t.Iterable[str]:
|
||||
tokens = nltk.word_tokenize(text)
|
||||
nltk.sentiment.util.mark_negation(tokens, shallow=True)
|
||||
return tokens
|
||||
|
|
|
@ -175,7 +175,7 @@ class PottsTokenizer(BaseTokenizer):
|
|||
s = s.replace(amp, " and ")
|
||||
return s
|
||||
|
||||
def tokenize(self, text: str) -> t.Iterable[str]:
|
||||
def tokenize_builtins(self, text: str) -> t.Iterable[str]:
|
||||
# Fix HTML character entitites:
|
||||
s = self.__html2string(text)
|
||||
# Tokenize:
|
||||
|
@ -187,8 +187,8 @@ class PottsTokenizer(BaseTokenizer):
|
|||
|
||||
|
||||
class PottsTokenizerWithNegation(PottsTokenizer):
|
||||
def tokenize(self, text: str) -> t.Iterable[str]:
|
||||
words = super().tokenize(text)
|
||||
def tokenize_builtins(self, text: str) -> t.Iterable[str]:
|
||||
words = super().tokenize_builtins(text)
|
||||
nltk.sentiment.util.mark_negation(words, shallow=True)
|
||||
return words
|
||||
|
||||
|
|
Loading…
Reference in a new issue