1
Fork 0
mirror of https://github.com/Steffo99/unimore-bda-6.git synced 2024-11-25 01:04:19 +00:00

fix and patch things

This commit is contained in:
Steffo 2023-02-11 04:32:17 +01:00
parent ade3a6bdc7
commit 7778c648c1
Signed by: steffo
GPG key ID: 2A24051445686895
10 changed files with 87 additions and 67 deletions

View file

@ -1,16 +1,18 @@
<component name="ProjectRunConfigurationManager"> <component name="ProjectRunConfigurationManager">
<configuration default="false" name="unimore_bda_6" type="PythonConfigurationType" factoryName="Python" nameIsGenerated="true"> <configuration default="false" name="unimore_bda_6" type="PythonConfigurationType" factoryName="Python" nameIsGenerated="true">
<module name="unimore-bda-6" /> <module name="unimore-bda-6" />
<option name="INTERPRETER_OPTIONS" value="-O" /> <option name="INTERPRETER_OPTIONS" value="" />
<option name="PARENT_ENVS" value="true" /> <option name="PARENT_ENVS" value="true" />
<envs> <envs>
<env name="CONFIRM_OVERWRITE" value="False" /> <env name="CONFIRM_OVERWRITE" value="False" />
<env name="NLTK_DATA" value="./data/nltk" /> <env name="NLTK_DATA" value="./data/nltk" />
<env name="PYTHONUNBUFFERED" value="1" /> <env name="PYTHONUNBUFFERED" value="1" />
<env name="TENSORFLOW_EPOCHS" value="4" />
<env name="TF_CPP_MIN_LOG_LEVEL" value="2" /> <env name="TF_CPP_MIN_LOG_LEVEL" value="2" />
<env name="WORKING_SET_SIZE" value="1000000" /> <env name="WORKING_SET_SIZE" value="100000" />
<env name="XLA_FLAGS" value="--xla_gpu_cuda_data_dir=/opt/cuda" /> <env name="XLA_FLAGS" value="--xla_gpu_cuda_data_dir=/opt/cuda" />
<env name="TRAINING_SET_SIZE" value="100" />
<env name="VALIDATION_SET_SIZE" value="25" />
<env name="EVALUATION_SET_SIZE" value="100" />
</envs> </envs>
<option name="SDK_HOME" value="$PROJECT_DIR$/.venv/bin/python" /> <option name="SDK_HOME" value="$PROJECT_DIR$/.venv/bin/python" />
<option name="SDK_NAME" value="Poetry (unimore-bda-6)" /> <option name="SDK_NAME" value="Poetry (unimore-bda-6)" />

View file

@ -8,7 +8,7 @@ from .config import config
from .database import mongo_client_from_config, reviews_collection, sample_reviews_polar, sample_reviews_varied from .database import mongo_client_from_config, reviews_collection, sample_reviews_polar, sample_reviews_varied
from .analysis import NLTKSentimentAnalyzer, TensorflowCategorySentimentAnalyzer, TensorflowPolarSentimentAnalyzer from .analysis import NLTKSentimentAnalyzer, TensorflowCategorySentimentAnalyzer, TensorflowPolarSentimentAnalyzer
from .analysis.base import TrainingFailedError from .analysis.base import TrainingFailedError
from .tokenizer import PlainTokenizer, LowercaseTokenizer, NLTKWordTokenizer, PottsTokenizer, PottsTokenizerWithNegation from .tokenizer import PlainTokenizer, LowercaseTokenizer, NLTKWordTokenizer, PottsTokenizer, PottsTokenizerWithNegation, HuggingBertTokenizer
from .gathering import Caches from .gathering import Caches
log = logging.getLogger(__name__) log = logging.getLogger(__name__)
@ -38,20 +38,21 @@ def main():
slog.debug("Selected sample_func: %s", sample_func.__name__) slog.debug("Selected sample_func: %s", sample_func.__name__)
for SentimentAnalyzer in [ for SentimentAnalyzer in [
NLTKSentimentAnalyzer,
TensorflowPolarSentimentAnalyzer, TensorflowPolarSentimentAnalyzer,
TensorflowCategorySentimentAnalyzer, TensorflowCategorySentimentAnalyzer,
NLTKSentimentAnalyzer,
]: ]:
slog = logging.getLogger(f"{__name__}.{sample_func.__name__}.{SentimentAnalyzer.__name__}") slog = logging.getLogger(f"{__name__}.{sample_func.__name__}.{SentimentAnalyzer.__name__}")
slog.debug("Selected SentimentAnalyzer: %s", SentimentAnalyzer.__name__) slog.debug("Selected SentimentAnalyzer: %s", SentimentAnalyzer.__name__)
for Tokenizer in [ for Tokenizer in [
PottsTokenizerWithNegation,
PottsTokenizer,
HuggingBertTokenizer,
PlainTokenizer, PlainTokenizer,
LowercaseTokenizer, LowercaseTokenizer,
NLTKWordTokenizer, NLTKWordTokenizer,
PottsTokenizer,
PottsTokenizerWithNegation,
]: ]:
slog = logging.getLogger(f"{__name__}.{sample_func.__name__}.{SentimentAnalyzer.__name__}.{Tokenizer.__name__}") slog = logging.getLogger(f"{__name__}.{sample_func.__name__}.{SentimentAnalyzer.__name__}.{Tokenizer.__name__}")

View file

@ -107,7 +107,7 @@ class NLTKSentimentAnalyzer(BaseSentimentAnalyzer):
raise NotTrainedError() raise NotTrainedError()
# Tokenize the input # Tokenize the input
tokens = self.tokenizer.tokenize_plain(text) tokens = self.tokenizer.tokenize_and_split_plain(text)
# Run the classification method # Run the classification method
return self.model.classify(instance=tokens) return self.model.classify(instance=tokens)

View file

@ -189,9 +189,11 @@ class TensorflowCategorySentimentAnalyzer(TensorflowSentimentAnalyzer):
input_dim=TENSORFLOW_MAX_FEATURES.__wrapped__ + 1, input_dim=TENSORFLOW_MAX_FEATURES.__wrapped__ + 1,
output_dim=TENSORFLOW_EMBEDDING_SIZE.__wrapped__, output_dim=TENSORFLOW_EMBEDDING_SIZE.__wrapped__,
), ),
tensorflow.keras.layers.Dropout(0.10), tensorflow.keras.layers.Dropout(0.25),
tensorflow.keras.layers.GlobalAveragePooling1D(), tensorflow.keras.layers.GlobalAveragePooling1D(),
tensorflow.keras.layers.Dropout(0.10), tensorflow.keras.layers.Dropout(0.25),
tensorflow.keras.layers.Dense(8),
tensorflow.keras.layers.Dropout(0.25),
tensorflow.keras.layers.Dense(5, activation="softmax"), tensorflow.keras.layers.Dense(5, activation="softmax"),
]) ])
@ -240,18 +242,20 @@ class TensorflowPolarSentimentAnalyzer(TensorflowSentimentAnalyzer):
input_dim=TENSORFLOW_MAX_FEATURES.__wrapped__ + 1, input_dim=TENSORFLOW_MAX_FEATURES.__wrapped__ + 1,
output_dim=TENSORFLOW_EMBEDDING_SIZE.__wrapped__, output_dim=TENSORFLOW_EMBEDDING_SIZE.__wrapped__,
), ),
tensorflow.keras.layers.Dropout(0.10), tensorflow.keras.layers.Dropout(0.25),
tensorflow.keras.layers.GlobalAveragePooling1D(), tensorflow.keras.layers.GlobalAveragePooling1D(),
tensorflow.keras.layers.Dropout(0.10), tensorflow.keras.layers.Dropout(0.25),
tensorflow.keras.layers.Dense(1), tensorflow.keras.layers.Dense(8),
tensorflow.keras.layers.Dropout(0.25),
tensorflow.keras.layers.Dense(1, activation="relu"),
]) ])
log.debug("Compiling model: %s", model) log.debug("Compiling model: %s", model)
model.compile( model.compile(
optimizer=tensorflow.keras.optimizers.Adadelta(global_clipnorm=1.0), optimizer=tensorflow.keras.optimizers.Adam(clipnorm=2.0),
loss=tensorflow.keras.losses.MeanSquaredError(), loss=tensorflow.keras.losses.MeanAbsoluteError(),
metrics=[ metrics=[
tensorflow.keras.metrics.MeanAbsoluteError(), # tensorflow.keras.metrics.MeanAbsoluteError(),
] ]
) )
@ -259,7 +263,7 @@ class TensorflowPolarSentimentAnalyzer(TensorflowSentimentAnalyzer):
return model return model
def _translate_prediction(self, a: numpy.array) -> Category: def _translate_prediction(self, a: numpy.array) -> Category:
return (a[0, 0] + 0.5) * 5 return 1 + (a[0, 0] + 0.5) * 4
__all__ = ( __all__ = (

View file

@ -109,10 +109,10 @@ def TENSORFLOW_EMBEDDING_SIZE(val: str | None) -> int:
""" """
The size of the embeddings tensor to use in Tensorflow models. The size of the embeddings tensor to use in Tensorflow models.
Defaults to `6`. Defaults to `12`.
""" """
if val is None: if val is None:
return 6 return 12
try: try:
return int(val) return int(val)
except ValueError: except ValueError:
@ -124,10 +124,10 @@ def TENSORFLOW_EPOCHS(val: str | None) -> int:
""" """
The number of epochs to train Tensorflow models for. The number of epochs to train Tensorflow models for.
Defaults to `12`. Defaults to `5`.
""" """
if val is None: if val is None:
return 12 return 5
try: try:
return int(val) return int(val)
except ValueError: except ValueError:

View file

@ -41,7 +41,7 @@ class Review:
return tensorflow.convert_to_tensor(self.text, dtype=tensorflow.string) return tensorflow.convert_to_tensor(self.text, dtype=tensorflow.string)
def to_tensor_normvalue(self) -> tensorflow.Tensor: def to_tensor_normvalue(self) -> tensorflow.Tensor:
return tensorflow.convert_to_tensor([self.category / 5 - 0.5], dtype=tensorflow.float32) return tensorflow.convert_to_tensor([(self.category - 1) / 4 - 0.5], dtype=tensorflow.float32)
def to_tensor_tuple_normvalue(self) -> tuple[tensorflow.Tensor, tensorflow.Tensor]: def to_tensor_tuple_normvalue(self) -> tuple[tensorflow.Tensor, tensorflow.Tensor]:
return ( return (

View file

@ -7,3 +7,4 @@ from .nltk_word_tokenize import *
from .potts import * from .potts import *
from .plain import * from .plain import *
from .lower import * from .lower import *
from .hugging import *

View file

@ -31,16 +31,18 @@ class BaseTokenizer:
""" """
Run `.tokenize_plain`, then split the result using `str.split`. Run `.tokenize_plain`, then split the result using `str.split`.
""" """
return self.tokenize_plain(text).split() text = self.tokenize_plain(text)
text = text.split()
return text
@__not_implemented @__not_implemented
def tokenize_tensorflow(self, text: "tensorflow.Tensor") -> "tensorflow.Tensor": def tokenize_tensorflow(self, text: tensorflow.Tensor) -> tensorflow.Tensor:
""" """
Convert a `tensorflow.Tensor` string into another `tensorflow.Tensor` space-separated string. Convert a `tensorflow.Tensor` string into another `tensorflow.Tensor` space-separated string.
""" """
raise NotImplementedError() raise NotImplementedError()
def tokenize_tensorflow_and_expand_dims(self, text: "tensorflow.Tensor") -> "tensorflow.Tensor": def tokenize_tensorflow_and_expand_dims(self, text: tensorflow.Tensor) -> tensorflow.Tensor:
""" """
Run `.tokenize_tensorflow`, then add a dimension to the tensor for reasons unknown to me, but required to get `tensorflow` to work properly. Run `.tokenize_tensorflow`, then add a dimension to the tensor for reasons unknown to me, but required to get `tensorflow` to work properly.
""" """

View file

@ -0,0 +1,26 @@
import abc
import tokenizers
from .base import BaseTokenizer
class HuggingTokenizer(BaseTokenizer, metaclass=abc.ABCMeta):
def __init__(self):
super().__init__()
self.hug: tokenizers.Tokenizer = self._build_hugging_tokenizer()
def _build_hugging_tokenizer(self) -> tokenizers.Tokenizer:
raise NotImplementedError()
def tokenize_plain(self, text: str) -> str:
return " ".join(self.hug.encode(text).tokens)
class HuggingBertTokenizer(HuggingTokenizer):
def _build_hugging_tokenizer(self):
return tokenizers.Tokenizer.from_pretrained("bert-base-cased")
__all__ = (
"HuggingBertTokenizer",
)

View file

@ -16,78 +16,62 @@ class PottsTokenizer(BaseTokenizer):
# noinspection RegExpRepeatedSpace # noinspection RegExpRepeatedSpace
# language=pythonregexp # language=pythonregexp
emoticon_re_string = r""" emoticon_re_string = r"""[<>]?[:;=8][\-o*']?[)\](\[dDpP/:}{@|\\]"""
[<>]?
[:;=8] # eyes
[\-o*']? # optional nose
[)\](\[dDpP/:}{@|\\] # mouth
|
[)\](\[dDpP/:}{@|\\] # mouth
[\-o*']? # optional nose
[:;=8] # eyes
[<>]?
"""
emoticon_re = re.compile(emoticon_re_string, re.VERBOSE | re.I) emoticon_re = re.compile(emoticon_re_string)
# noinspection RegExpRepeatedSpace,RegExpUnnecessaryNonCapturingGroup words_re_string = "(" + "|".join([
# language=pythonregexp
words_re_string = (
# Emoticons: # Emoticons:
emoticon_re_string emoticon_re_string
, ,
# Phone numbers: # Phone numbers:
r""" # language=pythonregexp
(?: # (international) r"""(?:[+]?[01][\s.-]*)?(?:[(]?\d{3}[\s.)-]*)?\d{3}[\-\s.]*\d{4}"""
\+?[01]
[\-\s.]*
)?
(?: # (area code)
[(]?
\d{3}
[\-\s.)]*
)?
\d{3} # exchange
[\-\s.]*
\d{4} # base
"""
, ,
# HTML tags: # HTML tags:
# language=pythonregexp
r"""<[^>]+>""" r"""<[^>]+>"""
, ,
# Twitter username: # Twitter username:
# language=pythonregexp
r"""@[\w_]+""" r"""@[\w_]+"""
, ,
# Twitter hashtags: # Twitter hashtags:
r"""#+[\w_]+[\w'_\-]*[\w_]+""" # language=pythonregexp
r"""#+[\w_]+[\w'_-]*[\w_]+"""
, ,
# Words with apostrophes or dashes # Words with apostrophes or dashes
r"""[a-z][a-z'\-_]+[a-z]""" # language=pythonregexp
r"""[a-z][a-z'_-]+[a-z]"""
, ,
# Numbers, including fractions, decimals # Numbers, including fractions, decimals
r"""[+\-]?\d+[,/.:-]\d+[+\-]?""" # language=pythonregexp
r"""[+-]?\d+(?:[,/.:-]\d+)?"""
, ,
# Words without apostrophes or dashes # Words without apostrophes or dashes
# language=pythonregexp
r"""[\w_]+""" r"""[\w_]+"""
, ,
# Ellipsis dots # Ellipsis dots
r"""\.(?:\s*\.)+""" # language=pythonregexp
r"""[.](?:\s*[.])+"""
, ,
# Everything else that isn't whitespace # Everything else that isn't whitespace
r"""(?:\S)""" # language=pythonregexp
) r"""\S+"""
]) + ")"
words_re = re.compile("|".join(words_re_string), re.VERBOSE | re.I) words_re = re.compile(words_re_string, re.I)
# language=pythonregexp # language=pythonregexp
digit_re_string = r"&#\d+;" digit_re_string = r"&#\d+;"
digit_re = re.compile(digit_re_string, re.VERBOSE) digit_re = re.compile(digit_re_string)
# language=pythonregexp # language=pythonregexp
alpha_re_string = r"&\w+;" alpha_re_string = r"&\w+;"
alpha_re = re.compile(alpha_re_string, re.VERBOSE) alpha_re = re.compile(alpha_re_string)
amp = "&amp;" amp = "&amp;"
@ -118,7 +102,7 @@ class PottsTokenizer(BaseTokenizer):
s = s.replace(cls.amp, " and ") s = s.replace(cls.amp, " and ")
return s return s
def tokenize_plain(self, text: str) -> t.Iterable[str]: def tokenize_plain(self, text: str) -> str:
# Fix HTML character entitites # Fix HTML character entitites
s = self.__html2string(text) s = self.__html2string(text)
# Tokenize # Tokenize
@ -132,10 +116,10 @@ class PottsTokenizer(BaseTokenizer):
class PottsTokenizerWithNegation(PottsTokenizer): class PottsTokenizerWithNegation(PottsTokenizer):
def tokenize_plain(self, text: str) -> t.Iterable[str]: def tokenize_plain(self, text: str) -> str:
words = super().tokenize_plain(text) words = super().tokenize_plain(text).split()
nltk.sentiment.util.mark_negation(words, shallow=True) nltk.sentiment.util.mark_negation(words, shallow=True)
return words return " ".join(words)
__all__ = ( __all__ = (