1
Fork 0
mirror of https://github.com/Steffo99/unimore-bda-6.git synced 2024-11-25 01:04:19 +00:00

fix and patch things

This commit is contained in:
Steffo 2023-02-11 04:32:17 +01:00
parent ade3a6bdc7
commit 7778c648c1
Signed by: steffo
GPG key ID: 2A24051445686895
10 changed files with 87 additions and 67 deletions

View file

@ -1,16 +1,18 @@
<component name="ProjectRunConfigurationManager">
<configuration default="false" name="unimore_bda_6" type="PythonConfigurationType" factoryName="Python" nameIsGenerated="true">
<module name="unimore-bda-6" />
<option name="INTERPRETER_OPTIONS" value="-O" />
<option name="INTERPRETER_OPTIONS" value="" />
<option name="PARENT_ENVS" value="true" />
<envs>
<env name="CONFIRM_OVERWRITE" value="False" />
<env name="NLTK_DATA" value="./data/nltk" />
<env name="PYTHONUNBUFFERED" value="1" />
<env name="TENSORFLOW_EPOCHS" value="4" />
<env name="TF_CPP_MIN_LOG_LEVEL" value="2" />
<env name="WORKING_SET_SIZE" value="1000000" />
<env name="WORKING_SET_SIZE" value="100000" />
<env name="XLA_FLAGS" value="--xla_gpu_cuda_data_dir=/opt/cuda" />
<env name="TRAINING_SET_SIZE" value="100" />
<env name="VALIDATION_SET_SIZE" value="25" />
<env name="EVALUATION_SET_SIZE" value="100" />
</envs>
<option name="SDK_HOME" value="$PROJECT_DIR$/.venv/bin/python" />
<option name="SDK_NAME" value="Poetry (unimore-bda-6)" />

View file

@ -8,7 +8,7 @@ from .config import config
from .database import mongo_client_from_config, reviews_collection, sample_reviews_polar, sample_reviews_varied
from .analysis import NLTKSentimentAnalyzer, TensorflowCategorySentimentAnalyzer, TensorflowPolarSentimentAnalyzer
from .analysis.base import TrainingFailedError
from .tokenizer import PlainTokenizer, LowercaseTokenizer, NLTKWordTokenizer, PottsTokenizer, PottsTokenizerWithNegation
from .tokenizer import PlainTokenizer, LowercaseTokenizer, NLTKWordTokenizer, PottsTokenizer, PottsTokenizerWithNegation, HuggingBertTokenizer
from .gathering import Caches
log = logging.getLogger(__name__)
@ -38,20 +38,21 @@ def main():
slog.debug("Selected sample_func: %s", sample_func.__name__)
for SentimentAnalyzer in [
NLTKSentimentAnalyzer,
TensorflowPolarSentimentAnalyzer,
TensorflowCategorySentimentAnalyzer,
NLTKSentimentAnalyzer,
]:
slog = logging.getLogger(f"{__name__}.{sample_func.__name__}.{SentimentAnalyzer.__name__}")
slog.debug("Selected SentimentAnalyzer: %s", SentimentAnalyzer.__name__)
for Tokenizer in [
PottsTokenizerWithNegation,
PottsTokenizer,
HuggingBertTokenizer,
PlainTokenizer,
LowercaseTokenizer,
NLTKWordTokenizer,
PottsTokenizer,
PottsTokenizerWithNegation,
]:
slog = logging.getLogger(f"{__name__}.{sample_func.__name__}.{SentimentAnalyzer.__name__}.{Tokenizer.__name__}")

View file

@ -107,7 +107,7 @@ class NLTKSentimentAnalyzer(BaseSentimentAnalyzer):
raise NotTrainedError()
# Tokenize the input
tokens = self.tokenizer.tokenize_plain(text)
tokens = self.tokenizer.tokenize_and_split_plain(text)
# Run the classification method
return self.model.classify(instance=tokens)

View file

@ -189,9 +189,11 @@ class TensorflowCategorySentimentAnalyzer(TensorflowSentimentAnalyzer):
input_dim=TENSORFLOW_MAX_FEATURES.__wrapped__ + 1,
output_dim=TENSORFLOW_EMBEDDING_SIZE.__wrapped__,
),
tensorflow.keras.layers.Dropout(0.10),
tensorflow.keras.layers.Dropout(0.25),
tensorflow.keras.layers.GlobalAveragePooling1D(),
tensorflow.keras.layers.Dropout(0.10),
tensorflow.keras.layers.Dropout(0.25),
tensorflow.keras.layers.Dense(8),
tensorflow.keras.layers.Dropout(0.25),
tensorflow.keras.layers.Dense(5, activation="softmax"),
])
@ -240,18 +242,20 @@ class TensorflowPolarSentimentAnalyzer(TensorflowSentimentAnalyzer):
input_dim=TENSORFLOW_MAX_FEATURES.__wrapped__ + 1,
output_dim=TENSORFLOW_EMBEDDING_SIZE.__wrapped__,
),
tensorflow.keras.layers.Dropout(0.10),
tensorflow.keras.layers.Dropout(0.25),
tensorflow.keras.layers.GlobalAveragePooling1D(),
tensorflow.keras.layers.Dropout(0.10),
tensorflow.keras.layers.Dense(1),
tensorflow.keras.layers.Dropout(0.25),
tensorflow.keras.layers.Dense(8),
tensorflow.keras.layers.Dropout(0.25),
tensorflow.keras.layers.Dense(1, activation="relu"),
])
log.debug("Compiling model: %s", model)
model.compile(
optimizer=tensorflow.keras.optimizers.Adadelta(global_clipnorm=1.0),
loss=tensorflow.keras.losses.MeanSquaredError(),
optimizer=tensorflow.keras.optimizers.Adam(clipnorm=2.0),
loss=tensorflow.keras.losses.MeanAbsoluteError(),
metrics=[
tensorflow.keras.metrics.MeanAbsoluteError(),
# tensorflow.keras.metrics.MeanAbsoluteError(),
]
)
@ -259,7 +263,7 @@ class TensorflowPolarSentimentAnalyzer(TensorflowSentimentAnalyzer):
return model
def _translate_prediction(self, a: numpy.array) -> Category:
return (a[0, 0] + 0.5) * 5
return 1 + (a[0, 0] + 0.5) * 4
__all__ = (

View file

@ -109,10 +109,10 @@ def TENSORFLOW_EMBEDDING_SIZE(val: str | None) -> int:
"""
The size of the embeddings tensor to use in Tensorflow models.
Defaults to `6`.
Defaults to `12`.
"""
if val is None:
return 6
return 12
try:
return int(val)
except ValueError:
@ -124,10 +124,10 @@ def TENSORFLOW_EPOCHS(val: str | None) -> int:
"""
The number of epochs to train Tensorflow models for.
Defaults to `12`.
Defaults to `5`.
"""
if val is None:
return 12
return 5
try:
return int(val)
except ValueError:

View file

@ -41,7 +41,7 @@ class Review:
return tensorflow.convert_to_tensor(self.text, dtype=tensorflow.string)
def to_tensor_normvalue(self) -> tensorflow.Tensor:
return tensorflow.convert_to_tensor([self.category / 5 - 0.5], dtype=tensorflow.float32)
return tensorflow.convert_to_tensor([(self.category - 1) / 4 - 0.5], dtype=tensorflow.float32)
def to_tensor_tuple_normvalue(self) -> tuple[tensorflow.Tensor, tensorflow.Tensor]:
return (

View file

@ -7,3 +7,4 @@ from .nltk_word_tokenize import *
from .potts import *
from .plain import *
from .lower import *
from .hugging import *

View file

@ -31,16 +31,18 @@ class BaseTokenizer:
"""
Run `.tokenize_plain`, then split the result using `str.split`.
"""
return self.tokenize_plain(text).split()
text = self.tokenize_plain(text)
text = text.split()
return text
@__not_implemented
def tokenize_tensorflow(self, text: "tensorflow.Tensor") -> "tensorflow.Tensor":
def tokenize_tensorflow(self, text: tensorflow.Tensor) -> tensorflow.Tensor:
"""
Convert a `tensorflow.Tensor` string into another `tensorflow.Tensor` space-separated string.
"""
raise NotImplementedError()
def tokenize_tensorflow_and_expand_dims(self, text: "tensorflow.Tensor") -> "tensorflow.Tensor":
def tokenize_tensorflow_and_expand_dims(self, text: tensorflow.Tensor) -> tensorflow.Tensor:
"""
Run `.tokenize_tensorflow`, then add a dimension to the tensor for reasons unknown to me, but required to get `tensorflow` to work properly.
"""

View file

@ -0,0 +1,26 @@
import abc
import tokenizers
from .base import BaseTokenizer
class HuggingTokenizer(BaseTokenizer, metaclass=abc.ABCMeta):
def __init__(self):
super().__init__()
self.hug: tokenizers.Tokenizer = self._build_hugging_tokenizer()
def _build_hugging_tokenizer(self) -> tokenizers.Tokenizer:
raise NotImplementedError()
def tokenize_plain(self, text: str) -> str:
return " ".join(self.hug.encode(text).tokens)
class HuggingBertTokenizer(HuggingTokenizer):
def _build_hugging_tokenizer(self):
return tokenizers.Tokenizer.from_pretrained("bert-base-cased")
__all__ = (
"HuggingBertTokenizer",
)

View file

@ -16,78 +16,62 @@ class PottsTokenizer(BaseTokenizer):
# noinspection RegExpRepeatedSpace
# language=pythonregexp
emoticon_re_string = r"""
[<>]?
[:;=8] # eyes
[\-o*']? # optional nose
[)\](\[dDpP/:}{@|\\] # mouth
|
[)\](\[dDpP/:}{@|\\] # mouth
[\-o*']? # optional nose
[:;=8] # eyes
[<>]?
"""
emoticon_re_string = r"""[<>]?[:;=8][\-o*']?[)\](\[dDpP/:}{@|\\]"""
emoticon_re = re.compile(emoticon_re_string, re.VERBOSE | re.I)
emoticon_re = re.compile(emoticon_re_string)
# noinspection RegExpRepeatedSpace,RegExpUnnecessaryNonCapturingGroup
# language=pythonregexp
words_re_string = (
words_re_string = "(" + "|".join([
# Emoticons:
emoticon_re_string
,
# Phone numbers:
r"""
(?: # (international)
\+?[01]
[\-\s.]*
)?
(?: # (area code)
[(]?
\d{3}
[\-\s.)]*
)?
\d{3} # exchange
[\-\s.]*
\d{4} # base
"""
# language=pythonregexp
r"""(?:[+]?[01][\s.-]*)?(?:[(]?\d{3}[\s.)-]*)?\d{3}[\-\s.]*\d{4}"""
,
# HTML tags:
# language=pythonregexp
r"""<[^>]+>"""
,
# Twitter username:
# language=pythonregexp
r"""@[\w_]+"""
,
# Twitter hashtags:
r"""#+[\w_]+[\w'_\-]*[\w_]+"""
# language=pythonregexp
r"""#+[\w_]+[\w'_-]*[\w_]+"""
,
# Words with apostrophes or dashes
r"""[a-z][a-z'\-_]+[a-z]"""
# language=pythonregexp
r"""[a-z][a-z'_-]+[a-z]"""
,
# Numbers, including fractions, decimals
r"""[+\-]?\d+[,/.:-]\d+[+\-]?"""
# language=pythonregexp
r"""[+-]?\d+(?:[,/.:-]\d+)?"""
,
# Words without apostrophes or dashes
# language=pythonregexp
r"""[\w_]+"""
,
# Ellipsis dots
r"""\.(?:\s*\.)+"""
# language=pythonregexp
r"""[.](?:\s*[.])+"""
,
# Everything else that isn't whitespace
r"""(?:\S)"""
)
# language=pythonregexp
r"""\S+"""
]) + ")"
words_re = re.compile("|".join(words_re_string), re.VERBOSE | re.I)
words_re = re.compile(words_re_string, re.I)
# language=pythonregexp
digit_re_string = r"&#\d+;"
digit_re = re.compile(digit_re_string, re.VERBOSE)
digit_re = re.compile(digit_re_string)
# language=pythonregexp
alpha_re_string = r"&\w+;"
alpha_re = re.compile(alpha_re_string, re.VERBOSE)
alpha_re = re.compile(alpha_re_string)
amp = "&amp;"
@ -118,7 +102,7 @@ class PottsTokenizer(BaseTokenizer):
s = s.replace(cls.amp, " and ")
return s
def tokenize_plain(self, text: str) -> t.Iterable[str]:
def tokenize_plain(self, text: str) -> str:
# Fix HTML character entitites
s = self.__html2string(text)
# Tokenize
@ -132,10 +116,10 @@ class PottsTokenizer(BaseTokenizer):
class PottsTokenizerWithNegation(PottsTokenizer):
def tokenize_plain(self, text: str) -> t.Iterable[str]:
words = super().tokenize_plain(text)
def tokenize_plain(self, text: str) -> str:
words = super().tokenize_plain(text).split()
nltk.sentiment.util.mark_negation(words, shallow=True)
return words
return " ".join(words)
__all__ = (