mirror of
https://github.com/Steffo99/unimore-bda-6.git
synced 2024-11-25 01:04:19 +00:00
fix and patch things
This commit is contained in:
parent
ade3a6bdc7
commit
7778c648c1
10 changed files with 87 additions and 67 deletions
|
@ -1,16 +1,18 @@
|
||||||
<component name="ProjectRunConfigurationManager">
|
<component name="ProjectRunConfigurationManager">
|
||||||
<configuration default="false" name="unimore_bda_6" type="PythonConfigurationType" factoryName="Python" nameIsGenerated="true">
|
<configuration default="false" name="unimore_bda_6" type="PythonConfigurationType" factoryName="Python" nameIsGenerated="true">
|
||||||
<module name="unimore-bda-6" />
|
<module name="unimore-bda-6" />
|
||||||
<option name="INTERPRETER_OPTIONS" value="-O" />
|
<option name="INTERPRETER_OPTIONS" value="" />
|
||||||
<option name="PARENT_ENVS" value="true" />
|
<option name="PARENT_ENVS" value="true" />
|
||||||
<envs>
|
<envs>
|
||||||
<env name="CONFIRM_OVERWRITE" value="False" />
|
<env name="CONFIRM_OVERWRITE" value="False" />
|
||||||
<env name="NLTK_DATA" value="./data/nltk" />
|
<env name="NLTK_DATA" value="./data/nltk" />
|
||||||
<env name="PYTHONUNBUFFERED" value="1" />
|
<env name="PYTHONUNBUFFERED" value="1" />
|
||||||
<env name="TENSORFLOW_EPOCHS" value="4" />
|
|
||||||
<env name="TF_CPP_MIN_LOG_LEVEL" value="2" />
|
<env name="TF_CPP_MIN_LOG_LEVEL" value="2" />
|
||||||
<env name="WORKING_SET_SIZE" value="1000000" />
|
<env name="WORKING_SET_SIZE" value="100000" />
|
||||||
<env name="XLA_FLAGS" value="--xla_gpu_cuda_data_dir=/opt/cuda" />
|
<env name="XLA_FLAGS" value="--xla_gpu_cuda_data_dir=/opt/cuda" />
|
||||||
|
<env name="TRAINING_SET_SIZE" value="100" />
|
||||||
|
<env name="VALIDATION_SET_SIZE" value="25" />
|
||||||
|
<env name="EVALUATION_SET_SIZE" value="100" />
|
||||||
</envs>
|
</envs>
|
||||||
<option name="SDK_HOME" value="$PROJECT_DIR$/.venv/bin/python" />
|
<option name="SDK_HOME" value="$PROJECT_DIR$/.venv/bin/python" />
|
||||||
<option name="SDK_NAME" value="Poetry (unimore-bda-6)" />
|
<option name="SDK_NAME" value="Poetry (unimore-bda-6)" />
|
||||||
|
|
|
@ -8,7 +8,7 @@ from .config import config
|
||||||
from .database import mongo_client_from_config, reviews_collection, sample_reviews_polar, sample_reviews_varied
|
from .database import mongo_client_from_config, reviews_collection, sample_reviews_polar, sample_reviews_varied
|
||||||
from .analysis import NLTKSentimentAnalyzer, TensorflowCategorySentimentAnalyzer, TensorflowPolarSentimentAnalyzer
|
from .analysis import NLTKSentimentAnalyzer, TensorflowCategorySentimentAnalyzer, TensorflowPolarSentimentAnalyzer
|
||||||
from .analysis.base import TrainingFailedError
|
from .analysis.base import TrainingFailedError
|
||||||
from .tokenizer import PlainTokenizer, LowercaseTokenizer, NLTKWordTokenizer, PottsTokenizer, PottsTokenizerWithNegation
|
from .tokenizer import PlainTokenizer, LowercaseTokenizer, NLTKWordTokenizer, PottsTokenizer, PottsTokenizerWithNegation, HuggingBertTokenizer
|
||||||
from .gathering import Caches
|
from .gathering import Caches
|
||||||
|
|
||||||
log = logging.getLogger(__name__)
|
log = logging.getLogger(__name__)
|
||||||
|
@ -38,20 +38,21 @@ def main():
|
||||||
slog.debug("Selected sample_func: %s", sample_func.__name__)
|
slog.debug("Selected sample_func: %s", sample_func.__name__)
|
||||||
|
|
||||||
for SentimentAnalyzer in [
|
for SentimentAnalyzer in [
|
||||||
|
NLTKSentimentAnalyzer,
|
||||||
TensorflowPolarSentimentAnalyzer,
|
TensorflowPolarSentimentAnalyzer,
|
||||||
TensorflowCategorySentimentAnalyzer,
|
TensorflowCategorySentimentAnalyzer,
|
||||||
NLTKSentimentAnalyzer,
|
|
||||||
]:
|
]:
|
||||||
|
|
||||||
slog = logging.getLogger(f"{__name__}.{sample_func.__name__}.{SentimentAnalyzer.__name__}")
|
slog = logging.getLogger(f"{__name__}.{sample_func.__name__}.{SentimentAnalyzer.__name__}")
|
||||||
slog.debug("Selected SentimentAnalyzer: %s", SentimentAnalyzer.__name__)
|
slog.debug("Selected SentimentAnalyzer: %s", SentimentAnalyzer.__name__)
|
||||||
|
|
||||||
for Tokenizer in [
|
for Tokenizer in [
|
||||||
|
PottsTokenizerWithNegation,
|
||||||
|
PottsTokenizer,
|
||||||
|
HuggingBertTokenizer,
|
||||||
PlainTokenizer,
|
PlainTokenizer,
|
||||||
LowercaseTokenizer,
|
LowercaseTokenizer,
|
||||||
NLTKWordTokenizer,
|
NLTKWordTokenizer,
|
||||||
PottsTokenizer,
|
|
||||||
PottsTokenizerWithNegation,
|
|
||||||
]:
|
]:
|
||||||
|
|
||||||
slog = logging.getLogger(f"{__name__}.{sample_func.__name__}.{SentimentAnalyzer.__name__}.{Tokenizer.__name__}")
|
slog = logging.getLogger(f"{__name__}.{sample_func.__name__}.{SentimentAnalyzer.__name__}.{Tokenizer.__name__}")
|
||||||
|
|
|
@ -107,7 +107,7 @@ class NLTKSentimentAnalyzer(BaseSentimentAnalyzer):
|
||||||
raise NotTrainedError()
|
raise NotTrainedError()
|
||||||
|
|
||||||
# Tokenize the input
|
# Tokenize the input
|
||||||
tokens = self.tokenizer.tokenize_plain(text)
|
tokens = self.tokenizer.tokenize_and_split_plain(text)
|
||||||
|
|
||||||
# Run the classification method
|
# Run the classification method
|
||||||
return self.model.classify(instance=tokens)
|
return self.model.classify(instance=tokens)
|
||||||
|
|
|
@ -189,9 +189,11 @@ class TensorflowCategorySentimentAnalyzer(TensorflowSentimentAnalyzer):
|
||||||
input_dim=TENSORFLOW_MAX_FEATURES.__wrapped__ + 1,
|
input_dim=TENSORFLOW_MAX_FEATURES.__wrapped__ + 1,
|
||||||
output_dim=TENSORFLOW_EMBEDDING_SIZE.__wrapped__,
|
output_dim=TENSORFLOW_EMBEDDING_SIZE.__wrapped__,
|
||||||
),
|
),
|
||||||
tensorflow.keras.layers.Dropout(0.10),
|
tensorflow.keras.layers.Dropout(0.25),
|
||||||
tensorflow.keras.layers.GlobalAveragePooling1D(),
|
tensorflow.keras.layers.GlobalAveragePooling1D(),
|
||||||
tensorflow.keras.layers.Dropout(0.10),
|
tensorflow.keras.layers.Dropout(0.25),
|
||||||
|
tensorflow.keras.layers.Dense(8),
|
||||||
|
tensorflow.keras.layers.Dropout(0.25),
|
||||||
tensorflow.keras.layers.Dense(5, activation="softmax"),
|
tensorflow.keras.layers.Dense(5, activation="softmax"),
|
||||||
])
|
])
|
||||||
|
|
||||||
|
@ -240,18 +242,20 @@ class TensorflowPolarSentimentAnalyzer(TensorflowSentimentAnalyzer):
|
||||||
input_dim=TENSORFLOW_MAX_FEATURES.__wrapped__ + 1,
|
input_dim=TENSORFLOW_MAX_FEATURES.__wrapped__ + 1,
|
||||||
output_dim=TENSORFLOW_EMBEDDING_SIZE.__wrapped__,
|
output_dim=TENSORFLOW_EMBEDDING_SIZE.__wrapped__,
|
||||||
),
|
),
|
||||||
tensorflow.keras.layers.Dropout(0.10),
|
tensorflow.keras.layers.Dropout(0.25),
|
||||||
tensorflow.keras.layers.GlobalAveragePooling1D(),
|
tensorflow.keras.layers.GlobalAveragePooling1D(),
|
||||||
tensorflow.keras.layers.Dropout(0.10),
|
tensorflow.keras.layers.Dropout(0.25),
|
||||||
tensorflow.keras.layers.Dense(1),
|
tensorflow.keras.layers.Dense(8),
|
||||||
|
tensorflow.keras.layers.Dropout(0.25),
|
||||||
|
tensorflow.keras.layers.Dense(1, activation="relu"),
|
||||||
])
|
])
|
||||||
|
|
||||||
log.debug("Compiling model: %s", model)
|
log.debug("Compiling model: %s", model)
|
||||||
model.compile(
|
model.compile(
|
||||||
optimizer=tensorflow.keras.optimizers.Adadelta(global_clipnorm=1.0),
|
optimizer=tensorflow.keras.optimizers.Adam(clipnorm=2.0),
|
||||||
loss=tensorflow.keras.losses.MeanSquaredError(),
|
loss=tensorflow.keras.losses.MeanAbsoluteError(),
|
||||||
metrics=[
|
metrics=[
|
||||||
tensorflow.keras.metrics.MeanAbsoluteError(),
|
# tensorflow.keras.metrics.MeanAbsoluteError(),
|
||||||
]
|
]
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -259,7 +263,7 @@ class TensorflowPolarSentimentAnalyzer(TensorflowSentimentAnalyzer):
|
||||||
return model
|
return model
|
||||||
|
|
||||||
def _translate_prediction(self, a: numpy.array) -> Category:
|
def _translate_prediction(self, a: numpy.array) -> Category:
|
||||||
return (a[0, 0] + 0.5) * 5
|
return 1 + (a[0, 0] + 0.5) * 4
|
||||||
|
|
||||||
|
|
||||||
__all__ = (
|
__all__ = (
|
||||||
|
|
|
@ -109,10 +109,10 @@ def TENSORFLOW_EMBEDDING_SIZE(val: str | None) -> int:
|
||||||
"""
|
"""
|
||||||
The size of the embeddings tensor to use in Tensorflow models.
|
The size of the embeddings tensor to use in Tensorflow models.
|
||||||
|
|
||||||
Defaults to `6`.
|
Defaults to `12`.
|
||||||
"""
|
"""
|
||||||
if val is None:
|
if val is None:
|
||||||
return 6
|
return 12
|
||||||
try:
|
try:
|
||||||
return int(val)
|
return int(val)
|
||||||
except ValueError:
|
except ValueError:
|
||||||
|
@ -124,10 +124,10 @@ def TENSORFLOW_EPOCHS(val: str | None) -> int:
|
||||||
"""
|
"""
|
||||||
The number of epochs to train Tensorflow models for.
|
The number of epochs to train Tensorflow models for.
|
||||||
|
|
||||||
Defaults to `12`.
|
Defaults to `5`.
|
||||||
"""
|
"""
|
||||||
if val is None:
|
if val is None:
|
||||||
return 12
|
return 5
|
||||||
try:
|
try:
|
||||||
return int(val)
|
return int(val)
|
||||||
except ValueError:
|
except ValueError:
|
||||||
|
|
|
@ -41,7 +41,7 @@ class Review:
|
||||||
return tensorflow.convert_to_tensor(self.text, dtype=tensorflow.string)
|
return tensorflow.convert_to_tensor(self.text, dtype=tensorflow.string)
|
||||||
|
|
||||||
def to_tensor_normvalue(self) -> tensorflow.Tensor:
|
def to_tensor_normvalue(self) -> tensorflow.Tensor:
|
||||||
return tensorflow.convert_to_tensor([self.category / 5 - 0.5], dtype=tensorflow.float32)
|
return tensorflow.convert_to_tensor([(self.category - 1) / 4 - 0.5], dtype=tensorflow.float32)
|
||||||
|
|
||||||
def to_tensor_tuple_normvalue(self) -> tuple[tensorflow.Tensor, tensorflow.Tensor]:
|
def to_tensor_tuple_normvalue(self) -> tuple[tensorflow.Tensor, tensorflow.Tensor]:
|
||||||
return (
|
return (
|
||||||
|
|
|
@ -7,3 +7,4 @@ from .nltk_word_tokenize import *
|
||||||
from .potts import *
|
from .potts import *
|
||||||
from .plain import *
|
from .plain import *
|
||||||
from .lower import *
|
from .lower import *
|
||||||
|
from .hugging import *
|
||||||
|
|
|
@ -31,16 +31,18 @@ class BaseTokenizer:
|
||||||
"""
|
"""
|
||||||
Run `.tokenize_plain`, then split the result using `str.split`.
|
Run `.tokenize_plain`, then split the result using `str.split`.
|
||||||
"""
|
"""
|
||||||
return self.tokenize_plain(text).split()
|
text = self.tokenize_plain(text)
|
||||||
|
text = text.split()
|
||||||
|
return text
|
||||||
|
|
||||||
@__not_implemented
|
@__not_implemented
|
||||||
def tokenize_tensorflow(self, text: "tensorflow.Tensor") -> "tensorflow.Tensor":
|
def tokenize_tensorflow(self, text: tensorflow.Tensor) -> tensorflow.Tensor:
|
||||||
"""
|
"""
|
||||||
Convert a `tensorflow.Tensor` string into another `tensorflow.Tensor` space-separated string.
|
Convert a `tensorflow.Tensor` string into another `tensorflow.Tensor` space-separated string.
|
||||||
"""
|
"""
|
||||||
raise NotImplementedError()
|
raise NotImplementedError()
|
||||||
|
|
||||||
def tokenize_tensorflow_and_expand_dims(self, text: "tensorflow.Tensor") -> "tensorflow.Tensor":
|
def tokenize_tensorflow_and_expand_dims(self, text: tensorflow.Tensor) -> tensorflow.Tensor:
|
||||||
"""
|
"""
|
||||||
Run `.tokenize_tensorflow`, then add a dimension to the tensor for reasons unknown to me, but required to get `tensorflow` to work properly.
|
Run `.tokenize_tensorflow`, then add a dimension to the tensor for reasons unknown to me, but required to get `tensorflow` to work properly.
|
||||||
"""
|
"""
|
||||||
|
|
26
unimore_bda_6/tokenizer/hugging.py
Normal file
26
unimore_bda_6/tokenizer/hugging.py
Normal file
|
@ -0,0 +1,26 @@
|
||||||
|
import abc
|
||||||
|
import tokenizers
|
||||||
|
|
||||||
|
from .base import BaseTokenizer
|
||||||
|
|
||||||
|
|
||||||
|
class HuggingTokenizer(BaseTokenizer, metaclass=abc.ABCMeta):
|
||||||
|
def __init__(self):
|
||||||
|
super().__init__()
|
||||||
|
self.hug: tokenizers.Tokenizer = self._build_hugging_tokenizer()
|
||||||
|
|
||||||
|
def _build_hugging_tokenizer(self) -> tokenizers.Tokenizer:
|
||||||
|
raise NotImplementedError()
|
||||||
|
|
||||||
|
def tokenize_plain(self, text: str) -> str:
|
||||||
|
return " ".join(self.hug.encode(text).tokens)
|
||||||
|
|
||||||
|
|
||||||
|
class HuggingBertTokenizer(HuggingTokenizer):
|
||||||
|
def _build_hugging_tokenizer(self):
|
||||||
|
return tokenizers.Tokenizer.from_pretrained("bert-base-cased")
|
||||||
|
|
||||||
|
|
||||||
|
__all__ = (
|
||||||
|
"HuggingBertTokenizer",
|
||||||
|
)
|
|
@ -16,78 +16,62 @@ class PottsTokenizer(BaseTokenizer):
|
||||||
|
|
||||||
# noinspection RegExpRepeatedSpace
|
# noinspection RegExpRepeatedSpace
|
||||||
# language=pythonregexp
|
# language=pythonregexp
|
||||||
emoticon_re_string = r"""
|
emoticon_re_string = r"""[<>]?[:;=8][\-o*']?[)\](\[dDpP/:}{@|\\]"""
|
||||||
[<>]?
|
|
||||||
[:;=8] # eyes
|
|
||||||
[\-o*']? # optional nose
|
|
||||||
[)\](\[dDpP/:}{@|\\] # mouth
|
|
||||||
|
|
|
||||||
[)\](\[dDpP/:}{@|\\] # mouth
|
|
||||||
[\-o*']? # optional nose
|
|
||||||
[:;=8] # eyes
|
|
||||||
[<>]?
|
|
||||||
"""
|
|
||||||
|
|
||||||
emoticon_re = re.compile(emoticon_re_string, re.VERBOSE | re.I)
|
emoticon_re = re.compile(emoticon_re_string)
|
||||||
|
|
||||||
# noinspection RegExpRepeatedSpace,RegExpUnnecessaryNonCapturingGroup
|
words_re_string = "(" + "|".join([
|
||||||
# language=pythonregexp
|
|
||||||
words_re_string = (
|
|
||||||
# Emoticons:
|
# Emoticons:
|
||||||
emoticon_re_string
|
emoticon_re_string
|
||||||
,
|
,
|
||||||
# Phone numbers:
|
# Phone numbers:
|
||||||
r"""
|
# language=pythonregexp
|
||||||
(?: # (international)
|
r"""(?:[+]?[01][\s.-]*)?(?:[(]?\d{3}[\s.)-]*)?\d{3}[\-\s.]*\d{4}"""
|
||||||
\+?[01]
|
|
||||||
[\-\s.]*
|
|
||||||
)?
|
|
||||||
(?: # (area code)
|
|
||||||
[(]?
|
|
||||||
\d{3}
|
|
||||||
[\-\s.)]*
|
|
||||||
)?
|
|
||||||
\d{3} # exchange
|
|
||||||
[\-\s.]*
|
|
||||||
\d{4} # base
|
|
||||||
"""
|
|
||||||
,
|
,
|
||||||
# HTML tags:
|
# HTML tags:
|
||||||
|
# language=pythonregexp
|
||||||
r"""<[^>]+>"""
|
r"""<[^>]+>"""
|
||||||
,
|
,
|
||||||
# Twitter username:
|
# Twitter username:
|
||||||
|
# language=pythonregexp
|
||||||
r"""@[\w_]+"""
|
r"""@[\w_]+"""
|
||||||
,
|
,
|
||||||
# Twitter hashtags:
|
# Twitter hashtags:
|
||||||
r"""#+[\w_]+[\w'_\-]*[\w_]+"""
|
# language=pythonregexp
|
||||||
|
r"""#+[\w_]+[\w'_-]*[\w_]+"""
|
||||||
,
|
,
|
||||||
# Words with apostrophes or dashes
|
# Words with apostrophes or dashes
|
||||||
r"""[a-z][a-z'\-_]+[a-z]"""
|
# language=pythonregexp
|
||||||
|
r"""[a-z][a-z'_-]+[a-z]"""
|
||||||
,
|
,
|
||||||
# Numbers, including fractions, decimals
|
# Numbers, including fractions, decimals
|
||||||
r"""[+\-]?\d+[,/.:-]\d+[+\-]?"""
|
# language=pythonregexp
|
||||||
|
r"""[+-]?\d+(?:[,/.:-]\d+)?"""
|
||||||
,
|
,
|
||||||
# Words without apostrophes or dashes
|
# Words without apostrophes or dashes
|
||||||
|
# language=pythonregexp
|
||||||
r"""[\w_]+"""
|
r"""[\w_]+"""
|
||||||
,
|
,
|
||||||
# Ellipsis dots
|
# Ellipsis dots
|
||||||
r"""\.(?:\s*\.)+"""
|
# language=pythonregexp
|
||||||
|
r"""[.](?:\s*[.])+"""
|
||||||
,
|
,
|
||||||
# Everything else that isn't whitespace
|
# Everything else that isn't whitespace
|
||||||
r"""(?:\S)"""
|
# language=pythonregexp
|
||||||
)
|
r"""\S+"""
|
||||||
|
]) + ")"
|
||||||
|
|
||||||
words_re = re.compile("|".join(words_re_string), re.VERBOSE | re.I)
|
words_re = re.compile(words_re_string, re.I)
|
||||||
|
|
||||||
# language=pythonregexp
|
# language=pythonregexp
|
||||||
digit_re_string = r"&#\d+;"
|
digit_re_string = r"&#\d+;"
|
||||||
|
|
||||||
digit_re = re.compile(digit_re_string, re.VERBOSE)
|
digit_re = re.compile(digit_re_string)
|
||||||
|
|
||||||
# language=pythonregexp
|
# language=pythonregexp
|
||||||
alpha_re_string = r"&\w+;"
|
alpha_re_string = r"&\w+;"
|
||||||
|
|
||||||
alpha_re = re.compile(alpha_re_string, re.VERBOSE)
|
alpha_re = re.compile(alpha_re_string)
|
||||||
|
|
||||||
amp = "&"
|
amp = "&"
|
||||||
|
|
||||||
|
@ -118,7 +102,7 @@ class PottsTokenizer(BaseTokenizer):
|
||||||
s = s.replace(cls.amp, " and ")
|
s = s.replace(cls.amp, " and ")
|
||||||
return s
|
return s
|
||||||
|
|
||||||
def tokenize_plain(self, text: str) -> t.Iterable[str]:
|
def tokenize_plain(self, text: str) -> str:
|
||||||
# Fix HTML character entitites
|
# Fix HTML character entitites
|
||||||
s = self.__html2string(text)
|
s = self.__html2string(text)
|
||||||
# Tokenize
|
# Tokenize
|
||||||
|
@ -132,10 +116,10 @@ class PottsTokenizer(BaseTokenizer):
|
||||||
|
|
||||||
|
|
||||||
class PottsTokenizerWithNegation(PottsTokenizer):
|
class PottsTokenizerWithNegation(PottsTokenizer):
|
||||||
def tokenize_plain(self, text: str) -> t.Iterable[str]:
|
def tokenize_plain(self, text: str) -> str:
|
||||||
words = super().tokenize_plain(text)
|
words = super().tokenize_plain(text).split()
|
||||||
nltk.sentiment.util.mark_negation(words, shallow=True)
|
nltk.sentiment.util.mark_negation(words, shallow=True)
|
||||||
return words
|
return " ".join(words)
|
||||||
|
|
||||||
|
|
||||||
__all__ = (
|
__all__ = (
|
||||||
|
|
Loading…
Reference in a new issue