mirror of
https://github.com/Steffo99/unimore-bda-6.git
synced 2024-11-25 01:04:19 +00:00
fix and patch things
This commit is contained in:
parent
ade3a6bdc7
commit
7778c648c1
10 changed files with 87 additions and 67 deletions
|
@ -1,16 +1,18 @@
|
|||
<component name="ProjectRunConfigurationManager">
|
||||
<configuration default="false" name="unimore_bda_6" type="PythonConfigurationType" factoryName="Python" nameIsGenerated="true">
|
||||
<module name="unimore-bda-6" />
|
||||
<option name="INTERPRETER_OPTIONS" value="-O" />
|
||||
<option name="INTERPRETER_OPTIONS" value="" />
|
||||
<option name="PARENT_ENVS" value="true" />
|
||||
<envs>
|
||||
<env name="CONFIRM_OVERWRITE" value="False" />
|
||||
<env name="NLTK_DATA" value="./data/nltk" />
|
||||
<env name="PYTHONUNBUFFERED" value="1" />
|
||||
<env name="TENSORFLOW_EPOCHS" value="4" />
|
||||
<env name="TF_CPP_MIN_LOG_LEVEL" value="2" />
|
||||
<env name="WORKING_SET_SIZE" value="1000000" />
|
||||
<env name="WORKING_SET_SIZE" value="100000" />
|
||||
<env name="XLA_FLAGS" value="--xla_gpu_cuda_data_dir=/opt/cuda" />
|
||||
<env name="TRAINING_SET_SIZE" value="100" />
|
||||
<env name="VALIDATION_SET_SIZE" value="25" />
|
||||
<env name="EVALUATION_SET_SIZE" value="100" />
|
||||
</envs>
|
||||
<option name="SDK_HOME" value="$PROJECT_DIR$/.venv/bin/python" />
|
||||
<option name="SDK_NAME" value="Poetry (unimore-bda-6)" />
|
||||
|
|
|
@ -8,7 +8,7 @@ from .config import config
|
|||
from .database import mongo_client_from_config, reviews_collection, sample_reviews_polar, sample_reviews_varied
|
||||
from .analysis import NLTKSentimentAnalyzer, TensorflowCategorySentimentAnalyzer, TensorflowPolarSentimentAnalyzer
|
||||
from .analysis.base import TrainingFailedError
|
||||
from .tokenizer import PlainTokenizer, LowercaseTokenizer, NLTKWordTokenizer, PottsTokenizer, PottsTokenizerWithNegation
|
||||
from .tokenizer import PlainTokenizer, LowercaseTokenizer, NLTKWordTokenizer, PottsTokenizer, PottsTokenizerWithNegation, HuggingBertTokenizer
|
||||
from .gathering import Caches
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
@ -38,20 +38,21 @@ def main():
|
|||
slog.debug("Selected sample_func: %s", sample_func.__name__)
|
||||
|
||||
for SentimentAnalyzer in [
|
||||
NLTKSentimentAnalyzer,
|
||||
TensorflowPolarSentimentAnalyzer,
|
||||
TensorflowCategorySentimentAnalyzer,
|
||||
NLTKSentimentAnalyzer,
|
||||
]:
|
||||
|
||||
slog = logging.getLogger(f"{__name__}.{sample_func.__name__}.{SentimentAnalyzer.__name__}")
|
||||
slog.debug("Selected SentimentAnalyzer: %s", SentimentAnalyzer.__name__)
|
||||
|
||||
for Tokenizer in [
|
||||
PottsTokenizerWithNegation,
|
||||
PottsTokenizer,
|
||||
HuggingBertTokenizer,
|
||||
PlainTokenizer,
|
||||
LowercaseTokenizer,
|
||||
NLTKWordTokenizer,
|
||||
PottsTokenizer,
|
||||
PottsTokenizerWithNegation,
|
||||
]:
|
||||
|
||||
slog = logging.getLogger(f"{__name__}.{sample_func.__name__}.{SentimentAnalyzer.__name__}.{Tokenizer.__name__}")
|
||||
|
|
|
@ -107,7 +107,7 @@ class NLTKSentimentAnalyzer(BaseSentimentAnalyzer):
|
|||
raise NotTrainedError()
|
||||
|
||||
# Tokenize the input
|
||||
tokens = self.tokenizer.tokenize_plain(text)
|
||||
tokens = self.tokenizer.tokenize_and_split_plain(text)
|
||||
|
||||
# Run the classification method
|
||||
return self.model.classify(instance=tokens)
|
||||
|
|
|
@ -189,9 +189,11 @@ class TensorflowCategorySentimentAnalyzer(TensorflowSentimentAnalyzer):
|
|||
input_dim=TENSORFLOW_MAX_FEATURES.__wrapped__ + 1,
|
||||
output_dim=TENSORFLOW_EMBEDDING_SIZE.__wrapped__,
|
||||
),
|
||||
tensorflow.keras.layers.Dropout(0.10),
|
||||
tensorflow.keras.layers.Dropout(0.25),
|
||||
tensorflow.keras.layers.GlobalAveragePooling1D(),
|
||||
tensorflow.keras.layers.Dropout(0.10),
|
||||
tensorflow.keras.layers.Dropout(0.25),
|
||||
tensorflow.keras.layers.Dense(8),
|
||||
tensorflow.keras.layers.Dropout(0.25),
|
||||
tensorflow.keras.layers.Dense(5, activation="softmax"),
|
||||
])
|
||||
|
||||
|
@ -240,18 +242,20 @@ class TensorflowPolarSentimentAnalyzer(TensorflowSentimentAnalyzer):
|
|||
input_dim=TENSORFLOW_MAX_FEATURES.__wrapped__ + 1,
|
||||
output_dim=TENSORFLOW_EMBEDDING_SIZE.__wrapped__,
|
||||
),
|
||||
tensorflow.keras.layers.Dropout(0.10),
|
||||
tensorflow.keras.layers.Dropout(0.25),
|
||||
tensorflow.keras.layers.GlobalAveragePooling1D(),
|
||||
tensorflow.keras.layers.Dropout(0.10),
|
||||
tensorflow.keras.layers.Dense(1),
|
||||
tensorflow.keras.layers.Dropout(0.25),
|
||||
tensorflow.keras.layers.Dense(8),
|
||||
tensorflow.keras.layers.Dropout(0.25),
|
||||
tensorflow.keras.layers.Dense(1, activation="relu"),
|
||||
])
|
||||
|
||||
log.debug("Compiling model: %s", model)
|
||||
model.compile(
|
||||
optimizer=tensorflow.keras.optimizers.Adadelta(global_clipnorm=1.0),
|
||||
loss=tensorflow.keras.losses.MeanSquaredError(),
|
||||
optimizer=tensorflow.keras.optimizers.Adam(clipnorm=2.0),
|
||||
loss=tensorflow.keras.losses.MeanAbsoluteError(),
|
||||
metrics=[
|
||||
tensorflow.keras.metrics.MeanAbsoluteError(),
|
||||
# tensorflow.keras.metrics.MeanAbsoluteError(),
|
||||
]
|
||||
)
|
||||
|
||||
|
@ -259,7 +263,7 @@ class TensorflowPolarSentimentAnalyzer(TensorflowSentimentAnalyzer):
|
|||
return model
|
||||
|
||||
def _translate_prediction(self, a: numpy.array) -> Category:
|
||||
return (a[0, 0] + 0.5) * 5
|
||||
return 1 + (a[0, 0] + 0.5) * 4
|
||||
|
||||
|
||||
__all__ = (
|
||||
|
|
|
@ -109,10 +109,10 @@ def TENSORFLOW_EMBEDDING_SIZE(val: str | None) -> int:
|
|||
"""
|
||||
The size of the embeddings tensor to use in Tensorflow models.
|
||||
|
||||
Defaults to `6`.
|
||||
Defaults to `12`.
|
||||
"""
|
||||
if val is None:
|
||||
return 6
|
||||
return 12
|
||||
try:
|
||||
return int(val)
|
||||
except ValueError:
|
||||
|
@ -124,10 +124,10 @@ def TENSORFLOW_EPOCHS(val: str | None) -> int:
|
|||
"""
|
||||
The number of epochs to train Tensorflow models for.
|
||||
|
||||
Defaults to `12`.
|
||||
Defaults to `5`.
|
||||
"""
|
||||
if val is None:
|
||||
return 12
|
||||
return 5
|
||||
try:
|
||||
return int(val)
|
||||
except ValueError:
|
||||
|
|
|
@ -41,7 +41,7 @@ class Review:
|
|||
return tensorflow.convert_to_tensor(self.text, dtype=tensorflow.string)
|
||||
|
||||
def to_tensor_normvalue(self) -> tensorflow.Tensor:
|
||||
return tensorflow.convert_to_tensor([self.category / 5 - 0.5], dtype=tensorflow.float32)
|
||||
return tensorflow.convert_to_tensor([(self.category - 1) / 4 - 0.5], dtype=tensorflow.float32)
|
||||
|
||||
def to_tensor_tuple_normvalue(self) -> tuple[tensorflow.Tensor, tensorflow.Tensor]:
|
||||
return (
|
||||
|
|
|
@ -7,3 +7,4 @@ from .nltk_word_tokenize import *
|
|||
from .potts import *
|
||||
from .plain import *
|
||||
from .lower import *
|
||||
from .hugging import *
|
||||
|
|
|
@ -31,16 +31,18 @@ class BaseTokenizer:
|
|||
"""
|
||||
Run `.tokenize_plain`, then split the result using `str.split`.
|
||||
"""
|
||||
return self.tokenize_plain(text).split()
|
||||
text = self.tokenize_plain(text)
|
||||
text = text.split()
|
||||
return text
|
||||
|
||||
@__not_implemented
|
||||
def tokenize_tensorflow(self, text: "tensorflow.Tensor") -> "tensorflow.Tensor":
|
||||
def tokenize_tensorflow(self, text: tensorflow.Tensor) -> tensorflow.Tensor:
|
||||
"""
|
||||
Convert a `tensorflow.Tensor` string into another `tensorflow.Tensor` space-separated string.
|
||||
"""
|
||||
raise NotImplementedError()
|
||||
|
||||
def tokenize_tensorflow_and_expand_dims(self, text: "tensorflow.Tensor") -> "tensorflow.Tensor":
|
||||
def tokenize_tensorflow_and_expand_dims(self, text: tensorflow.Tensor) -> tensorflow.Tensor:
|
||||
"""
|
||||
Run `.tokenize_tensorflow`, then add a dimension to the tensor for reasons unknown to me, but required to get `tensorflow` to work properly.
|
||||
"""
|
||||
|
|
26
unimore_bda_6/tokenizer/hugging.py
Normal file
26
unimore_bda_6/tokenizer/hugging.py
Normal file
|
@ -0,0 +1,26 @@
|
|||
import abc
|
||||
import tokenizers
|
||||
|
||||
from .base import BaseTokenizer
|
||||
|
||||
|
||||
class HuggingTokenizer(BaseTokenizer, metaclass=abc.ABCMeta):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.hug: tokenizers.Tokenizer = self._build_hugging_tokenizer()
|
||||
|
||||
def _build_hugging_tokenizer(self) -> tokenizers.Tokenizer:
|
||||
raise NotImplementedError()
|
||||
|
||||
def tokenize_plain(self, text: str) -> str:
|
||||
return " ".join(self.hug.encode(text).tokens)
|
||||
|
||||
|
||||
class HuggingBertTokenizer(HuggingTokenizer):
|
||||
def _build_hugging_tokenizer(self):
|
||||
return tokenizers.Tokenizer.from_pretrained("bert-base-cased")
|
||||
|
||||
|
||||
__all__ = (
|
||||
"HuggingBertTokenizer",
|
||||
)
|
|
@ -16,78 +16,62 @@ class PottsTokenizer(BaseTokenizer):
|
|||
|
||||
# noinspection RegExpRepeatedSpace
|
||||
# language=pythonregexp
|
||||
emoticon_re_string = r"""
|
||||
[<>]?
|
||||
[:;=8] # eyes
|
||||
[\-o*']? # optional nose
|
||||
[)\](\[dDpP/:}{@|\\] # mouth
|
||||
|
|
||||
[)\](\[dDpP/:}{@|\\] # mouth
|
||||
[\-o*']? # optional nose
|
||||
[:;=8] # eyes
|
||||
[<>]?
|
||||
"""
|
||||
emoticon_re_string = r"""[<>]?[:;=8][\-o*']?[)\](\[dDpP/:}{@|\\]"""
|
||||
|
||||
emoticon_re = re.compile(emoticon_re_string, re.VERBOSE | re.I)
|
||||
emoticon_re = re.compile(emoticon_re_string)
|
||||
|
||||
# noinspection RegExpRepeatedSpace,RegExpUnnecessaryNonCapturingGroup
|
||||
# language=pythonregexp
|
||||
words_re_string = (
|
||||
words_re_string = "(" + "|".join([
|
||||
# Emoticons:
|
||||
emoticon_re_string
|
||||
,
|
||||
# Phone numbers:
|
||||
r"""
|
||||
(?: # (international)
|
||||
\+?[01]
|
||||
[\-\s.]*
|
||||
)?
|
||||
(?: # (area code)
|
||||
[(]?
|
||||
\d{3}
|
||||
[\-\s.)]*
|
||||
)?
|
||||
\d{3} # exchange
|
||||
[\-\s.]*
|
||||
\d{4} # base
|
||||
"""
|
||||
# language=pythonregexp
|
||||
r"""(?:[+]?[01][\s.-]*)?(?:[(]?\d{3}[\s.)-]*)?\d{3}[\-\s.]*\d{4}"""
|
||||
,
|
||||
# HTML tags:
|
||||
# language=pythonregexp
|
||||
r"""<[^>]+>"""
|
||||
,
|
||||
# Twitter username:
|
||||
# language=pythonregexp
|
||||
r"""@[\w_]+"""
|
||||
,
|
||||
# Twitter hashtags:
|
||||
r"""#+[\w_]+[\w'_\-]*[\w_]+"""
|
||||
# language=pythonregexp
|
||||
r"""#+[\w_]+[\w'_-]*[\w_]+"""
|
||||
,
|
||||
# Words with apostrophes or dashes
|
||||
r"""[a-z][a-z'\-_]+[a-z]"""
|
||||
# language=pythonregexp
|
||||
r"""[a-z][a-z'_-]+[a-z]"""
|
||||
,
|
||||
# Numbers, including fractions, decimals
|
||||
r"""[+\-]?\d+[,/.:-]\d+[+\-]?"""
|
||||
# language=pythonregexp
|
||||
r"""[+-]?\d+(?:[,/.:-]\d+)?"""
|
||||
,
|
||||
# Words without apostrophes or dashes
|
||||
# language=pythonregexp
|
||||
r"""[\w_]+"""
|
||||
,
|
||||
# Ellipsis dots
|
||||
r"""\.(?:\s*\.)+"""
|
||||
# language=pythonregexp
|
||||
r"""[.](?:\s*[.])+"""
|
||||
,
|
||||
# Everything else that isn't whitespace
|
||||
r"""(?:\S)"""
|
||||
)
|
||||
# language=pythonregexp
|
||||
r"""\S+"""
|
||||
]) + ")"
|
||||
|
||||
words_re = re.compile("|".join(words_re_string), re.VERBOSE | re.I)
|
||||
words_re = re.compile(words_re_string, re.I)
|
||||
|
||||
# language=pythonregexp
|
||||
digit_re_string = r"&#\d+;"
|
||||
|
||||
digit_re = re.compile(digit_re_string, re.VERBOSE)
|
||||
digit_re = re.compile(digit_re_string)
|
||||
|
||||
# language=pythonregexp
|
||||
alpha_re_string = r"&\w+;"
|
||||
|
||||
alpha_re = re.compile(alpha_re_string, re.VERBOSE)
|
||||
alpha_re = re.compile(alpha_re_string)
|
||||
|
||||
amp = "&"
|
||||
|
||||
|
@ -118,7 +102,7 @@ class PottsTokenizer(BaseTokenizer):
|
|||
s = s.replace(cls.amp, " and ")
|
||||
return s
|
||||
|
||||
def tokenize_plain(self, text: str) -> t.Iterable[str]:
|
||||
def tokenize_plain(self, text: str) -> str:
|
||||
# Fix HTML character entitites
|
||||
s = self.__html2string(text)
|
||||
# Tokenize
|
||||
|
@ -132,10 +116,10 @@ class PottsTokenizer(BaseTokenizer):
|
|||
|
||||
|
||||
class PottsTokenizerWithNegation(PottsTokenizer):
|
||||
def tokenize_plain(self, text: str) -> t.Iterable[str]:
|
||||
words = super().tokenize_plain(text)
|
||||
def tokenize_plain(self, text: str) -> str:
|
||||
words = super().tokenize_plain(text).split()
|
||||
nltk.sentiment.util.mark_negation(words, shallow=True)
|
||||
return words
|
||||
return " ".join(words)
|
||||
|
||||
|
||||
__all__ = (
|
||||
|
|
Loading…
Reference in a new issue