1
Fork 0
mirror of https://github.com/Steffo99/unimore-bda-6.git synced 2024-11-25 01:04:19 +00:00

Improve the tokenizer situation

This commit is contained in:
Steffo 2023-02-10 05:12:07 +01:00
parent 0ce584e856
commit 3d9eeecb2a
Signed by: steffo
GPG key ID: 2A24051445686895
6 changed files with 122 additions and 166 deletions

View file

@ -39,19 +39,19 @@ def main():
slog.debug("Selected sample_func: %s", sample_func.__name__) slog.debug("Selected sample_func: %s", sample_func.__name__)
for SentimentAnalyzer in [ for SentimentAnalyzer in [
NLTKSentimentAnalyzer,
TensorflowCategorySentimentAnalyzer, TensorflowCategorySentimentAnalyzer,
NLTKSentimentAnalyzer
]: ]:
slog = logging.getLogger(f"{__name__}.{sample_func.__name__}.{SentimentAnalyzer.__name__}") slog = logging.getLogger(f"{__name__}.{sample_func.__name__}.{SentimentAnalyzer.__name__}")
slog.debug("Selected SentimentAnalyzer: %s", SentimentAnalyzer.__name__) slog.debug("Selected SentimentAnalyzer: %s", SentimentAnalyzer.__name__)
for Tokenizer in [ for Tokenizer in [
PottsTokenizer,
PottsTokenizerWithNegation,
PlainTokenizer, PlainTokenizer,
LowercaseTokenizer, LowercaseTokenizer,
NLTKWordTokenizer, NLTKWordTokenizer,
PottsTokenizer,
PottsTokenizerWithNegation,
]: ]:
slog = logging.getLogger(f"{__name__}.{sample_func.__name__}.{SentimentAnalyzer.__name__}.{Tokenizer.__name__}") slog = logging.getLogger(f"{__name__}.{sample_func.__name__}.{SentimentAnalyzer.__name__}.{Tokenizer.__name__}")

View file

@ -21,15 +21,29 @@ class BaseTokenizer:
return not getattr(self.tokenize_tensorflow, "__notimplemented__", False) return not getattr(self.tokenize_tensorflow, "__notimplemented__", False)
@__not_implemented @__not_implemented
def tokenize_plain(self, text: str) -> list[str]: def tokenize_plain(self, text: str) -> str:
""" """
Convert a text string into a list of tokens. Convert a text `str` into another `str` containing a series of whitespace-separated tokens.
""" """
raise NotImplementedError() raise NotImplementedError()
def tokenize_and_split_plain(self, text: str) -> list[str]:
"""
Run `.tokenize_plain`, then split the result using `str.split`.
"""
return self.tokenize_plain(text).split()
@__not_implemented @__not_implemented
def tokenize_tensorflow(self, text: "tensorflow.Tensor") -> "tensorflow.Tensor": def tokenize_tensorflow(self, text: "tensorflow.Tensor") -> "tensorflow.Tensor":
""" """
Convert a `tensorflow.Tensor` string into another `tensorflow.Tensor` space-separated string. Convert a `tensorflow.Tensor` string into another `tensorflow.Tensor` space-separated string.
""" """
raise NotImplementedError() raise NotImplementedError()
def tokenize_tensorflow_and_expand_dims(self, text: "tensorflow.Tensor") -> "tensorflow.Tensor":
"""
Run `.tokenize_tensorflow`, then add a dimension to the tensor for reasons unknown to me, but required to get `tensorflow` to work properly.
"""
text = self.tokenize_tensorflow(text)
text = tensorflow.expand_dims(text, -1, name="tokens")
return text

View file

@ -8,10 +8,10 @@ class LowercaseTokenizer(BaseTokenizer):
Tokenizer which converts the words to lowercase before splitting them via spaces. Tokenizer which converts the words to lowercase before splitting them via spaces.
""" """
def tokenize_plain(self, text: str) -> list[str]: def tokenize_plain(self, text: str) -> str:
return text.lower().split() text = text.lower()
return text
def tokenize_tensorflow(self, text: tensorflow.Tensor) -> tensorflow.Tensor: def tokenize_tensorflow(self, text: tensorflow.Tensor) -> tensorflow.Tensor:
text = tensorflow.strings.lower(text) text = tensorflow.strings.lower(text)
text = tensorflow.expand_dims(text, -1, name="tokens")
return text return text

View file

@ -1,6 +1,5 @@
import nltk import nltk
import nltk.sentiment.util import nltk.sentiment.util
import typing as t
from .base import BaseTokenizer from .base import BaseTokenizer
@ -10,10 +9,10 @@ class NLTKWordTokenizer(BaseTokenizer):
Tokenizer based on `nltk.word_tokenize`. Tokenizer based on `nltk.word_tokenize`.
""" """
def tokenize_plain(self, text: str) -> t.Iterable[str]: def tokenize_plain(self, text: str) -> str:
tokens = nltk.word_tokenize(text) tokens = nltk.word_tokenize(text)
nltk.sentiment.util.mark_negation(tokens, shallow=True) nltk.sentiment.util.mark_negation(tokens, shallow=True)
return tokens return " ".join(tokens)
__all__ = ( __all__ = (

View file

@ -8,9 +8,8 @@ class PlainTokenizer(BaseTokenizer):
Tokenizer which just splits the text into tokens by separating them at whitespaces. Tokenizer which just splits the text into tokens by separating them at whitespaces.
""" """
def tokenize_plain(self, text: str) -> list[str]: def tokenize_plain(self, text: str) -> str:
return text.split() return text
def tokenize_tensorflow(self, text: tensorflow.Tensor) -> tensorflow.Tensor: def tokenize_tensorflow(self, text: tensorflow.Tensor) -> tensorflow.Tensor:
text = tensorflow.expand_dims(text, -1, name="tokens")
return text return text

View file

@ -1,52 +1,4 @@
""" import tensorflow
=========================
Original module docstring
=========================
This code implements a basic, Twitter-aware tokenizer.
A tokenizer is a function that splits a string of text into words. In
Python terms, we map string and unicode objects into lists of unicode
objects.
There is not a single right way to do tokenizing. The best method
depends on the application. This tokenizer is designed to be flexible
and this easy to adapt to new domains and tasks. The basic logic is
this:
1. The tuple regex_strings defines a list of regular expression
strings.
2. The regex_strings strings are put, in order, into a compiled
regular expression object called word_re.
3. The tokenization is done by word_re.findall(s), where s is the
user-supplied string, inside the tokenize() method of the class
Tokenizer.
4. When instantiating Tokenizer objects, there is a single option:
preserve_case. By default, it is set to True. If it is set to
False, then the tokenizer will downcase everything except for
emoticons.
The __main__ method illustrates by tokenizing a few examples.
I've also included a Tokenizer method tokenize_random_tweet(). If the
twitter library is installed (http://code.google.com/p/python-twitter/)
and Twitter is cooperating, then it should tokenize a random
English-language tweet.
"""
__author__ = "Christopher Potts"
__copyright__ = "Copyright 2011, Christopher Potts"
__credits__ = []
__license__ = "Creative Commons Attribution-NonCommercial-ShareAlike 3.0 Unported License: http://creativecommons.org/licenses/by-nc-sa/3.0/"
__version__ = "1.0"
__maintainer__ = "Christopher Potts"
__email__ = "See the author's website"
######################################################################
import re import re
import html.entities import html.entities
import typing as t import typing as t
@ -54,108 +6,98 @@ import nltk.sentiment.util
from .base import BaseTokenizer from .base import BaseTokenizer
######################################################################
# The following strings are components in the regular expression
# that is used for tokenizing. It's important that phone_number
# appears first in the final regex (since it can contain whitespace).
# It also could matter that tags comes after emoticons, due to the
# possibility of having text like
#
# <:| and some text >:)
#
# Most imporatantly, the final element should always be last, since it
# does a last ditch whitespace-based tokenization of whatever is left.
# This particular element is used in a couple ways, so we define it
# with a name:
emoticon_string = r"""
(?:
[<>]?
[:;=8] # eyes
[\-o\*\']? # optional nose
[\)\]\(\[dDpP/\:\}\{@\|\\] # mouth
|
[\)\]\(\[dDpP/\:\}\{@\|\\] # mouth
[\-o\*\']? # optional nose
[:;=8] # eyes
[<>]?
)"""
# The components of the tokenizer:
regex_strings = (
# Phone numbers:
r"""
(?:
(?: # (international)
\+?[01]
[\-\s.]*
)?
(?: # (area code)
[\(]?
\d{3}
[\-\s.\)]*
)?
\d{3} # exchange
[\-\s.]*
\d{4} # base
)"""
,
# Emoticons:
emoticon_string
,
# HTML tags:
r"""<[^>]+>"""
,
# Twitter username:
r"""(?:@[\w_]+)"""
,
# Twitter hashtags:
r"""(?:\#+[\w_]+[\w\'_\-]*[\w_]+)"""
,
# Remaining word types:
r"""
(?:[a-z][a-z'\-_]+[a-z]) # Words with apostrophes or dashes.
|
(?:[+\-]?\d+[,/.:-]\d+[+\-]?) # Numbers, including fractions, decimals.
|
(?:[\w_]+) # Words without apostrophes or dashes.
|
(?:\.(?:\s*\.){1,}) # Ellipsis dots.
|
(?:\S) # Everything else that isn't whitespace.
"""
)
######################################################################
# This is the core tokenizing regex:
word_re = re.compile(r"""(%s)""" % "|".join(regex_strings), re.VERBOSE | re.I | re.UNICODE)
# The emoticon string gets its own regex so that we can preserve case for them as needed:
emoticon_re = re.compile(regex_strings[1], re.VERBOSE | re.I | re.UNICODE)
# These are for regularizing HTML entities to Unicode:
html_entity_digit_re = re.compile(r"&#\d+;")
html_entity_alpha_re = re.compile(r"&\w+;")
amp = "&amp;"
######################################################################
class PottsTokenizer(BaseTokenizer): class PottsTokenizer(BaseTokenizer):
""" """
Tokenizer based on `Christopher Potts' tokenizer <http://sentiment.christopherpotts.net/tokenizing.html>`_. Tokenizer based on `Christopher Potts' tokenizer <http://sentiment.christopherpotts.net/tokenizing.html>`_, released in 2011.
This module is released under the Creative Commons Attribution-NonCommercial-ShareAlike 3.0 Unported License: https://creativecommons.org/licenses/by-nc-sa/3.0/ .
""" """
@staticmethod # noinspection RegExpRepeatedSpace
def __html2string(s: str) -> str: # language=pythonregexp
emoticon_re_string = r"""
[<>]?
[:;=8] # eyes
[\-o*']? # optional nose
[)\](\[dDpP/:}{@|\\] # mouth
|
[)\](\[dDpP/:}{@|\\] # mouth
[\-o*']? # optional nose
[:;=8] # eyes
[<>]?
""" """
Internal metod that seeks to replace all the HTML entities in
s with their corresponding unicode characters. emoticon_re = re.compile(emoticon_re_string, re.VERBOSE | re.I)
# noinspection RegExpRepeatedSpace,RegExpUnnecessaryNonCapturingGroup
# language=pythonregexp
words_re_string = (
# Emoticons:
emoticon_re_string
,
# Phone numbers:
r"""
(?: # (international)
\+?[01]
[\-\s.]*
)?
(?: # (area code)
[(]?
\d{3}
[\-\s.)]*
)?
\d{3} # exchange
[\-\s.]*
\d{4} # base
"""
,
# HTML tags:
r"""<[^>]+>"""
,
# Twitter username:
r"""@[\w_]+"""
,
# Twitter hashtags:
r"""#+[\w_]+[\w'_\-]*[\w_]+"""
,
# Words with apostrophes or dashes
r"""[a-z][a-z'\-_]+[a-z]"""
,
# Numbers, including fractions, decimals
r"""[+\-]?\d+[,/.:-]\d+[+\-]?"""
,
# Words without apostrophes or dashes
r"""[\w_]+"""
,
# Ellipsis dots
r"""\.(?:\s*\.)+"""
,
# Everything else that isn't whitespace
r"""(?:\S)"""
)
words_re = re.compile("|".join(words_re_string), re.VERBOSE | re.I)
# language=pythonregexp
digit_re_string = r"&#\d+;"
digit_re = re.compile(digit_re_string, re.VERBOSE)
# language=pythonregexp
alpha_re_string = r"&\w+;"
alpha_re = re.compile(alpha_re_string, re.VERBOSE)
amp = "&amp;"
@classmethod
def __html2string(cls, s: str) -> str:
"""
Internal metod that seeks to replace all the HTML entities in s with their corresponding characters.
""" """
# First the digits: # First the digits:
ents = set(html_entity_digit_re.findall(s)) ents = set(cls.digit_re.findall(s))
if len(ents) > 0: if len(ents) > 0:
for ent in ents: for ent in ents:
entnum = ent[2:-1] entnum = ent[2:-1]
@ -165,26 +107,28 @@ class PottsTokenizer(BaseTokenizer):
except (ValueError, KeyError): except (ValueError, KeyError):
pass pass
# Now the alpha versions: # Now the alpha versions:
ents = set(html_entity_alpha_re.findall(s)) ents = set(cls.alpha_re.findall(s))
ents = filter((lambda x: x != amp), ents) ents = filter((lambda x: x != cls.amp), ents)
for ent in ents: for ent in ents:
entname = ent[1:-1] entname = ent[1:-1]
try: try:
s = s.replace(ent, chr(html.entities.name2codepoint[entname])) s = s.replace(ent, chr(html.entities.name2codepoint[entname]))
except (ValueError, KeyError): except (ValueError, KeyError):
pass pass
s = s.replace(amp, " and ") s = s.replace(cls.amp, " and ")
return s return s
def tokenize_plain(self, text: str) -> t.Iterable[str]: def tokenize_plain(self, text: str) -> t.Iterable[str]:
# Fix HTML character entitites: # Fix HTML character entitites
s = self.__html2string(text) s = self.__html2string(text)
# Tokenize: # Tokenize
words = word_re.findall(s) words = self.words_re.findall(s)
# Possible alter the case, but avoid changing emoticons like :D into :d: # Possible alter the case, but avoid changing emoticons like :D into :d:
words = list(map(lambda x: x if emoticon_re.search(x) else x.lower(), words)) words = list(map(lambda x: x if self.emoticon_re.search(x) else x.lower(), words))
# Return the results # Re-join words
return words result = " ".join(words)
# Return the result
return result
class PottsTokenizerWithNegation(PottsTokenizer): class PottsTokenizerWithNegation(PottsTokenizer):