mirror of
https://github.com/Steffo99/unimore-bda-6.git
synced 2024-11-25 01:04:19 +00:00
Improve the tokenizer situation
This commit is contained in:
parent
0ce584e856
commit
3d9eeecb2a
6 changed files with 122 additions and 166 deletions
|
@ -39,19 +39,19 @@ def main():
|
||||||
slog.debug("Selected sample_func: %s", sample_func.__name__)
|
slog.debug("Selected sample_func: %s", sample_func.__name__)
|
||||||
|
|
||||||
for SentimentAnalyzer in [
|
for SentimentAnalyzer in [
|
||||||
|
NLTKSentimentAnalyzer,
|
||||||
TensorflowCategorySentimentAnalyzer,
|
TensorflowCategorySentimentAnalyzer,
|
||||||
NLTKSentimentAnalyzer
|
|
||||||
]:
|
]:
|
||||||
|
|
||||||
slog = logging.getLogger(f"{__name__}.{sample_func.__name__}.{SentimentAnalyzer.__name__}")
|
slog = logging.getLogger(f"{__name__}.{sample_func.__name__}.{SentimentAnalyzer.__name__}")
|
||||||
slog.debug("Selected SentimentAnalyzer: %s", SentimentAnalyzer.__name__)
|
slog.debug("Selected SentimentAnalyzer: %s", SentimentAnalyzer.__name__)
|
||||||
|
|
||||||
for Tokenizer in [
|
for Tokenizer in [
|
||||||
|
PottsTokenizer,
|
||||||
|
PottsTokenizerWithNegation,
|
||||||
PlainTokenizer,
|
PlainTokenizer,
|
||||||
LowercaseTokenizer,
|
LowercaseTokenizer,
|
||||||
NLTKWordTokenizer,
|
NLTKWordTokenizer,
|
||||||
PottsTokenizer,
|
|
||||||
PottsTokenizerWithNegation,
|
|
||||||
]:
|
]:
|
||||||
|
|
||||||
slog = logging.getLogger(f"{__name__}.{sample_func.__name__}.{SentimentAnalyzer.__name__}.{Tokenizer.__name__}")
|
slog = logging.getLogger(f"{__name__}.{sample_func.__name__}.{SentimentAnalyzer.__name__}.{Tokenizer.__name__}")
|
||||||
|
|
|
@ -21,15 +21,29 @@ class BaseTokenizer:
|
||||||
return not getattr(self.tokenize_tensorflow, "__notimplemented__", False)
|
return not getattr(self.tokenize_tensorflow, "__notimplemented__", False)
|
||||||
|
|
||||||
@__not_implemented
|
@__not_implemented
|
||||||
def tokenize_plain(self, text: str) -> list[str]:
|
def tokenize_plain(self, text: str) -> str:
|
||||||
"""
|
"""
|
||||||
Convert a text string into a list of tokens.
|
Convert a text `str` into another `str` containing a series of whitespace-separated tokens.
|
||||||
"""
|
"""
|
||||||
raise NotImplementedError()
|
raise NotImplementedError()
|
||||||
|
|
||||||
|
def tokenize_and_split_plain(self, text: str) -> list[str]:
|
||||||
|
"""
|
||||||
|
Run `.tokenize_plain`, then split the result using `str.split`.
|
||||||
|
"""
|
||||||
|
return self.tokenize_plain(text).split()
|
||||||
|
|
||||||
@__not_implemented
|
@__not_implemented
|
||||||
def tokenize_tensorflow(self, text: "tensorflow.Tensor") -> "tensorflow.Tensor":
|
def tokenize_tensorflow(self, text: "tensorflow.Tensor") -> "tensorflow.Tensor":
|
||||||
"""
|
"""
|
||||||
Convert a `tensorflow.Tensor` string into another `tensorflow.Tensor` space-separated string.
|
Convert a `tensorflow.Tensor` string into another `tensorflow.Tensor` space-separated string.
|
||||||
"""
|
"""
|
||||||
raise NotImplementedError()
|
raise NotImplementedError()
|
||||||
|
|
||||||
|
def tokenize_tensorflow_and_expand_dims(self, text: "tensorflow.Tensor") -> "tensorflow.Tensor":
|
||||||
|
"""
|
||||||
|
Run `.tokenize_tensorflow`, then add a dimension to the tensor for reasons unknown to me, but required to get `tensorflow` to work properly.
|
||||||
|
"""
|
||||||
|
text = self.tokenize_tensorflow(text)
|
||||||
|
text = tensorflow.expand_dims(text, -1, name="tokens")
|
||||||
|
return text
|
||||||
|
|
|
@ -8,10 +8,10 @@ class LowercaseTokenizer(BaseTokenizer):
|
||||||
Tokenizer which converts the words to lowercase before splitting them via spaces.
|
Tokenizer which converts the words to lowercase before splitting them via spaces.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def tokenize_plain(self, text: str) -> list[str]:
|
def tokenize_plain(self, text: str) -> str:
|
||||||
return text.lower().split()
|
text = text.lower()
|
||||||
|
return text
|
||||||
|
|
||||||
def tokenize_tensorflow(self, text: tensorflow.Tensor) -> tensorflow.Tensor:
|
def tokenize_tensorflow(self, text: tensorflow.Tensor) -> tensorflow.Tensor:
|
||||||
text = tensorflow.strings.lower(text)
|
text = tensorflow.strings.lower(text)
|
||||||
text = tensorflow.expand_dims(text, -1, name="tokens")
|
|
||||||
return text
|
return text
|
||||||
|
|
|
@ -1,6 +1,5 @@
|
||||||
import nltk
|
import nltk
|
||||||
import nltk.sentiment.util
|
import nltk.sentiment.util
|
||||||
import typing as t
|
|
||||||
|
|
||||||
from .base import BaseTokenizer
|
from .base import BaseTokenizer
|
||||||
|
|
||||||
|
@ -10,10 +9,10 @@ class NLTKWordTokenizer(BaseTokenizer):
|
||||||
Tokenizer based on `nltk.word_tokenize`.
|
Tokenizer based on `nltk.word_tokenize`.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def tokenize_plain(self, text: str) -> t.Iterable[str]:
|
def tokenize_plain(self, text: str) -> str:
|
||||||
tokens = nltk.word_tokenize(text)
|
tokens = nltk.word_tokenize(text)
|
||||||
nltk.sentiment.util.mark_negation(tokens, shallow=True)
|
nltk.sentiment.util.mark_negation(tokens, shallow=True)
|
||||||
return tokens
|
return " ".join(tokens)
|
||||||
|
|
||||||
|
|
||||||
__all__ = (
|
__all__ = (
|
||||||
|
|
|
@ -8,9 +8,8 @@ class PlainTokenizer(BaseTokenizer):
|
||||||
Tokenizer which just splits the text into tokens by separating them at whitespaces.
|
Tokenizer which just splits the text into tokens by separating them at whitespaces.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def tokenize_plain(self, text: str) -> list[str]:
|
def tokenize_plain(self, text: str) -> str:
|
||||||
return text.split()
|
return text
|
||||||
|
|
||||||
def tokenize_tensorflow(self, text: tensorflow.Tensor) -> tensorflow.Tensor:
|
def tokenize_tensorflow(self, text: tensorflow.Tensor) -> tensorflow.Tensor:
|
||||||
text = tensorflow.expand_dims(text, -1, name="tokens")
|
|
||||||
return text
|
return text
|
||||||
|
|
|
@ -1,52 +1,4 @@
|
||||||
"""
|
import tensorflow
|
||||||
=========================
|
|
||||||
Original module docstring
|
|
||||||
=========================
|
|
||||||
|
|
||||||
This code implements a basic, Twitter-aware tokenizer.
|
|
||||||
|
|
||||||
A tokenizer is a function that splits a string of text into words. In
|
|
||||||
Python terms, we map string and unicode objects into lists of unicode
|
|
||||||
objects.
|
|
||||||
|
|
||||||
There is not a single right way to do tokenizing. The best method
|
|
||||||
depends on the application. This tokenizer is designed to be flexible
|
|
||||||
and this easy to adapt to new domains and tasks. The basic logic is
|
|
||||||
this:
|
|
||||||
|
|
||||||
1. The tuple regex_strings defines a list of regular expression
|
|
||||||
strings.
|
|
||||||
|
|
||||||
2. The regex_strings strings are put, in order, into a compiled
|
|
||||||
regular expression object called word_re.
|
|
||||||
|
|
||||||
3. The tokenization is done by word_re.findall(s), where s is the
|
|
||||||
user-supplied string, inside the tokenize() method of the class
|
|
||||||
Tokenizer.
|
|
||||||
|
|
||||||
4. When instantiating Tokenizer objects, there is a single option:
|
|
||||||
preserve_case. By default, it is set to True. If it is set to
|
|
||||||
False, then the tokenizer will downcase everything except for
|
|
||||||
emoticons.
|
|
||||||
|
|
||||||
The __main__ method illustrates by tokenizing a few examples.
|
|
||||||
|
|
||||||
I've also included a Tokenizer method tokenize_random_tweet(). If the
|
|
||||||
twitter library is installed (http://code.google.com/p/python-twitter/)
|
|
||||||
and Twitter is cooperating, then it should tokenize a random
|
|
||||||
English-language tweet.
|
|
||||||
"""
|
|
||||||
|
|
||||||
__author__ = "Christopher Potts"
|
|
||||||
__copyright__ = "Copyright 2011, Christopher Potts"
|
|
||||||
__credits__ = []
|
|
||||||
__license__ = "Creative Commons Attribution-NonCommercial-ShareAlike 3.0 Unported License: http://creativecommons.org/licenses/by-nc-sa/3.0/"
|
|
||||||
__version__ = "1.0"
|
|
||||||
__maintainer__ = "Christopher Potts"
|
|
||||||
__email__ = "See the author's website"
|
|
||||||
|
|
||||||
######################################################################
|
|
||||||
|
|
||||||
import re
|
import re
|
||||||
import html.entities
|
import html.entities
|
||||||
import typing as t
|
import typing as t
|
||||||
|
@ -54,108 +6,98 @@ import nltk.sentiment.util
|
||||||
|
|
||||||
from .base import BaseTokenizer
|
from .base import BaseTokenizer
|
||||||
|
|
||||||
######################################################################
|
|
||||||
# The following strings are components in the regular expression
|
|
||||||
# that is used for tokenizing. It's important that phone_number
|
|
||||||
# appears first in the final regex (since it can contain whitespace).
|
|
||||||
# It also could matter that tags comes after emoticons, due to the
|
|
||||||
# possibility of having text like
|
|
||||||
#
|
|
||||||
# <:| and some text >:)
|
|
||||||
#
|
|
||||||
# Most imporatantly, the final element should always be last, since it
|
|
||||||
# does a last ditch whitespace-based tokenization of whatever is left.
|
|
||||||
|
|
||||||
# This particular element is used in a couple ways, so we define it
|
class PottsTokenizer(BaseTokenizer):
|
||||||
# with a name:
|
"""
|
||||||
emoticon_string = r"""
|
Tokenizer based on `Christopher Potts' tokenizer <http://sentiment.christopherpotts.net/tokenizing.html>`_, released in 2011.
|
||||||
(?:
|
|
||||||
|
This module is released under the Creative Commons Attribution-NonCommercial-ShareAlike 3.0 Unported License: https://creativecommons.org/licenses/by-nc-sa/3.0/ .
|
||||||
|
"""
|
||||||
|
|
||||||
|
# noinspection RegExpRepeatedSpace
|
||||||
|
# language=pythonregexp
|
||||||
|
emoticon_re_string = r"""
|
||||||
[<>]?
|
[<>]?
|
||||||
[:;=8] # eyes
|
[:;=8] # eyes
|
||||||
[\-o\*\']? # optional nose
|
[\-o*']? # optional nose
|
||||||
[\)\]\(\[dDpP/\:\}\{@\|\\] # mouth
|
[)\](\[dDpP/:}{@|\\] # mouth
|
||||||
|
|
|
|
||||||
[\)\]\(\[dDpP/\:\}\{@\|\\] # mouth
|
[)\](\[dDpP/:}{@|\\] # mouth
|
||||||
[\-o\*\']? # optional nose
|
[\-o*']? # optional nose
|
||||||
[:;=8] # eyes
|
[:;=8] # eyes
|
||||||
[<>]?
|
[<>]?
|
||||||
)"""
|
"""
|
||||||
|
|
||||||
# The components of the tokenizer:
|
emoticon_re = re.compile(emoticon_re_string, re.VERBOSE | re.I)
|
||||||
regex_strings = (
|
|
||||||
|
# noinspection RegExpRepeatedSpace,RegExpUnnecessaryNonCapturingGroup
|
||||||
|
# language=pythonregexp
|
||||||
|
words_re_string = (
|
||||||
|
# Emoticons:
|
||||||
|
emoticon_re_string
|
||||||
|
,
|
||||||
# Phone numbers:
|
# Phone numbers:
|
||||||
r"""
|
r"""
|
||||||
(?:
|
|
||||||
(?: # (international)
|
(?: # (international)
|
||||||
\+?[01]
|
\+?[01]
|
||||||
[\-\s.]*
|
[\-\s.]*
|
||||||
)?
|
)?
|
||||||
(?: # (area code)
|
(?: # (area code)
|
||||||
[\(]?
|
[(]?
|
||||||
\d{3}
|
\d{3}
|
||||||
[\-\s.\)]*
|
[\-\s.)]*
|
||||||
)?
|
)?
|
||||||
\d{3} # exchange
|
\d{3} # exchange
|
||||||
[\-\s.]*
|
[\-\s.]*
|
||||||
\d{4} # base
|
\d{4} # base
|
||||||
)"""
|
"""
|
||||||
,
|
|
||||||
# Emoticons:
|
|
||||||
emoticon_string
|
|
||||||
,
|
,
|
||||||
# HTML tags:
|
# HTML tags:
|
||||||
r"""<[^>]+>"""
|
r"""<[^>]+>"""
|
||||||
,
|
,
|
||||||
# Twitter username:
|
# Twitter username:
|
||||||
r"""(?:@[\w_]+)"""
|
r"""@[\w_]+"""
|
||||||
,
|
,
|
||||||
# Twitter hashtags:
|
# Twitter hashtags:
|
||||||
r"""(?:\#+[\w_]+[\w\'_\-]*[\w_]+)"""
|
r"""#+[\w_]+[\w'_\-]*[\w_]+"""
|
||||||
,
|
,
|
||||||
# Remaining word types:
|
# Words with apostrophes or dashes
|
||||||
r"""
|
r"""[a-z][a-z'\-_]+[a-z]"""
|
||||||
(?:[a-z][a-z'\-_]+[a-z]) # Words with apostrophes or dashes.
|
,
|
||||||
|
|
# Numbers, including fractions, decimals
|
||||||
(?:[+\-]?\d+[,/.:-]\d+[+\-]?) # Numbers, including fractions, decimals.
|
r"""[+\-]?\d+[,/.:-]\d+[+\-]?"""
|
||||||
|
|
,
|
||||||
(?:[\w_]+) # Words without apostrophes or dashes.
|
# Words without apostrophes or dashes
|
||||||
|
|
r"""[\w_]+"""
|
||||||
(?:\.(?:\s*\.){1,}) # Ellipsis dots.
|
,
|
||||||
|
|
# Ellipsis dots
|
||||||
(?:\S) # Everything else that isn't whitespace.
|
r"""\.(?:\s*\.)+"""
|
||||||
|
,
|
||||||
|
# Everything else that isn't whitespace
|
||||||
|
r"""(?:\S)"""
|
||||||
|
)
|
||||||
|
|
||||||
|
words_re = re.compile("|".join(words_re_string), re.VERBOSE | re.I)
|
||||||
|
|
||||||
|
# language=pythonregexp
|
||||||
|
digit_re_string = r"&#\d+;"
|
||||||
|
|
||||||
|
digit_re = re.compile(digit_re_string, re.VERBOSE)
|
||||||
|
|
||||||
|
# language=pythonregexp
|
||||||
|
alpha_re_string = r"&\w+;"
|
||||||
|
|
||||||
|
alpha_re = re.compile(alpha_re_string, re.VERBOSE)
|
||||||
|
|
||||||
|
amp = "&"
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def __html2string(cls, s: str) -> str:
|
||||||
"""
|
"""
|
||||||
)
|
Internal metod that seeks to replace all the HTML entities in s with their corresponding characters.
|
||||||
|
|
||||||
######################################################################
|
|
||||||
# This is the core tokenizing regex:
|
|
||||||
|
|
||||||
word_re = re.compile(r"""(%s)""" % "|".join(regex_strings), re.VERBOSE | re.I | re.UNICODE)
|
|
||||||
|
|
||||||
# The emoticon string gets its own regex so that we can preserve case for them as needed:
|
|
||||||
emoticon_re = re.compile(regex_strings[1], re.VERBOSE | re.I | re.UNICODE)
|
|
||||||
|
|
||||||
# These are for regularizing HTML entities to Unicode:
|
|
||||||
html_entity_digit_re = re.compile(r"&#\d+;")
|
|
||||||
html_entity_alpha_re = re.compile(r"&\w+;")
|
|
||||||
amp = "&"
|
|
||||||
|
|
||||||
|
|
||||||
######################################################################
|
|
||||||
|
|
||||||
|
|
||||||
class PottsTokenizer(BaseTokenizer):
|
|
||||||
"""
|
|
||||||
Tokenizer based on `Christopher Potts' tokenizer <http://sentiment.christopherpotts.net/tokenizing.html>`_.
|
|
||||||
"""
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def __html2string(s: str) -> str:
|
|
||||||
"""
|
|
||||||
Internal metod that seeks to replace all the HTML entities in
|
|
||||||
s with their corresponding unicode characters.
|
|
||||||
"""
|
"""
|
||||||
# First the digits:
|
# First the digits:
|
||||||
ents = set(html_entity_digit_re.findall(s))
|
ents = set(cls.digit_re.findall(s))
|
||||||
if len(ents) > 0:
|
if len(ents) > 0:
|
||||||
for ent in ents:
|
for ent in ents:
|
||||||
entnum = ent[2:-1]
|
entnum = ent[2:-1]
|
||||||
|
@ -165,26 +107,28 @@ class PottsTokenizer(BaseTokenizer):
|
||||||
except (ValueError, KeyError):
|
except (ValueError, KeyError):
|
||||||
pass
|
pass
|
||||||
# Now the alpha versions:
|
# Now the alpha versions:
|
||||||
ents = set(html_entity_alpha_re.findall(s))
|
ents = set(cls.alpha_re.findall(s))
|
||||||
ents = filter((lambda x: x != amp), ents)
|
ents = filter((lambda x: x != cls.amp), ents)
|
||||||
for ent in ents:
|
for ent in ents:
|
||||||
entname = ent[1:-1]
|
entname = ent[1:-1]
|
||||||
try:
|
try:
|
||||||
s = s.replace(ent, chr(html.entities.name2codepoint[entname]))
|
s = s.replace(ent, chr(html.entities.name2codepoint[entname]))
|
||||||
except (ValueError, KeyError):
|
except (ValueError, KeyError):
|
||||||
pass
|
pass
|
||||||
s = s.replace(amp, " and ")
|
s = s.replace(cls.amp, " and ")
|
||||||
return s
|
return s
|
||||||
|
|
||||||
def tokenize_plain(self, text: str) -> t.Iterable[str]:
|
def tokenize_plain(self, text: str) -> t.Iterable[str]:
|
||||||
# Fix HTML character entitites:
|
# Fix HTML character entitites
|
||||||
s = self.__html2string(text)
|
s = self.__html2string(text)
|
||||||
# Tokenize:
|
# Tokenize
|
||||||
words = word_re.findall(s)
|
words = self.words_re.findall(s)
|
||||||
# Possible alter the case, but avoid changing emoticons like :D into :d:
|
# Possible alter the case, but avoid changing emoticons like :D into :d:
|
||||||
words = list(map(lambda x: x if emoticon_re.search(x) else x.lower(), words))
|
words = list(map(lambda x: x if self.emoticon_re.search(x) else x.lower(), words))
|
||||||
# Return the results
|
# Re-join words
|
||||||
return words
|
result = " ".join(words)
|
||||||
|
# Return the result
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
class PottsTokenizerWithNegation(PottsTokenizer):
|
class PottsTokenizerWithNegation(PottsTokenizer):
|
||||||
|
|
Loading…
Reference in a new issue