From 3d9eeecb2a1138dd02239acd0d32b3c1a5186274 Mon Sep 17 00:00:00 2001 From: Stefano Pigozzi Date: Fri, 10 Feb 2023 05:12:07 +0100 Subject: [PATCH] Improve the tokenizer situation --- unimore_bda_6/__main__.py | 6 +- unimore_bda_6/tokenizer/base.py | 18 +- unimore_bda_6/tokenizer/lower.py | 6 +- unimore_bda_6/tokenizer/nltk_word_tokenize.py | 5 +- unimore_bda_6/tokenizer/plain.py | 5 +- unimore_bda_6/tokenizer/potts.py | 248 +++++++----------- 6 files changed, 122 insertions(+), 166 deletions(-) diff --git a/unimore_bda_6/__main__.py b/unimore_bda_6/__main__.py index 587fe84..7d3b7f0 100644 --- a/unimore_bda_6/__main__.py +++ b/unimore_bda_6/__main__.py @@ -39,19 +39,19 @@ def main(): slog.debug("Selected sample_func: %s", sample_func.__name__) for SentimentAnalyzer in [ + NLTKSentimentAnalyzer, TensorflowCategorySentimentAnalyzer, - NLTKSentimentAnalyzer ]: slog = logging.getLogger(f"{__name__}.{sample_func.__name__}.{SentimentAnalyzer.__name__}") slog.debug("Selected SentimentAnalyzer: %s", SentimentAnalyzer.__name__) for Tokenizer in [ + PottsTokenizer, + PottsTokenizerWithNegation, PlainTokenizer, LowercaseTokenizer, NLTKWordTokenizer, - PottsTokenizer, - PottsTokenizerWithNegation, ]: slog = logging.getLogger(f"{__name__}.{sample_func.__name__}.{SentimentAnalyzer.__name__}.{Tokenizer.__name__}") diff --git a/unimore_bda_6/tokenizer/base.py b/unimore_bda_6/tokenizer/base.py index f4b28e8..61349e0 100644 --- a/unimore_bda_6/tokenizer/base.py +++ b/unimore_bda_6/tokenizer/base.py @@ -21,15 +21,29 @@ class BaseTokenizer: return not getattr(self.tokenize_tensorflow, "__notimplemented__", False) @__not_implemented - def tokenize_plain(self, text: str) -> list[str]: + def tokenize_plain(self, text: str) -> str: """ - Convert a text string into a list of tokens. + Convert a text `str` into another `str` containing a series of whitespace-separated tokens. """ raise NotImplementedError() + def tokenize_and_split_plain(self, text: str) -> list[str]: + """ + Run `.tokenize_plain`, then split the result using `str.split`. + """ + return self.tokenize_plain(text).split() + @__not_implemented def tokenize_tensorflow(self, text: "tensorflow.Tensor") -> "tensorflow.Tensor": """ Convert a `tensorflow.Tensor` string into another `tensorflow.Tensor` space-separated string. """ raise NotImplementedError() + + def tokenize_tensorflow_and_expand_dims(self, text: "tensorflow.Tensor") -> "tensorflow.Tensor": + """ + Run `.tokenize_tensorflow`, then add a dimension to the tensor for reasons unknown to me, but required to get `tensorflow` to work properly. + """ + text = self.tokenize_tensorflow(text) + text = tensorflow.expand_dims(text, -1, name="tokens") + return text diff --git a/unimore_bda_6/tokenizer/lower.py b/unimore_bda_6/tokenizer/lower.py index 77177e0..94fbdf2 100644 --- a/unimore_bda_6/tokenizer/lower.py +++ b/unimore_bda_6/tokenizer/lower.py @@ -8,10 +8,10 @@ class LowercaseTokenizer(BaseTokenizer): Tokenizer which converts the words to lowercase before splitting them via spaces. """ - def tokenize_plain(self, text: str) -> list[str]: - return text.lower().split() + def tokenize_plain(self, text: str) -> str: + text = text.lower() + return text def tokenize_tensorflow(self, text: tensorflow.Tensor) -> tensorflow.Tensor: text = tensorflow.strings.lower(text) - text = tensorflow.expand_dims(text, -1, name="tokens") return text diff --git a/unimore_bda_6/tokenizer/nltk_word_tokenize.py b/unimore_bda_6/tokenizer/nltk_word_tokenize.py index e56776c..f96c8b0 100644 --- a/unimore_bda_6/tokenizer/nltk_word_tokenize.py +++ b/unimore_bda_6/tokenizer/nltk_word_tokenize.py @@ -1,6 +1,5 @@ import nltk import nltk.sentiment.util -import typing as t from .base import BaseTokenizer @@ -10,10 +9,10 @@ class NLTKWordTokenizer(BaseTokenizer): Tokenizer based on `nltk.word_tokenize`. """ - def tokenize_plain(self, text: str) -> t.Iterable[str]: + def tokenize_plain(self, text: str) -> str: tokens = nltk.word_tokenize(text) nltk.sentiment.util.mark_negation(tokens, shallow=True) - return tokens + return " ".join(tokens) __all__ = ( diff --git a/unimore_bda_6/tokenizer/plain.py b/unimore_bda_6/tokenizer/plain.py index 206131d..b771401 100644 --- a/unimore_bda_6/tokenizer/plain.py +++ b/unimore_bda_6/tokenizer/plain.py @@ -8,9 +8,8 @@ class PlainTokenizer(BaseTokenizer): Tokenizer which just splits the text into tokens by separating them at whitespaces. """ - def tokenize_plain(self, text: str) -> list[str]: - return text.split() + def tokenize_plain(self, text: str) -> str: + return text def tokenize_tensorflow(self, text: tensorflow.Tensor) -> tensorflow.Tensor: - text = tensorflow.expand_dims(text, -1, name="tokens") return text diff --git a/unimore_bda_6/tokenizer/potts.py b/unimore_bda_6/tokenizer/potts.py index bc44fb3..d30c6cb 100644 --- a/unimore_bda_6/tokenizer/potts.py +++ b/unimore_bda_6/tokenizer/potts.py @@ -1,52 +1,4 @@ -""" -========================= -Original module docstring -========================= - -This code implements a basic, Twitter-aware tokenizer. - -A tokenizer is a function that splits a string of text into words. In -Python terms, we map string and unicode objects into lists of unicode -objects. - -There is not a single right way to do tokenizing. The best method -depends on the application. This tokenizer is designed to be flexible -and this easy to adapt to new domains and tasks. The basic logic is -this: - -1. The tuple regex_strings defines a list of regular expression - strings. - -2. The regex_strings strings are put, in order, into a compiled - regular expression object called word_re. - -3. The tokenization is done by word_re.findall(s), where s is the - user-supplied string, inside the tokenize() method of the class - Tokenizer. - -4. When instantiating Tokenizer objects, there is a single option: - preserve_case. By default, it is set to True. If it is set to - False, then the tokenizer will downcase everything except for - emoticons. - -The __main__ method illustrates by tokenizing a few examples. - -I've also included a Tokenizer method tokenize_random_tweet(). If the -twitter library is installed (http://code.google.com/p/python-twitter/) -and Twitter is cooperating, then it should tokenize a random -English-language tweet. -""" - -__author__ = "Christopher Potts" -__copyright__ = "Copyright 2011, Christopher Potts" -__credits__ = [] -__license__ = "Creative Commons Attribution-NonCommercial-ShareAlike 3.0 Unported License: http://creativecommons.org/licenses/by-nc-sa/3.0/" -__version__ = "1.0" -__maintainer__ = "Christopher Potts" -__email__ = "See the author's website" - -###################################################################### - +import tensorflow import re import html.entities import typing as t @@ -54,108 +6,98 @@ import nltk.sentiment.util from .base import BaseTokenizer -###################################################################### -# The following strings are components in the regular expression -# that is used for tokenizing. It's important that phone_number -# appears first in the final regex (since it can contain whitespace). -# It also could matter that tags comes after emoticons, due to the -# possibility of having text like -# -# <:| and some text >:) -# -# Most imporatantly, the final element should always be last, since it -# does a last ditch whitespace-based tokenization of whatever is left. - -# This particular element is used in a couple ways, so we define it -# with a name: -emoticon_string = r""" - (?: - [<>]? - [:;=8] # eyes - [\-o\*\']? # optional nose - [\)\]\(\[dDpP/\:\}\{@\|\\] # mouth - | - [\)\]\(\[dDpP/\:\}\{@\|\\] # mouth - [\-o\*\']? # optional nose - [:;=8] # eyes - [<>]? - )""" - -# The components of the tokenizer: -regex_strings = ( - # Phone numbers: - r""" - (?: - (?: # (international) - \+?[01] - [\-\s.]* - )? - (?: # (area code) - [\(]? - \d{3} - [\-\s.\)]* - )? - \d{3} # exchange - [\-\s.]* - \d{4} # base - )""" - , - # Emoticons: - emoticon_string - , - # HTML tags: - r"""<[^>]+>""" - , - # Twitter username: - r"""(?:@[\w_]+)""" - , - # Twitter hashtags: - r"""(?:\#+[\w_]+[\w\'_\-]*[\w_]+)""" - , - # Remaining word types: - r""" - (?:[a-z][a-z'\-_]+[a-z]) # Words with apostrophes or dashes. - | - (?:[+\-]?\d+[,/.:-]\d+[+\-]?) # Numbers, including fractions, decimals. - | - (?:[\w_]+) # Words without apostrophes or dashes. - | - (?:\.(?:\s*\.){1,}) # Ellipsis dots. - | - (?:\S) # Everything else that isn't whitespace. - """ -) - -###################################################################### -# This is the core tokenizing regex: - -word_re = re.compile(r"""(%s)""" % "|".join(regex_strings), re.VERBOSE | re.I | re.UNICODE) - -# The emoticon string gets its own regex so that we can preserve case for them as needed: -emoticon_re = re.compile(regex_strings[1], re.VERBOSE | re.I | re.UNICODE) - -# These are for regularizing HTML entities to Unicode: -html_entity_digit_re = re.compile(r"&#\d+;") -html_entity_alpha_re = re.compile(r"&\w+;") -amp = "&" - - -###################################################################### - class PottsTokenizer(BaseTokenizer): """ - Tokenizer based on `Christopher Potts' tokenizer `_. + Tokenizer based on `Christopher Potts' tokenizer `_, released in 2011. + + This module is released under the Creative Commons Attribution-NonCommercial-ShareAlike 3.0 Unported License: https://creativecommons.org/licenses/by-nc-sa/3.0/ . """ - @staticmethod - def __html2string(s: str) -> str: + # noinspection RegExpRepeatedSpace + # language=pythonregexp + emoticon_re_string = r""" + [<>]? + [:;=8] # eyes + [\-o*']? # optional nose + [)\](\[dDpP/:}{@|\\] # mouth + | + [)\](\[dDpP/:}{@|\\] # mouth + [\-o*']? # optional nose + [:;=8] # eyes + [<>]? """ - Internal metod that seeks to replace all the HTML entities in - s with their corresponding unicode characters. + + emoticon_re = re.compile(emoticon_re_string, re.VERBOSE | re.I) + + # noinspection RegExpRepeatedSpace,RegExpUnnecessaryNonCapturingGroup + # language=pythonregexp + words_re_string = ( + # Emoticons: + emoticon_re_string + , + # Phone numbers: + r""" + (?: # (international) + \+?[01] + [\-\s.]* + )? + (?: # (area code) + [(]? + \d{3} + [\-\s.)]* + )? + \d{3} # exchange + [\-\s.]* + \d{4} # base + """ + , + # HTML tags: + r"""<[^>]+>""" + , + # Twitter username: + r"""@[\w_]+""" + , + # Twitter hashtags: + r"""#+[\w_]+[\w'_\-]*[\w_]+""" + , + # Words with apostrophes or dashes + r"""[a-z][a-z'\-_]+[a-z]""" + , + # Numbers, including fractions, decimals + r"""[+\-]?\d+[,/.:-]\d+[+\-]?""" + , + # Words without apostrophes or dashes + r"""[\w_]+""" + , + # Ellipsis dots + r"""\.(?:\s*\.)+""" + , + # Everything else that isn't whitespace + r"""(?:\S)""" + ) + + words_re = re.compile("|".join(words_re_string), re.VERBOSE | re.I) + + # language=pythonregexp + digit_re_string = r"&#\d+;" + + digit_re = re.compile(digit_re_string, re.VERBOSE) + + # language=pythonregexp + alpha_re_string = r"&\w+;" + + alpha_re = re.compile(alpha_re_string, re.VERBOSE) + + amp = "&" + + @classmethod + def __html2string(cls, s: str) -> str: + """ + Internal metod that seeks to replace all the HTML entities in s with their corresponding characters. """ # First the digits: - ents = set(html_entity_digit_re.findall(s)) + ents = set(cls.digit_re.findall(s)) if len(ents) > 0: for ent in ents: entnum = ent[2:-1] @@ -165,26 +107,28 @@ class PottsTokenizer(BaseTokenizer): except (ValueError, KeyError): pass # Now the alpha versions: - ents = set(html_entity_alpha_re.findall(s)) - ents = filter((lambda x: x != amp), ents) + ents = set(cls.alpha_re.findall(s)) + ents = filter((lambda x: x != cls.amp), ents) for ent in ents: entname = ent[1:-1] try: s = s.replace(ent, chr(html.entities.name2codepoint[entname])) except (ValueError, KeyError): pass - s = s.replace(amp, " and ") + s = s.replace(cls.amp, " and ") return s def tokenize_plain(self, text: str) -> t.Iterable[str]: - # Fix HTML character entitites: + # Fix HTML character entitites s = self.__html2string(text) - # Tokenize: - words = word_re.findall(s) + # Tokenize + words = self.words_re.findall(s) # Possible alter the case, but avoid changing emoticons like :D into :d: - words = list(map(lambda x: x if emoticon_re.search(x) else x.lower(), words)) - # Return the results - return words + words = list(map(lambda x: x if self.emoticon_re.search(x) else x.lower(), words)) + # Re-join words + result = " ".join(words) + # Return the result + return result class PottsTokenizerWithNegation(PottsTokenizer):