From 3d9eeecb2a1138dd02239acd0d32b3c1a5186274 Mon Sep 17 00:00:00 2001
From: Stefano Pigozzi <me@steffo.eu>
Date: Fri, 10 Feb 2023 05:12:07 +0100
Subject: [PATCH] Improve the tokenizer situation

---
 unimore_bda_6/__main__.py                     |   6 +-
 unimore_bda_6/tokenizer/base.py               |  18 +-
 unimore_bda_6/tokenizer/lower.py              |   6 +-
 unimore_bda_6/tokenizer/nltk_word_tokenize.py |   5 +-
 unimore_bda_6/tokenizer/plain.py              |   5 +-
 unimore_bda_6/tokenizer/potts.py              | 248 +++++++-----------
 6 files changed, 122 insertions(+), 166 deletions(-)

diff --git a/unimore_bda_6/__main__.py b/unimore_bda_6/__main__.py
index 587fe84..7d3b7f0 100644
--- a/unimore_bda_6/__main__.py
+++ b/unimore_bda_6/__main__.py
@@ -39,19 +39,19 @@ def main():
             slog.debug("Selected sample_func: %s", sample_func.__name__)
 
             for SentimentAnalyzer in [
+                NLTKSentimentAnalyzer,
                 TensorflowCategorySentimentAnalyzer,
-                NLTKSentimentAnalyzer
             ]:
 
                 slog = logging.getLogger(f"{__name__}.{sample_func.__name__}.{SentimentAnalyzer.__name__}")
                 slog.debug("Selected SentimentAnalyzer: %s", SentimentAnalyzer.__name__)
 
                 for Tokenizer in [
+                    PottsTokenizer,
+                    PottsTokenizerWithNegation,
                     PlainTokenizer,
                     LowercaseTokenizer,
                     NLTKWordTokenizer,
-                    PottsTokenizer,
-                    PottsTokenizerWithNegation,
                 ]:
 
                     slog = logging.getLogger(f"{__name__}.{sample_func.__name__}.{SentimentAnalyzer.__name__}.{Tokenizer.__name__}")
diff --git a/unimore_bda_6/tokenizer/base.py b/unimore_bda_6/tokenizer/base.py
index f4b28e8..61349e0 100644
--- a/unimore_bda_6/tokenizer/base.py
+++ b/unimore_bda_6/tokenizer/base.py
@@ -21,15 +21,29 @@ class BaseTokenizer:
         return not getattr(self.tokenize_tensorflow, "__notimplemented__", False)
 
     @__not_implemented
-    def tokenize_plain(self, text: str) -> list[str]:
+    def tokenize_plain(self, text: str) -> str:
         """
-        Convert a text string into a list of tokens.
+        Convert a text `str` into another `str` containing a series of whitespace-separated tokens.
         """
         raise NotImplementedError()
 
+    def tokenize_and_split_plain(self, text: str) -> list[str]:
+        """
+        Run `.tokenize_plain`, then split the result using `str.split`.
+        """
+        return self.tokenize_plain(text).split()
+
     @__not_implemented
     def tokenize_tensorflow(self, text: "tensorflow.Tensor") -> "tensorflow.Tensor":
         """
         Convert a `tensorflow.Tensor` string into another `tensorflow.Tensor` space-separated string.
         """
         raise NotImplementedError()
+
+    def tokenize_tensorflow_and_expand_dims(self, text: "tensorflow.Tensor") -> "tensorflow.Tensor":
+        """
+        Run `.tokenize_tensorflow`, then add a dimension to the tensor for reasons unknown to me, but required to get `tensorflow` to work properly.
+        """
+        text = self.tokenize_tensorflow(text)
+        text = tensorflow.expand_dims(text, -1, name="tokens")
+        return text
diff --git a/unimore_bda_6/tokenizer/lower.py b/unimore_bda_6/tokenizer/lower.py
index 77177e0..94fbdf2 100644
--- a/unimore_bda_6/tokenizer/lower.py
+++ b/unimore_bda_6/tokenizer/lower.py
@@ -8,10 +8,10 @@ class LowercaseTokenizer(BaseTokenizer):
     Tokenizer which converts the words to lowercase before splitting them via spaces.
     """
 
-    def tokenize_plain(self, text: str) -> list[str]:
-        return text.lower().split()
+    def tokenize_plain(self, text: str) -> str:
+        text = text.lower()
+        return text
 
     def tokenize_tensorflow(self, text: tensorflow.Tensor) -> tensorflow.Tensor:
         text = tensorflow.strings.lower(text)
-        text = tensorflow.expand_dims(text, -1, name="tokens")
         return text
diff --git a/unimore_bda_6/tokenizer/nltk_word_tokenize.py b/unimore_bda_6/tokenizer/nltk_word_tokenize.py
index e56776c..f96c8b0 100644
--- a/unimore_bda_6/tokenizer/nltk_word_tokenize.py
+++ b/unimore_bda_6/tokenizer/nltk_word_tokenize.py
@@ -1,6 +1,5 @@
 import nltk
 import nltk.sentiment.util
-import typing as t
 
 from .base import BaseTokenizer
 
@@ -10,10 +9,10 @@ class NLTKWordTokenizer(BaseTokenizer):
     Tokenizer based on `nltk.word_tokenize`.
     """
 
-    def tokenize_plain(self, text: str) -> t.Iterable[str]:
+    def tokenize_plain(self, text: str) -> str:
         tokens = nltk.word_tokenize(text)
         nltk.sentiment.util.mark_negation(tokens, shallow=True)
-        return tokens
+        return " ".join(tokens)
 
 
 __all__ = (
diff --git a/unimore_bda_6/tokenizer/plain.py b/unimore_bda_6/tokenizer/plain.py
index 206131d..b771401 100644
--- a/unimore_bda_6/tokenizer/plain.py
+++ b/unimore_bda_6/tokenizer/plain.py
@@ -8,9 +8,8 @@ class PlainTokenizer(BaseTokenizer):
     Tokenizer which just splits the text into tokens by separating them at whitespaces.
     """
 
-    def tokenize_plain(self, text: str) -> list[str]:
-        return text.split()
+    def tokenize_plain(self, text: str) -> str:
+        return text
 
     def tokenize_tensorflow(self, text: tensorflow.Tensor) -> tensorflow.Tensor:
-        text = tensorflow.expand_dims(text, -1, name="tokens")
         return text
diff --git a/unimore_bda_6/tokenizer/potts.py b/unimore_bda_6/tokenizer/potts.py
index bc44fb3..d30c6cb 100644
--- a/unimore_bda_6/tokenizer/potts.py
+++ b/unimore_bda_6/tokenizer/potts.py
@@ -1,52 +1,4 @@
-"""
-=========================
-Original module docstring
-=========================
-
-This code implements a basic, Twitter-aware tokenizer.
-
-A tokenizer is a function that splits a string of text into words. In
-Python terms, we map string and unicode objects into lists of unicode
-objects.
-
-There is not a single right way to do tokenizing. The best method
-depends on the application.  This tokenizer is designed to be flexible
-and this easy to adapt to new domains and tasks.  The basic logic is
-this:
-
-1. The tuple regex_strings defines a list of regular expression
-   strings.
-
-2. The regex_strings strings are put, in order, into a compiled
-   regular expression object called word_re.
-
-3. The tokenization is done by word_re.findall(s), where s is the
-   user-supplied string, inside the tokenize() method of the class
-   Tokenizer.
-
-4. When instantiating Tokenizer objects, there is a single option:
-   preserve_case.  By default, it is set to True. If it is set to
-   False, then the tokenizer will downcase everything except for
-   emoticons.
-
-The __main__ method illustrates by tokenizing a few examples.
-
-I've also included a Tokenizer method tokenize_random_tweet(). If the
-twitter library is installed (http://code.google.com/p/python-twitter/)
-and Twitter is cooperating, then it should tokenize a random
-English-language tweet.
-"""
-
-__author__ = "Christopher Potts"
-__copyright__ = "Copyright 2011, Christopher Potts"
-__credits__ = []
-__license__ = "Creative Commons Attribution-NonCommercial-ShareAlike 3.0 Unported License: http://creativecommons.org/licenses/by-nc-sa/3.0/"
-__version__ = "1.0"
-__maintainer__ = "Christopher Potts"
-__email__ = "See the author's website"
-
-######################################################################
-
+import tensorflow
 import re
 import html.entities
 import typing as t
@@ -54,108 +6,98 @@ import nltk.sentiment.util
 
 from .base import BaseTokenizer
 
-######################################################################
-# The following strings are components in the regular expression
-# that is used for tokenizing. It's important that phone_number
-# appears first in the final regex (since it can contain whitespace).
-# It also could matter that tags comes after emoticons, due to the
-# possibility of having text like
-#
-#     <:| and some text >:)
-#
-# Most imporatantly, the final element should always be last, since it
-# does a last ditch whitespace-based tokenization of whatever is left.
-
-# This particular element is used in a couple ways, so we define it
-# with a name:
-emoticon_string = r"""
-    (?:
-      [<>]?
-      [:;=8]                     # eyes
-      [\-o\*\']?                 # optional nose
-      [\)\]\(\[dDpP/\:\}\{@\|\\] # mouth
-      |
-      [\)\]\(\[dDpP/\:\}\{@\|\\] # mouth
-      [\-o\*\']?                 # optional nose
-      [:;=8]                     # eyes
-      [<>]?
-    )"""
-
-# The components of the tokenizer:
-regex_strings = (
-    # Phone numbers:
-    r"""
-    (?:
-      (?:            # (international)
-        \+?[01]
-        [\-\s.]*
-      )?
-      (?:            # (area code)
-        [\(]?
-        \d{3}
-        [\-\s.\)]*
-      )?
-      \d{3}          # exchange
-      [\-\s.]*
-      \d{4}          # base
-    )"""
-    ,
-    # Emoticons:
-    emoticon_string
-    ,
-    # HTML tags:
-    r"""<[^>]+>"""
-    ,
-    # Twitter username:
-    r"""(?:@[\w_]+)"""
-    ,
-    # Twitter hashtags:
-    r"""(?:\#+[\w_]+[\w\'_\-]*[\w_]+)"""
-    ,
-    # Remaining word types:
-    r"""
-    (?:[a-z][a-z'\-_]+[a-z])       # Words with apostrophes or dashes.
-    |
-    (?:[+\-]?\d+[,/.:-]\d+[+\-]?)  # Numbers, including fractions, decimals.
-    |
-    (?:[\w_]+)                     # Words without apostrophes or dashes.
-    |
-    (?:\.(?:\s*\.){1,})            # Ellipsis dots.
-    |
-    (?:\S)                         # Everything else that isn't whitespace.
-    """
-)
-
-######################################################################
-# This is the core tokenizing regex:
-
-word_re = re.compile(r"""(%s)""" % "|".join(regex_strings), re.VERBOSE | re.I | re.UNICODE)
-
-# The emoticon string gets its own regex so that we can preserve case for them as needed:
-emoticon_re = re.compile(regex_strings[1], re.VERBOSE | re.I | re.UNICODE)
-
-# These are for regularizing HTML entities to Unicode:
-html_entity_digit_re = re.compile(r"&#\d+;")
-html_entity_alpha_re = re.compile(r"&\w+;")
-amp = "&amp;"
-
-
-######################################################################
-
 
 class PottsTokenizer(BaseTokenizer):
     """
-    Tokenizer based on `Christopher Potts' tokenizer <http://sentiment.christopherpotts.net/tokenizing.html>`_.
+    Tokenizer based on `Christopher Potts' tokenizer <http://sentiment.christopherpotts.net/tokenizing.html>`_, released in 2011.
+
+    This module is released under the Creative Commons Attribution-NonCommercial-ShareAlike 3.0 Unported License: https://creativecommons.org/licenses/by-nc-sa/3.0/ .
     """
 
-    @staticmethod
-    def __html2string(s: str) -> str:
+    # noinspection RegExpRepeatedSpace
+    # language=pythonregexp
+    emoticon_re_string = r"""
+            [<>]?
+            [:;=8]                   # eyes
+            [\-o*']?                 # optional nose
+            [)\](\[dDpP/:}{@|\\]     # mouth
+            |
+            [)\](\[dDpP/:}{@|\\]     # mouth
+            [\-o*']?                 # optional nose
+            [:;=8]                   # eyes
+            [<>]?
         """
-        Internal metod that seeks to replace all the HTML entities in
-        s with their corresponding unicode characters.
+
+    emoticon_re = re.compile(emoticon_re_string, re.VERBOSE | re.I)
+
+    # noinspection RegExpRepeatedSpace,RegExpUnnecessaryNonCapturingGroup
+    # language=pythonregexp
+    words_re_string = (
+        # Emoticons:
+        emoticon_re_string
+        ,
+        # Phone numbers:
+        r"""
+        (?:            # (international)
+            \+?[01]
+            [\-\s.]*
+        )?
+        (?:            # (area code)
+            [(]?
+            \d{3}
+            [\-\s.)]*
+        )?
+        \d{3}          # exchange
+        [\-\s.]*
+        \d{4}          # base
+        """
+        ,
+        # HTML tags:
+        r"""<[^>]+>"""
+        ,
+        # Twitter username:
+        r"""@[\w_]+"""
+        ,
+        # Twitter hashtags:
+        r"""#+[\w_]+[\w'_\-]*[\w_]+"""
+        ,
+        # Words with apostrophes or dashes
+        r"""[a-z][a-z'\-_]+[a-z]"""
+        ,
+        # Numbers, including fractions, decimals
+        r"""[+\-]?\d+[,/.:-]\d+[+\-]?"""
+        ,
+        # Words without apostrophes or dashes
+        r"""[\w_]+"""
+        ,
+        # Ellipsis dots
+        r"""\.(?:\s*\.)+"""
+        ,
+        # Everything else that isn't whitespace
+        r"""(?:\S)"""
+    )
+
+    words_re = re.compile("|".join(words_re_string), re.VERBOSE | re.I)
+
+    # language=pythonregexp
+    digit_re_string = r"&#\d+;"
+
+    digit_re = re.compile(digit_re_string, re.VERBOSE)
+
+    # language=pythonregexp
+    alpha_re_string = r"&\w+;"
+
+    alpha_re = re.compile(alpha_re_string, re.VERBOSE)
+
+    amp = "&amp;"
+
+    @classmethod
+    def __html2string(cls, s: str) -> str:
+        """
+        Internal metod that seeks to replace all the HTML entities in s with their corresponding characters.
         """
         # First the digits:
-        ents = set(html_entity_digit_re.findall(s))
+        ents = set(cls.digit_re.findall(s))
         if len(ents) > 0:
             for ent in ents:
                 entnum = ent[2:-1]
@@ -165,26 +107,28 @@ class PottsTokenizer(BaseTokenizer):
                 except (ValueError, KeyError):
                     pass
         # Now the alpha versions:
-        ents = set(html_entity_alpha_re.findall(s))
-        ents = filter((lambda x: x != amp), ents)
+        ents = set(cls.alpha_re.findall(s))
+        ents = filter((lambda x: x != cls.amp), ents)
         for ent in ents:
             entname = ent[1:-1]
             try:
                 s = s.replace(ent, chr(html.entities.name2codepoint[entname]))
             except (ValueError, KeyError):
                 pass
-            s = s.replace(amp, " and ")
+            s = s.replace(cls.amp, " and ")
         return s
 
     def tokenize_plain(self, text: str) -> t.Iterable[str]:
-        # Fix HTML character entitites:
+        # Fix HTML character entitites
         s = self.__html2string(text)
-        # Tokenize:
-        words = word_re.findall(s)
+        # Tokenize
+        words = self.words_re.findall(s)
         # Possible alter the case, but avoid changing emoticons like :D into :d:
-        words = list(map(lambda x: x if emoticon_re.search(x) else x.lower(), words))
-        # Return the results
-        return words
+        words = list(map(lambda x: x if self.emoticon_re.search(x) else x.lower(), words))
+        # Re-join words
+        result = " ".join(words)
+        # Return the result
+        return result
 
 
 class PottsTokenizerWithNegation(PottsTokenizer):