Include typing module in Potts' tokenizer

2024-11-24 16:54:20 +00:00 · 2023-02-02 04:17:43 +01:00 · 2023-02-02 04:17:43 +01:00 · 569f9e5359
commit 569f9e5359
parent c7345cb3a3
1 changed files with 4 additions and 9 deletions
--- a/unimore_bda_6/vendor/potts.py
+++ b/unimore_bda_6/vendor/potts.py
@ -53,6 +53,7 @@ __email__ = "See the author's website"

 import re
 import html.entities
+import typing as t

 ######################################################################
 # The following strings are components in the regular expression
@ -145,17 +146,11 @@ class Tokenizer:
    def __init__(self, preserve_case=False):
        self.preserve_case = preserve_case

-    def tokenize(self, s):
+    def tokenize(self, s: str) -> t.Iterable[str]:
        """
-        Argument: s -- any string or unicode object
+        Argument: s -- any string object
        Value: a tokenize list of strings; conatenating this list returns the original string if preserve_case=False
        """
-        # Try to ensure unicode:
-        try:
-            s = unicode(s)
-        except UnicodeDecodeError:
-            s = str(s).encode('string_escape')
-            s = unicode(s)
        # Fix HTML character entitites:
        s = self.__html2unicode(s)
        # Tokenize: