1
Fork 0
mirror of https://github.com/Steffo99/unimore-bda-6.git synced 2024-11-24 16:54:20 +00:00

Include typing module in Potts' tokenizer

This commit is contained in:
Steffo 2023-02-02 04:17:43 +01:00
parent c7345cb3a3
commit 569f9e5359
Signed by: steffo
GPG key ID: 2A24051445686895

View file

@ -53,6 +53,7 @@ __email__ = "See the author's website"
import re
import html.entities
import typing as t
######################################################################
# The following strings are components in the regular expression
@ -145,17 +146,11 @@ class Tokenizer:
def __init__(self, preserve_case=False):
self.preserve_case = preserve_case
def tokenize(self, s):
def tokenize(self, s: str) -> t.Iterable[str]:
"""
Argument: s -- any string or unicode object
Argument: s -- any string object
Value: a tokenize list of strings; conatenating this list returns the original string if preserve_case=False
"""
# Try to ensure unicode:
try:
s = unicode(s)
except UnicodeDecodeError:
s = str(s).encode('string_escape')
s = unicode(s)
# Fix HTML character entitites:
s = self.__html2unicode(s)
# Tokenize: