mirror of
https://github.com/Steffo99/unimore-bda-6.git
synced 2024-11-24 16:54:20 +00:00
Include typing
module in Potts' tokenizer
This commit is contained in:
parent
c7345cb3a3
commit
569f9e5359
1 changed files with 4 additions and 9 deletions
11
unimore_bda_6/vendor/potts.py
vendored
11
unimore_bda_6/vendor/potts.py
vendored
|
@ -53,6 +53,7 @@ __email__ = "See the author's website"
|
|||
|
||||
import re
|
||||
import html.entities
|
||||
import typing as t
|
||||
|
||||
######################################################################
|
||||
# The following strings are components in the regular expression
|
||||
|
@ -145,17 +146,11 @@ class Tokenizer:
|
|||
def __init__(self, preserve_case=False):
|
||||
self.preserve_case = preserve_case
|
||||
|
||||
def tokenize(self, s):
|
||||
def tokenize(self, s: str) -> t.Iterable[str]:
|
||||
"""
|
||||
Argument: s -- any string or unicode object
|
||||
Argument: s -- any string object
|
||||
Value: a tokenize list of strings; conatenating this list returns the original string if preserve_case=False
|
||||
"""
|
||||
# Try to ensure unicode:
|
||||
try:
|
||||
s = unicode(s)
|
||||
except UnicodeDecodeError:
|
||||
s = str(s).encode('string_escape')
|
||||
s = unicode(s)
|
||||
# Fix HTML character entitites:
|
||||
s = self.__html2unicode(s)
|
||||
# Tokenize:
|
||||
|
|
Loading…
Reference in a new issue