mirror of
https://github.com/Steffo99/unimore-bda-6.git
synced 2024-11-24 16:54:20 +00:00
Include typing
module in Potts' tokenizer
This commit is contained in:
parent
c7345cb3a3
commit
569f9e5359
1 changed files with 4 additions and 9 deletions
13
unimore_bda_6/vendor/potts.py
vendored
13
unimore_bda_6/vendor/potts.py
vendored
|
@ -53,6 +53,7 @@ __email__ = "See the author's website"
|
||||||
|
|
||||||
import re
|
import re
|
||||||
import html.entities
|
import html.entities
|
||||||
|
import typing as t
|
||||||
|
|
||||||
######################################################################
|
######################################################################
|
||||||
# The following strings are components in the regular expression
|
# The following strings are components in the regular expression
|
||||||
|
@ -145,17 +146,11 @@ class Tokenizer:
|
||||||
def __init__(self, preserve_case=False):
|
def __init__(self, preserve_case=False):
|
||||||
self.preserve_case = preserve_case
|
self.preserve_case = preserve_case
|
||||||
|
|
||||||
def tokenize(self, s):
|
def tokenize(self, s: str) -> t.Iterable[str]:
|
||||||
"""
|
"""
|
||||||
Argument: s -- any string or unicode object
|
Argument: s -- any string object
|
||||||
Value: a tokenize list of strings; conatenating this list returns the original string if preserve_case=False
|
Value: a tokenize list of strings; conatenating this list returns the original string if preserve_case=False
|
||||||
"""
|
"""
|
||||||
# Try to ensure unicode:
|
|
||||||
try:
|
|
||||||
s = unicode(s)
|
|
||||||
except UnicodeDecodeError:
|
|
||||||
s = str(s).encode('string_escape')
|
|
||||||
s = unicode(s)
|
|
||||||
# Fix HTML character entitites:
|
# Fix HTML character entitites:
|
||||||
s = self.__html2unicode(s)
|
s = self.__html2unicode(s)
|
||||||
# Tokenize:
|
# Tokenize:
|
||||||
|
|
Loading…
Reference in a new issue