mirror of
https://github.com/Steffo99/unimore-bda-6.git
synced 2024-11-25 17:24:20 +00:00
In Potts' tokenizer, use html.entities
instead of htmlentitydefs
This commit is contained in:
parent
a85131cb58
commit
c7345cb3a3
1 changed files with 3 additions and 2 deletions
5
unimore_bda_6/vendor/potts.py
vendored
5
unimore_bda_6/vendor/potts.py
vendored
|
@ -3,6 +3,7 @@ This file is a vendored version of `Christopher Potts' tokenizer <http://sentime
|
||||||
|
|
||||||
It has been altered to be used with Python 3.10, but the code is mostly the same.
|
It has been altered to be used with Python 3.10, but the code is mostly the same.
|
||||||
|
|
||||||
|
=========================
|
||||||
Original module docstring
|
Original module docstring
|
||||||
=========================
|
=========================
|
||||||
|
|
||||||
|
@ -51,7 +52,7 @@ __email__ = "See the author's website"
|
||||||
######################################################################
|
######################################################################
|
||||||
|
|
||||||
import re
|
import re
|
||||||
import htmlentitydefs
|
import html.entities
|
||||||
|
|
||||||
######################################################################
|
######################################################################
|
||||||
# The following strings are components in the regular expression
|
# The following strings are components in the regular expression
|
||||||
|
@ -204,7 +205,7 @@ class Tokenizer:
|
||||||
for ent in ents:
|
for ent in ents:
|
||||||
entname = ent[1:-1]
|
entname = ent[1:-1]
|
||||||
try:
|
try:
|
||||||
s = s.replace(ent, unichr(htmlentitydefs.name2codepoint[entname]))
|
s = s.replace(ent, unichr(html.entities.name2codepoint[entname]))
|
||||||
except:
|
except:
|
||||||
pass
|
pass
|
||||||
s = s.replace(amp, " and ")
|
s = s.replace(amp, " and ")
|
||||||
|
|
Loading…
Reference in a new issue