1
Fork 0
mirror of https://github.com/Steffo99/unimore-bda-6.git synced 2024-11-22 07:54:19 +00:00

In Potts' tokenizer, use html.entities instead of htmlentitydefs

This commit is contained in:
Steffo 2023-02-02 04:12:56 +01:00
parent a85131cb58
commit c7345cb3a3
Signed by: steffo
GPG key ID: 2A24051445686895

View file

@ -3,6 +3,7 @@ This file is a vendored version of `Christopher Potts' tokenizer <http://sentime
It has been altered to be used with Python 3.10, but the code is mostly the same. It has been altered to be used with Python 3.10, but the code is mostly the same.
=========================
Original module docstring Original module docstring
========================= =========================
@ -51,7 +52,7 @@ __email__ = "See the author's website"
###################################################################### ######################################################################
import re import re
import htmlentitydefs import html.entities
###################################################################### ######################################################################
# The following strings are components in the regular expression # The following strings are components in the regular expression
@ -204,7 +205,7 @@ class Tokenizer:
for ent in ents: for ent in ents:
entname = ent[1:-1] entname = ent[1:-1]
try: try:
s = s.replace(ent, unichr(htmlentitydefs.name2codepoint[entname])) s = s.replace(ent, unichr(html.entities.name2codepoint[entname]))
except: except:
pass pass
s = s.replace(amp, " and ") s = s.replace(amp, " and ")