In Potts' tokenizer, use html.entities instead of htmlentitydefs

2024-11-21 23:44:19 +00:00 · 2023-02-02 04:12:56 +01:00 · 2023-02-02 04:12:56 +01:00 · c7345cb3a3
commit c7345cb3a3
parent a85131cb58
1 changed files with 3 additions and 2 deletions
--- a/unimore_bda_6/vendor/potts.py
+++ b/unimore_bda_6/vendor/potts.py
@ -3,6 +3,7 @@ This file is a vendored version of `Christopher Potts' tokenizer <http://sentime

 It has been altered to be used with Python 3.10, but the code is mostly the same.

+=========================
 Original module docstring
 =========================

@ -51,7 +52,7 @@ __email__ = "See the author's website"
 ######################################################################

 import re
-import htmlentitydefs
+import html.entities

 ######################################################################
 # The following strings are components in the regular expression
@ -204,7 +205,7 @@ class Tokenizer:
        for ent in ents:
            entname = ent[1:-1]
            try:            
-                s = s.replace(ent, unichr(htmlentitydefs.name2codepoint[entname]))
+                s = s.replace(ent, unichr(html.entities.name2codepoint[entname]))
            except:
                pass                    
            s = s.replace(amp, " and ")