2023-02-10 04:12:07 +00:00
|
|
|
import tensorflow
|
2023-02-02 03:12:25 +00:00
|
|
|
import re
|
2023-02-02 03:12:56 +00:00
|
|
|
import html.entities
|
2023-02-02 03:17:43 +00:00
|
|
|
import typing as t
|
2023-02-03 22:27:44 +00:00
|
|
|
import nltk.sentiment.util
|
|
|
|
|
|
|
|
from .base import BaseTokenizer
|
2023-02-02 03:12:25 +00:00
|
|
|
|
2023-02-02 16:24:11 +00:00
|
|
|
|
2023-02-03 22:27:44 +00:00
|
|
|
class PottsTokenizer(BaseTokenizer):
|
2023-02-02 16:24:11 +00:00
|
|
|
"""
|
2023-02-10 04:12:07 +00:00
|
|
|
Tokenizer based on `Christopher Potts' tokenizer <http://sentiment.christopherpotts.net/tokenizing.html>`_, released in 2011.
|
|
|
|
|
|
|
|
This module is released under the Creative Commons Attribution-NonCommercial-ShareAlike 3.0 Unported License: https://creativecommons.org/licenses/by-nc-sa/3.0/ .
|
2023-02-02 16:24:11 +00:00
|
|
|
"""
|
|
|
|
|
2023-02-10 04:12:07 +00:00
|
|
|
# noinspection RegExpRepeatedSpace
|
|
|
|
# language=pythonregexp
|
2023-02-11 03:32:17 +00:00
|
|
|
emoticon_re_string = r"""[<>]?[:;=8][\-o*']?[)\](\[dDpP/:}{@|\\]"""
|
2023-02-10 04:12:07 +00:00
|
|
|
|
2023-02-11 03:32:17 +00:00
|
|
|
emoticon_re = re.compile(emoticon_re_string)
|
2023-02-10 04:12:07 +00:00
|
|
|
|
2023-02-11 03:32:17 +00:00
|
|
|
words_re_string = "(" + "|".join([
|
2023-02-10 04:12:07 +00:00
|
|
|
# Emoticons:
|
|
|
|
emoticon_re_string
|
|
|
|
,
|
|
|
|
# Phone numbers:
|
2023-02-11 03:32:17 +00:00
|
|
|
# language=pythonregexp
|
|
|
|
r"""(?:[+]?[01][\s.-]*)?(?:[(]?\d{3}[\s.)-]*)?\d{3}[\-\s.]*\d{4}"""
|
2023-02-10 04:12:07 +00:00
|
|
|
,
|
|
|
|
# HTML tags:
|
2023-02-11 03:32:17 +00:00
|
|
|
# language=pythonregexp
|
2023-02-10 04:12:07 +00:00
|
|
|
r"""<[^>]+>"""
|
|
|
|
,
|
|
|
|
# Twitter username:
|
2023-02-11 03:32:17 +00:00
|
|
|
# language=pythonregexp
|
2023-02-10 04:12:07 +00:00
|
|
|
r"""@[\w_]+"""
|
|
|
|
,
|
|
|
|
# Twitter hashtags:
|
2023-02-11 03:32:17 +00:00
|
|
|
# language=pythonregexp
|
|
|
|
r"""#+[\w_]+[\w'_-]*[\w_]+"""
|
2023-02-10 04:12:07 +00:00
|
|
|
,
|
|
|
|
# Words with apostrophes or dashes
|
2023-02-11 03:32:17 +00:00
|
|
|
# language=pythonregexp
|
|
|
|
r"""[a-z][a-z'_-]+[a-z]"""
|
2023-02-10 04:12:07 +00:00
|
|
|
,
|
|
|
|
# Numbers, including fractions, decimals
|
2023-02-11 03:32:17 +00:00
|
|
|
# language=pythonregexp
|
|
|
|
r"""[+-]?\d+(?:[,/.:-]\d+)?"""
|
2023-02-10 04:12:07 +00:00
|
|
|
,
|
|
|
|
# Words without apostrophes or dashes
|
2023-02-11 03:32:17 +00:00
|
|
|
# language=pythonregexp
|
2023-02-10 04:12:07 +00:00
|
|
|
r"""[\w_]+"""
|
|
|
|
,
|
|
|
|
# Ellipsis dots
|
2023-02-11 03:32:17 +00:00
|
|
|
# language=pythonregexp
|
|
|
|
r"""[.](?:\s*[.])+"""
|
2023-02-10 04:12:07 +00:00
|
|
|
,
|
|
|
|
# Everything else that isn't whitespace
|
2023-02-11 03:32:17 +00:00
|
|
|
# language=pythonregexp
|
|
|
|
r"""\S+"""
|
|
|
|
]) + ")"
|
2023-02-10 04:12:07 +00:00
|
|
|
|
2023-02-11 03:32:17 +00:00
|
|
|
words_re = re.compile(words_re_string, re.I)
|
2023-02-10 04:12:07 +00:00
|
|
|
|
|
|
|
# language=pythonregexp
|
|
|
|
digit_re_string = r"&#\d+;"
|
|
|
|
|
2023-02-11 03:32:17 +00:00
|
|
|
digit_re = re.compile(digit_re_string)
|
2023-02-10 04:12:07 +00:00
|
|
|
|
|
|
|
# language=pythonregexp
|
|
|
|
alpha_re_string = r"&\w+;"
|
|
|
|
|
2023-02-11 03:32:17 +00:00
|
|
|
alpha_re = re.compile(alpha_re_string)
|
2023-02-10 04:12:07 +00:00
|
|
|
|
|
|
|
amp = "&"
|
|
|
|
|
|
|
|
@classmethod
|
|
|
|
def __html2string(cls, s: str) -> str:
|
|
|
|
"""
|
|
|
|
Internal metod that seeks to replace all the HTML entities in s with their corresponding characters.
|
2023-02-03 22:27:44 +00:00
|
|
|
"""
|
|
|
|
# First the digits:
|
2023-02-10 04:12:07 +00:00
|
|
|
ents = set(cls.digit_re.findall(s))
|
2023-02-03 22:27:44 +00:00
|
|
|
if len(ents) > 0:
|
|
|
|
for ent in ents:
|
|
|
|
entnum = ent[2:-1]
|
|
|
|
try:
|
|
|
|
entnum = int(entnum)
|
|
|
|
s = s.replace(ent, chr(entnum))
|
|
|
|
except (ValueError, KeyError):
|
|
|
|
pass
|
|
|
|
# Now the alpha versions:
|
2023-02-10 04:12:07 +00:00
|
|
|
ents = set(cls.alpha_re.findall(s))
|
|
|
|
ents = filter((lambda x: x != cls.amp), ents)
|
2023-02-02 03:12:25 +00:00
|
|
|
for ent in ents:
|
2023-02-03 22:27:44 +00:00
|
|
|
entname = ent[1:-1]
|
2023-02-02 16:24:11 +00:00
|
|
|
try:
|
2023-02-03 22:27:44 +00:00
|
|
|
s = s.replace(ent, chr(html.entities.name2codepoint[entname]))
|
|
|
|
except (ValueError, KeyError):
|
2023-02-02 16:24:11 +00:00
|
|
|
pass
|
2023-02-10 04:12:07 +00:00
|
|
|
s = s.replace(cls.amp, " and ")
|
2023-02-03 22:27:44 +00:00
|
|
|
return s
|
|
|
|
|
2023-02-11 03:32:17 +00:00
|
|
|
def tokenize_plain(self, text: str) -> str:
|
2023-02-10 04:12:07 +00:00
|
|
|
# Fix HTML character entitites
|
2023-02-03 22:27:44 +00:00
|
|
|
s = self.__html2string(text)
|
2023-02-10 04:12:07 +00:00
|
|
|
# Tokenize
|
|
|
|
words = self.words_re.findall(s)
|
2023-02-03 22:27:44 +00:00
|
|
|
# Possible alter the case, but avoid changing emoticons like :D into :d:
|
2023-02-10 04:12:07 +00:00
|
|
|
words = list(map(lambda x: x if self.emoticon_re.search(x) else x.lower(), words))
|
|
|
|
# Re-join words
|
|
|
|
result = " ".join(words)
|
|
|
|
# Return the result
|
|
|
|
return result
|
2023-02-03 22:27:44 +00:00
|
|
|
|
|
|
|
|
|
|
|
class PottsTokenizerWithNegation(PottsTokenizer):
|
2023-02-11 03:32:17 +00:00
|
|
|
def tokenize_plain(self, text: str) -> str:
|
|
|
|
words = super().tokenize_plain(text).split()
|
2023-02-03 22:27:44 +00:00
|
|
|
nltk.sentiment.util.mark_negation(words, shallow=True)
|
2023-02-11 03:32:17 +00:00
|
|
|
return " ".join(words)
|
2023-02-02 16:24:11 +00:00
|
|
|
|
|
|
|
|
|
|
|
__all__ = (
|
2023-02-03 22:27:44 +00:00
|
|
|
"PottsTokenizer",
|
|
|
|
"PottsTokenizerWithNegation",
|
2023-02-02 16:24:11 +00:00
|
|
|
)
|