1
Fork 0
mirror of https://github.com/Steffo99/unimore-bda-6.git synced 2024-11-29 11:14:19 +00:00
bda-6-steffo/unimore_bda_6/tokenizer/potts.py

145 lines
4.2 KiB
Python
Raw Normal View History

2023-02-02 03:12:25 +00:00
import re
import html.entities
import typing as t
2023-02-03 22:27:44 +00:00
import nltk.sentiment.util
from .base import BaseTokenizer
2023-02-02 03:12:25 +00:00
2023-02-02 16:24:11 +00:00
2023-02-03 22:27:44 +00:00
class PottsTokenizer(BaseTokenizer):
2023-02-02 16:24:11 +00:00
"""
2023-02-10 04:12:07 +00:00
Tokenizer based on `Christopher Potts' tokenizer <http://sentiment.christopherpotts.net/tokenizing.html>`_, released in 2011.
2023-02-12 04:11:58 +00:00
This class is released under the Creative Commons Attribution-NonCommercial-ShareAlike 3.0 Unported License: https://creativecommons.org/licenses/by-nc-sa/3.0/ .
2023-02-02 16:24:11 +00:00
"""
2023-02-10 04:12:07 +00:00
# noinspection RegExpRepeatedSpace
# language=pythonregexp
2023-02-11 03:32:17 +00:00
emoticon_re_string = r"""[<>]?[:;=8][\-o*']?[)\](\[dDpP/:}{@|\\]"""
2023-02-10 04:12:07 +00:00
2023-02-11 03:32:17 +00:00
emoticon_re = re.compile(emoticon_re_string)
2023-02-10 04:12:07 +00:00
2023-02-11 03:32:17 +00:00
words_re_string = "(" + "|".join([
2023-02-10 04:12:07 +00:00
# Emoticons:
emoticon_re_string
,
# Phone numbers:
2023-02-11 03:32:17 +00:00
# language=pythonregexp
r"""(?:[+]?[01][\s.-]*)?(?:[(]?\d{3}[\s.)-]*)?\d{3}[\-\s.]*\d{4}"""
2023-02-10 04:12:07 +00:00
,
# HTML tags:
2023-02-11 03:32:17 +00:00
# language=pythonregexp
2023-02-10 04:12:07 +00:00
r"""<[^>]+>"""
,
# Twitter username:
2023-02-11 03:32:17 +00:00
# language=pythonregexp
2023-02-10 04:12:07 +00:00
r"""@[\w_]+"""
,
# Twitter hashtags:
2023-02-11 03:32:17 +00:00
# language=pythonregexp
r"""#+[\w_]+[\w'_-]*[\w_]+"""
2023-02-10 04:12:07 +00:00
,
# Words with apostrophes or dashes
2023-02-11 03:32:17 +00:00
# language=pythonregexp
r"""[a-z][a-z'_-]+[a-z]"""
2023-02-10 04:12:07 +00:00
,
# Numbers, including fractions, decimals
2023-02-11 03:32:17 +00:00
# language=pythonregexp
r"""[+-]?\d+(?:[,/.:-]\d+)?"""
2023-02-10 04:12:07 +00:00
,
# Words without apostrophes or dashes
2023-02-11 03:32:17 +00:00
# language=pythonregexp
2023-02-10 04:12:07 +00:00
r"""[\w_]+"""
,
# Ellipsis dots
2023-02-11 03:32:17 +00:00
# language=pythonregexp
r"""[.](?:\s*[.])+"""
2023-02-10 04:12:07 +00:00
,
# Everything else that isn't whitespace
2023-02-11 03:32:17 +00:00
# language=pythonregexp
r"""\S+"""
]) + ")"
2023-02-10 04:12:07 +00:00
2023-02-11 03:32:17 +00:00
words_re = re.compile(words_re_string, re.I)
2023-02-10 04:12:07 +00:00
# language=pythonregexp
digit_re_string = r"&#\d+;"
2023-02-11 03:32:17 +00:00
digit_re = re.compile(digit_re_string)
2023-02-10 04:12:07 +00:00
# language=pythonregexp
alpha_re_string = r"&\w+;"
2023-02-11 03:32:17 +00:00
alpha_re = re.compile(alpha_re_string)
2023-02-10 04:12:07 +00:00
amp = "&amp;"
@classmethod
2023-02-12 04:11:58 +00:00
def html_entities_to_chr(cls, s: str) -> str:
2023-02-10 04:12:07 +00:00
"""
Internal metod that seeks to replace all the HTML entities in s with their corresponding characters.
2023-02-03 22:27:44 +00:00
"""
# First the digits:
2023-02-10 04:12:07 +00:00
ents = set(cls.digit_re.findall(s))
2023-02-03 22:27:44 +00:00
if len(ents) > 0:
for ent in ents:
entnum = ent[2:-1]
try:
entnum = int(entnum)
s = s.replace(ent, chr(entnum))
except (ValueError, KeyError):
pass
# Now the alpha versions:
2023-02-10 04:12:07 +00:00
ents = set(cls.alpha_re.findall(s))
ents = filter((lambda x: x != cls.amp), ents)
2023-02-02 03:12:25 +00:00
for ent in ents:
2023-02-03 22:27:44 +00:00
entname = ent[1:-1]
2023-02-02 16:24:11 +00:00
try:
2023-02-03 22:27:44 +00:00
s = s.replace(ent, chr(html.entities.name2codepoint[entname]))
except (ValueError, KeyError):
2023-02-02 16:24:11 +00:00
pass
2023-02-10 04:12:07 +00:00
s = s.replace(cls.amp, " and ")
2023-02-03 22:27:44 +00:00
return s
2023-02-12 04:11:58 +00:00
@classmethod
def lower_but_preserve_emoticons(cls, word):
"""
Internal method which lowercases the word if it does not match `.emoticon_re`.
"""
if cls.emoticon_re.search(word):
return word
else:
return word.lower()
def tokenize(self, text: str) -> t.Iterator[str]:
2023-02-10 04:12:07 +00:00
# Fix HTML character entitites
2023-02-12 04:11:58 +00:00
text = self.html_entities_to_chr(text)
2023-02-10 04:12:07 +00:00
# Tokenize
2023-02-12 04:11:58 +00:00
tokens = self.words_re.findall(text)
2023-02-03 22:27:44 +00:00
# Possible alter the case, but avoid changing emoticons like :D into :d:
2023-02-12 04:11:58 +00:00
tokens = map(self.lower_but_preserve_emoticons, tokens)
# Convert to a list (sigh) the iterator
tokens = list(tokens)
2023-02-10 04:12:07 +00:00
# Return the result
2023-02-12 04:11:58 +00:00
return tokens
2023-02-03 22:27:44 +00:00
class PottsTokenizerWithNegation(PottsTokenizer):
2023-02-12 04:11:58 +00:00
"""
Version of `.PottsTokenizer` which after tokenizing applies `nltk.sentiment.util.mark_negation`.
"""
def tokenize(self, text: str) -> t.Iterator[str]:
2023-02-12 04:11:58 +00:00
# Apply the base tokenization
tokens = super().tokenize(text)
2023-02-12 04:11:58 +00:00
# Use nltk to mark negation
nltk.sentiment.util.mark_negation(tokens, shallow=True)
2023-02-12 04:11:58 +00:00
# Return the result
return tokens
2023-02-02 16:24:11 +00:00
__all__ = (
2023-02-03 22:27:44 +00:00
"PottsTokenizer",
"PottsTokenizerWithNegation",
2023-02-02 16:24:11 +00:00
)