2023-02-02 03:12:25 +00:00
|
|
|
"""
|
2023-02-02 03:12:56 +00:00
|
|
|
=========================
|
2023-02-02 03:12:25 +00:00
|
|
|
Original module docstring
|
|
|
|
=========================
|
|
|
|
|
|
|
|
This code implements a basic, Twitter-aware tokenizer.
|
|
|
|
|
|
|
|
A tokenizer is a function that splits a string of text into words. In
|
|
|
|
Python terms, we map string and unicode objects into lists of unicode
|
|
|
|
objects.
|
|
|
|
|
|
|
|
There is not a single right way to do tokenizing. The best method
|
|
|
|
depends on the application. This tokenizer is designed to be flexible
|
|
|
|
and this easy to adapt to new domains and tasks. The basic logic is
|
|
|
|
this:
|
|
|
|
|
|
|
|
1. The tuple regex_strings defines a list of regular expression
|
|
|
|
strings.
|
|
|
|
|
|
|
|
2. The regex_strings strings are put, in order, into a compiled
|
|
|
|
regular expression object called word_re.
|
|
|
|
|
|
|
|
3. The tokenization is done by word_re.findall(s), where s is the
|
|
|
|
user-supplied string, inside the tokenize() method of the class
|
|
|
|
Tokenizer.
|
|
|
|
|
|
|
|
4. When instantiating Tokenizer objects, there is a single option:
|
|
|
|
preserve_case. By default, it is set to True. If it is set to
|
|
|
|
False, then the tokenizer will downcase everything except for
|
|
|
|
emoticons.
|
|
|
|
|
|
|
|
The __main__ method illustrates by tokenizing a few examples.
|
|
|
|
|
|
|
|
I've also included a Tokenizer method tokenize_random_tweet(). If the
|
|
|
|
twitter library is installed (http://code.google.com/p/python-twitter/)
|
|
|
|
and Twitter is cooperating, then it should tokenize a random
|
|
|
|
English-language tweet.
|
|
|
|
"""
|
|
|
|
|
|
|
|
__author__ = "Christopher Potts"
|
|
|
|
__copyright__ = "Copyright 2011, Christopher Potts"
|
|
|
|
__credits__ = []
|
|
|
|
__license__ = "Creative Commons Attribution-NonCommercial-ShareAlike 3.0 Unported License: http://creativecommons.org/licenses/by-nc-sa/3.0/"
|
|
|
|
__version__ = "1.0"
|
|
|
|
__maintainer__ = "Christopher Potts"
|
|
|
|
__email__ = "See the author's website"
|
|
|
|
|
|
|
|
######################################################################
|
|
|
|
|
|
|
|
import re
|
2023-02-02 03:12:56 +00:00
|
|
|
import html.entities
|
2023-02-02 03:17:43 +00:00
|
|
|
import typing as t
|
2023-02-03 22:27:44 +00:00
|
|
|
import nltk.sentiment.util
|
|
|
|
|
|
|
|
from .base import BaseTokenizer
|
2023-02-02 03:12:25 +00:00
|
|
|
|
|
|
|
######################################################################
|
|
|
|
# The following strings are components in the regular expression
|
|
|
|
# that is used for tokenizing. It's important that phone_number
|
|
|
|
# appears first in the final regex (since it can contain whitespace).
|
|
|
|
# It also could matter that tags comes after emoticons, due to the
|
|
|
|
# possibility of having text like
|
|
|
|
#
|
|
|
|
# <:| and some text >:)
|
|
|
|
#
|
|
|
|
# Most imporatantly, the final element should always be last, since it
|
|
|
|
# does a last ditch whitespace-based tokenization of whatever is left.
|
|
|
|
|
|
|
|
# This particular element is used in a couple ways, so we define it
|
|
|
|
# with a name:
|
|
|
|
emoticon_string = r"""
|
|
|
|
(?:
|
|
|
|
[<>]?
|
|
|
|
[:;=8] # eyes
|
|
|
|
[\-o\*\']? # optional nose
|
2023-02-02 16:24:11 +00:00
|
|
|
[\)\]\(\[dDpP/\:\}\{@\|\\] # mouth
|
2023-02-02 03:12:25 +00:00
|
|
|
|
|
|
|
|
[\)\]\(\[dDpP/\:\}\{@\|\\] # mouth
|
|
|
|
[\-o\*\']? # optional nose
|
|
|
|
[:;=8] # eyes
|
|
|
|
[<>]?
|
|
|
|
)"""
|
|
|
|
|
|
|
|
# The components of the tokenizer:
|
|
|
|
regex_strings = (
|
|
|
|
# Phone numbers:
|
|
|
|
r"""
|
|
|
|
(?:
|
|
|
|
(?: # (international)
|
|
|
|
\+?[01]
|
|
|
|
[\-\s.]*
|
2023-02-02 16:24:11 +00:00
|
|
|
)?
|
2023-02-02 03:12:25 +00:00
|
|
|
(?: # (area code)
|
|
|
|
[\(]?
|
|
|
|
\d{3}
|
|
|
|
[\-\s.\)]*
|
2023-02-02 16:24:11 +00:00
|
|
|
)?
|
2023-02-02 03:12:25 +00:00
|
|
|
\d{3} # exchange
|
2023-02-02 16:24:11 +00:00
|
|
|
[\-\s.]*
|
2023-02-02 03:12:25 +00:00
|
|
|
\d{4} # base
|
|
|
|
)"""
|
|
|
|
,
|
|
|
|
# Emoticons:
|
|
|
|
emoticon_string
|
2023-02-02 16:24:11 +00:00
|
|
|
,
|
2023-02-02 03:12:25 +00:00
|
|
|
# HTML tags:
|
|
|
|
r"""<[^>]+>"""
|
|
|
|
,
|
|
|
|
# Twitter username:
|
|
|
|
r"""(?:@[\w_]+)"""
|
|
|
|
,
|
|
|
|
# Twitter hashtags:
|
|
|
|
r"""(?:\#+[\w_]+[\w\'_\-]*[\w_]+)"""
|
|
|
|
,
|
|
|
|
# Remaining word types:
|
|
|
|
r"""
|
|
|
|
(?:[a-z][a-z'\-_]+[a-z]) # Words with apostrophes or dashes.
|
|
|
|
|
|
|
|
|
(?:[+\-]?\d+[,/.:-]\d+[+\-]?) # Numbers, including fractions, decimals.
|
|
|
|
|
|
|
|
|
(?:[\w_]+) # Words without apostrophes or dashes.
|
|
|
|
|
|
2023-02-02 16:24:11 +00:00
|
|
|
(?:\.(?:\s*\.){1,}) # Ellipsis dots.
|
2023-02-02 03:12:25 +00:00
|
|
|
|
|
|
|
|
(?:\S) # Everything else that isn't whitespace.
|
|
|
|
"""
|
|
|
|
)
|
|
|
|
|
|
|
|
######################################################################
|
|
|
|
# This is the core tokenizing regex:
|
2023-02-02 16:24:11 +00:00
|
|
|
|
2023-02-02 03:12:25 +00:00
|
|
|
word_re = re.compile(r"""(%s)""" % "|".join(regex_strings), re.VERBOSE | re.I | re.UNICODE)
|
|
|
|
|
|
|
|
# The emoticon string gets its own regex so that we can preserve case for them as needed:
|
|
|
|
emoticon_re = re.compile(regex_strings[1], re.VERBOSE | re.I | re.UNICODE)
|
|
|
|
|
|
|
|
# These are for regularizing HTML entities to Unicode:
|
|
|
|
html_entity_digit_re = re.compile(r"&#\d+;")
|
|
|
|
html_entity_alpha_re = re.compile(r"&\w+;")
|
|
|
|
amp = "&"
|
|
|
|
|
|
|
|
######################################################################
|
|
|
|
|
2023-02-02 16:24:11 +00:00
|
|
|
|
2023-02-03 22:27:44 +00:00
|
|
|
class PottsTokenizer(BaseTokenizer):
|
2023-02-02 16:24:11 +00:00
|
|
|
"""
|
2023-02-03 22:27:44 +00:00
|
|
|
Tokenizer based on `Christopher Potts' tokenizer <http://sentiment.christopherpotts.net/tokenizing.html>`_.
|
2023-02-02 16:24:11 +00:00
|
|
|
"""
|
|
|
|
|
2023-02-03 22:27:44 +00:00
|
|
|
@staticmethod
|
|
|
|
def __html2string(s: str) -> str:
|
|
|
|
"""
|
|
|
|
Internal metod that seeks to replace all the HTML entities in
|
|
|
|
s with their corresponding unicode characters.
|
|
|
|
"""
|
|
|
|
# First the digits:
|
|
|
|
ents = set(html_entity_digit_re.findall(s))
|
|
|
|
if len(ents) > 0:
|
|
|
|
for ent in ents:
|
|
|
|
entnum = ent[2:-1]
|
|
|
|
try:
|
|
|
|
entnum = int(entnum)
|
|
|
|
s = s.replace(ent, chr(entnum))
|
|
|
|
except (ValueError, KeyError):
|
|
|
|
pass
|
|
|
|
# Now the alpha versions:
|
|
|
|
ents = set(html_entity_alpha_re.findall(s))
|
|
|
|
ents = filter((lambda x : x != amp), ents)
|
2023-02-02 03:12:25 +00:00
|
|
|
for ent in ents:
|
2023-02-03 22:27:44 +00:00
|
|
|
entname = ent[1:-1]
|
2023-02-02 16:24:11 +00:00
|
|
|
try:
|
2023-02-03 22:27:44 +00:00
|
|
|
s = s.replace(ent, chr(html.entities.name2codepoint[entname]))
|
|
|
|
except (ValueError, KeyError):
|
2023-02-02 16:24:11 +00:00
|
|
|
pass
|
2023-02-03 22:27:44 +00:00
|
|
|
s = s.replace(amp, " and ")
|
|
|
|
return s
|
|
|
|
|
|
|
|
def tokenize(self, text: str) -> t.Iterable[str]:
|
|
|
|
# Fix HTML character entitites:
|
|
|
|
s = self.__html2string(text)
|
|
|
|
# Tokenize:
|
|
|
|
words = word_re.findall(s)
|
|
|
|
# Possible alter the case, but avoid changing emoticons like :D into :d:
|
|
|
|
words = list(map(lambda x: x if emoticon_re.search(x) else x.lower(), words))
|
|
|
|
# Return the results
|
|
|
|
return words
|
|
|
|
|
|
|
|
|
|
|
|
class PottsTokenizerWithNegation(PottsTokenizer):
|
|
|
|
def tokenize(self, text: str) -> t.Iterable[str]:
|
|
|
|
words = super().tokenize(text)
|
|
|
|
nltk.sentiment.util.mark_negation(words, shallow=True)
|
|
|
|
return words
|
2023-02-02 16:24:11 +00:00
|
|
|
|
|
|
|
|
|
|
|
__all__ = (
|
2023-02-03 22:27:44 +00:00
|
|
|
"PottsTokenizer",
|
|
|
|
"PottsTokenizerWithNegation",
|
2023-02-02 16:24:11 +00:00
|
|
|
)
|