mirror of
https://github.com/Steffo99/unimore-bda-6.git
synced 2024-11-22 16:04:18 +00:00
227 lines
7.8 KiB
Python
227 lines
7.8 KiB
Python
|
"""
|
||
|
This file is a vendored version of `Christopher Potts' tokenizer <http://sentiment.christopherpotts.net/tokenizing.html>`_, which the project's specifications require to use.
|
||
|
|
||
|
It has been altered to be used with Python 3.10, but the code is mostly the same.
|
||
|
|
||
|
Original module docstring
|
||
|
=========================
|
||
|
|
||
|
This code implements a basic, Twitter-aware tokenizer.
|
||
|
|
||
|
A tokenizer is a function that splits a string of text into words. In
|
||
|
Python terms, we map string and unicode objects into lists of unicode
|
||
|
objects.
|
||
|
|
||
|
There is not a single right way to do tokenizing. The best method
|
||
|
depends on the application. This tokenizer is designed to be flexible
|
||
|
and this easy to adapt to new domains and tasks. The basic logic is
|
||
|
this:
|
||
|
|
||
|
1. The tuple regex_strings defines a list of regular expression
|
||
|
strings.
|
||
|
|
||
|
2. The regex_strings strings are put, in order, into a compiled
|
||
|
regular expression object called word_re.
|
||
|
|
||
|
3. The tokenization is done by word_re.findall(s), where s is the
|
||
|
user-supplied string, inside the tokenize() method of the class
|
||
|
Tokenizer.
|
||
|
|
||
|
4. When instantiating Tokenizer objects, there is a single option:
|
||
|
preserve_case. By default, it is set to True. If it is set to
|
||
|
False, then the tokenizer will downcase everything except for
|
||
|
emoticons.
|
||
|
|
||
|
The __main__ method illustrates by tokenizing a few examples.
|
||
|
|
||
|
I've also included a Tokenizer method tokenize_random_tweet(). If the
|
||
|
twitter library is installed (http://code.google.com/p/python-twitter/)
|
||
|
and Twitter is cooperating, then it should tokenize a random
|
||
|
English-language tweet.
|
||
|
"""
|
||
|
|
||
|
__author__ = "Christopher Potts"
|
||
|
__copyright__ = "Copyright 2011, Christopher Potts"
|
||
|
__credits__ = []
|
||
|
__license__ = "Creative Commons Attribution-NonCommercial-ShareAlike 3.0 Unported License: http://creativecommons.org/licenses/by-nc-sa/3.0/"
|
||
|
__version__ = "1.0"
|
||
|
__maintainer__ = "Christopher Potts"
|
||
|
__email__ = "See the author's website"
|
||
|
|
||
|
######################################################################
|
||
|
|
||
|
import re
|
||
|
import htmlentitydefs
|
||
|
|
||
|
######################################################################
|
||
|
# The following strings are components in the regular expression
|
||
|
# that is used for tokenizing. It's important that phone_number
|
||
|
# appears first in the final regex (since it can contain whitespace).
|
||
|
# It also could matter that tags comes after emoticons, due to the
|
||
|
# possibility of having text like
|
||
|
#
|
||
|
# <:| and some text >:)
|
||
|
#
|
||
|
# Most imporatantly, the final element should always be last, since it
|
||
|
# does a last ditch whitespace-based tokenization of whatever is left.
|
||
|
|
||
|
# This particular element is used in a couple ways, so we define it
|
||
|
# with a name:
|
||
|
emoticon_string = r"""
|
||
|
(?:
|
||
|
[<>]?
|
||
|
[:;=8] # eyes
|
||
|
[\-o\*\']? # optional nose
|
||
|
[\)\]\(\[dDpP/\:\}\{@\|\\] # mouth
|
||
|
|
|
||
|
[\)\]\(\[dDpP/\:\}\{@\|\\] # mouth
|
||
|
[\-o\*\']? # optional nose
|
||
|
[:;=8] # eyes
|
||
|
[<>]?
|
||
|
)"""
|
||
|
|
||
|
# The components of the tokenizer:
|
||
|
regex_strings = (
|
||
|
# Phone numbers:
|
||
|
r"""
|
||
|
(?:
|
||
|
(?: # (international)
|
||
|
\+?[01]
|
||
|
[\-\s.]*
|
||
|
)?
|
||
|
(?: # (area code)
|
||
|
[\(]?
|
||
|
\d{3}
|
||
|
[\-\s.\)]*
|
||
|
)?
|
||
|
\d{3} # exchange
|
||
|
[\-\s.]*
|
||
|
\d{4} # base
|
||
|
)"""
|
||
|
,
|
||
|
# Emoticons:
|
||
|
emoticon_string
|
||
|
,
|
||
|
# HTML tags:
|
||
|
r"""<[^>]+>"""
|
||
|
,
|
||
|
# Twitter username:
|
||
|
r"""(?:@[\w_]+)"""
|
||
|
,
|
||
|
# Twitter hashtags:
|
||
|
r"""(?:\#+[\w_]+[\w\'_\-]*[\w_]+)"""
|
||
|
,
|
||
|
# Remaining word types:
|
||
|
r"""
|
||
|
(?:[a-z][a-z'\-_]+[a-z]) # Words with apostrophes or dashes.
|
||
|
|
|
||
|
(?:[+\-]?\d+[,/.:-]\d+[+\-]?) # Numbers, including fractions, decimals.
|
||
|
|
|
||
|
(?:[\w_]+) # Words without apostrophes or dashes.
|
||
|
|
|
||
|
(?:\.(?:\s*\.){1,}) # Ellipsis dots.
|
||
|
|
|
||
|
(?:\S) # Everything else that isn't whitespace.
|
||
|
"""
|
||
|
)
|
||
|
|
||
|
######################################################################
|
||
|
# This is the core tokenizing regex:
|
||
|
|
||
|
word_re = re.compile(r"""(%s)""" % "|".join(regex_strings), re.VERBOSE | re.I | re.UNICODE)
|
||
|
|
||
|
# The emoticon string gets its own regex so that we can preserve case for them as needed:
|
||
|
emoticon_re = re.compile(regex_strings[1], re.VERBOSE | re.I | re.UNICODE)
|
||
|
|
||
|
# These are for regularizing HTML entities to Unicode:
|
||
|
html_entity_digit_re = re.compile(r"&#\d+;")
|
||
|
html_entity_alpha_re = re.compile(r"&\w+;")
|
||
|
amp = "&"
|
||
|
|
||
|
######################################################################
|
||
|
|
||
|
class Tokenizer:
|
||
|
def __init__(self, preserve_case=False):
|
||
|
self.preserve_case = preserve_case
|
||
|
|
||
|
def tokenize(self, s):
|
||
|
"""
|
||
|
Argument: s -- any string or unicode object
|
||
|
Value: a tokenize list of strings; conatenating this list returns the original string if preserve_case=False
|
||
|
"""
|
||
|
# Try to ensure unicode:
|
||
|
try:
|
||
|
s = unicode(s)
|
||
|
except UnicodeDecodeError:
|
||
|
s = str(s).encode('string_escape')
|
||
|
s = unicode(s)
|
||
|
# Fix HTML character entitites:
|
||
|
s = self.__html2unicode(s)
|
||
|
# Tokenize:
|
||
|
words = word_re.findall(s)
|
||
|
# Possible alter the case, but avoid changing emoticons like :D into :d:
|
||
|
if not self.preserve_case:
|
||
|
words = map((lambda x : x if emoticon_re.search(x) else x.lower()), words)
|
||
|
return words
|
||
|
|
||
|
def tokenize_random_tweet(self):
|
||
|
"""
|
||
|
If the twitter library is installed and a twitter connection
|
||
|
can be established, then tokenize a random tweet.
|
||
|
"""
|
||
|
try:
|
||
|
import twitter
|
||
|
except ImportError:
|
||
|
print "Apologies. The random tweet functionality requires the Python twitter library: http://code.google.com/p/python-twitter/"
|
||
|
from random import shuffle
|
||
|
api = twitter.Api()
|
||
|
tweets = api.GetPublicTimeline()
|
||
|
if tweets:
|
||
|
for tweet in tweets:
|
||
|
if tweet.user.lang == 'en':
|
||
|
return self.tokenize(tweet.text)
|
||
|
else:
|
||
|
raise Exception("Apologies. I couldn't get Twitter to give me a public English-language tweet. Perhaps try again")
|
||
|
|
||
|
def __html2unicode(self, s):
|
||
|
"""
|
||
|
Internal metod that seeks to replace all the HTML entities in
|
||
|
s with their corresponding unicode characters.
|
||
|
"""
|
||
|
# First the digits:
|
||
|
ents = set(html_entity_digit_re.findall(s))
|
||
|
if len(ents) > 0:
|
||
|
for ent in ents:
|
||
|
entnum = ent[2:-1]
|
||
|
try:
|
||
|
entnum = int(entnum)
|
||
|
s = s.replace(ent, unichr(entnum))
|
||
|
except:
|
||
|
pass
|
||
|
# Now the alpha versions:
|
||
|
ents = set(html_entity_alpha_re.findall(s))
|
||
|
ents = filter((lambda x : x != amp), ents)
|
||
|
for ent in ents:
|
||
|
entname = ent[1:-1]
|
||
|
try:
|
||
|
s = s.replace(ent, unichr(htmlentitydefs.name2codepoint[entname]))
|
||
|
except:
|
||
|
pass
|
||
|
s = s.replace(amp, " and ")
|
||
|
return s
|
||
|
|
||
|
###############################################################################
|
||
|
|
||
|
if __name__ == '__main__':
|
||
|
tok = Tokenizer(preserve_case=False)
|
||
|
samples = (
|
||
|
u"RT @ #happyfuncoding: this is a typical Twitter tweet :-)",
|
||
|
u"HTML entities & other Web oddities can be an ácute <em class='grumpy'>pain</em> >:(",
|
||
|
u"It's perhaps noteworthy that phone numbers like +1 (800) 123-4567, (800) 123-4567, and 123-4567 are treated as words despite their whitespace."
|
||
|
)
|
||
|
|
||
|
for s in samples:
|
||
|
print "======================================================================"
|
||
|
print s
|
||
|
tokenized = tok.tokenize(s)
|
||
|
print "\n".join(tokenized)
|