mirror of
https://github.com/Steffo99/unimore-bda-6.git
synced 2024-11-25 01:04:19 +00:00
Vendor Potts' tokenizer
This commit is contained in:
parent
b8acf5fc7c
commit
a85131cb58
1 changed files with 227 additions and 0 deletions
227
unimore_bda_6/vendor/potts.py
vendored
Normal file
227
unimore_bda_6/vendor/potts.py
vendored
Normal file
|
@ -0,0 +1,227 @@
|
|||
"""
|
||||
This file is a vendored version of `Christopher Potts' tokenizer <http://sentiment.christopherpotts.net/tokenizing.html>`_, which the project's specifications require to use.
|
||||
|
||||
It has been altered to be used with Python 3.10, but the code is mostly the same.
|
||||
|
||||
Original module docstring
|
||||
=========================
|
||||
|
||||
This code implements a basic, Twitter-aware tokenizer.
|
||||
|
||||
A tokenizer is a function that splits a string of text into words. In
|
||||
Python terms, we map string and unicode objects into lists of unicode
|
||||
objects.
|
||||
|
||||
There is not a single right way to do tokenizing. The best method
|
||||
depends on the application. This tokenizer is designed to be flexible
|
||||
and this easy to adapt to new domains and tasks. The basic logic is
|
||||
this:
|
||||
|
||||
1. The tuple regex_strings defines a list of regular expression
|
||||
strings.
|
||||
|
||||
2. The regex_strings strings are put, in order, into a compiled
|
||||
regular expression object called word_re.
|
||||
|
||||
3. The tokenization is done by word_re.findall(s), where s is the
|
||||
user-supplied string, inside the tokenize() method of the class
|
||||
Tokenizer.
|
||||
|
||||
4. When instantiating Tokenizer objects, there is a single option:
|
||||
preserve_case. By default, it is set to True. If it is set to
|
||||
False, then the tokenizer will downcase everything except for
|
||||
emoticons.
|
||||
|
||||
The __main__ method illustrates by tokenizing a few examples.
|
||||
|
||||
I've also included a Tokenizer method tokenize_random_tweet(). If the
|
||||
twitter library is installed (http://code.google.com/p/python-twitter/)
|
||||
and Twitter is cooperating, then it should tokenize a random
|
||||
English-language tweet.
|
||||
"""
|
||||
|
||||
__author__ = "Christopher Potts"
|
||||
__copyright__ = "Copyright 2011, Christopher Potts"
|
||||
__credits__ = []
|
||||
__license__ = "Creative Commons Attribution-NonCommercial-ShareAlike 3.0 Unported License: http://creativecommons.org/licenses/by-nc-sa/3.0/"
|
||||
__version__ = "1.0"
|
||||
__maintainer__ = "Christopher Potts"
|
||||
__email__ = "See the author's website"
|
||||
|
||||
######################################################################
|
||||
|
||||
import re
|
||||
import htmlentitydefs
|
||||
|
||||
######################################################################
|
||||
# The following strings are components in the regular expression
|
||||
# that is used for tokenizing. It's important that phone_number
|
||||
# appears first in the final regex (since it can contain whitespace).
|
||||
# It also could matter that tags comes after emoticons, due to the
|
||||
# possibility of having text like
|
||||
#
|
||||
# <:| and some text >:)
|
||||
#
|
||||
# Most imporatantly, the final element should always be last, since it
|
||||
# does a last ditch whitespace-based tokenization of whatever is left.
|
||||
|
||||
# This particular element is used in a couple ways, so we define it
|
||||
# with a name:
|
||||
emoticon_string = r"""
|
||||
(?:
|
||||
[<>]?
|
||||
[:;=8] # eyes
|
||||
[\-o\*\']? # optional nose
|
||||
[\)\]\(\[dDpP/\:\}\{@\|\\] # mouth
|
||||
|
|
||||
[\)\]\(\[dDpP/\:\}\{@\|\\] # mouth
|
||||
[\-o\*\']? # optional nose
|
||||
[:;=8] # eyes
|
||||
[<>]?
|
||||
)"""
|
||||
|
||||
# The components of the tokenizer:
|
||||
regex_strings = (
|
||||
# Phone numbers:
|
||||
r"""
|
||||
(?:
|
||||
(?: # (international)
|
||||
\+?[01]
|
||||
[\-\s.]*
|
||||
)?
|
||||
(?: # (area code)
|
||||
[\(]?
|
||||
\d{3}
|
||||
[\-\s.\)]*
|
||||
)?
|
||||
\d{3} # exchange
|
||||
[\-\s.]*
|
||||
\d{4} # base
|
||||
)"""
|
||||
,
|
||||
# Emoticons:
|
||||
emoticon_string
|
||||
,
|
||||
# HTML tags:
|
||||
r"""<[^>]+>"""
|
||||
,
|
||||
# Twitter username:
|
||||
r"""(?:@[\w_]+)"""
|
||||
,
|
||||
# Twitter hashtags:
|
||||
r"""(?:\#+[\w_]+[\w\'_\-]*[\w_]+)"""
|
||||
,
|
||||
# Remaining word types:
|
||||
r"""
|
||||
(?:[a-z][a-z'\-_]+[a-z]) # Words with apostrophes or dashes.
|
||||
|
|
||||
(?:[+\-]?\d+[,/.:-]\d+[+\-]?) # Numbers, including fractions, decimals.
|
||||
|
|
||||
(?:[\w_]+) # Words without apostrophes or dashes.
|
||||
|
|
||||
(?:\.(?:\s*\.){1,}) # Ellipsis dots.
|
||||
|
|
||||
(?:\S) # Everything else that isn't whitespace.
|
||||
"""
|
||||
)
|
||||
|
||||
######################################################################
|
||||
# This is the core tokenizing regex:
|
||||
|
||||
word_re = re.compile(r"""(%s)""" % "|".join(regex_strings), re.VERBOSE | re.I | re.UNICODE)
|
||||
|
||||
# The emoticon string gets its own regex so that we can preserve case for them as needed:
|
||||
emoticon_re = re.compile(regex_strings[1], re.VERBOSE | re.I | re.UNICODE)
|
||||
|
||||
# These are for regularizing HTML entities to Unicode:
|
||||
html_entity_digit_re = re.compile(r"&#\d+;")
|
||||
html_entity_alpha_re = re.compile(r"&\w+;")
|
||||
amp = "&"
|
||||
|
||||
######################################################################
|
||||
|
||||
class Tokenizer:
|
||||
def __init__(self, preserve_case=False):
|
||||
self.preserve_case = preserve_case
|
||||
|
||||
def tokenize(self, s):
|
||||
"""
|
||||
Argument: s -- any string or unicode object
|
||||
Value: a tokenize list of strings; conatenating this list returns the original string if preserve_case=False
|
||||
"""
|
||||
# Try to ensure unicode:
|
||||
try:
|
||||
s = unicode(s)
|
||||
except UnicodeDecodeError:
|
||||
s = str(s).encode('string_escape')
|
||||
s = unicode(s)
|
||||
# Fix HTML character entitites:
|
||||
s = self.__html2unicode(s)
|
||||
# Tokenize:
|
||||
words = word_re.findall(s)
|
||||
# Possible alter the case, but avoid changing emoticons like :D into :d:
|
||||
if not self.preserve_case:
|
||||
words = map((lambda x : x if emoticon_re.search(x) else x.lower()), words)
|
||||
return words
|
||||
|
||||
def tokenize_random_tweet(self):
|
||||
"""
|
||||
If the twitter library is installed and a twitter connection
|
||||
can be established, then tokenize a random tweet.
|
||||
"""
|
||||
try:
|
||||
import twitter
|
||||
except ImportError:
|
||||
print "Apologies. The random tweet functionality requires the Python twitter library: http://code.google.com/p/python-twitter/"
|
||||
from random import shuffle
|
||||
api = twitter.Api()
|
||||
tweets = api.GetPublicTimeline()
|
||||
if tweets:
|
||||
for tweet in tweets:
|
||||
if tweet.user.lang == 'en':
|
||||
return self.tokenize(tweet.text)
|
||||
else:
|
||||
raise Exception("Apologies. I couldn't get Twitter to give me a public English-language tweet. Perhaps try again")
|
||||
|
||||
def __html2unicode(self, s):
|
||||
"""
|
||||
Internal metod that seeks to replace all the HTML entities in
|
||||
s with their corresponding unicode characters.
|
||||
"""
|
||||
# First the digits:
|
||||
ents = set(html_entity_digit_re.findall(s))
|
||||
if len(ents) > 0:
|
||||
for ent in ents:
|
||||
entnum = ent[2:-1]
|
||||
try:
|
||||
entnum = int(entnum)
|
||||
s = s.replace(ent, unichr(entnum))
|
||||
except:
|
||||
pass
|
||||
# Now the alpha versions:
|
||||
ents = set(html_entity_alpha_re.findall(s))
|
||||
ents = filter((lambda x : x != amp), ents)
|
||||
for ent in ents:
|
||||
entname = ent[1:-1]
|
||||
try:
|
||||
s = s.replace(ent, unichr(htmlentitydefs.name2codepoint[entname]))
|
||||
except:
|
||||
pass
|
||||
s = s.replace(amp, " and ")
|
||||
return s
|
||||
|
||||
###############################################################################
|
||||
|
||||
if __name__ == '__main__':
|
||||
tok = Tokenizer(preserve_case=False)
|
||||
samples = (
|
||||
u"RT @ #happyfuncoding: this is a typical Twitter tweet :-)",
|
||||
u"HTML entities & other Web oddities can be an ácute <em class='grumpy'>pain</em> >:(",
|
||||
u"It's perhaps noteworthy that phone numbers like +1 (800) 123-4567, (800) 123-4567, and 123-4567 are treated as words despite their whitespace."
|
||||
)
|
||||
|
||||
for s in samples:
|
||||
print "======================================================================"
|
||||
print s
|
||||
tokenized = tok.tokenize(s)
|
||||
print "\n".join(tokenized)
|
Loading…
Reference in a new issue