2023-02-02 03:12:25 +00:00
"""
This file is a vendored version of ` Christopher Potts ' tokenizer <http://sentiment.christopherpotts.net/tokenizing.html>`_, which the project ' s specifications require to use .
It has been altered to be used with Python 3.10 , but the code is mostly the same .
2023-02-02 03:12:56 +00:00
== == == == == == == == == == == == =
2023-02-02 03:12:25 +00:00
Original module docstring
== == == == == == == == == == == == =
This code implements a basic , Twitter - aware tokenizer .
A tokenizer is a function that splits a string of text into words . In
Python terms , we map string and unicode objects into lists of unicode
objects .
There is not a single right way to do tokenizing . The best method
depends on the application . This tokenizer is designed to be flexible
and this easy to adapt to new domains and tasks . The basic logic is
this :
1. The tuple regex_strings defines a list of regular expression
strings .
2. The regex_strings strings are put , in order , into a compiled
regular expression object called word_re .
3. The tokenization is done by word_re . findall ( s ) , where s is the
user - supplied string , inside the tokenize ( ) method of the class
Tokenizer .
4. When instantiating Tokenizer objects , there is a single option :
preserve_case . By default , it is set to True . If it is set to
False , then the tokenizer will downcase everything except for
emoticons .
The __main__ method illustrates by tokenizing a few examples .
I ' ve also included a Tokenizer method tokenize_random_tweet(). If the
twitter library is installed ( http : / / code . google . com / p / python - twitter / )
and Twitter is cooperating , then it should tokenize a random
English - language tweet .
"""
__author__ = " Christopher Potts "
__copyright__ = " Copyright 2011, Christopher Potts "
__credits__ = [ ]
__license__ = " Creative Commons Attribution-NonCommercial-ShareAlike 3.0 Unported License: http://creativecommons.org/licenses/by-nc-sa/3.0/ "
__version__ = " 1.0 "
__maintainer__ = " Christopher Potts "
__email__ = " See the author ' s website "
######################################################################
import re
2023-02-02 03:12:56 +00:00
import html . entities
2023-02-02 03:17:43 +00:00
import typing as t
2023-02-02 03:12:25 +00:00
######################################################################
# The following strings are components in the regular expression
# that is used for tokenizing. It's important that phone_number
# appears first in the final regex (since it can contain whitespace).
# It also could matter that tags comes after emoticons, due to the
# possibility of having text like
#
# <:| and some text >:)
#
# Most imporatantly, the final element should always be last, since it
# does a last ditch whitespace-based tokenization of whatever is left.
# This particular element is used in a couple ways, so we define it
# with a name:
emoticon_string = r """
( ? :
[ < > ] ?
[ : ; = 8 ] # eyes
[ \- o \* \' ]? # optional nose
2023-02-02 16:24:11 +00:00
[ \) \] \( \[ dDpP / \: \} \{ @ \| \\] # mouth
2023-02-02 03:12:25 +00:00
|
[ \) \] \( \[ dDpP / \: \} \{ @ \| \\] # mouth
[ \- o \* \' ]? # optional nose
[ : ; = 8 ] # eyes
[ < > ] ?
) """
# The components of the tokenizer:
regex_strings = (
# Phone numbers:
r """
( ? :
( ? : # (international)
\+ ? [ 01 ]
[ \- \s . ] *
2023-02-02 16:24:11 +00:00
) ?
2023-02-02 03:12:25 +00:00
( ? : # (area code)
[ \( ] ?
\d { 3 }
[ \- \s . \) ] *
2023-02-02 16:24:11 +00:00
) ?
2023-02-02 03:12:25 +00:00
\d { 3 } # exchange
2023-02-02 16:24:11 +00:00
[ \- \s . ] *
2023-02-02 03:12:25 +00:00
\d { 4 } # base
) """
,
# Emoticons:
emoticon_string
2023-02-02 16:24:11 +00:00
,
2023-02-02 03:12:25 +00:00
# HTML tags:
r """ <[^>]+> """
,
# Twitter username:
r """ (?:@[ \ w_]+) """
,
# Twitter hashtags:
r """ (?: \ #+[ \ w_]+[ \ w \ ' _ \ -]*[ \ w_]+) """
,
# Remaining word types:
r """
( ? : [ a - z ] [ a - z ' \ -_]+[a-z]) # Words with apostrophes or dashes.
|
( ? : [ + \- ] ? \d + [ , / . : - ] \d + [ + \- ] ? ) # Numbers, including fractions, decimals.
|
( ? : [ \w_ ] + ) # Words without apostrophes or dashes.
|
2023-02-02 16:24:11 +00:00
( ? : \. ( ? : \s * \. ) { 1 , } ) # Ellipsis dots.
2023-02-02 03:12:25 +00:00
|
( ? : \S ) # Everything else that isn't whitespace.
"""
)
######################################################################
# This is the core tokenizing regex:
2023-02-02 16:24:11 +00:00
2023-02-02 03:12:25 +00:00
word_re = re . compile ( r """ ( %s ) """ % " | " . join ( regex_strings ) , re . VERBOSE | re . I | re . UNICODE )
# The emoticon string gets its own regex so that we can preserve case for them as needed:
emoticon_re = re . compile ( regex_strings [ 1 ] , re . VERBOSE | re . I | re . UNICODE )
# These are for regularizing HTML entities to Unicode:
html_entity_digit_re = re . compile ( r " &# \ d+; " )
html_entity_alpha_re = re . compile ( r " & \ w+; " )
amp = " & "
######################################################################
2023-02-02 16:24:11 +00:00
def tokenizer ( text : str ) - > t . Iterable [ str ] :
"""
Argument : s - - any string object
Value : a tokenize list of strings ; conatenating this list returns the original string if preserve_case = False
"""
# Fix HTML character entitites:
s = __html2string ( text )
# Tokenize:
words = word_re . findall ( s )
# Possible alter the case, but avoid changing emoticons like :D into :d:
words = map ( ( lambda x : x if emoticon_re . search ( x ) else x . lower ( ) ) , words )
# Return the results
return words
def __html2string ( html : str ) - > str :
"""
Internal metod that seeks to replace all the HTML entities in
s with their corresponding unicode characters .
"""
# First the digits:
ents = set ( html_entity_digit_re . findall ( html ) )
if len ( ents ) > 0 :
2023-02-02 03:12:25 +00:00
for ent in ents :
2023-02-02 16:24:11 +00:00
entnum = ent [ 2 : - 1 ]
try :
entnum = int ( entnum )
html = html . replace ( ent , chr ( entnum ) )
2023-02-02 03:12:25 +00:00
except :
2023-02-02 16:24:11 +00:00
pass
# Now the alpha versions:
ents = set ( html_entity_alpha_re . findall ( html ) )
ents = filter ( ( lambda x : x != amp ) , ents )
for ent in ents :
entname = ent [ 1 : - 1 ]
try :
html = html . replace ( ent , chr ( html . entities . name2codepoint [ entname ] ) )
except :
pass
html = html . replace ( amp , " and " )
return html
__all__ = (
" tokenizer " ,
)