From a85131cb581f848d6d25d2d036622e2b1f4c25c2 Mon Sep 17 00:00:00 2001 From: Stefano Pigozzi Date: Thu, 2 Feb 2023 04:12:25 +0100 Subject: [PATCH] Vendor Potts' tokenizer --- unimore_bda_6/vendor/potts.py | 227 ++++++++++++++++++++++++++++++++++ 1 file changed, 227 insertions(+) create mode 100644 unimore_bda_6/vendor/potts.py diff --git a/unimore_bda_6/vendor/potts.py b/unimore_bda_6/vendor/potts.py new file mode 100644 index 0000000..2178c43 --- /dev/null +++ b/unimore_bda_6/vendor/potts.py @@ -0,0 +1,227 @@ +""" +This file is a vendored version of `Christopher Potts' tokenizer `_, which the project's specifications require to use. + +It has been altered to be used with Python 3.10, but the code is mostly the same. + +Original module docstring +========================= + +This code implements a basic, Twitter-aware tokenizer. + +A tokenizer is a function that splits a string of text into words. In +Python terms, we map string and unicode objects into lists of unicode +objects. + +There is not a single right way to do tokenizing. The best method +depends on the application. This tokenizer is designed to be flexible +and this easy to adapt to new domains and tasks. The basic logic is +this: + +1. The tuple regex_strings defines a list of regular expression + strings. + +2. The regex_strings strings are put, in order, into a compiled + regular expression object called word_re. + +3. The tokenization is done by word_re.findall(s), where s is the + user-supplied string, inside the tokenize() method of the class + Tokenizer. + +4. When instantiating Tokenizer objects, there is a single option: + preserve_case. By default, it is set to True. If it is set to + False, then the tokenizer will downcase everything except for + emoticons. + +The __main__ method illustrates by tokenizing a few examples. + +I've also included a Tokenizer method tokenize_random_tweet(). If the +twitter library is installed (http://code.google.com/p/python-twitter/) +and Twitter is cooperating, then it should tokenize a random +English-language tweet. +""" + +__author__ = "Christopher Potts" +__copyright__ = "Copyright 2011, Christopher Potts" +__credits__ = [] +__license__ = "Creative Commons Attribution-NonCommercial-ShareAlike 3.0 Unported License: http://creativecommons.org/licenses/by-nc-sa/3.0/" +__version__ = "1.0" +__maintainer__ = "Christopher Potts" +__email__ = "See the author's website" + +###################################################################### + +import re +import htmlentitydefs + +###################################################################### +# The following strings are components in the regular expression +# that is used for tokenizing. It's important that phone_number +# appears first in the final regex (since it can contain whitespace). +# It also could matter that tags comes after emoticons, due to the +# possibility of having text like +# +# <:| and some text >:) +# +# Most imporatantly, the final element should always be last, since it +# does a last ditch whitespace-based tokenization of whatever is left. + +# This particular element is used in a couple ways, so we define it +# with a name: +emoticon_string = r""" + (?: + [<>]? + [:;=8] # eyes + [\-o\*\']? # optional nose + [\)\]\(\[dDpP/\:\}\{@\|\\] # mouth + | + [\)\]\(\[dDpP/\:\}\{@\|\\] # mouth + [\-o\*\']? # optional nose + [:;=8] # eyes + [<>]? + )""" + +# The components of the tokenizer: +regex_strings = ( + # Phone numbers: + r""" + (?: + (?: # (international) + \+?[01] + [\-\s.]* + )? + (?: # (area code) + [\(]? + \d{3} + [\-\s.\)]* + )? + \d{3} # exchange + [\-\s.]* + \d{4} # base + )""" + , + # Emoticons: + emoticon_string + , + # HTML tags: + r"""<[^>]+>""" + , + # Twitter username: + r"""(?:@[\w_]+)""" + , + # Twitter hashtags: + r"""(?:\#+[\w_]+[\w\'_\-]*[\w_]+)""" + , + # Remaining word types: + r""" + (?:[a-z][a-z'\-_]+[a-z]) # Words with apostrophes or dashes. + | + (?:[+\-]?\d+[,/.:-]\d+[+\-]?) # Numbers, including fractions, decimals. + | + (?:[\w_]+) # Words without apostrophes or dashes. + | + (?:\.(?:\s*\.){1,}) # Ellipsis dots. + | + (?:\S) # Everything else that isn't whitespace. + """ + ) + +###################################################################### +# This is the core tokenizing regex: + +word_re = re.compile(r"""(%s)""" % "|".join(regex_strings), re.VERBOSE | re.I | re.UNICODE) + +# The emoticon string gets its own regex so that we can preserve case for them as needed: +emoticon_re = re.compile(regex_strings[1], re.VERBOSE | re.I | re.UNICODE) + +# These are for regularizing HTML entities to Unicode: +html_entity_digit_re = re.compile(r"&#\d+;") +html_entity_alpha_re = re.compile(r"&\w+;") +amp = "&" + +###################################################################### + +class Tokenizer: + def __init__(self, preserve_case=False): + self.preserve_case = preserve_case + + def tokenize(self, s): + """ + Argument: s -- any string or unicode object + Value: a tokenize list of strings; conatenating this list returns the original string if preserve_case=False + """ + # Try to ensure unicode: + try: + s = unicode(s) + except UnicodeDecodeError: + s = str(s).encode('string_escape') + s = unicode(s) + # Fix HTML character entitites: + s = self.__html2unicode(s) + # Tokenize: + words = word_re.findall(s) + # Possible alter the case, but avoid changing emoticons like :D into :d: + if not self.preserve_case: + words = map((lambda x : x if emoticon_re.search(x) else x.lower()), words) + return words + + def tokenize_random_tweet(self): + """ + If the twitter library is installed and a twitter connection + can be established, then tokenize a random tweet. + """ + try: + import twitter + except ImportError: + print "Apologies. The random tweet functionality requires the Python twitter library: http://code.google.com/p/python-twitter/" + from random import shuffle + api = twitter.Api() + tweets = api.GetPublicTimeline() + if tweets: + for tweet in tweets: + if tweet.user.lang == 'en': + return self.tokenize(tweet.text) + else: + raise Exception("Apologies. I couldn't get Twitter to give me a public English-language tweet. Perhaps try again") + + def __html2unicode(self, s): + """ + Internal metod that seeks to replace all the HTML entities in + s with their corresponding unicode characters. + """ + # First the digits: + ents = set(html_entity_digit_re.findall(s)) + if len(ents) > 0: + for ent in ents: + entnum = ent[2:-1] + try: + entnum = int(entnum) + s = s.replace(ent, unichr(entnum)) + except: + pass + # Now the alpha versions: + ents = set(html_entity_alpha_re.findall(s)) + ents = filter((lambda x : x != amp), ents) + for ent in ents: + entname = ent[1:-1] + try: + s = s.replace(ent, unichr(htmlentitydefs.name2codepoint[entname])) + except: + pass + s = s.replace(amp, " and ") + return s + +############################################################################### + +if __name__ == '__main__': + tok = Tokenizer(preserve_case=False) + samples = ( + u"RT @ #happyfuncoding: this is a typical Twitter tweet :-)", + u"HTML entities & other Web oddities can be an ácute pain >:(", + u"It's perhaps noteworthy that phone numbers like +1 (800) 123-4567, (800) 123-4567, and 123-4567 are treated as words despite their whitespace." + ) + + for s in samples: + print "======================================================================" + print s + tokenized = tok.tokenize(s) + print "\n".join(tokenized) \ No newline at end of file