bda-6-steffo/unimore_bda_6/tokenizer/potts.py

"""
=========================
Original module docstring
=========================

This code implements a basic, Twitter-aware tokenizer.

A tokenizer is a function that splits a string of text into words. In
Python terms, we map string and unicode objects into lists of unicode
objects.

There is not a single right way to do tokenizing. The best method
depends on the application.  This tokenizer is designed to be flexible
and this easy to adapt to new domains and tasks.  The basic logic is
this:

1. The tuple regex_strings defines a list of regular expression
   strings.

2. The regex_strings strings are put, in order, into a compiled
   regular expression object called word_re.

3. The tokenization is done by word_re.findall(s), where s is the
   user-supplied string, inside the tokenize() method of the class
   Tokenizer.

4. When instantiating Tokenizer objects, there is a single option:
   preserve_case.  By default, it is set to True. If it is set to
   False, then the tokenizer will downcase everything except for
   emoticons.

The __main__ method illustrates by tokenizing a few examples.

I've also included a Tokenizer method tokenize_random_tweet(). If the
twitter library is installed (http://code.google.com/p/python-twitter/)
and Twitter is cooperating, then it should tokenize a random
English-language tweet.
"""

__author__ = "Christopher Potts"
__copyright__ = "Copyright 2011, Christopher Potts"
__credits__ = []
__license__ = "Creative Commons Attribution-NonCommercial-ShareAlike 3.0 Unported License: http://creativecommons.org/licenses/by-nc-sa/3.0/"
__version__ = "1.0"
__maintainer__ = "Christopher Potts"
__email__ = "See the author's website"

######################################################################

import re
import html.entities
import typing as t
import nltk.sentiment.util

from .base import BaseTokenizer

######################################################################
# The following strings are components in the regular expression
# that is used for tokenizing. It's important that phone_number
# appears first in the final regex (since it can contain whitespace).
# It also could matter that tags comes after emoticons, due to the
# possibility of having text like
#
#     <:| and some text >:)
#
# Most imporatantly, the final element should always be last, since it
# does a last ditch whitespace-based tokenization of whatever is left.

# This particular element is used in a couple ways, so we define it
# with a name:
emoticon_string = r"""
    (?:
      [<>]?
      [:;=8]                     # eyes
      [\-o\*\']?                 # optional nose
      [\)\]\(\[dDpP/\:\}\{@\|\\] # mouth
      |
      [\)\]\(\[dDpP/\:\}\{@\|\\] # mouth
      [\-o\*\']?                 # optional nose
      [:;=8]                     # eyes
      [<>]?
    )"""

# The components of the tokenizer:
regex_strings = (
    # Phone numbers:
    r"""
    (?:
      (?:            # (international)
        \+?[01]
        [\-\s.]*
      )?
      (?:            # (area code)
        [\(]?
        \d{3}
        [\-\s.\)]*
      )?
      \d{3}          # exchange
      [\-\s.]*
      \d{4}          # base
    )"""
    ,
    # Emoticons:
    emoticon_string
    ,
    # HTML tags:
    r"""<[^>]+>"""
    ,
    # Twitter username:
    r"""(?:@[\w_]+)"""
    ,
    # Twitter hashtags:
    r"""(?:\#+[\w_]+[\w\'_\-]*[\w_]+)"""
    ,
    # Remaining word types:
    r"""
    (?:[a-z][a-z'\-_]+[a-z])       # Words with apostrophes or dashes.
    |
    (?:[+\-]?\d+[,/.:-]\d+[+\-]?)  # Numbers, including fractions, decimals.
    |
    (?:[\w_]+)                     # Words without apostrophes or dashes.
    |
    (?:\.(?:\s*\.){1,})            # Ellipsis dots.
    |
    (?:\S)                         # Everything else that isn't whitespace.
    """
)

######################################################################
# This is the core tokenizing regex:

word_re = re.compile(r"""(%s)""" % "|".join(regex_strings), re.VERBOSE | re.I | re.UNICODE)

# The emoticon string gets its own regex so that we can preserve case for them as needed:
emoticon_re = re.compile(regex_strings[1], re.VERBOSE | re.I | re.UNICODE)

# These are for regularizing HTML entities to Unicode:
html_entity_digit_re = re.compile(r"&#\d+;")
html_entity_alpha_re = re.compile(r"&\w+;")
amp = "&amp;"


######################################################################


class PottsTokenizer(BaseTokenizer):
    """
    Tokenizer based on `Christopher Potts' tokenizer <http://sentiment.christopherpotts.net/tokenizing.html>`_.
    """

    @staticmethod
    def __html2string(s: str) -> str:
        """
        Internal metod that seeks to replace all the HTML entities in
        s with their corresponding unicode characters.
        """
        # First the digits:
        ents = set(html_entity_digit_re.findall(s))
        if len(ents) > 0:
            for ent in ents:
                entnum = ent[2:-1]
                try:
                    entnum = int(entnum)
                    s = s.replace(ent, chr(entnum))
                except (ValueError, KeyError):
                    pass
        # Now the alpha versions:
        ents = set(html_entity_alpha_re.findall(s))
        ents = filter((lambda x: x != amp), ents)
        for ent in ents:
            entname = ent[1:-1]
            try:
                s = s.replace(ent, chr(html.entities.name2codepoint[entname]))
            except (ValueError, KeyError):
                pass
            s = s.replace(amp, " and ")
        return s

    def tokenize_plain(self, text: str) -> t.Iterable[str]:
        # Fix HTML character entitites:
        s = self.__html2string(text)
        # Tokenize:
        words = word_re.findall(s)
        # Possible alter the case, but avoid changing emoticons like :D into :d:
        words = list(map(lambda x: x if emoticon_re.search(x) else x.lower(), words))
        # Return the results
        return words


class PottsTokenizerWithNegation(PottsTokenizer):
    def tokenize_plain(self, text: str) -> t.Iterable[str]:
        words = super().tokenize_plain(text)
        nltk.sentiment.util.mark_negation(words, shallow=True)
        return words


__all__ = (
    "PottsTokenizer",
    "PottsTokenizerWithNegation",
)
Vendor Potts' tokenizer 2023-02-02 03:12:25 +00:00			`"""`
In Potts' tokenizer, use `html.entities` instead of `htmlentitydefs` 2023-02-02 03:12:56 +00:00			`=========================`
Vendor Potts' tokenizer 2023-02-02 03:12:25 +00:00			`Original module docstring`
			`=========================`

			`This code implements a basic, Twitter-aware tokenizer.`

			`A tokenizer is a function that splits a string of text into words. In`
			`Python terms, we map string and unicode objects into lists of unicode`
			`objects.`

			`There is not a single right way to do tokenizing. The best method`
			`depends on the application. This tokenizer is designed to be flexible`
			`and this easy to adapt to new domains and tasks. The basic logic is`
			`this:`

			`1. The tuple regex_strings defines a list of regular expression`
			`strings.`

			`2. The regex_strings strings are put, in order, into a compiled`
			`regular expression object called word_re.`

			`3. The tokenization is done by word_re.findall(s), where s is the`
			`user-supplied string, inside the tokenize() method of the class`
			`Tokenizer.`

			`4. When instantiating Tokenizer objects, there is a single option:`
			`preserve_case. By default, it is set to True. If it is set to`
			`False, then the tokenizer will downcase everything except for`
			`emoticons.`

			`The __main__ method illustrates by tokenizing a few examples.`

			`I've also included a Tokenizer method tokenize_random_tweet(). If the`
			`twitter library is installed (http://code.google.com/p/python-twitter/)`
			`and Twitter is cooperating, then it should tokenize a random`
			`English-language tweet.`
			`"""`

			`__author__ = "Christopher Potts"`
			`__copyright__ = "Copyright 2011, Christopher Potts"`
			`__credits__ = []`
			`__license__ = "Creative Commons Attribution-NonCommercial-ShareAlike 3.0 Unported License: http://creativecommons.org/licenses/by-nc-sa/3.0/"`
			`__version__ = "1.0"`
			`__maintainer__ = "Christopher Potts"`
			`__email__ = "See the author's website"`

			`######################################################################`

			`import re`
In Potts' tokenizer, use `html.entities` instead of `htmlentitydefs` 2023-02-02 03:12:56 +00:00			`import html.entities`
Include `typing` module in Potts' tokenizer 2023-02-02 03:17:43 +00:00			`import typing as t`
New version working nicely 2023-02-03 22:27:44 +00:00			`import nltk.sentiment.util`

			`from .base import BaseTokenizer`
Vendor Potts' tokenizer 2023-02-02 03:12:25 +00:00
			`######################################################################`
			`# The following strings are components in the regular expression`
			`# that is used for tokenizing. It's important that phone_number`
			`# appears first in the final regex (since it can contain whitespace).`
			`# It also could matter that tags comes after emoticons, due to the`
			`# possibility of having text like`
			`#`
			`# <:\| and some text >:)`
			`#`
			`# Most imporatantly, the final element should always be last, since it`
			`# does a last ditch whitespace-based tokenization of whatever is left.`

			`# This particular element is used in a couple ways, so we define it`
			`# with a name:`
			`emoticon_string = r"""`
			`(?:`
			`[<>]?`
			`[:;=8] # eyes`
			`[\-o\*\']? # optional nose`
Refactor things to work better 2023-02-02 16:24:11 +00:00			`[\)\]\(\[dDpP/\:\}\{@\\|\\] # mouth`
Vendor Potts' tokenizer 2023-02-02 03:12:25 +00:00			`\|`
			`[\)\]\(\[dDpP/\:\}\{@\\|\\] # mouth`
			`[\-o\*\']? # optional nose`
			`[:;=8] # eyes`
			`[<>]?`
			`)"""`

			`# The components of the tokenizer:`
			`regex_strings = (`
			`# Phone numbers:`
			`r"""`
			`(?:`
			`(?: # (international)`
			`\+?[01]`
			`[\-\s.]*`
Refactor things to work better 2023-02-02 16:24:11 +00:00			`)?`
Vendor Potts' tokenizer 2023-02-02 03:12:25 +00:00			`(?: # (area code)`
			`[\(]?`
			`\d{3}`
			`[\-\s.\)]*`
Refactor things to work better 2023-02-02 16:24:11 +00:00			`)?`
Vendor Potts' tokenizer 2023-02-02 03:12:25 +00:00			`\d{3} # exchange`
Refactor things to work better 2023-02-02 16:24:11 +00:00			`[\-\s.]*`
Vendor Potts' tokenizer 2023-02-02 03:12:25 +00:00			`\d{4} # base`
			`)"""`
			`,`
			`# Emoticons:`
			`emoticon_string`
Refactor things to work better 2023-02-02 16:24:11 +00:00			`,`
Vendor Potts' tokenizer 2023-02-02 03:12:25 +00:00			`# HTML tags:`
enough 2023-02-08 18:46:05 +00:00			`r"""<[^>]+>"""`
Vendor Potts' tokenizer 2023-02-02 03:12:25 +00:00			`,`
			`# Twitter username:`
			`r"""(?:@[\w_]+)"""`
			`,`
			`# Twitter hashtags:`
			`r"""(?:\#+[\w_]+[\w\'_\-]*[\w_]+)"""`
			`,`
			`# Remaining word types:`
			`r"""`
			`(?:[a-z][a-z'\-_]+[a-z]) # Words with apostrophes or dashes.`
			`\|`
			`(?:[+\-]?\d+[,/.:-]\d+[+\-]?) # Numbers, including fractions, decimals.`
			`\|`
			`(?:[\w_]+) # Words without apostrophes or dashes.`
			`\|`
Refactor things to work better 2023-02-02 16:24:11 +00:00			`(?:\.(?:\s*\.){1,}) # Ellipsis dots.`
Vendor Potts' tokenizer 2023-02-02 03:12:25 +00:00			`\|`
			`(?:\S) # Everything else that isn't whitespace.`
			`"""`
enough 2023-02-08 18:46:05 +00:00			`)`
Vendor Potts' tokenizer 2023-02-02 03:12:25 +00:00
			`######################################################################`
			`# This is the core tokenizing regex:`
Refactor things to work better 2023-02-02 16:24:11 +00:00
Vendor Potts' tokenizer 2023-02-02 03:12:25 +00:00			`word_re = re.compile(r"""(%s)""" % "\|".join(regex_strings), re.VERBOSE \| re.I \| re.UNICODE)`

			`# The emoticon string gets its own regex so that we can preserve case for them as needed:`
			`emoticon_re = re.compile(regex_strings[1], re.VERBOSE \| re.I \| re.UNICODE)`

			`# These are for regularizing HTML entities to Unicode:`
			`html_entity_digit_re = re.compile(r"&#\d+;")`
			`html_entity_alpha_re = re.compile(r"&\w+;")`
			`amp = "&"`

enough 2023-02-08 18:46:05 +00:00
Vendor Potts' tokenizer 2023-02-02 03:12:25 +00:00			`######################################################################`

Refactor things to work better 2023-02-02 16:24:11 +00:00
New version working nicely 2023-02-03 22:27:44 +00:00			`class PottsTokenizer(BaseTokenizer):`
Refactor things to work better 2023-02-02 16:24:11 +00:00			`"""`
New version working nicely 2023-02-03 22:27:44 +00:00			Tokenizer based on `Christopher Potts' tokenizer <http://sentiment.christopherpotts.net/tokenizing.html>`_.
Refactor things to work better 2023-02-02 16:24:11 +00:00			`"""`

New version working nicely 2023-02-03 22:27:44 +00:00			`@staticmethod`
			`def __html2string(s: str) -> str:`
			`"""`
			`Internal metod that seeks to replace all the HTML entities in`
			`s with their corresponding unicode characters.`
			`"""`
			`# First the digits:`
			`ents = set(html_entity_digit_re.findall(s))`
			`if len(ents) > 0:`
			`for ent in ents:`
			`entnum = ent[2:-1]`
			`try:`
			`entnum = int(entnum)`
			`s = s.replace(ent, chr(entnum))`
			`except (ValueError, KeyError):`
			`pass`
			`# Now the alpha versions:`
			`ents = set(html_entity_alpha_re.findall(s))`
enough 2023-02-08 18:46:05 +00:00			`ents = filter((lambda x: x != amp), ents)`
Vendor Potts' tokenizer 2023-02-02 03:12:25 +00:00			`for ent in ents:`
New version working nicely 2023-02-03 22:27:44 +00:00			`entname = ent[1:-1]`
Refactor things to work better 2023-02-02 16:24:11 +00:00			`try:`
New version working nicely 2023-02-03 22:27:44 +00:00			`s = s.replace(ent, chr(html.entities.name2codepoint[entname]))`
			`except (ValueError, KeyError):`
Refactor things to work better 2023-02-02 16:24:11 +00:00			`pass`
New version working nicely 2023-02-03 22:27:44 +00:00			`s = s.replace(amp, " and ")`
			`return s`

enough 2023-02-08 18:46:05 +00:00			`def tokenize_plain(self, text: str) -> t.Iterable[str]:`
New version working nicely 2023-02-03 22:27:44 +00:00			`# Fix HTML character entitites:`
			`s = self.__html2string(text)`
			`# Tokenize:`
			`words = word_re.findall(s)`
			`# Possible alter the case, but avoid changing emoticons like :D into :d:`
			`words = list(map(lambda x: x if emoticon_re.search(x) else x.lower(), words))`
			`# Return the results`
			`return words`


			`class PottsTokenizerWithNegation(PottsTokenizer):`
enough 2023-02-08 18:46:05 +00:00			`def tokenize_plain(self, text: str) -> t.Iterable[str]:`
			`words = super().tokenize_plain(text)`
New version working nicely 2023-02-03 22:27:44 +00:00			`nltk.sentiment.util.mark_negation(words, shallow=True)`
			`return words`
Refactor things to work better 2023-02-02 16:24:11 +00:00

			`__all__ = (`
New version working nicely 2023-02-03 22:27:44 +00:00			`"PottsTokenizer",`
			`"PottsTokenizerWithNegation",`
Refactor things to work better 2023-02-02 16:24:11 +00:00			`)`