bda-6-steffo/unimore_bda_6/tokenizer/potts.py

import re
import html.entities
import typing as t
import nltk.sentiment.util

from .base import BaseTokenizer


class PottsTokenizer(BaseTokenizer):
    """
    Tokenizer based on `Christopher Potts' tokenizer <http://sentiment.christopherpotts.net/tokenizing.html>`_, released in 2011.

    This class is released under the Creative Commons Attribution-NonCommercial-ShareAlike 3.0 Unported License: https://creativecommons.org/licenses/by-nc-sa/3.0/ .
    """

    # noinspection RegExpRepeatedSpace
    # language=pythonregexp
    emoticon_re_string = r"""[<>]?[:;=8][\-o*']?[)\](\[dDpP/:}{@|\\]"""

    emoticon_re = re.compile(emoticon_re_string)

    words_re_string = "(" + "|".join([
        # Emoticons:
        emoticon_re_string
        ,
        # Phone numbers:
        # language=pythonregexp
        r"""(?:[+]?[01][\s.-]*)?(?:[(]?\d{3}[\s.)-]*)?\d{3}[\-\s.]*\d{4}"""
        ,
        # HTML tags:
        # language=pythonregexp
        r"""<[^>]+>"""
        ,
        # Twitter username:
        # language=pythonregexp
        r"""@[\w_]+"""
        ,
        # Twitter hashtags:
        # language=pythonregexp
        r"""#+[\w_]+[\w'_-]*[\w_]+"""
        ,
        # Words with apostrophes or dashes
        # language=pythonregexp
        r"""[a-z][a-z'_-]+[a-z]"""
        ,
        # Numbers, including fractions, decimals
        # language=pythonregexp
        r"""[+-]?\d+(?:[,/.:-]\d+)?"""
        ,
        # Words without apostrophes or dashes
        # language=pythonregexp
        r"""[\w_]+"""
        ,
        # Ellipsis dots
        # language=pythonregexp
        r"""[.](?:\s*[.])+"""
        ,
        # Everything else that isn't whitespace
        # language=pythonregexp
        r"""\S+"""
    ]) + ")"

    words_re = re.compile(words_re_string, re.I)

    # language=pythonregexp
    digit_re_string = r"&#\d+;"

    digit_re = re.compile(digit_re_string)

    # language=pythonregexp
    alpha_re_string = r"&\w+;"

    alpha_re = re.compile(alpha_re_string)

    amp = "&amp;"

    @classmethod
    def html_entities_to_chr(cls, s: str) -> str:
        """
        Internal metod that seeks to replace all the HTML entities in s with their corresponding characters.
        """
        # First the digits:
        ents = set(cls.digit_re.findall(s))
        if len(ents) > 0:
            for ent in ents:
                entnum = ent[2:-1]
                try:
                    entnum = int(entnum)
                    s = s.replace(ent, chr(entnum))
                except (ValueError, KeyError):
                    pass
        # Now the alpha versions:
        ents = set(cls.alpha_re.findall(s))
        ents = filter((lambda x: x != cls.amp), ents)
        for ent in ents:
            entname = ent[1:-1]
            try:
                s = s.replace(ent, chr(html.entities.name2codepoint[entname]))
            except (ValueError, KeyError):
                pass
            s = s.replace(cls.amp, " and ")
        return s

    @classmethod
    def lower_but_preserve_emoticons(cls, word):
        """
        Internal method which lowercases the word if it does not match `.emoticon_re`.
        """
        if cls.emoticon_re.search(word):
            return word
        else:
            return word.lower()

    def tokenize(self, text: str) -> t.Iterator[str]:
        # Fix HTML character entitites
        text = self.html_entities_to_chr(text)
        # Tokenize
        tokens = self.words_re.findall(text)
        # Possible alter the case, but avoid changing emoticons like :D into :d:
        tokens = map(self.lower_but_preserve_emoticons, tokens)
        # Return the result
        return tokens


class PottsTokenizerWithNegation(PottsTokenizer):
    """
    Version of `.PottsTokenizer` which after tokenizing applies `nltk.sentiment.util.mark_negation`.
    """

    def tokenize(self, text: str) -> str:
        # Apply the base tokenization
        words = super().tokenize(text)
        # Convert to a list (sigh) the iterator
        words = list(words)
        # Use nltk to mark negation
        nltk.sentiment.util.mark_negation(words, shallow=True)
        # Return the result
        return words


__all__ = (
    "PottsTokenizer",
    "PottsTokenizerWithNegation",
)
Vendor Potts' tokenizer 2023-02-02 03:12:25 +00:00			`import re`
In Potts' tokenizer, use `html.entities` instead of `htmlentitydefs` 2023-02-02 03:12:56 +00:00			`import html.entities`
Include `typing` module in Potts' tokenizer 2023-02-02 03:17:43 +00:00			`import typing as t`
New version working nicely 2023-02-03 22:27:44 +00:00			`import nltk.sentiment.util`

			`from .base import BaseTokenizer`
Vendor Potts' tokenizer 2023-02-02 03:12:25 +00:00
Refactor things to work better 2023-02-02 16:24:11 +00:00
New version working nicely 2023-02-03 22:27:44 +00:00			`class PottsTokenizer(BaseTokenizer):`
Refactor things to work better 2023-02-02 16:24:11 +00:00			`"""`
Improve the tokenizer situation 2023-02-10 04:12:07 +00:00			Tokenizer based on `Christopher Potts' tokenizer <http://sentiment.christopherpotts.net/tokenizing.html>`_, released in 2011.

CODE IS DONE 2023-02-12 04:11:58 +00:00			`This class is released under the Creative Commons Attribution-NonCommercial-ShareAlike 3.0 Unported License: https://creativecommons.org/licenses/by-nc-sa/3.0/ .`
Refactor things to work better 2023-02-02 16:24:11 +00:00			`"""`

Improve the tokenizer situation 2023-02-10 04:12:07 +00:00			`# noinspection RegExpRepeatedSpace`
			`# language=pythonregexp`
fix and patch things 2023-02-11 03:32:17 +00:00			`emoticon_re_string = r"""[<>]?[:;=8][\-o*']?[)\](\[dDpP/:}{@\|\\]"""`
Improve the tokenizer situation 2023-02-10 04:12:07 +00:00
fix and patch things 2023-02-11 03:32:17 +00:00			`emoticon_re = re.compile(emoticon_re_string)`
Improve the tokenizer situation 2023-02-10 04:12:07 +00:00
fix and patch things 2023-02-11 03:32:17 +00:00			`words_re_string = "(" + "\|".join([`
Improve the tokenizer situation 2023-02-10 04:12:07 +00:00			`# Emoticons:`
			`emoticon_re_string`
			`,`
			`# Phone numbers:`
fix and patch things 2023-02-11 03:32:17 +00:00			`# language=pythonregexp`
			`r"""(?:[+]?[01][\s.-])?(?:[(]?\d{3}[\s.)-])?\d{3}[\-\s.]*\d{4}"""`
Improve the tokenizer situation 2023-02-10 04:12:07 +00:00			`,`
			`# HTML tags:`
fix and patch things 2023-02-11 03:32:17 +00:00			`# language=pythonregexp`
Improve the tokenizer situation 2023-02-10 04:12:07 +00:00			`r"""<[^>]+>"""`
			`,`
			`# Twitter username:`
fix and patch things 2023-02-11 03:32:17 +00:00			`# language=pythonregexp`
Improve the tokenizer situation 2023-02-10 04:12:07 +00:00			`r"""@[\w_]+"""`
			`,`
			`# Twitter hashtags:`
fix and patch things 2023-02-11 03:32:17 +00:00			`# language=pythonregexp`
			`r"""#+[\w_]+[\w'_-]*[\w_]+"""`
Improve the tokenizer situation 2023-02-10 04:12:07 +00:00			`,`
			`# Words with apostrophes or dashes`
fix and patch things 2023-02-11 03:32:17 +00:00			`# language=pythonregexp`
			`r"""[a-z][a-z'_-]+[a-z]"""`
Improve the tokenizer situation 2023-02-10 04:12:07 +00:00			`,`
			`# Numbers, including fractions, decimals`
fix and patch things 2023-02-11 03:32:17 +00:00			`# language=pythonregexp`
			`r"""[+-]?\d+(?:[,/.:-]\d+)?"""`
Improve the tokenizer situation 2023-02-10 04:12:07 +00:00			`,`
			`# Words without apostrophes or dashes`
fix and patch things 2023-02-11 03:32:17 +00:00			`# language=pythonregexp`
Improve the tokenizer situation 2023-02-10 04:12:07 +00:00			`r"""[\w_]+"""`
			`,`
			`# Ellipsis dots`
fix and patch things 2023-02-11 03:32:17 +00:00			`# language=pythonregexp`
			`r"""[.](?:\s*[.])+"""`
Improve the tokenizer situation 2023-02-10 04:12:07 +00:00			`,`
			`# Everything else that isn't whitespace`
fix and patch things 2023-02-11 03:32:17 +00:00			`# language=pythonregexp`
			`r"""\S+"""`
			`]) + ")"`
Improve the tokenizer situation 2023-02-10 04:12:07 +00:00
fix and patch things 2023-02-11 03:32:17 +00:00			`words_re = re.compile(words_re_string, re.I)`
Improve the tokenizer situation 2023-02-10 04:12:07 +00:00
			`# language=pythonregexp`
			`digit_re_string = r"&#\d+;"`

fix and patch things 2023-02-11 03:32:17 +00:00			`digit_re = re.compile(digit_re_string)`
Improve the tokenizer situation 2023-02-10 04:12:07 +00:00
			`# language=pythonregexp`
			`alpha_re_string = r"&\w+;"`

fix and patch things 2023-02-11 03:32:17 +00:00			`alpha_re = re.compile(alpha_re_string)`
Improve the tokenizer situation 2023-02-10 04:12:07 +00:00
			`amp = "&"`

			`@classmethod`
CODE IS DONE 2023-02-12 04:11:58 +00:00			`def html_entities_to_chr(cls, s: str) -> str:`
Improve the tokenizer situation 2023-02-10 04:12:07 +00:00			`"""`
			`Internal metod that seeks to replace all the HTML entities in s with their corresponding characters.`
New version working nicely 2023-02-03 22:27:44 +00:00			`"""`
			`# First the digits:`
Improve the tokenizer situation 2023-02-10 04:12:07 +00:00			`ents = set(cls.digit_re.findall(s))`
New version working nicely 2023-02-03 22:27:44 +00:00			`if len(ents) > 0:`
			`for ent in ents:`
			`entnum = ent[2:-1]`
			`try:`
			`entnum = int(entnum)`
			`s = s.replace(ent, chr(entnum))`
			`except (ValueError, KeyError):`
			`pass`
			`# Now the alpha versions:`
Improve the tokenizer situation 2023-02-10 04:12:07 +00:00			`ents = set(cls.alpha_re.findall(s))`
			`ents = filter((lambda x: x != cls.amp), ents)`
Vendor Potts' tokenizer 2023-02-02 03:12:25 +00:00			`for ent in ents:`
New version working nicely 2023-02-03 22:27:44 +00:00			`entname = ent[1:-1]`
Refactor things to work better 2023-02-02 16:24:11 +00:00			`try:`
New version working nicely 2023-02-03 22:27:44 +00:00			`s = s.replace(ent, chr(html.entities.name2codepoint[entname]))`
			`except (ValueError, KeyError):`
Refactor things to work better 2023-02-02 16:24:11 +00:00			`pass`
Improve the tokenizer situation 2023-02-10 04:12:07 +00:00			`s = s.replace(cls.amp, " and ")`
New version working nicely 2023-02-03 22:27:44 +00:00			`return s`

CODE IS DONE 2023-02-12 04:11:58 +00:00			`@classmethod`
			`def lower_but_preserve_emoticons(cls, word):`
			`"""`
			Internal method which lowercases the word if it does not match `.emoticon_re`.
			`"""`
			`if cls.emoticon_re.search(word):`
			`return word`
			`else:`
			`return word.lower()`

			`def tokenize(self, text: str) -> t.Iterator[str]:`
Improve the tokenizer situation 2023-02-10 04:12:07 +00:00			`# Fix HTML character entitites`
CODE IS DONE 2023-02-12 04:11:58 +00:00			`text = self.html_entities_to_chr(text)`
Improve the tokenizer situation 2023-02-10 04:12:07 +00:00			`# Tokenize`
CODE IS DONE 2023-02-12 04:11:58 +00:00			`tokens = self.words_re.findall(text)`
New version working nicely 2023-02-03 22:27:44 +00:00			`# Possible alter the case, but avoid changing emoticons like :D into :d:`
CODE IS DONE 2023-02-12 04:11:58 +00:00			`tokens = map(self.lower_but_preserve_emoticons, tokens)`
Improve the tokenizer situation 2023-02-10 04:12:07 +00:00			`# Return the result`
CODE IS DONE 2023-02-12 04:11:58 +00:00			`return tokens`
New version working nicely 2023-02-03 22:27:44 +00:00

			`class PottsTokenizerWithNegation(PottsTokenizer):`
CODE IS DONE 2023-02-12 04:11:58 +00:00			`"""`
			Version of `.PottsTokenizer` which after tokenizing applies `nltk.sentiment.util.mark_negation`.
			`"""`

			`def tokenize(self, text: str) -> str:`
			`# Apply the base tokenization`
			`words = super().tokenize(text)`
			`# Convert to a list (sigh) the iterator`
			`words = list(words)`
			`# Use nltk to mark negation`
New version working nicely 2023-02-03 22:27:44 +00:00			`nltk.sentiment.util.mark_negation(words, shallow=True)`
CODE IS DONE 2023-02-12 04:11:58 +00:00			`# Return the result`
			`return words`
Refactor things to work better 2023-02-02 16:24:11 +00:00

			`__all__ = (`
New version working nicely 2023-02-03 22:27:44 +00:00			`"PottsTokenizer",`
			`"PottsTokenizerWithNegation",`
Refactor things to work better 2023-02-02 16:24:11 +00:00			`)`