1
Fork 0
mirror of https://github.com/Steffo99/unimore-bda-6.git synced 2024-10-16 14:27:32 +00:00

Rename __html2unicode to __html2string

This commit is contained in:
Steffo 2023-02-02 04:18:08 +01:00
parent 569f9e5359
commit 29c3d05b6c
Signed by: steffo
GPG key ID: 2A24051445686895

View file

@ -152,7 +152,7 @@ class Tokenizer:
Value: a tokenize list of strings; conatenating this list returns the original string if preserve_case=False Value: a tokenize list of strings; conatenating this list returns the original string if preserve_case=False
""" """
# Fix HTML character entitites: # Fix HTML character entitites:
s = self.__html2unicode(s) s = self.__html2string(s)
# Tokenize: # Tokenize:
words = word_re.findall(s) words = word_re.findall(s)
# Possible alter the case, but avoid changing emoticons like :D into :d: # Possible alter the case, but avoid changing emoticons like :D into :d:
@ -160,26 +160,7 @@ class Tokenizer:
words = map((lambda x : x if emoticon_re.search(x) else x.lower()), words) words = map((lambda x : x if emoticon_re.search(x) else x.lower()), words)
return words return words
def tokenize_random_tweet(self): def __html2string(self, s: str) -> str:
"""
If the twitter library is installed and a twitter connection
can be established, then tokenize a random tweet.
"""
try:
import twitter
except ImportError:
print "Apologies. The random tweet functionality requires the Python twitter library: http://code.google.com/p/python-twitter/"
from random import shuffle
api = twitter.Api()
tweets = api.GetPublicTimeline()
if tweets:
for tweet in tweets:
if tweet.user.lang == 'en':
return self.tokenize(tweet.text)
else:
raise Exception("Apologies. I couldn't get Twitter to give me a public English-language tweet. Perhaps try again")
def __html2unicode(self, s):
""" """
Internal metod that seeks to replace all the HTML entities in Internal metod that seeks to replace all the HTML entities in
s with their corresponding unicode characters. s with their corresponding unicode characters.