mirror of
https://github.com/Steffo99/unimore-bda-6.git
synced 2024-11-25 01:04:19 +00:00
Rename __html2unicode
to __html2string
This commit is contained in:
parent
569f9e5359
commit
29c3d05b6c
1 changed files with 2 additions and 21 deletions
23
unimore_bda_6/vendor/potts.py
vendored
23
unimore_bda_6/vendor/potts.py
vendored
|
@ -152,7 +152,7 @@ class Tokenizer:
|
||||||
Value: a tokenize list of strings; conatenating this list returns the original string if preserve_case=False
|
Value: a tokenize list of strings; conatenating this list returns the original string if preserve_case=False
|
||||||
"""
|
"""
|
||||||
# Fix HTML character entitites:
|
# Fix HTML character entitites:
|
||||||
s = self.__html2unicode(s)
|
s = self.__html2string(s)
|
||||||
# Tokenize:
|
# Tokenize:
|
||||||
words = word_re.findall(s)
|
words = word_re.findall(s)
|
||||||
# Possible alter the case, but avoid changing emoticons like :D into :d:
|
# Possible alter the case, but avoid changing emoticons like :D into :d:
|
||||||
|
@ -160,26 +160,7 @@ class Tokenizer:
|
||||||
words = map((lambda x : x if emoticon_re.search(x) else x.lower()), words)
|
words = map((lambda x : x if emoticon_re.search(x) else x.lower()), words)
|
||||||
return words
|
return words
|
||||||
|
|
||||||
def tokenize_random_tweet(self):
|
def __html2string(self, s: str) -> str:
|
||||||
"""
|
|
||||||
If the twitter library is installed and a twitter connection
|
|
||||||
can be established, then tokenize a random tweet.
|
|
||||||
"""
|
|
||||||
try:
|
|
||||||
import twitter
|
|
||||||
except ImportError:
|
|
||||||
print "Apologies. The random tweet functionality requires the Python twitter library: http://code.google.com/p/python-twitter/"
|
|
||||||
from random import shuffle
|
|
||||||
api = twitter.Api()
|
|
||||||
tweets = api.GetPublicTimeline()
|
|
||||||
if tweets:
|
|
||||||
for tweet in tweets:
|
|
||||||
if tweet.user.lang == 'en':
|
|
||||||
return self.tokenize(tweet.text)
|
|
||||||
else:
|
|
||||||
raise Exception("Apologies. I couldn't get Twitter to give me a public English-language tweet. Perhaps try again")
|
|
||||||
|
|
||||||
def __html2unicode(self, s):
|
|
||||||
"""
|
"""
|
||||||
Internal metod that seeks to replace all the HTML entities in
|
Internal metod that seeks to replace all the HTML entities in
|
||||||
s with their corresponding unicode characters.
|
s with their corresponding unicode characters.
|
||||||
|
|
Loading…
Reference in a new issue