mirror of
https://github.com/Steffo99/unimore-bda-6.git
synced 2024-11-21 23:44:19 +00:00
Do not stringify processed tokens
This commit is contained in:
parent
f0561cf078
commit
58554c84e0
1 changed files with 3 additions and 2 deletions
|
@ -1,3 +1,4 @@
|
||||||
|
import typing as t
|
||||||
import nltk
|
import nltk
|
||||||
import nltk.sentiment.util
|
import nltk.sentiment.util
|
||||||
|
|
||||||
|
@ -9,10 +10,10 @@ class NLTKWordTokenizer(BaseTokenizer):
|
||||||
Tokenizer based on `nltk.word_tokenize`.
|
Tokenizer based on `nltk.word_tokenize`.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def tokenize(self, text: str) -> str:
|
def tokenize(self, text: str) -> t.Iterator[str]:
|
||||||
tokens = nltk.word_tokenize(text)
|
tokens = nltk.word_tokenize(text)
|
||||||
nltk.sentiment.util.mark_negation(tokens, shallow=True)
|
nltk.sentiment.util.mark_negation(tokens, shallow=True)
|
||||||
return " ".join(tokens)
|
return tokens
|
||||||
|
|
||||||
|
|
||||||
__all__ = (
|
__all__ = (
|
||||||
|
|
Loading…
Reference in a new issue