diff --git a/unimore_bda_6/tokenizer/nltk_word_tokenize.py b/unimore_bda_6/tokenizer/nltk_word_tokenize.py index 9c909a0..39ce5f0 100644 --- a/unimore_bda_6/tokenizer/nltk_word_tokenize.py +++ b/unimore_bda_6/tokenizer/nltk_word_tokenize.py @@ -1,3 +1,4 @@ +import typing as t import nltk import nltk.sentiment.util @@ -9,10 +10,10 @@ class NLTKWordTokenizer(BaseTokenizer): Tokenizer based on `nltk.word_tokenize`. """ - def tokenize(self, text: str) -> str: + def tokenize(self, text: str) -> t.Iterator[str]: tokens = nltk.word_tokenize(text) nltk.sentiment.util.mark_negation(tokens, shallow=True) - return " ".join(tokens) + return tokens __all__ = (