From 58554c84e07fb922cb461a4ff7e37e8f070e92ee Mon Sep 17 00:00:00 2001 From: Stefano Pigozzi Date: Tue, 14 Feb 2023 17:51:24 +0100 Subject: [PATCH] Do not stringify processed tokens --- unimore_bda_6/tokenizer/nltk_word_tokenize.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/unimore_bda_6/tokenizer/nltk_word_tokenize.py b/unimore_bda_6/tokenizer/nltk_word_tokenize.py index 9c909a0..39ce5f0 100644 --- a/unimore_bda_6/tokenizer/nltk_word_tokenize.py +++ b/unimore_bda_6/tokenizer/nltk_word_tokenize.py @@ -1,3 +1,4 @@ +import typing as t import nltk import nltk.sentiment.util @@ -9,10 +10,10 @@ class NLTKWordTokenizer(BaseTokenizer): Tokenizer based on `nltk.word_tokenize`. """ - def tokenize(self, text: str) -> str: + def tokenize(self, text: str) -> t.Iterator[str]: tokens = nltk.word_tokenize(text) nltk.sentiment.util.mark_negation(tokens, shallow=True) - return " ".join(tokens) + return tokens __all__ = (