Do not stringify processed tokens

2025-03-29 03:40:35 +00:00 · 2023-02-14 17:51:24 +01:00 · 2023-02-14 17:51:24 +01:00 · 58554c84e0
commit 58554c84e0
parent f0561cf078
1 changed files with 3 additions and 2 deletions
--- a/unimore_bda_6/tokenizer/nltk_word_tokenize.py
+++ b/unimore_bda_6/tokenizer/nltk_word_tokenize.py
@ -1,3 +1,4 @@
+import typing as t
 import nltk
 import nltk.sentiment.util

@ -9,10 +10,10 @@ class NLTKWordTokenizer(BaseTokenizer):
    Tokenizer based on `nltk.word_tokenize`.
    """

-    def tokenize(self, text: str) -> str:
+    def tokenize(self, text: str) -> t.Iterator[str]:
        tokens = nltk.word_tokenize(text)
        nltk.sentiment.util.mark_negation(tokens, shallow=True)
-        return " ".join(tokens)
+        return tokens


 __all__ = (