1
Fork 0
mirror of https://github.com/Steffo99/unimore-bda-6.git synced 2024-11-21 15:34:18 +00:00

Do not stringify processed tokens

This commit is contained in:
Steffo 2023-02-14 17:51:24 +01:00
parent f0561cf078
commit 58554c84e0
Signed by: steffo
GPG key ID: 2A24051445686895

View file

@ -1,3 +1,4 @@
import typing as t
import nltk
import nltk.sentiment.util
@ -9,10 +10,10 @@ class NLTKWordTokenizer(BaseTokenizer):
Tokenizer based on `nltk.word_tokenize`.
"""
def tokenize(self, text: str) -> str:
def tokenize(self, text: str) -> t.Iterator[str]:
tokens = nltk.word_tokenize(text)
nltk.sentiment.util.mark_negation(tokens, shallow=True)
return " ".join(tokens)
return tokens
__all__ = (