1
Fork 0
mirror of https://github.com/Steffo99/unimore-bda-6.git synced 2024-11-21 23:44:19 +00:00

Do not stringify processed tokens

This commit is contained in:
Steffo 2023-02-14 17:51:24 +01:00
parent f0561cf078
commit 58554c84e0
Signed by: steffo
GPG key ID: 2A24051445686895

View file

@ -1,3 +1,4 @@
import typing as t
import nltk import nltk
import nltk.sentiment.util import nltk.sentiment.util
@ -9,10 +10,10 @@ class NLTKWordTokenizer(BaseTokenizer):
Tokenizer based on `nltk.word_tokenize`. Tokenizer based on `nltk.word_tokenize`.
""" """
def tokenize(self, text: str) -> str: def tokenize(self, text: str) -> t.Iterator[str]:
tokens = nltk.word_tokenize(text) tokens = nltk.word_tokenize(text)
nltk.sentiment.util.mark_negation(tokens, shallow=True) nltk.sentiment.util.mark_negation(tokens, shallow=True)
return " ".join(tokens) return tokens
__all__ = ( __all__ = (