1
Fork 0
mirror of https://github.com/Steffo99/unimore-bda-6.git synced 2024-12-01 12:14:19 +00:00
bda-6-steffo/unimore_bda_6/tokenizer/plain.py

14 lines
319 B
Python
Raw Normal View History

2023-02-12 04:11:58 +00:00
import typing as t
2023-02-08 18:46:05 +00:00
from .base import BaseTokenizer
class PlainTokenizer(BaseTokenizer):
"""
2023-02-12 04:11:58 +00:00
Tokenizer which just splits the text into tokens by separating them at whitespaces with `str.split`.
2023-02-08 18:46:05 +00:00
"""
2023-02-12 04:11:58 +00:00
def tokenize(self, text: str) -> t.Iterator[str]:
tokens = text.split()
return tokens