1
Fork 0
mirror of https://github.com/Steffo99/unimore-bda-6.git synced 2024-11-24 08:44:19 +00:00
bda-6-steffo/unimore_bda_6/tokenizer/lower.py

15 lines
338 B
Python
Raw Permalink Normal View History

2023-02-12 04:11:58 +00:00
import typing as t
2023-02-04 00:36:42 +00:00
from .base import BaseTokenizer
class LowercaseTokenizer(BaseTokenizer):
2023-02-08 18:46:05 +00:00
"""
2023-02-12 04:11:58 +00:00
Tokenizer which converts the words to lowercase before splitting them with `str.split`.
2023-02-08 18:46:05 +00:00
"""
2023-02-12 04:11:58 +00:00
def tokenize(self, text: str) -> t.Iterator[str]:
2023-02-10 04:12:07 +00:00
text = text.lower()
2023-02-12 04:11:58 +00:00
tokens = text.split()
return tokens