1
Fork 0
mirror of https://github.com/Steffo99/unimore-bda-6.git synced 2024-11-25 01:04:19 +00:00
bda-6-steffo/unimore_bda_6/tokenizer/lower.py
2023-02-12 05:11:58 +01:00

14 lines
338 B
Python

import typing as t
from .base import BaseTokenizer
class LowercaseTokenizer(BaseTokenizer):
"""
Tokenizer which converts the words to lowercase before splitting them with `str.split`.
"""
def tokenize(self, text: str) -> t.Iterator[str]:
text = text.lower()
tokens = text.split()
return tokens