1
Fork 0
mirror of https://github.com/Steffo99/unimore-bda-6.git synced 2024-11-22 16:04:18 +00:00
bda-6-steffo/unimore_bda_6/tokenizer/hugging.py

37 lines
923 B
Python
Raw Normal View History

2023-02-11 03:32:17 +00:00
import abc
import tokenizers
2023-02-12 04:11:58 +00:00
import typing as t
2023-02-11 03:32:17 +00:00
from .base import BaseTokenizer
class HuggingTokenizer(BaseTokenizer, metaclass=abc.ABCMeta):
2023-02-12 04:11:58 +00:00
"""
Abstract tokenizer to implement any tokenizer based on HuggingFace `tokenizers.Tokenizer`.
"""
2023-02-11 03:32:17 +00:00
def __init__(self):
super().__init__()
self.hug: tokenizers.Tokenizer = self._build_hugging_tokenizer()
2023-02-15 17:57:30 +00:00
@abc.abstractmethod
2023-02-11 03:32:17 +00:00
def _build_hugging_tokenizer(self) -> tokenizers.Tokenizer:
raise NotImplementedError()
2023-02-12 04:11:58 +00:00
def tokenize(self, text: str) -> t.Iterator[str]:
return self.hug.encode(text).tokens
2023-02-11 03:32:17 +00:00
class HuggingBertTokenizer(HuggingTokenizer):
2023-02-12 04:11:58 +00:00
"""
Tokenizer based on the `bert-base-cased <https://huggingface.co/bert-base-cased>`_ tokenizer.
"""
2023-02-11 03:32:17 +00:00
def _build_hugging_tokenizer(self):
return tokenizers.Tokenizer.from_pretrained("bert-base-cased")
__all__ = (
"HuggingBertTokenizer",
)