2023-02-11 03:32:17 +00:00
|
|
|
import abc
|
|
|
|
import tokenizers
|
2023-02-12 04:11:58 +00:00
|
|
|
import typing as t
|
2023-02-11 03:32:17 +00:00
|
|
|
|
|
|
|
from .base import BaseTokenizer
|
|
|
|
|
|
|
|
|
|
|
|
class HuggingTokenizer(BaseTokenizer, metaclass=abc.ABCMeta):
|
2023-02-12 04:11:58 +00:00
|
|
|
"""
|
|
|
|
Abstract tokenizer to implement any tokenizer based on HuggingFace `tokenizers.Tokenizer`.
|
|
|
|
"""
|
|
|
|
|
2023-02-11 03:32:17 +00:00
|
|
|
def __init__(self):
|
|
|
|
super().__init__()
|
|
|
|
self.hug: tokenizers.Tokenizer = self._build_hugging_tokenizer()
|
|
|
|
|
2023-02-15 17:57:30 +00:00
|
|
|
@abc.abstractmethod
|
2023-02-11 03:32:17 +00:00
|
|
|
def _build_hugging_tokenizer(self) -> tokenizers.Tokenizer:
|
|
|
|
raise NotImplementedError()
|
|
|
|
|
2023-02-12 04:11:58 +00:00
|
|
|
def tokenize(self, text: str) -> t.Iterator[str]:
|
|
|
|
return self.hug.encode(text).tokens
|
2023-02-11 03:32:17 +00:00
|
|
|
|
|
|
|
|
|
|
|
class HuggingBertTokenizer(HuggingTokenizer):
|
2023-02-12 04:11:58 +00:00
|
|
|
"""
|
|
|
|
Tokenizer based on the `bert-base-cased <https://huggingface.co/bert-base-cased>`_ tokenizer.
|
|
|
|
"""
|
|
|
|
|
2023-02-11 03:32:17 +00:00
|
|
|
def _build_hugging_tokenizer(self):
|
|
|
|
return tokenizers.Tokenizer.from_pretrained("bert-base-cased")
|
|
|
|
|
|
|
|
|
|
|
|
__all__ = (
|
|
|
|
"HuggingBertTokenizer",
|
|
|
|
)
|