1
Fork 0
mirror of https://github.com/Steffo99/unimore-bda-6.git synced 2024-11-24 16:54:20 +00:00
bda-6-steffo/unimore_bda_6/tokenizer/hugging.py
2023-02-15 18:57:30 +01:00

36 lines
923 B
Python

import abc
import tokenizers
import typing as t
from .base import BaseTokenizer
class HuggingTokenizer(BaseTokenizer, metaclass=abc.ABCMeta):
"""
Abstract tokenizer to implement any tokenizer based on HuggingFace `tokenizers.Tokenizer`.
"""
def __init__(self):
super().__init__()
self.hug: tokenizers.Tokenizer = self._build_hugging_tokenizer()
@abc.abstractmethod
def _build_hugging_tokenizer(self) -> tokenizers.Tokenizer:
raise NotImplementedError()
def tokenize(self, text: str) -> t.Iterator[str]:
return self.hug.encode(text).tokens
class HuggingBertTokenizer(HuggingTokenizer):
"""
Tokenizer based on the `bert-base-cased <https://huggingface.co/bert-base-cased>`_ tokenizer.
"""
def _build_hugging_tokenizer(self):
return tokenizers.Tokenizer.from_pretrained("bert-base-cased")
__all__ = (
"HuggingBertTokenizer",
)