1
Fork 0
mirror of https://github.com/Steffo99/unimore-bda-6.git synced 2024-11-22 16:04:18 +00:00
bda-6-steffo/unimore_bda_6/tokenizer/hugging.py

26 lines
657 B
Python

import abc
import tokenizers
from .base import BaseTokenizer
class HuggingTokenizer(BaseTokenizer, metaclass=abc.ABCMeta):
def __init__(self):
super().__init__()
self.hug: tokenizers.Tokenizer = self._build_hugging_tokenizer()
def _build_hugging_tokenizer(self) -> tokenizers.Tokenizer:
raise NotImplementedError()
def tokenize_plain(self, text: str) -> str:
return " ".join(self.hug.encode(text).tokens)
class HuggingBertTokenizer(HuggingTokenizer):
def _build_hugging_tokenizer(self):
return tokenizers.Tokenizer.from_pretrained("bert-base-cased")
__all__ = (
"HuggingBertTokenizer",
)