1
Fork 0
mirror of https://github.com/Steffo99/unimore-bda-6.git synced 2024-11-22 16:04:18 +00:00
bda-6-steffo/unimore_bda_6/tokenizer/base.py

36 lines
987 B
Python
Raw Normal View History

2023-02-04 00:36:42 +00:00
import tensorflow
2023-02-03 22:27:44 +00:00
2023-02-04 00:36:42 +00:00
class BaseTokenizer:
2023-02-03 22:27:44 +00:00
"""
The base for all tokenizers in this project.
"""
def __repr__(self):
return f"{self.__class__.__qualname__}()"
2023-02-04 00:36:42 +00:00
@staticmethod
def __not_implemented(f):
f.__notimplemented__ = True
return f
2023-02-08 18:46:05 +00:00
def supports_plain(self) -> bool:
return not getattr(self.tokenize_plain, "__notimplemented__", False)
2023-02-04 00:36:42 +00:00
2023-02-08 18:46:05 +00:00
def supports_tensorflow(self) -> bool:
return not getattr(self.tokenize_tensorflow, "__notimplemented__", False)
2023-02-04 00:36:42 +00:00
@__not_implemented
2023-02-08 18:46:05 +00:00
def tokenize_plain(self, text: str) -> list[str]:
2023-02-03 22:27:44 +00:00
"""
Convert a text string into a list of tokens.
"""
raise NotImplementedError()
2023-02-04 00:36:42 +00:00
@__not_implemented
2023-02-08 18:46:05 +00:00
def tokenize_tensorflow(self, text: "tensorflow.Tensor") -> "tensorflow.Tensor":
2023-02-04 00:36:42 +00:00
"""
Convert a `tensorflow.Tensor` string into another `tensorflow.Tensor` space-separated string.
"""
raise NotImplementedError()