2023-02-04 00:36:42 +00:00
|
|
|
import tensorflow
|
2023-02-03 22:27:44 +00:00
|
|
|
|
|
|
|
|
2023-02-04 00:36:42 +00:00
|
|
|
class BaseTokenizer:
|
2023-02-03 22:27:44 +00:00
|
|
|
"""
|
|
|
|
The base for all tokenizers in this project.
|
|
|
|
"""
|
|
|
|
|
|
|
|
def __repr__(self):
|
|
|
|
return f"{self.__class__.__qualname__}()"
|
|
|
|
|
2023-02-04 00:36:42 +00:00
|
|
|
@staticmethod
|
|
|
|
def __not_implemented(f):
|
|
|
|
f.__notimplemented__ = True
|
|
|
|
return f
|
|
|
|
|
2023-02-08 18:46:05 +00:00
|
|
|
def supports_plain(self) -> bool:
|
|
|
|
return not getattr(self.tokenize_plain, "__notimplemented__", False)
|
2023-02-04 00:36:42 +00:00
|
|
|
|
2023-02-08 18:46:05 +00:00
|
|
|
def supports_tensorflow(self) -> bool:
|
|
|
|
return not getattr(self.tokenize_tensorflow, "__notimplemented__", False)
|
2023-02-04 00:36:42 +00:00
|
|
|
|
|
|
|
@__not_implemented
|
2023-02-10 04:12:07 +00:00
|
|
|
def tokenize_plain(self, text: str) -> str:
|
2023-02-03 22:27:44 +00:00
|
|
|
"""
|
2023-02-10 04:12:07 +00:00
|
|
|
Convert a text `str` into another `str` containing a series of whitespace-separated tokens.
|
2023-02-03 22:27:44 +00:00
|
|
|
"""
|
|
|
|
raise NotImplementedError()
|
2023-02-04 00:36:42 +00:00
|
|
|
|
2023-02-10 04:12:07 +00:00
|
|
|
def tokenize_and_split_plain(self, text: str) -> list[str]:
|
|
|
|
"""
|
|
|
|
Run `.tokenize_plain`, then split the result using `str.split`.
|
|
|
|
"""
|
|
|
|
return self.tokenize_plain(text).split()
|
|
|
|
|
2023-02-04 00:36:42 +00:00
|
|
|
@__not_implemented
|
2023-02-08 18:46:05 +00:00
|
|
|
def tokenize_tensorflow(self, text: "tensorflow.Tensor") -> "tensorflow.Tensor":
|
2023-02-04 00:36:42 +00:00
|
|
|
"""
|
|
|
|
Convert a `tensorflow.Tensor` string into another `tensorflow.Tensor` space-separated string.
|
|
|
|
"""
|
|
|
|
raise NotImplementedError()
|
2023-02-10 04:12:07 +00:00
|
|
|
|
|
|
|
def tokenize_tensorflow_and_expand_dims(self, text: "tensorflow.Tensor") -> "tensorflow.Tensor":
|
|
|
|
"""
|
|
|
|
Run `.tokenize_tensorflow`, then add a dimension to the tensor for reasons unknown to me, but required to get `tensorflow` to work properly.
|
|
|
|
"""
|
|
|
|
text = self.tokenize_tensorflow(text)
|
|
|
|
text = tensorflow.expand_dims(text, -1, name="tokens")
|
|
|
|
return text
|