1
Fork 0
mirror of https://github.com/Steffo99/unimore-bda-6.git synced 2024-11-22 16:04:18 +00:00
bda-6-steffo/unimore_bda_6/tokenizer/base.py

51 lines
1.6 KiB
Python

import tensorflow
class BaseTokenizer:
"""
The base for all tokenizers in this project.
"""
def __repr__(self):
return f"{self.__class__.__qualname__}()"
@staticmethod
def __not_implemented(f):
f.__notimplemented__ = True
return f
def supports_plain(self) -> bool:
return not getattr(self.tokenize_plain, "__notimplemented__", False)
def supports_tensorflow(self) -> bool:
return not getattr(self.tokenize_tensorflow, "__notimplemented__", False)
@__not_implemented
def tokenize_plain(self, text: str) -> str:
"""
Convert a text `str` into another `str` containing a series of whitespace-separated tokens.
"""
raise NotImplementedError()
def tokenize_and_split_plain(self, text: str) -> list[str]:
"""
Run `.tokenize_plain`, then split the result using `str.split`.
"""
text = self.tokenize_plain(text)
text = text.split()
return text
@__not_implemented
def tokenize_tensorflow(self, text: tensorflow.Tensor) -> tensorflow.Tensor:
"""
Convert a `tensorflow.Tensor` string into another `tensorflow.Tensor` space-separated string.
"""
raise NotImplementedError()
def tokenize_tensorflow_and_expand_dims(self, text: tensorflow.Tensor) -> tensorflow.Tensor:
"""
Run `.tokenize_tensorflow`, then add a dimension to the tensor for reasons unknown to me, but required to get `tensorflow` to work properly.
"""
text = self.tokenize_tensorflow(text)
text = tensorflow.expand_dims(text, -1, name="tokens")
return text