bda-6-steffo/unimore_bda_6/tokenizer/base.py

import tensorflow


class BaseTokenizer:
    """
    The base for all tokenizers in this project.
    """

    def __repr__(self):
        return f"{self.__class__.__qualname__}()"

    @staticmethod
    def __not_implemented(f):
        f.__notimplemented__ = True
        return f

    def supports_plain(self) -> bool:
        return not getattr(self.tokenize_plain, "__notimplemented__", False)

    def supports_tensorflow(self) -> bool:
        return not getattr(self.tokenize_tensorflow, "__notimplemented__", False)

    @__not_implemented
    def tokenize_plain(self, text: str) -> str:
        """
        Convert a text `str` into another `str` containing a series of whitespace-separated tokens.
        """
        raise NotImplementedError()

    def tokenize_and_split_plain(self, text: str) -> list[str]:
        """
        Run `.tokenize_plain`, then split the result using `str.split`.
        """
        return self.tokenize_plain(text).split()

    @__not_implemented
    def tokenize_tensorflow(self, text: "tensorflow.Tensor") -> "tensorflow.Tensor":
        """
        Convert a `tensorflow.Tensor` string into another `tensorflow.Tensor` space-separated string.
        """
        raise NotImplementedError()

    def tokenize_tensorflow_and_expand_dims(self, text: "tensorflow.Tensor") -> "tensorflow.Tensor":
        """
        Run `.tokenize_tensorflow`, then add a dimension to the tensor for reasons unknown to me, but required to get `tensorflow` to work properly.
        """
        text = self.tokenize_tensorflow(text)
        text = tensorflow.expand_dims(text, -1, name="tokens")
        return text
stop here for now 2023-02-04 01:36:42 +01:00			`import tensorflow`
New version working nicely 2023-02-03 23:27:44 +01:00

stop here for now 2023-02-04 01:36:42 +01:00			`class BaseTokenizer:`
New version working nicely 2023-02-03 23:27:44 +01:00			`"""`
			`The base for all tokenizers in this project.`
			`"""`

			`def __repr__(self):`
			`return f"{self.__class__.__qualname__}()"`

stop here for now 2023-02-04 01:36:42 +01:00			`@staticmethod`
			`def __not_implemented(f):`
			`f.__notimplemented__ = True`
			`return f`

enough 2023-02-08 19:46:05 +01:00			`def supports_plain(self) -> bool:`
			`return not getattr(self.tokenize_plain, "__notimplemented__", False)`
stop here for now 2023-02-04 01:36:42 +01:00
enough 2023-02-08 19:46:05 +01:00			`def supports_tensorflow(self) -> bool:`
			`return not getattr(self.tokenize_tensorflow, "__notimplemented__", False)`
stop here for now 2023-02-04 01:36:42 +01:00
			`@__not_implemented`
Improve the tokenizer situation 2023-02-10 05:12:07 +01:00			`def tokenize_plain(self, text: str) -> str:`
New version working nicely 2023-02-03 23:27:44 +01:00			`"""`
Improve the tokenizer situation 2023-02-10 05:12:07 +01:00			Convert a text `str` into another `str` containing a series of whitespace-separated tokens.
New version working nicely 2023-02-03 23:27:44 +01:00			`"""`
			`raise NotImplementedError()`
stop here for now 2023-02-04 01:36:42 +01:00
Improve the tokenizer situation 2023-02-10 05:12:07 +01:00			`def tokenize_and_split_plain(self, text: str) -> list[str]:`
			`"""`
			Run `.tokenize_plain`, then split the result using `str.split`.
			`"""`
			`return self.tokenize_plain(text).split()`

stop here for now 2023-02-04 01:36:42 +01:00			`@__not_implemented`
enough 2023-02-08 19:46:05 +01:00			`def tokenize_tensorflow(self, text: "tensorflow.Tensor") -> "tensorflow.Tensor":`
stop here for now 2023-02-04 01:36:42 +01:00			`"""`
			Convert a `tensorflow.Tensor` string into another `tensorflow.Tensor` space-separated string.
			`"""`
			`raise NotImplementedError()`
Improve the tokenizer situation 2023-02-10 05:12:07 +01:00
			`def tokenize_tensorflow_and_expand_dims(self, text: "tensorflow.Tensor") -> "tensorflow.Tensor":`
			`"""`
			Run `.tokenize_tensorflow`, then add a dimension to the tensor for reasons unknown to me, but required to get `tensorflow` to work properly.
			`"""`
			`text = self.tokenize_tensorflow(text)`
			`text = tensorflow.expand_dims(text, -1, name="tokens")`
			`return text`