2023-02-12 04:11:58 +00:00
|
|
|
import typing as t
|
|
|
|
import abc
|
|
|
|
from ..database.datatypes import TextReview, TokenizedReview
|
2023-02-03 22:27:44 +00:00
|
|
|
|
|
|
|
|
2023-02-12 04:11:58 +00:00
|
|
|
class BaseTokenizer(metaclass=abc.ABCMeta):
|
2023-02-03 22:27:44 +00:00
|
|
|
"""
|
|
|
|
The base for all tokenizers in this project.
|
|
|
|
"""
|
|
|
|
|
|
|
|
def __repr__(self):
|
2023-02-12 04:11:58 +00:00
|
|
|
return f"<{self.__class__.__qualname__}>"
|
2023-02-03 22:27:44 +00:00
|
|
|
|
2023-02-12 04:11:58 +00:00
|
|
|
@abc.abstractmethod
|
|
|
|
def tokenize(self, text: str) -> t.Iterator[str]:
|
2023-02-03 22:27:44 +00:00
|
|
|
"""
|
2023-02-10 04:12:07 +00:00
|
|
|
Convert a text `str` into another `str` containing a series of whitespace-separated tokens.
|
2023-02-03 22:27:44 +00:00
|
|
|
"""
|
|
|
|
raise NotImplementedError()
|
2023-02-04 00:36:42 +00:00
|
|
|
|
2023-02-12 04:11:58 +00:00
|
|
|
def tokenize_review(self, review: TextReview) -> TokenizedReview:
|
2023-02-10 04:12:07 +00:00
|
|
|
"""
|
2023-02-12 04:11:58 +00:00
|
|
|
Apply `.tokenize` to the text of a `TextReview`, converting it in a `TokenizedReview`.
|
2023-02-10 04:12:07 +00:00
|
|
|
"""
|
2023-02-12 04:11:58 +00:00
|
|
|
tokens = self.tokenize(review.text)
|
|
|
|
return TokenizedReview(rating=review.rating, tokens=tokens)
|