1
Fork 0
mirror of https://github.com/Steffo99/unimore-bda-6.git synced 2024-11-27 02:04:20 +00:00
bda-6-steffo/unimore_bda_6/tokenizer/base.py

27 lines
818 B
Python
Raw Normal View History

2023-02-12 04:11:58 +00:00
import typing as t
import abc
from ..database.datatypes import TextReview, TokenizedReview
2023-02-03 22:27:44 +00:00
2023-02-12 04:11:58 +00:00
class BaseTokenizer(metaclass=abc.ABCMeta):
2023-02-03 22:27:44 +00:00
"""
The base for all tokenizers in this project.
"""
def __repr__(self):
2023-02-12 04:11:58 +00:00
return f"<{self.__class__.__qualname__}>"
2023-02-03 22:27:44 +00:00
2023-02-12 04:11:58 +00:00
@abc.abstractmethod
def tokenize(self, text: str) -> t.Iterator[str]:
2023-02-03 22:27:44 +00:00
"""
2023-02-10 04:12:07 +00:00
Convert a text `str` into another `str` containing a series of whitespace-separated tokens.
2023-02-03 22:27:44 +00:00
"""
raise NotImplementedError()
2023-02-04 00:36:42 +00:00
2023-02-12 04:11:58 +00:00
def tokenize_review(self, review: TextReview) -> TokenizedReview:
2023-02-10 04:12:07 +00:00
"""
2023-02-12 04:11:58 +00:00
Apply `.tokenize` to the text of a `TextReview`, converting it in a `TokenizedReview`.
2023-02-10 04:12:07 +00:00
"""
2023-02-12 04:11:58 +00:00
tokens = self.tokenize(review.text)
return TokenizedReview(rating=review.rating, tokens=tokens)