Skip to content

Label2id tokenizer

Label2IDTokenizer

Bases: TokenizerSpec

Initializes simple Char Tokenizer.

Intended to be used for extracting class labels for classification models such as secondary structure prediction model, where each class is encoded with a character (ex. "C", "H", "E")

Examples:

>>> tokenizer = Label2IDTokenizer()
>>> seqs = ['CHE', 'CCC', 'EHH']
>>> tokenizer = tokenizer.build_vocab(s)
Source code in bionemo/llm/data/label2id_tokenizer.py
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
class Label2IDTokenizer(TokenizerSpec):
    """Initializes simple Char Tokenizer.

    Intended to be used for extracting class labels
    for classification models such as secondary
    structure prediction model, where each class is
    encoded with a character (ex. "C", "H", "E")

    Examples:
            >>> tokenizer = Label2IDTokenizer()
            >>> seqs = ['CHE', 'CCC', 'EHH']
            >>> tokenizer = tokenizer.build_vocab(s)

    """

    def __init__(self) -> None:  # noqa: D107
        super().__init__()
        self.vocab: Dict[str, int] = {}
        self.decode_vocab: Dict[int, str] = {id_: token for token, id_ in self.vocab.items()}

    @property
    def vocab_size(self) -> int:
        """Return the size of the vocab being used."""
        return len(self.vocab)

    def text_to_tokens(self, text: str) -> List[str]:  # noqa: D102
        return list(text)

    def tokens_to_text(self, tokens: List[str]) -> str:  # noqa: D102
        return "".join(tokens)

    def tokens_to_ids(self, tokens: List[str]) -> List[int]:
        """Convert tokens to indexes/ids.

        Args:
            tokens: Containing tokens
        Returns:
            Containing ID's for each token
        """
        ids = []
        for token in tokens:
            id_ = self.vocab.get(token)
            if id_ is None:
                raise ValueError(f"Do not recognize token: {token}")
            else:
                ids.append(id_)
        return ids

    def ids_to_tokens(self, ids: List[int]) -> List[str]:
        """Convert Ids to tokens.

        Args:
            ids: Containg ids for each token
        Returns:
            Containing tokens
        """
        tokens = []
        for id_ in ids:
            token = self.decode_vocab.get(id_)
            if token is None:
                raise ValueError(f"Do not recognize ID: {id_}")
            tokens.append(token)
        return tokens

    def text_to_ids(self, text: str) -> List[int]:
        """Converts text to ids.

        Args:
            text (str): String containing text to convert
        Returns:
            (List[int]): Id's corresponding to the tokenization
            of the text
        """
        tokens = self.text_to_tokens(text)
        return self.tokens_to_ids(tokens)

    def ids_to_text(self, ids: List[int]) -> str:  # noqa: D102
        tokens = self.ids_to_tokens(ids)
        return self.tokens_to_text(tokens)

    def build_vocab(self, strings: Union[str, Iterable[str]]) -> "Label2IDTokenizer":
        """Builds the vocabulary of the tokenizer from strings
        Args:
            strings: (Union[str, Iterable[str]]): Strings to
                build the vocabulary with. If a string is supplied,
                then the vocabulary is built from the single string.
                Otherwise, the vocabulary is progressively built
                from all the strings in `strings`.
        """  # noqa: D205
        if isinstance(strings, str):
            strings = [strings]

        for string in strings:
            for token in string:
                if token not in self.vocab:
                    self.vocab[token] = len(self.vocab)
                    self.decode_vocab[self.vocab[token]] = token

        return self

vocab_size property

Return the size of the vocab being used.

build_vocab(strings)

Builds the vocabulary of the tokenizer from strings Args: strings: (Union[str, Iterable[str]]): Strings to build the vocabulary with. If a string is supplied, then the vocabulary is built from the single string. Otherwise, the vocabulary is progressively built from all the strings in strings.

Source code in bionemo/llm/data/label2id_tokenizer.py
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
def build_vocab(self, strings: Union[str, Iterable[str]]) -> "Label2IDTokenizer":
    """Builds the vocabulary of the tokenizer from strings
    Args:
        strings: (Union[str, Iterable[str]]): Strings to
            build the vocabulary with. If a string is supplied,
            then the vocabulary is built from the single string.
            Otherwise, the vocabulary is progressively built
            from all the strings in `strings`.
    """  # noqa: D205
    if isinstance(strings, str):
        strings = [strings]

    for string in strings:
        for token in string:
            if token not in self.vocab:
                self.vocab[token] = len(self.vocab)
                self.decode_vocab[self.vocab[token]] = token

    return self

ids_to_tokens(ids)

Convert Ids to tokens.

Parameters:

Name Type Description Default
ids List[int]

Containg ids for each token

required

Returns: Containing tokens

Source code in bionemo/llm/data/label2id_tokenizer.py
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
def ids_to_tokens(self, ids: List[int]) -> List[str]:
    """Convert Ids to tokens.

    Args:
        ids: Containg ids for each token
    Returns:
        Containing tokens
    """
    tokens = []
    for id_ in ids:
        token = self.decode_vocab.get(id_)
        if token is None:
            raise ValueError(f"Do not recognize ID: {id_}")
        tokens.append(token)
    return tokens

text_to_ids(text)

Converts text to ids.

Parameters:

Name Type Description Default
text str

String containing text to convert

required

Returns: (List[int]): Id's corresponding to the tokenization of the text

Source code in bionemo/llm/data/label2id_tokenizer.py
89
90
91
92
93
94
95
96
97
98
99
def text_to_ids(self, text: str) -> List[int]:
    """Converts text to ids.

    Args:
        text (str): String containing text to convert
    Returns:
        (List[int]): Id's corresponding to the tokenization
        of the text
    """
    tokens = self.text_to_tokens(text)
    return self.tokens_to_ids(tokens)

tokens_to_ids(tokens)

Convert tokens to indexes/ids.

Parameters:

Name Type Description Default
tokens List[str]

Containing tokens

required

Returns: Containing ID's for each token

Source code in bionemo/llm/data/label2id_tokenizer.py
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
def tokens_to_ids(self, tokens: List[str]) -> List[int]:
    """Convert tokens to indexes/ids.

    Args:
        tokens: Containing tokens
    Returns:
        Containing ID's for each token
    """
    ids = []
    for token in tokens:
        id_ = self.vocab.get(token)
        if id_ is None:
            raise ValueError(f"Do not recognize token: {token}")
        else:
            ids.append(id_)
    return ids