Label2id tokenizer

`Label2IDTokenizer`

Bases: TokenizerSpec

Initializes simple Char Tokenizer.

Intended to be used for extracting class labels for classification models such as secondary structure prediction model, where each class is encoded with a character (ex. "C", "H", "E")

Examples:

>>> tokenizer = Label2IDTokenizer()
>>> seqs = ['CHE', 'CCC', 'EHH']
>>> tokenizer = tokenizer.build_vocab(s)

Source code in bionemo/llm/data/label2id_tokenizer.py

class Label2IDTokenizer(TokenizerSpec):
    """Initializes simple Char Tokenizer.

    Intended to be used for extracting class labels
    for classification models such as secondary
    structure prediction model, where each class is
    encoded with a character (ex. "C", "H", "E")

    Examples:
            >>> tokenizer = Label2IDTokenizer()
            >>> seqs = ['CHE', 'CCC', 'EHH']
            >>> tokenizer = tokenizer.build_vocab(s)

    """

    def __init__(self) -> None:  # noqa: D107
        super().__init__()
        self.vocab: Dict[str, int] = {}
        self.decode_vocab: Dict[int, str] = {id_: token for token, id_ in self.vocab.items()}

    @property
    def vocab_size(self) -> int:
        """Return the size of the vocab being used."""
        return len(self.vocab)

    def text_to_tokens(self, text: str) -> List[str]:  # noqa: D102
        return list(text)

    def tokens_to_text(self, tokens: List[str]) -> str:  # noqa: D102
        return "".join(tokens)

    def tokens_to_ids(self, tokens: List[str]) -> List[int]:
        """Convert tokens to indexes/ids.

        Args:
            tokens: Containing tokens
        Returns:
            Containing ID's for each token
        """
        ids = []
        for token in tokens:
            id_ = self.vocab.get(token)
            if id_ is None:
                raise ValueError(f"Do not recognize token: {token}")
            else:
                ids.append(id_)
        return ids

    def ids_to_tokens(self, ids: List[int]) -> List[str]:
        """Convert Ids to tokens.

        Args:
            ids: Containg ids for each token
        Returns:
            Containing tokens
        """
        tokens = []
        for id_ in ids:
            token = self.decode_vocab.get(id_)
            if token is None:
                raise ValueError(f"Do not recognize ID: {id_}")
            tokens.append(token)
        return tokens

    def text_to_ids(self, text: str) -> List[int]:
        """Converts text to ids.

        Args:
            text (str): String containing text to convert
        Returns:
            (List[int]): Id's corresponding to the tokenization
            of the text
        """
        tokens = self.text_to_tokens(text)
        return self.tokens_to_ids(tokens)

    def ids_to_text(self, ids: List[int]) -> str:  # noqa: D102
        tokens = self.ids_to_tokens(ids)
        return self.tokens_to_text(tokens)

    def build_vocab(self, strings: Union[str, Iterable[str]]) -> "Label2IDTokenizer":
        """Builds the vocabulary of the tokenizer from strings
        Args:
            strings: (Union[str, Iterable[str]]): Strings to
                build the vocabulary with. If a string is supplied,
                then the vocabulary is built from the single string.
                Otherwise, the vocabulary is progressively built
                from all the strings in `strings`.
        """  # noqa: D205
        if isinstance(strings, str):
            strings = [strings]

        for string in strings:
            for token in string:
                if token not in self.vocab:
                    self.vocab[token] = len(self.vocab)
                    self.decode_vocab[self.vocab[token]] = token

        return self

`vocab_size` `property`

Return the size of the vocab being used.

`build_vocab(strings)`

Builds the vocabulary of the tokenizer from strings Args: strings: (Union[str, Iterable[str]]): Strings to build the vocabulary with. If a string is supplied, then the vocabulary is built from the single string. Otherwise, the vocabulary is progressively built from all the strings in strings.

Source code in bionemo/llm/data/label2id_tokenizer.py

def build_vocab(self, strings: Union[str, Iterable[str]]) -> "Label2IDTokenizer":
    """Builds the vocabulary of the tokenizer from strings
    Args:
        strings: (Union[str, Iterable[str]]): Strings to
            build the vocabulary with. If a string is supplied,
            then the vocabulary is built from the single string.
            Otherwise, the vocabulary is progressively built
            from all the strings in `strings`.
    """  # noqa: D205
    if isinstance(strings, str):
        strings = [strings]

    for string in strings:
        for token in string:
            if token not in self.vocab:
                self.vocab[token] = len(self.vocab)
                self.decode_vocab[self.vocab[token]] = token

    return self

`ids_to_tokens(ids)`

Convert Ids to tokens.

Parameters:

Name	Type	Description	Default
`ids`	`List[int]`	Containg ids for each token	required

Returns: Containing tokens

Source code in bionemo/llm/data/label2id_tokenizer.py

def ids_to_tokens(self, ids: List[int]) -> List[str]:
    """Convert Ids to tokens.

    Args:
        ids: Containg ids for each token
    Returns:
        Containing tokens
    """
    tokens = []
    for id_ in ids:
        token = self.decode_vocab.get(id_)
        if token is None:
            raise ValueError(f"Do not recognize ID: {id_}")
        tokens.append(token)
    return tokens

`text_to_ids(text)`

Converts text to ids.

Parameters:

Name	Type	Description	Default
`text`	`str`	String containing text to convert	required

Returns: (List[int]): Id's corresponding to the tokenization of the text

Source code in bionemo/llm/data/label2id_tokenizer.py

def text_to_ids(self, text: str) -> List[int]:
    """Converts text to ids.

    Args:
        text (str): String containing text to convert
    Returns:
        (List[int]): Id's corresponding to the tokenization
        of the text
    """
    tokens = self.text_to_tokens(text)
    return self.tokens_to_ids(tokens)

`tokens_to_ids(tokens)`

Convert tokens to indexes/ids.

Parameters:

Name	Type	Description	Default
`tokens`	`List[str]`	Containing tokens	required

Returns: Containing ID's for each token

Source code in bionemo/llm/data/label2id_tokenizer.py

def tokens_to_ids(self, tokens: List[str]) -> List[int]:
    """Convert tokens to indexes/ids.

    Args:
        tokens: Containing tokens
    Returns:
        Containing ID's for each token
    """
    ids = []
    for token in tokens:
        id_ = self.vocab.get(token)
        if id_ is None:
            raise ValueError(f"Do not recognize token: {token}")
        else:
            ids.append(id_)
    return ids

Label2id tokenizer