Skip to content

Gene tokenizer

GeneTokenizer

Bases: Label2IDTokenizer, IOMixin

Initializes the GeneTokenizer object.

Source code in bionemo/geneformer/tokenizer/gene_tokenizer.py
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
class GeneTokenizer(Label2IDTokenizer, io.IOMixin):
    """Initializes the GeneTokenizer object."""

    cls_token: str = "[CLS]"
    mask_token: str = "[MASK]"
    pad_token: str = "[PAD]"
    sep_token: str = "[SEP]"
    ukw_token: str = "[UKW]"
    special_tokens: Tuple[str, str, str, str, str] = (cls_token, mask_token, pad_token, sep_token, ukw_token)

    def __init__(self, vocab: Dict[str, int], gene_to_ens: Dict[str, str]):  # noqa: D107
        # Sets up vocab/decode_vocab dictionaries, parent class is sateful.
        super().__init__()
        assert set(self.special_tokens).issubset(
            set(vocab.keys())
        ), f"Vocab must contain all of {self.special_tokens}, missing {set(self.special_tokens) - set(vocab.keys())}"
        self.gene_to_ens = deepcopy(gene_to_ens)
        self.ens_to_gene = {v: k for k, v in self.gene_to_ens.items()}
        self.vocab = deepcopy(vocab)
        self.decode_vocab = {v: k for k, v in self.vocab.items()}

    @classmethod
    def from_medians_and_genes_dicts(cls, median_dict: Dict[str, float], gene_to_ens: Dict[str, str]) -> T:
        """Creates a tokenizer from a median dictionary."""
        tokens = list(cls.special_tokens) + list(median_dict.keys())
        vocab = cls._build_vocab(tokens)
        return cls(vocab, gene_to_ens)

    @staticmethod
    def _build_vocab(strings: Union[List[str], str]) -> Dict[str, int]:
        """We override the parent because complete strings are tokens. Otherwise, has the same behavior."""
        vocab: Dict[str, int] = {}
        if isinstance(strings, str):
            strings = [strings]

        for token in strings:
            if token not in vocab:
                vocab[token] = len(vocab)
        return vocab

    def token_to_id(self, token: str) -> int:
        """Converts a token to its corresponding ID.

        Args:
            token: The token to be converted.

        Returns:
            The ID corresponding to the token.
        """
        return self.vocab.get(token)

    @property
    def pad_id(self) -> int:  # noqa: D102
        return self.token_to_id(self.pad_token)

    @property
    def mask_token_id(self) -> int:  # noqa: D102
        return self.token_to_id(self.mask_token)

    @property
    def all_special_ids(self) -> list[int]:  # noqa: D102
        return [self.token_to_id(tok) for tok in self.special_tokens]

    @property
    def class_id(self) -> int:  # noqa: D102
        return self.token_to_id(self.cls_token)

    def tokens_to_ids(self, tokens: List[str]) -> List[int]:  # noqa: D102
        return super().tokens_to_ids(tokens)

    def save_vocab(self, vocab_file: str) -> None:
        """Saves the vocabulary as a newline delimieted vocabulary file, each line represents an int -> token mapping. line number is assumed to be the integer."""
        vocab_dir = os.path.dirname(vocab_file)
        if not os.path.exists(vocab_dir):
            os.makedirs(vocab_dir, exist_ok=True)  # ensure the dir exists but be ok with race conditions.

        to_serialize = {}
        to_serialize["vocab"] = self.vocab
        to_serialize["gene_to_ens"] = self.gene_to_ens

        with open(vocab_file, "w") as f:
            json.dump(to_serialize, f)

    @classmethod
    def from_vocab_file(cls, vocab_file: str) -> None:
        """This method adds a layer on the constructor in the case we are working from a filename instead of a dictionary."""
        if not os.path.exists(vocab_file):
            raise FileNotFoundError(f"Vocab file {vocab_file} not found, run preprocessing to create it.")

        with open(vocab_file) as f:
            to_deserialize = json.load(f)
            vocab = to_deserialize["vocab"]
            gene_to_ens = to_deserialize["gene_to_ens"]

        tokenizer = GeneTokenizer(vocab, gene_to_ens)
        return tokenizer

    def gene_tok_to_ens(self, gene: str) -> str:
        """Converts a gene token to its corresponding Ensembl ID.

        Args:
            gene (str): The gene token to be converted.

        Returns:
            str: The Ensembl ID corresponding to the gene token.
        """
        return self.gene_to_ens[gene]

    def ens_tok_to_gene(self, ens: str) -> str:
        """Converts an Ensembl token to a gene name.

        Args:
            ens (str): The Ensembl token to be converted.

        Returns:
            str: The corresponding gene name.
        """
        return self.ens_to_gene[ens]

    def genes_to_enss(self, genes: List[str]) -> List[str]:
        """Converts a list of gene names to Ensembl IDs.

        Args:
            genes (List[str]): A list of gene names.

        Returns:
            List[str]: A list of corresponding Ensembl IDs.

        Raises:
            ValueError: If a gene name is not found in the gene_to_ens dictionary.
        """
        ens_ids = []
        for gene in genes:
            if gene in self.gene_to_ens:
                ens_ids.append(self.gene_to_ens[gene])
            else:
                raise ValueError(f"{gene} not found")
        return ens_ids

    def enss_to_genes(self, ensemble_ids: List[str]) -> List[str]:
        """Converts a list of ensemble IDs to gene names.

        Args:
            ensemble_ids (List[str]): A list of ensemble IDs.

        Returns:
            List[str]: A list of gene names corresponding to the ensemble IDs.

        Raises:
            ValueError: If an ensemble ID is not found in the mapping.
        """
        genes = []
        for ens_id in ensemble_ids:
            if ens_id in self.ens_to_gene:
                genes.append(self.ens_to_gene[ens_id])
            else:
                raise ValueError(f"{ens_id} not found")
        return genes

ens_tok_to_gene(ens)

Converts an Ensembl token to a gene name.

Parameters:

Name Type Description Default
ens str

The Ensembl token to be converted.

required

Returns:

Name Type Description
str str

The corresponding gene name.

Source code in bionemo/geneformer/tokenizer/gene_tokenizer.py
140
141
142
143
144
145
146
147
148
149
def ens_tok_to_gene(self, ens: str) -> str:
    """Converts an Ensembl token to a gene name.

    Args:
        ens (str): The Ensembl token to be converted.

    Returns:
        str: The corresponding gene name.
    """
    return self.ens_to_gene[ens]

enss_to_genes(ensemble_ids)

Converts a list of ensemble IDs to gene names.

Parameters:

Name Type Description Default
ensemble_ids List[str]

A list of ensemble IDs.

required

Returns:

Type Description
List[str]

List[str]: A list of gene names corresponding to the ensemble IDs.

Raises:

Type Description
ValueError

If an ensemble ID is not found in the mapping.

Source code in bionemo/geneformer/tokenizer/gene_tokenizer.py
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
def enss_to_genes(self, ensemble_ids: List[str]) -> List[str]:
    """Converts a list of ensemble IDs to gene names.

    Args:
        ensemble_ids (List[str]): A list of ensemble IDs.

    Returns:
        List[str]: A list of gene names corresponding to the ensemble IDs.

    Raises:
        ValueError: If an ensemble ID is not found in the mapping.
    """
    genes = []
    for ens_id in ensemble_ids:
        if ens_id in self.ens_to_gene:
            genes.append(self.ens_to_gene[ens_id])
        else:
            raise ValueError(f"{ens_id} not found")
    return genes

from_medians_and_genes_dicts(median_dict, gene_to_ens) classmethod

Creates a tokenizer from a median dictionary.

Source code in bionemo/geneformer/tokenizer/gene_tokenizer.py
53
54
55
56
57
58
@classmethod
def from_medians_and_genes_dicts(cls, median_dict: Dict[str, float], gene_to_ens: Dict[str, str]) -> T:
    """Creates a tokenizer from a median dictionary."""
    tokens = list(cls.special_tokens) + list(median_dict.keys())
    vocab = cls._build_vocab(tokens)
    return cls(vocab, gene_to_ens)

from_vocab_file(vocab_file) classmethod

This method adds a layer on the constructor in the case we are working from a filename instead of a dictionary.

Source code in bionemo/geneformer/tokenizer/gene_tokenizer.py
115
116
117
118
119
120
121
122
123
124
125
126
127
@classmethod
def from_vocab_file(cls, vocab_file: str) -> None:
    """This method adds a layer on the constructor in the case we are working from a filename instead of a dictionary."""
    if not os.path.exists(vocab_file):
        raise FileNotFoundError(f"Vocab file {vocab_file} not found, run preprocessing to create it.")

    with open(vocab_file) as f:
        to_deserialize = json.load(f)
        vocab = to_deserialize["vocab"]
        gene_to_ens = to_deserialize["gene_to_ens"]

    tokenizer = GeneTokenizer(vocab, gene_to_ens)
    return tokenizer

gene_tok_to_ens(gene)

Converts a gene token to its corresponding Ensembl ID.

Parameters:

Name Type Description Default
gene str

The gene token to be converted.

required

Returns:

Name Type Description
str str

The Ensembl ID corresponding to the gene token.

Source code in bionemo/geneformer/tokenizer/gene_tokenizer.py
129
130
131
132
133
134
135
136
137
138
def gene_tok_to_ens(self, gene: str) -> str:
    """Converts a gene token to its corresponding Ensembl ID.

    Args:
        gene (str): The gene token to be converted.

    Returns:
        str: The Ensembl ID corresponding to the gene token.
    """
    return self.gene_to_ens[gene]

genes_to_enss(genes)

Converts a list of gene names to Ensembl IDs.

Parameters:

Name Type Description Default
genes List[str]

A list of gene names.

required

Returns:

Type Description
List[str]

List[str]: A list of corresponding Ensembl IDs.

Raises:

Type Description
ValueError

If a gene name is not found in the gene_to_ens dictionary.

Source code in bionemo/geneformer/tokenizer/gene_tokenizer.py
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
def genes_to_enss(self, genes: List[str]) -> List[str]:
    """Converts a list of gene names to Ensembl IDs.

    Args:
        genes (List[str]): A list of gene names.

    Returns:
        List[str]: A list of corresponding Ensembl IDs.

    Raises:
        ValueError: If a gene name is not found in the gene_to_ens dictionary.
    """
    ens_ids = []
    for gene in genes:
        if gene in self.gene_to_ens:
            ens_ids.append(self.gene_to_ens[gene])
        else:
            raise ValueError(f"{gene} not found")
    return ens_ids

save_vocab(vocab_file)

Saves the vocabulary as a newline delimieted vocabulary file, each line represents an int -> token mapping. line number is assumed to be the integer.

Source code in bionemo/geneformer/tokenizer/gene_tokenizer.py
102
103
104
105
106
107
108
109
110
111
112
113
def save_vocab(self, vocab_file: str) -> None:
    """Saves the vocabulary as a newline delimieted vocabulary file, each line represents an int -> token mapping. line number is assumed to be the integer."""
    vocab_dir = os.path.dirname(vocab_file)
    if not os.path.exists(vocab_dir):
        os.makedirs(vocab_dir, exist_ok=True)  # ensure the dir exists but be ok with race conditions.

    to_serialize = {}
    to_serialize["vocab"] = self.vocab
    to_serialize["gene_to_ens"] = self.gene_to_ens

    with open(vocab_file, "w") as f:
        json.dump(to_serialize, f)

token_to_id(token)

Converts a token to its corresponding ID.

Parameters:

Name Type Description Default
token str

The token to be converted.

required

Returns:

Type Description
int

The ID corresponding to the token.

Source code in bionemo/geneformer/tokenizer/gene_tokenizer.py
72
73
74
75
76
77
78
79
80
81
def token_to_id(self, token: str) -> int:
    """Converts a token to its corresponding ID.

    Args:
        token: The token to be converted.

    Returns:
        The ID corresponding to the token.
    """
    return self.vocab.get(token)