Convert

`HFESM2Exporter`

Bases: ModelConnector[BionemoLightningModule, EsmForMaskedLM]

Exporter Connector for converting NeMo ESM-2 Model to HF.

Source code in bionemo/esm2/model/convert.py

@io.model_exporter(BionemoLightningModule, "hf")
class HFESM2Exporter(io.ModelConnector[BionemoLightningModule, EsmForMaskedLM]):
    """Exporter Connector for converting NeMo ESM-2 Model to HF."""

    def init(self, dtype: torch.dtype = torch.bfloat16) -> EsmForMaskedLM:
        """Initialize the target model."""
        with no_init_weights():
            return EsmForMaskedLM._from_config(self.config, torch_dtype=dtype)

    def apply(self, output_path: Path) -> Path:
        """Applies the transformation."""
        cpu = not torch.distributed.is_initialized()
        trainer = Trainer(
            devices=1,
            accelerator="cpu" if cpu else "gpu",
            strategy=MegatronStrategy(
                ddp="pytorch", setup_optimizers=False, ckpt_load_strictness=StrictHandling.LOG_UNEXPECTED
            ),
        )
        source, _ = self.nemo_load(self, trainer=trainer, cpu=cpu)

        dtype = source.dtype

        # Not sure why we need to do this, for some reason lm_head stays as fp32
        source.module.lm_head.to(dtype)

        target = self.init(dtype)
        target = self.convert_state(source, target)

        target = target.cpu()
        target.esm.contact_head.regression.reset_parameters()  # We don't initialize this head, so we need to reset.
        target.save_pretrained(output_path)
        self.tokenizer.save_pretrained(output_path)

        return output_path

    @property
    def tokenizer(self):
        """Retrieve Tokenizer from HF."""
        return get_tokenizer()

    def convert_state(self, nemo_module, target):
        """Convert NeMo state dict to HF style."""
        mapping = {
            "encoder.final_layernorm.weight": "esm.encoder.emb_layer_norm_after.weight",
            "encoder.final_layernorm.bias": "esm.encoder.emb_layer_norm_after.bias",
            "encoder.layers.*.self_attention.linear_proj.weight": "esm.encoder.layer.*.attention.output.dense.weight",
            "encoder.layers.*.self_attention.linear_proj.bias": "esm.encoder.layer.*.attention.output.dense.bias",
            "encoder.layers.*.self_attention.linear_qkv.layer_norm_weight": "esm.encoder.layer.*.attention.LayerNorm.weight",
            "encoder.layers.*.self_attention.linear_qkv.layer_norm_bias": "esm.encoder.layer.*.attention.LayerNorm.bias",
            "encoder.layers.*.mlp.linear_fc1.weight": "esm.encoder.layer.*.intermediate.dense.weight",
            "encoder.layers.*.mlp.linear_fc1.bias": "esm.encoder.layer.*.intermediate.dense.bias",
            "encoder.layers.*.mlp.linear_fc2.weight": "esm.encoder.layer.*.output.dense.weight",
            "encoder.layers.*.mlp.linear_fc2.bias": "esm.encoder.layer.*.output.dense.bias",
            "encoder.layers.*.mlp.linear_fc1.layer_norm_weight": "esm.encoder.layer.*.LayerNorm.weight",
            "encoder.layers.*.mlp.linear_fc1.layer_norm_bias": "esm.encoder.layer.*.LayerNorm.bias",
            "lm_head.dense.weight": "lm_head.dense.weight",
            "lm_head.dense.bias": "lm_head.dense.bias",
            "lm_head.layer_norm.weight": "lm_head.layer_norm.weight",
            "lm_head.layer_norm.bias": "lm_head.layer_norm.bias",
        }

        return io.apply_transforms(
            nemo_module,
            target,
            mapping=mapping,
            transforms=[_export_qkv_weight, _export_qkv_bias, _export_embedding, _export_bias],
        )

    @property
    def config(self) -> HFEsmConfig:
        """Generate HF Config based on NeMo config."""
        source: ESM2Config = io.load_context(Path(str(self)), subpath="model.config")

        return HFEsmConfig(
            attention_probs_dropout_prob=float(source.attention_dropout),
            emb_layer_norm_before=False,
            hidden_act="gelu",
            hidden_dropout_prob=float(source.hidden_dropout),
            hidden_size=int(source.hidden_size),
            initializer_range=float(source.init_method_std),
            intermediate_size=int(source.ffn_hidden_size),
            is_folding_model=False,
            layer_norm_eps=float(source.layernorm_epsilon),
            mask_token_id=32,
            max_position_embeddings=int(source.seq_length),
            model_type="esm",
            num_attention_heads=int(source.num_attention_heads),
            num_hidden_layers=int(source.num_layers),
            pad_token_id=1,
            position_embedding_type="rotary",
            token_dropout=True,
            torch_dtype=torch.bfloat16,
            use_cache=True,
            vocab_size=self.tokenizer.vocab_size,
        )

`config` `property`

Generate HF Config based on NeMo config.

`tokenizer` `property`

Retrieve Tokenizer from HF.

`apply(output_path)`

Applies the transformation.

Source code in bionemo/esm2/model/convert.py

def apply(self, output_path: Path) -> Path:
    """Applies the transformation."""
    cpu = not torch.distributed.is_initialized()
    trainer = Trainer(
        devices=1,
        accelerator="cpu" if cpu else "gpu",
        strategy=MegatronStrategy(
            ddp="pytorch", setup_optimizers=False, ckpt_load_strictness=StrictHandling.LOG_UNEXPECTED
        ),
    )
    source, _ = self.nemo_load(self, trainer=trainer, cpu=cpu)

    dtype = source.dtype

    # Not sure why we need to do this, for some reason lm_head stays as fp32
    source.module.lm_head.to(dtype)

    target = self.init(dtype)
    target = self.convert_state(source, target)

    target = target.cpu()
    target.esm.contact_head.regression.reset_parameters()  # We don't initialize this head, so we need to reset.
    target.save_pretrained(output_path)
    self.tokenizer.save_pretrained(output_path)

    return output_path

`convert_state(nemo_module, target)`

Convert NeMo state dict to HF style.

Source code in bionemo/esm2/model/convert.py

def convert_state(self, nemo_module, target):
    """Convert NeMo state dict to HF style."""
    mapping = {
        "encoder.final_layernorm.weight": "esm.encoder.emb_layer_norm_after.weight",
        "encoder.final_layernorm.bias": "esm.encoder.emb_layer_norm_after.bias",
        "encoder.layers.*.self_attention.linear_proj.weight": "esm.encoder.layer.*.attention.output.dense.weight",
        "encoder.layers.*.self_attention.linear_proj.bias": "esm.encoder.layer.*.attention.output.dense.bias",
        "encoder.layers.*.self_attention.linear_qkv.layer_norm_weight": "esm.encoder.layer.*.attention.LayerNorm.weight",
        "encoder.layers.*.self_attention.linear_qkv.layer_norm_bias": "esm.encoder.layer.*.attention.LayerNorm.bias",
        "encoder.layers.*.mlp.linear_fc1.weight": "esm.encoder.layer.*.intermediate.dense.weight",
        "encoder.layers.*.mlp.linear_fc1.bias": "esm.encoder.layer.*.intermediate.dense.bias",
        "encoder.layers.*.mlp.linear_fc2.weight": "esm.encoder.layer.*.output.dense.weight",
        "encoder.layers.*.mlp.linear_fc2.bias": "esm.encoder.layer.*.output.dense.bias",
        "encoder.layers.*.mlp.linear_fc1.layer_norm_weight": "esm.encoder.layer.*.LayerNorm.weight",
        "encoder.layers.*.mlp.linear_fc1.layer_norm_bias": "esm.encoder.layer.*.LayerNorm.bias",
        "lm_head.dense.weight": "lm_head.dense.weight",
        "lm_head.dense.bias": "lm_head.dense.bias",
        "lm_head.layer_norm.weight": "lm_head.layer_norm.weight",
        "lm_head.layer_norm.bias": "lm_head.layer_norm.bias",
    }

    return io.apply_transforms(
        nemo_module,
        target,
        mapping=mapping,
        transforms=[_export_qkv_weight, _export_qkv_bias, _export_embedding, _export_bias],
    )

`init(dtype=torch.bfloat16)`

Initialize the target model.

Source code in bionemo/esm2/model/convert.py

def init(self, dtype: torch.dtype = torch.bfloat16) -> EsmForMaskedLM:
    """Initialize the target model."""
    with no_init_weights():
        return EsmForMaskedLM._from_config(self.config, torch_dtype=dtype)

`HFESM2Importer`

Bases: ModelConnector[AutoModelForMaskedLM, BionemoLightningModule]

Converts a Hugging Face ESM-2 model to a NeMo ESM-2 model.

Source code in bionemo/esm2/model/convert.py

@io.model_importer(BionemoLightningModule, "hf")
class HFESM2Importer(io.ModelConnector[AutoModelForMaskedLM, BionemoLightningModule]):
    """Converts a Hugging Face ESM-2 model to a NeMo ESM-2 model."""

    def init(self) -> BionemoLightningModule:
        """Initialize the converted model."""
        return biobert_lightning_module(self.config, tokenizer=self.tokenizer)

    def apply(self, output_path: Path) -> Path:
        """Applies the transformation.

        Largely inspired by
        https://docs.nvidia.com/nemo-framework/user-guide/latest/nemo-2.0/features/hf-integration.html
        """
        source = AutoModelForMaskedLM.from_pretrained(str(self), trust_remote_code=True, torch_dtype="auto")
        target = self.init()
        trainer = self.nemo_setup(target)
        self.convert_state(source, target)
        self.nemo_save(output_path, trainer)

        print(f"Converted ESM-2 model to Nemo, model saved to {output_path}")

        teardown(trainer, target)
        del trainer, target

        return output_path

    def convert_state(self, source, target):
        """Converting HF state dict to NeMo state dict."""
        mapping = {
            # "esm.encoder.layer.0.attention.self.rotary_embeddings.inv_freq": "rotary_pos_emb.inv_freq",
            "esm.encoder.layer.*.attention.output.dense.weight": "encoder.layers.*.self_attention.linear_proj.weight",
            "esm.encoder.layer.*.attention.output.dense.bias": "encoder.layers.*.self_attention.linear_proj.bias",
            "esm.encoder.layer.*.attention.LayerNorm.weight": "encoder.layers.*.self_attention.linear_qkv.layer_norm_weight",
            "esm.encoder.layer.*.attention.LayerNorm.bias": "encoder.layers.*.self_attention.linear_qkv.layer_norm_bias",
            "esm.encoder.layer.*.intermediate.dense.weight": "encoder.layers.*.mlp.linear_fc1.weight",
            "esm.encoder.layer.*.intermediate.dense.bias": "encoder.layers.*.mlp.linear_fc1.bias",
            "esm.encoder.layer.*.output.dense.weight": "encoder.layers.*.mlp.linear_fc2.weight",
            "esm.encoder.layer.*.output.dense.bias": "encoder.layers.*.mlp.linear_fc2.bias",
            "esm.encoder.layer.*.LayerNorm.weight": "encoder.layers.*.mlp.linear_fc1.layer_norm_weight",
            "esm.encoder.layer.*.LayerNorm.bias": "encoder.layers.*.mlp.linear_fc1.layer_norm_bias",
            "esm.encoder.emb_layer_norm_after.weight": "encoder.final_layernorm.weight",
            "esm.encoder.emb_layer_norm_after.bias": "encoder.final_layernorm.bias",
            "lm_head.dense.weight": "lm_head.dense.weight",
            "lm_head.dense.bias": "lm_head.dense.bias",
            "lm_head.layer_norm.weight": "lm_head.layer_norm.weight",
            "lm_head.layer_norm.bias": "lm_head.layer_norm.bias",
        }

        return io.apply_transforms(
            source,
            target,
            mapping=mapping,
            transforms=[_pad_embeddings, _pad_bias, _import_qkv_weight, _import_qkv_bias],
        )

    @property
    def tokenizer(self) -> BioNeMoESMTokenizer:
        """We just have the one tokenizer for ESM-2."""
        return get_tokenizer()

    @property
    def config(self) -> ESM2Config:
        """Returns the transformed ESM-2 config given the model tag."""
        source = HFAutoConfig.from_pretrained(str(self), trust_remote_code=True)
        output = ESM2Config(
            num_layers=source.num_hidden_layers,
            hidden_size=source.hidden_size,
            ffn_hidden_size=source.intermediate_size,
            position_embedding_type="rope",
            num_attention_heads=source.num_attention_heads,
            seq_length=source.max_position_embeddings,
            fp16=(dtype_from_hf(source) == torch.float16),
            bf16=(dtype_from_hf(source) == torch.bfloat16),
            params_dtype=dtype_from_hf(source),
        )

        return output

`config` `property`

Returns the transformed ESM-2 config given the model tag.

`tokenizer` `property`

We just have the one tokenizer for ESM-2.

`apply(output_path)`

Applies the transformation.

Largely inspired by https://docs.nvidia.com/nemo-framework/user-guide/latest/nemo-2.0/features/hf-integration.html

Source code in bionemo/esm2/model/convert.py

def apply(self, output_path: Path) -> Path:
    """Applies the transformation.

    Largely inspired by
    https://docs.nvidia.com/nemo-framework/user-guide/latest/nemo-2.0/features/hf-integration.html
    """
    source = AutoModelForMaskedLM.from_pretrained(str(self), trust_remote_code=True, torch_dtype="auto")
    target = self.init()
    trainer = self.nemo_setup(target)
    self.convert_state(source, target)
    self.nemo_save(output_path, trainer)

    print(f"Converted ESM-2 model to Nemo, model saved to {output_path}")

    teardown(trainer, target)
    del trainer, target

    return output_path

`convert_state(source, target)`

Converting HF state dict to NeMo state dict.

Source code in bionemo/esm2/model/convert.py

def convert_state(self, source, target):
    """Converting HF state dict to NeMo state dict."""
    mapping = {
        # "esm.encoder.layer.0.attention.self.rotary_embeddings.inv_freq": "rotary_pos_emb.inv_freq",
        "esm.encoder.layer.*.attention.output.dense.weight": "encoder.layers.*.self_attention.linear_proj.weight",
        "esm.encoder.layer.*.attention.output.dense.bias": "encoder.layers.*.self_attention.linear_proj.bias",
        "esm.encoder.layer.*.attention.LayerNorm.weight": "encoder.layers.*.self_attention.linear_qkv.layer_norm_weight",
        "esm.encoder.layer.*.attention.LayerNorm.bias": "encoder.layers.*.self_attention.linear_qkv.layer_norm_bias",
        "esm.encoder.layer.*.intermediate.dense.weight": "encoder.layers.*.mlp.linear_fc1.weight",
        "esm.encoder.layer.*.intermediate.dense.bias": "encoder.layers.*.mlp.linear_fc1.bias",
        "esm.encoder.layer.*.output.dense.weight": "encoder.layers.*.mlp.linear_fc2.weight",
        "esm.encoder.layer.*.output.dense.bias": "encoder.layers.*.mlp.linear_fc2.bias",
        "esm.encoder.layer.*.LayerNorm.weight": "encoder.layers.*.mlp.linear_fc1.layer_norm_weight",
        "esm.encoder.layer.*.LayerNorm.bias": "encoder.layers.*.mlp.linear_fc1.layer_norm_bias",
        "esm.encoder.emb_layer_norm_after.weight": "encoder.final_layernorm.weight",
        "esm.encoder.emb_layer_norm_after.bias": "encoder.final_layernorm.bias",
        "lm_head.dense.weight": "lm_head.dense.weight",
        "lm_head.dense.bias": "lm_head.dense.bias",
        "lm_head.layer_norm.weight": "lm_head.layer_norm.weight",
        "lm_head.layer_norm.bias": "lm_head.layer_norm.bias",
    }

    return io.apply_transforms(
        source,
        target,
        mapping=mapping,
        transforms=[_pad_embeddings, _pad_bias, _import_qkv_weight, _import_qkv_bias],
    )

`init()`

Initialize the converted model.

Source code in bionemo/esm2/model/convert.py

def init(self) -> BionemoLightningModule:
    """Initialize the converted model."""
    return biobert_lightning_module(self.config, tokenizer=self.tokenizer)

`convert_hf_to_nemo(hf_tag_or_path, output_path, overwrite=True)`

Convert a HuggingFace ESM-2 checkpoint to a NeMo ESM-2 checkpoint.

Parameters:

Name	Type	Description	Default
`hf_tag_or_path`	`str`	Tag or path to the HuggingFace checkpoint.	required
`output_path`	`str`	Path to the output NeMo checkpoint.	required
`overwrite`	`bool`	Whether to overwrite the output path if it already exists.	`True`

Source code in bionemo/esm2/model/convert.py

@app.command()
def convert_hf_to_nemo(hf_tag_or_path: str, output_path: str, overwrite: bool = True):
    """Convert a HuggingFace ESM-2 checkpoint to a NeMo ESM-2 checkpoint.

    Args:
        hf_tag_or_path: Tag or path to the HuggingFace checkpoint.
        output_path: Path to the output NeMo checkpoint.
        overwrite: Whether to overwrite the output path if it already exists.
    """
    module = biobert_lightning_module(config=ESM2Config(), post_process=True)
    io.import_ckpt(module, f"hf://{hf_tag_or_path}", Path(output_path), overwrite=overwrite)

`convert_nemo_to_hf(nemo_path, output_path, overwrite=True)`

Convert a NeMo ESM-2 checkpoint to a HuggingFace checkpoint.

Parameters:

Name	Type	Description	Default
`nemo_path`	`str`	Path to the NeMo checkpoint.	required
`output_path`	`str`	Path to the output HuggingFace checkpoint.	required
`overwrite`	`bool`	Whether to overwrite the output path if it already exists.	`True`

Source code in bionemo/esm2/model/convert.py

@app.command()
def convert_nemo_to_hf(nemo_path: str, output_path: str, overwrite: bool = True):
    """Convert a NeMo ESM-2 checkpoint to a HuggingFace checkpoint.

    Args:
        nemo_path: Path to the NeMo checkpoint.
        output_path: Path to the output HuggingFace checkpoint.
        overwrite: Whether to overwrite the output path if it already exists.
    """
    io.export_ckpt(
        Path(nemo_path),
        "hf",
        Path(output_path),
        overwrite=overwrite,
        load_connector=lambda path, ext: BionemoLightningModule.exporter(ext, path),
    )

Convert

HFESM2Exporter

config property

tokenizer property

apply(output_path)

convert_state(nemo_module, target)

init(dtype=torch.bfloat16)

HFESM2Importer

config property

tokenizer property

apply(output_path)

convert_state(source, target)

init()

convert_hf_to_nemo(hf_tag_or_path, output_path, overwrite=True)

convert_nemo_to_hf(nemo_path, output_path, overwrite=True)

`HFESM2Exporter`

`config` `property`

`tokenizer` `property`

`apply(output_path)`

`convert_state(nemo_module, target)`

`init(dtype=torch.bfloat16)`

`HFESM2Importer`

`config` `property`

`tokenizer` `property`

`apply(output_path)`

`convert_state(source, target)`

`init()`

`convert_hf_to_nemo(hf_tag_or_path, output_path, overwrite=True)`

`convert_nemo_to_hf(nemo_path, output_path, overwrite=True)`