Finetune regressor

`ESM2FineTuneSeqConfig` `dataclass`

Bases: ESM2GenericConfig[ESM2FineTuneSeqModel, RegressorLossReduction], IOMixinWithGettersSetters

ExampleConfig is a dataclass that is used to configure the model.

Timers from ModelParallelConfig are required for megatron forward compatibility.

Source code in bionemo/esm2/model/finetune/finetune_regressor.py

@dataclass
class ESM2FineTuneSeqConfig(
    ESM2GenericConfig[ESM2FineTuneSeqModel, RegressorLossReduction], iom.IOMixinWithGettersSetters
):
    """ExampleConfig is a dataclass that is used to configure the model.

    Timers from ModelParallelConfig are required for megatron forward compatibility.
    """

    model_cls: Type[ESM2FineTuneSeqModel] = ESM2FineTuneSeqModel
    # typical case is fine-tune the base biobert that doesn't have this head. If you are instead loading a checkpoint
    # that has this new head and want to keep using these weights, please drop this next line or set to []
    initial_ckpt_skip_keys_with_these_prefixes: List[str] = field(default_factory=lambda: ["regression_head"])

    encoder_frozen: bool = True  # freeze encoder parameters
    ft_dropout: float = 0.25  # MLP layer dropout

    def get_loss_reduction_class(self) -> Type[RegressorLossReduction]:
        """Returns RegressorLossReduction class."""
        return RegressorLossReduction

`get_loss_reduction_class()`

Returns RegressorLossReduction class.

Source code in bionemo/esm2/model/finetune/finetune_regressor.py

def get_loss_reduction_class(self) -> Type[RegressorLossReduction]:
    """Returns RegressorLossReduction class."""
    return RegressorLossReduction

`ESM2FineTuneSeqModel`

Bases: ESM2Model

ESM2 model that is suitable for fine-tuning on downstream tasks.

Source code in bionemo/esm2/model/finetune/finetune_regressor.py

class ESM2FineTuneSeqModel(ESM2Model):
    """ESM2 model that is suitable for fine-tuning on downstream tasks."""

    def __init__(self, config, *args, post_process: bool = True, include_embeddings: bool = False, **kwargs):
        """Constructs an instance of the ESM2 model suitable for fine-tuning."""
        super().__init__(config, *args, post_process=post_process, include_embeddings=True, **kwargs)

        # freeze encoder parameters
        if config.encoder_frozen:
            for _, param in self.named_parameters():
                param.requires_grad = False

        self.include_embeddings_finetuning = (
            include_embeddings  # this include_embeddings is for the final output of fine-tuning
        )
        # If post_process is True that means that we are at the last megatron parallelism stage and we can
        #   apply the head.
        if post_process:
            # if we are doing post process (eg pipeline last stage) then we need to add the output layers
            self.regression_head = MegatronMLPHead(config)

    def forward(self, *args, **kwargs) -> BioBertOutput | Tensor:
        """Inference."""
        output = super().forward(*args, **kwargs)
        # Stop early if we are not in post_process mode (for example if we are in the middle of model parallelism)
        if not self.post_process:
            return output  # we are not at the last pipeline stage so just return what the parent has
        # Double check that the output from the parent has everything we need to do prediction in this head.
        if not isinstance(output, dict) or "embeddings" not in output:
            raise ValueError(
                f"Expected to find 'embeddings' in the output, and output to be dictionary-like, found {output},\n"
                "Make sure include_embeddings=True in the call to super().__init__"
            )
        # Get the embeddings from the parent output, and pull out the [CLS] token for this task
        embeddings: Tensor = output["embeddings"]
        # Predict our 1d regression target
        regression_output = self.regression_head(embeddings)
        if not self.include_embeddings_finetuning:
            del output["embeddings"]
        output["regression_output"] = regression_output
        return output

`init(config, *args, post_process=True, include_embeddings=False, **kwargs)`

Constructs an instance of the ESM2 model suitable for fine-tuning.

Source code in bionemo/esm2/model/finetune/finetune_regressor.py

def __init__(self, config, *args, post_process: bool = True, include_embeddings: bool = False, **kwargs):
    """Constructs an instance of the ESM2 model suitable for fine-tuning."""
    super().__init__(config, *args, post_process=post_process, include_embeddings=True, **kwargs)

    # freeze encoder parameters
    if config.encoder_frozen:
        for _, param in self.named_parameters():
            param.requires_grad = False

    self.include_embeddings_finetuning = (
        include_embeddings  # this include_embeddings is for the final output of fine-tuning
    )
    # If post_process is True that means that we are at the last megatron parallelism stage and we can
    #   apply the head.
    if post_process:
        # if we are doing post process (eg pipeline last stage) then we need to add the output layers
        self.regression_head = MegatronMLPHead(config)

`forward(*args, **kwargs)`

Inference.

Source code in bionemo/esm2/model/finetune/finetune_regressor.py

def forward(self, *args, **kwargs) -> BioBertOutput | Tensor:
    """Inference."""
    output = super().forward(*args, **kwargs)
    # Stop early if we are not in post_process mode (for example if we are in the middle of model parallelism)
    if not self.post_process:
        return output  # we are not at the last pipeline stage so just return what the parent has
    # Double check that the output from the parent has everything we need to do prediction in this head.
    if not isinstance(output, dict) or "embeddings" not in output:
        raise ValueError(
            f"Expected to find 'embeddings' in the output, and output to be dictionary-like, found {output},\n"
            "Make sure include_embeddings=True in the call to super().__init__"
        )
    # Get the embeddings from the parent output, and pull out the [CLS] token for this task
    embeddings: Tensor = output["embeddings"]
    # Predict our 1d regression target
    regression_output = self.regression_head(embeddings)
    if not self.include_embeddings_finetuning:
        del output["embeddings"]
    output["regression_output"] = regression_output
    return output

`InMemorySingleValueDataset`

Bases: Dataset

An in-memory dataset that tokenizes strings into BertSample instances.

Source code in bionemo/esm2/model/finetune/finetune_regressor.py

class InMemorySingleValueDataset(Dataset):
    """An in-memory dataset that tokenizes strings into BertSample instances."""

    def __init__(
        self,
        data: Sequence[Tuple[str, float]],
        tokenizer: tokenizer.BioNeMoESMTokenizer = tokenizer.get_tokenizer(),
        seed: int = np.random.SeedSequence().entropy,  # type: ignore
    ):
        """Initializes a dataset for single-value regression fine-tuning.

        This is an in-memory dataset that does not apply masking to the sequence.

        Args:
            data (Sequence[Tuple[str, float]]): A sequence of tuples containing the sequence and target data.
            tokenizer (tokenizer.BioNeMoESMTokenizer, optional): The tokenizer to use. Defaults to tokenizer.get_tokenizer().
            seed: Random seed for reproducibility. This seed is mixed with the index of the sample to retrieve to ensure
                that __getitem__ is deterministic, but can be random across different runs. If None, a random seed is
                generated.
        """
        self.data = data
        self.seed = seed
        self._len = len(self.data)
        self.tokenizer = tokenizer

    def __len__(self) -> int:
        """The size of the dataset."""
        return self._len

    def __getitem__(self, index: int) -> BertSample:
        """Obtains the BertSample at the given index."""
        sequence, target = self.data[index]
        tokenized_sequence = self._tokenize(sequence)
        # Overall mask for a token being masked in some capacity - either mask token, random token, or left as-is
        loss_mask = ~torch.isin(tokenized_sequence, Tensor(self.tokenizer.all_special_ids))

        return {
            "text": tokenized_sequence,
            "types": torch.zeros_like(tokenized_sequence, dtype=torch.int64),
            "attention_mask": torch.ones_like(tokenized_sequence, dtype=torch.int64),
            "labels": torch.tensor([target], dtype=torch.float),
            "loss_mask": loss_mask,
            "is_random": torch.zeros_like(tokenized_sequence, dtype=torch.int64),
        }

    def _tokenize(self, sequence: str) -> Tensor:
        """Tokenize a protein sequence.

        Args:
            sequence: The protein sequence.

        Returns:
            The tokenized sequence.
        """
        tensor = self.tokenizer.encode(sequence, add_special_tokens=True, return_tensors="pt")
        return tensor.flatten()  # type: ignore

`getitem(index)`

Obtains the BertSample at the given index.

Source code in bionemo/esm2/model/finetune/finetune_regressor.py

def __getitem__(self, index: int) -> BertSample:
    """Obtains the BertSample at the given index."""
    sequence, target = self.data[index]
    tokenized_sequence = self._tokenize(sequence)
    # Overall mask for a token being masked in some capacity - either mask token, random token, or left as-is
    loss_mask = ~torch.isin(tokenized_sequence, Tensor(self.tokenizer.all_special_ids))

    return {
        "text": tokenized_sequence,
        "types": torch.zeros_like(tokenized_sequence, dtype=torch.int64),
        "attention_mask": torch.ones_like(tokenized_sequence, dtype=torch.int64),
        "labels": torch.tensor([target], dtype=torch.float),
        "loss_mask": loss_mask,
        "is_random": torch.zeros_like(tokenized_sequence, dtype=torch.int64),
    }

`init(data, tokenizer=tokenizer.get_tokenizer(), seed=np.random.SeedSequence().entropy)`

Initializes a dataset for single-value regression fine-tuning.

This is an in-memory dataset that does not apply masking to the sequence.

Parameters:

Name	Type	Description	Default
`data`	`Sequence[Tuple[str, float]]`	A sequence of tuples containing the sequence and target data.	required
`tokenizer`	`BioNeMoESMTokenizer`	The tokenizer to use. Defaults to tokenizer.get_tokenizer().	`get_tokenizer()`
`seed`	`int`	Random seed for reproducibility. This seed is mixed with the index of the sample to retrieve to ensure that getitem is deterministic, but can be random across different runs. If None, a random seed is generated.	`entropy`

Source code in bionemo/esm2/model/finetune/finetune_regressor.py

def __init__(
    self,
    data: Sequence[Tuple[str, float]],
    tokenizer: tokenizer.BioNeMoESMTokenizer = tokenizer.get_tokenizer(),
    seed: int = np.random.SeedSequence().entropy,  # type: ignore
):
    """Initializes a dataset for single-value regression fine-tuning.

    This is an in-memory dataset that does not apply masking to the sequence.

    Args:
        data (Sequence[Tuple[str, float]]): A sequence of tuples containing the sequence and target data.
        tokenizer (tokenizer.BioNeMoESMTokenizer, optional): The tokenizer to use. Defaults to tokenizer.get_tokenizer().
        seed: Random seed for reproducibility. This seed is mixed with the index of the sample to retrieve to ensure
            that __getitem__ is deterministic, but can be random across different runs. If None, a random seed is
            generated.
    """
    self.data = data
    self.seed = seed
    self._len = len(self.data)
    self.tokenizer = tokenizer

`len()`

The size of the dataset.

Source code in bionemo/esm2/model/finetune/finetune_regressor.py

def __len__(self) -> int:
    """The size of the dataset."""
    return self._len

`MegatronMLPHead`

Bases: MegatronModule

An MLP class for sequence-level regression.

Source code in bionemo/esm2/model/finetune/finetune_regressor.py

class MegatronMLPHead(MegatronModule):
    """An MLP class for sequence-level regression."""

    def __init__(self, config: TransformerConfig):
        """Constructor."""
        super().__init__(config)

        layer_sizes = [config.hidden_size, 256, 1]
        self.linear_layers = torch.nn.ModuleList(
            [torch.nn.Linear(i, o) for i, o in zip(layer_sizes[:-1], layer_sizes[1:])]  # noqa: RUF007
        )
        self.act = torch.nn.ReLU()
        self.dropout = torch.nn.Dropout(p=config.ft_dropout)

    def forward(self, hidden_states: Tensor) -> List[Tensor]:
        """Inference."""
        # [b, s, h]
        for layer in self.linear_layers[:-1]:
            hidden_states = self.dropout(self.act(layer(hidden_states)))

        output = self.linear_layers[-1](hidden_states)
        return output

`init(config)`

Constructor.

Source code in bionemo/esm2/model/finetune/finetune_regressor.py

def __init__(self, config: TransformerConfig):
    """Constructor."""
    super().__init__(config)

    layer_sizes = [config.hidden_size, 256, 1]
    self.linear_layers = torch.nn.ModuleList(
        [torch.nn.Linear(i, o) for i, o in zip(layer_sizes[:-1], layer_sizes[1:])]  # noqa: RUF007
    )
    self.act = torch.nn.ReLU()
    self.dropout = torch.nn.Dropout(p=config.ft_dropout)

`forward(hidden_states)`

Inference.

Source code in bionemo/esm2/model/finetune/finetune_regressor.py

def forward(self, hidden_states: Tensor) -> List[Tensor]:
    """Inference."""
    # [b, s, h]
    for layer in self.linear_layers[:-1]:
        hidden_states = self.dropout(self.act(layer(hidden_states)))

    output = self.linear_layers[-1](hidden_states)
    return output

`RegressorLossReduction`

Bases: BERTMLMLossWithReduction

A class for calculating the MSE loss of regression output.

This class used for calculating the loss, and for logging the reduced loss across micro batches.

Source code in bionemo/esm2/model/finetune/finetune_regressor.py

class RegressorLossReduction(BERTMLMLossWithReduction):
    """A class for calculating the MSE loss of regression output.

    This class used for calculating the loss, and for logging the reduced loss across micro batches.
    """

    def forward(
        self, batch: Dict[str, Tensor], forward_out: Dict[str, Tensor]
    ) -> Tuple[Tensor, PerTokenLossDict | SameSizeLossDict]:
        """Calculates the loss within a micro-batch. A micro-batch is a batch of data on a single GPU.

        Args:
            batch: A batch of data that gets passed to the original forward inside LitAutoEncoder.
            forward_out: the output of the forward method inside classification head.

        Returns:
            A tuple containing [<loss_tensor>, ReductionT] where the loss tensor will be used for
                backpropagation and the ReductionT will be passed to the reduce method
                (which currently only works for logging.).
        """
        regression_output = forward_out["regression_output"]
        targets = batch["labels"].to(dtype=regression_output.dtype)  # [b, 1]

        cp_size = parallel_state.get_context_parallel_world_size()
        if cp_size == 1:
            loss = torch.nn.functional.mse_loss(regression_output, targets)
        else:  # TODO: support CP with masked_token_loss_context_parallel
            raise NotImplementedError("Context Parallel support is not implemented for this loss")

        return loss, {"avg": loss}

    def reduce(self, losses_reduced_per_micro_batch: Sequence[SameSizeLossDict]) -> Tensor:
        """Works across micro-batches. (data on single gpu).

        Note: This currently only works for logging and this loss will not be used for backpropagation.

        Args:
            losses_reduced_per_micro_batch: a list of the outputs of forward

        Returns:
            A tensor that is the mean of the losses. (used for logging).
        """
        losses = torch.stack([loss["avg"] for loss in losses_reduced_per_micro_batch])
        return losses.mean()

`forward(batch, forward_out)`

Calculates the loss within a micro-batch. A micro-batch is a batch of data on a single GPU.

Parameters:

Name	Type	Description	Default
`batch`	`Dict[str, Tensor]`	A batch of data that gets passed to the original forward inside LitAutoEncoder.	required
`forward_out`	`Dict[str, Tensor]`	the output of the forward method inside classification head.	required

Returns:

Type	Description
`Tuple[Tensor, PerTokenLossDict \| SameSizeLossDict]`	A tuple containing [, ReductionT] where the loss tensor will be used for backpropagation and the ReductionT will be passed to the reduce method (which currently only works for logging.).

Source code in bionemo/esm2/model/finetune/finetune_regressor.py

def forward(
    self, batch: Dict[str, Tensor], forward_out: Dict[str, Tensor]
) -> Tuple[Tensor, PerTokenLossDict | SameSizeLossDict]:
    """Calculates the loss within a micro-batch. A micro-batch is a batch of data on a single GPU.

    Args:
        batch: A batch of data that gets passed to the original forward inside LitAutoEncoder.
        forward_out: the output of the forward method inside classification head.

    Returns:
        A tuple containing [<loss_tensor>, ReductionT] where the loss tensor will be used for
            backpropagation and the ReductionT will be passed to the reduce method
            (which currently only works for logging.).
    """
    regression_output = forward_out["regression_output"]
    targets = batch["labels"].to(dtype=regression_output.dtype)  # [b, 1]

    cp_size = parallel_state.get_context_parallel_world_size()
    if cp_size == 1:
        loss = torch.nn.functional.mse_loss(regression_output, targets)
    else:  # TODO: support CP with masked_token_loss_context_parallel
        raise NotImplementedError("Context Parallel support is not implemented for this loss")

    return loss, {"avg": loss}

`reduce(losses_reduced_per_micro_batch)`

Works across micro-batches. (data on single gpu).

Note: This currently only works for logging and this loss will not be used for backpropagation.

Parameters:

Name	Type	Description	Default
`losses_reduced_per_micro_batch`	`Sequence[SameSizeLossDict]`	a list of the outputs of forward	required

Returns:

Type	Description
`Tensor`	A tensor that is the mean of the losses. (used for logging).

Source code in bionemo/esm2/model/finetune/finetune_regressor.py

def reduce(self, losses_reduced_per_micro_batch: Sequence[SameSizeLossDict]) -> Tensor:
    """Works across micro-batches. (data on single gpu).

    Note: This currently only works for logging and this loss will not be used for backpropagation.

    Args:
        losses_reduced_per_micro_batch: a list of the outputs of forward

    Returns:
        A tensor that is the mean of the losses. (used for logging).
    """
    losses = torch.stack([loss["avg"] for loss in losses_reduced_per_micro_batch])
    return losses.mean()

Finetune regressor

ESM2FineTuneSeqConfig dataclass

get_loss_reduction_class()

ESM2FineTuneSeqModel

__init__(config, *args, post_process=True, include_embeddings=False, **kwargs)

forward(*args, **kwargs)

InMemorySingleValueDataset

__getitem__(index)

__init__(data, tokenizer=tokenizer.get_tokenizer(), seed=np.random.SeedSequence().entropy)

__len__()

MegatronMLPHead

__init__(config)

forward(hidden_states)

RegressorLossReduction

forward(batch, forward_out)

reduce(losses_reduced_per_micro_batch)

`ESM2FineTuneSeqConfig` `dataclass`

`get_loss_reduction_class()`

`ESM2FineTuneSeqModel`

`init(config, *args, post_process=True, include_embeddings=False, **kwargs)`

`forward(*args, **kwargs)`

`InMemorySingleValueDataset`

`getitem(index)`

`init(data, tokenizer=tokenizer.get_tokenizer(), seed=np.random.SeedSequence().entropy)`

`len()`

`MegatronMLPHead`

`init(config)`

`forward(hidden_states)`

`RegressorLossReduction`

`forward(batch, forward_out)`

`reduce(losses_reduced_per_micro_batch)`