Lightning

`extract_global_steps_from_log(log_string)`

Extract global steps from a Pytorch lightening log string.

Source code in bionemo/testing/lightning.py

def extract_global_steps_from_log(log_string: str) -> List[int]:
    """Extract global steps from a Pytorch lightening log string."""
    pattern = r"\| global_step: (\d+) \|"
    matches = re.findall(pattern, log_string)
    return [int(step) for step in matches]

`get_random_microbatch(microbatch_size, max_sequence_length, vocab_size, seed, mask_index=MLM_LOSS_IGNORE_INDEX)`

Generate random microbatches for testing.

Note that this follows the convention that token_logits are s,b, while other fields are b,s.

Source code in bionemo/testing/lightning.py

def get_random_microbatch(
    microbatch_size: int,
    max_sequence_length: int,
    vocab_size: int,
    seed: int,
    mask_index: int = MLM_LOSS_IGNORE_INDEX,
) -> Dict[str, Dict[str, torch.Tensor]]:
    """Generate random microbatches for testing.

    Note that this follows the convention that token_logits are s,b, while other fields are b,s.
    """
    generator = torch.Generator(device=torch.cuda.current_device()).manual_seed(seed)
    labels = torch.randint(
        low=0,
        high=vocab_size,
        size=(microbatch_size, max_sequence_length),
        generator=generator,
        device=torch.cuda.current_device(),
    )  # [b s]
    loss_mask = torch.randint(
        low=1,
        high=1 + 1,
        size=(microbatch_size, max_sequence_length),
        dtype=torch.long,
        device=torch.cuda.current_device(),
        generator=generator,
    )  # [b s]
    token_logits = torch.rand(
        max_sequence_length, microbatch_size, vocab_size, device=torch.cuda.current_device(), generator=generator
    )  # [s b v]
    labels[loss_mask == 0] = mask_index  # propagate masking to labels
    microbatch_output = {
        "batch": {"labels": labels, "loss_mask": loss_mask},
        "forward_out": {"token_logits": token_logits},
    }
    return microbatch_output