Skip to content

Train

NsysConfig

Bases: BaseModel

Configuration for nsys profiling.

Source code in bionemo/llm/train.py
49
50
51
52
53
54
class NsysConfig(BaseModel):
    """Configuration for nsys profiling."""

    start_step: int = 0
    end_step: Optional[int] = None
    ranks: list[int] = field(default_factory=lambda: [0])

nemo_logger_factory(experiment_config, wandb_config)

Creates and returns a NeMoLogger instance configured based on the provided experiment and wandb configurations.

Parameters:

Name Type Description Default
experiment_config ExperimentConfig

Configuration object containing experiment settings such as result directory, experiment name, checkpoint settings, and logger preferences.

required
wandb_config Optional[WandbConfig]

Optional configuration object for Weights and Biases logging.

required

Returns:

Type Description
NeMoLogger

nl.NeMoLogger: An instance of NeMoLogger configured with the specified settings.

Source code in bionemo/llm/train.py
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
def nemo_logger_factory(experiment_config: ExperimentConfig, wandb_config: Optional[WandbConfig]) -> nl.NeMoLogger:
    """Creates and returns a NeMoLogger instance configured based on the provided experiment and wandb configurations.

    Args:
        experiment_config (ExperimentConfig): Configuration object containing experiment settings such as
            result directory, experiment name, checkpoint settings, and logger preferences.
        wandb_config (Optional[WandbConfig]): Optional configuration object for Weights and Biases logging.

    Returns:
        nl.NeMoLogger: An instance of NeMoLogger configured with the specified settings.
    """
    checkpoint_callback = nl_callbacks.ModelCheckpoint(
        save_last=experiment_config.save_last_checkpoint,
        monitor=experiment_config.metric_to_monitor_for_checkpoints,
        save_top_k=experiment_config.save_top_k,
        every_n_train_steps=experiment_config.save_every_n_steps,
        always_save_context=True,
        filename="{epoch}-{val_loss:.2f}-{step}-{consumed_samples}",  # Including step and consumed_samples in the checkpoint filename prevents duplicate filenames and bugs related to this.
    )

    nemo_logger = setup_nemo_lightning_logger(
        root_dir=experiment_config.result_dir,
        name=experiment_config.experiment_name,
        initialize_tensorboard_logger=experiment_config.create_tensorboard_logger,
        wandb_config=wandb_config,
        ckpt_callback=checkpoint_callback,
    )
    return nemo_logger

setup_trainer(parallel_config, training_config, callbacks=None, nsys_config=None)

Set up the trainer for model training using the specified parallel and training configurations.

Parameters:

Name Type Description Default
parallel_config ParallelConfig

Configuration for parallelism, including tensor and pipeline model parallel sizes, number of devices, and number of nodes.

required
training_config TrainingConfig

Configuration for training, including maximum steps, accelerator type, validation batch limit, validation check interval, and precision.

required
callbacks list

List of callback functions to be used during training. Defaults to None, in which case default callbacks (RichModelSummary and LearningRateMonitor) are used.

None
nsys_config NsysConfig

Configuration for nsys profiling. If None, is disabled.

None

Returns:

Type Description
Trainer

nl.Trainer: Configured trainer object ready for model training.

Source code in bionemo/llm/train.py
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
def setup_trainer(
    parallel_config: ParallelConfig,
    training_config: TrainingConfig,
    callbacks=None,
    nsys_config: NsysConfig | None = None,
) -> nl.Trainer:
    """Set up the trainer for model training using the specified parallel and training configurations.

    Args:
        parallel_config (ParallelConfig): Configuration for parallelism, including tensor and pipeline model parallel sizes,
                                          number of devices, and number of nodes.
        training_config (TrainingConfig): Configuration for training, including maximum steps, accelerator type,
                                          validation batch limit, validation check interval, and precision.
        callbacks (list, optional): List of callback functions to be used during training. Defaults to None,
                                    in which case default callbacks (RichModelSummary and LearningRateMonitor) are used.
        nsys_config (NsysConfig, optional): Configuration for nsys profiling. If None, is disabled.

    Returns:
        nl.Trainer: Configured trainer object ready for model training.
    """
    strategy = nl.MegatronStrategy(
        tensor_model_parallel_size=parallel_config.tensor_model_parallel_size,
        pipeline_model_parallel_size=parallel_config.pipeline_model_parallel_size,
        ddp="megatron",
        find_unused_parameters=True,
        ckpt_include_optimizer=True,
        # NOTE: there are issues related to async that may occur, most recently observed due to duplicate filenames.
        ckpt_async_save=True,
        ckpt_parallel_load=True,
    )
    if callbacks is None:
        callbacks = [
            RichModelSummary(max_depth=4),
            LearningRateMonitor(),
        ]

    if training_config.include_perplexity:
        callbacks.append(PerplexityLoggingCallback())

    if training_config.gc_interval > 0:
        callbacks.append(
            nl_callbacks.GarbageCollectionCallback(
                gc_interval_train=training_config.gc_interval, gc_interval_val=training_config.gc_interval
            )
        )

    if nsys_config:
        if nsys_config.end_step is None:
            nsys_config.end_step = training_config.max_steps
        callbacks.append(
            nl_callbacks.NsysCallback(
                start_step=nsys_config.start_step,
                end_step=nsys_config.end_step,
                ranks=nsys_config.ranks,
                gen_shape=True,
            )
        )

    trainer = nl.Trainer(
        devices=parallel_config.num_devices,
        max_steps=training_config.max_steps,
        accelerator=training_config.accelerator,
        strategy=strategy,
        limit_val_batches=training_config.limit_val_batches,
        val_check_interval=training_config.val_check_interval,
        num_nodes=parallel_config.num_nodes,
        callbacks=callbacks,
        plugins=nl.MegatronMixedPrecision(precision=training_config.precision),
    )
    return trainer

train(bionemo_exposed_model_config, data_config, parallel_config, training_config, optim_config, experiment_config, wandb_config, nsys_config=None, resume_if_exists=True)

Train a BioNemo model using the provided configurations. Uses the ExposedModelConfig and DataConfig as the primary variants for this method.

Parameters:

Name Type Description Default
bionemo_exposed_model_config ExposedModelConfig

Configuration for the exposed BioNemo model.

required
data_config DataConfig[DataModuleT]

Configuration for the data module.

required
parallel_config ParallelConfig

Configuration for parallel training.

required
training_config TrainingConfig

Configuration for training parameters.

required
optim_config OptimizerSchedulerConfig

Configuration for the optimizer and scheduler.

required
experiment_config ExperimentConfig

Configuration for the experiment.

required
wandb_config Optional[WandbConfig]

Configuration for Weights and Biases logging.n

required
nsys_config Optional[NsysConfig]

Configuration for nsys profiling. If None, is disabled.

None
resume_if_exists bool

Flag to resume training if a checkpoint exists. Defaults to True.

True
Source code in bionemo/llm/train.py
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
def train(
    bionemo_exposed_model_config: ExposedModelConfig,
    data_config: DataConfig[DataModuleT],
    parallel_config: ParallelConfig,
    training_config: TrainingConfig,
    optim_config: OptimizerSchedulerConfig,
    experiment_config: ExperimentConfig,
    wandb_config: Optional[WandbConfig],
    nsys_config: Optional[NsysConfig] = None,
    resume_if_exists: bool = True,
):
    """Train a BioNemo model using the provided configurations. Uses the ExposedModelConfig and DataConfig as the primary variants for this method.

    Args:
        bionemo_exposed_model_config (ExposedModelConfig): Configuration for the exposed BioNemo model.
        data_config (DataConfig[DataModuleT]): Configuration for the data module.
        parallel_config (ParallelConfig): Configuration for parallel training.
        training_config (TrainingConfig): Configuration for training parameters.
        optim_config (OptimizerSchedulerConfig): Configuration for the optimizer and scheduler.
        experiment_config (ExperimentConfig): Configuration for the experiment.
        wandb_config (Optional[WandbConfig]): Configuration for Weights and Biases logging.n
        nsys_config (Optional[NsysConfig], optional): Configuration for nsys profiling. If None, is disabled.
        resume_if_exists (bool, optional): Flag to resume training if a checkpoint exists. Defaults to True.
    """
    bionemo_model_config = bionemo_exposed_model_config.exposed_to_internal_bionemo_model_config()
    pathlib.Path(data_config.result_dir).mkdir(parents=True, exist_ok=True)

    if experiment_config.save_every_n_steps != training_config.val_check_interval:
        logging.warning("Mutating training_config.save_every_n_steps to be equal to val_check_interval.")
        experiment_config.save_every_n_steps = training_config.val_check_interval

    global_batch_size = infer_global_batch_size(
        micro_batch_size=data_config.micro_batch_size,
        num_nodes=parallel_config.num_nodes,
        devices=parallel_config.num_devices,
        accumulate_grad_batches=parallel_config.accumulate_grad_batches,
        tensor_model_parallel_size=parallel_config.tensor_model_parallel_size,
        pipeline_model_parallel_size=parallel_config.pipeline_model_parallel_size,
    )

    data: DataModuleT = data_config.construct_data_module(global_batch_size)
    # TODO BioBertDataModule or BioBertTokenizer abstractions. We know all DataModuleT in this case has data.tokenizer,
    # although this constraint is not documented.

    # TODO: need an abstraction for LrSchedulerConfig
    if optim_config.lr_scheduler == "cosine":
        lr_scheduler = CosineAnnealingScheduler(
            max_steps=training_config.max_steps if optim_config.max_steps is None else optim_config.max_steps,
            min_lr=optim_config.lr / 100,
            warmup_steps=int(math.ceil(training_config.max_steps * optim_config.cosine_rampup_frac)),
            interval=optim_config.interval,
            monitor=optim_config.monitor,
            constant_steps=int(math.ceil(training_config.max_steps * optim_config.cosine_hold_frac)),
        )
    elif optim_config.lr_scheduler == "warmup_anneal":
        lr_scheduler = WarmupAnnealDecayHoldScheduler(
            warmup_steps=optim_config.warmup_steps,
            max_steps=training_config.max_steps if optim_config.max_steps is None else optim_config.max_steps,
            max_lr=optim_config.lr,
            min_lr=optim_config.lr / 10.0,
            anneal_percentage=0.10,
        )
    else:
        raise NotImplementedError(f"Scheduler {optim_config.lr_scheduler} not implemented.")

    optimizer = MegatronOptimizerModule(
        config=OptimizerConfig(
            lr=optim_config.lr,
            optimizer=optim_config.optimizer,
            use_distributed_optimizer=True,
            fp16=bionemo_model_config.fp16,
            bf16=bionemo_model_config.bf16,
        ),
        lr_scheduler=lr_scheduler,
    )

    model: BionemoLightningModule = biobert_lightning_module(
        config=bionemo_model_config, tokenizer=data.tokenizer, optimizer=optimizer
    )
    trainer: nl.Trainer = setup_trainer(parallel_config, training_config, nsys_config=nsys_config)
    nemo_logger: nl.NeMoLogger = nemo_logger_factory(experiment_config, wandb_config=wandb_config)

    llm.train(
        model=model,
        data=data,
        trainer=trainer,
        log=nemo_logger,
        resume=resume.AutoResume(
            resume_if_exists=resume_if_exists,
            resume_ignore_no_checkpoint=True,
        ),
    )