Skip to content

Lr scheduler

SchedulerOutput

Bases: TypedDict

Output of the scheduler method.

Source code in bionemo/llm/model/lr_scheduler.py
33
34
35
36
37
38
class SchedulerOutput(TypedDict):
    """Output of the scheduler method."""

    optimizer: MegatronOptimizerModule
    lr_scheduler: dict
    monitor: str

WarmupAnnealDecayHold

Bases: _LRScheduler

Warmup Anneal Decay Hold learning rate scheduler.

Source code in bionemo/llm/model/lr_scheduler.py
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
class WarmupAnnealDecayHold(_LRScheduler):
    """Warmup Anneal Decay Hold learning rate scheduler."""

    def __init__(
        self,
        optimizer: MegatronOptimizerModule,
        *,
        warmup_steps: Optional[int] = None,
        max_steps: Optional[int] = None,
        max_lr: Optional[float] = None,
        min_lr: float = 4e-5,
        anneal_percentage: float = 0.10,
        last_epoch: int = -1,
    ) -> None:
        """Initializes the WarmupAnnealDecayHold learning rate scheduler.

        Args:
            optimizer: Optimizer to apply the learning rate scheduler.
            warmup_steps (int): Number of steps for the linear warm-up.
            max_steps (int): Total number of training steps.
            max_lr (float): Peak learning rate to be achieved after warm-up.
            min_lr (float): Minimum learning rate.
            anneal_percentage (float): Percentage of the max_lr to hold after decay.
            last_epoch (int): The index of the last epoch.
        """
        self.warmup_steps = warmup_steps
        self.max_steps = max_steps
        self.max_lr = max_lr
        self.min_lr = min_lr
        self.anneal_percentage = anneal_percentage
        self.last_epoch = last_epoch

        for group in optimizer.param_groups:
            group.setdefault("initial_lr", max_lr)

        super(WarmupAnnealDecayHold, self).__init__(optimizer, last_epoch)

    def get_lr(self) -> List[float]:
        """Get the learning rate at the current step."""
        step_num = self.last_epoch
        if step_num < self.warmup_steps:
            lr = self.min_lr + (self.max_lr - self.min_lr) * step_num / self.warmup_steps
        else:
            decay_steps = self.max_steps - self.warmup_steps
            lr = self.max_lr * (1 - (step_num - self.warmup_steps) / decay_steps)
            lr = max(lr, self.max_lr * self.anneal_percentage)

        return [lr for _ in self.optimizer.param_groups]

__init__(optimizer, *, warmup_steps=None, max_steps=None, max_lr=None, min_lr=4e-05, anneal_percentage=0.1, last_epoch=-1)

Initializes the WarmupAnnealDecayHold learning rate scheduler.

Parameters:

Name Type Description Default
optimizer MegatronOptimizerModule

Optimizer to apply the learning rate scheduler.

required
warmup_steps int

Number of steps for the linear warm-up.

None
max_steps int

Total number of training steps.

None
max_lr float

Peak learning rate to be achieved after warm-up.

None
min_lr float

Minimum learning rate.

4e-05
anneal_percentage float

Percentage of the max_lr to hold after decay.

0.1
last_epoch int

The index of the last epoch.

-1
Source code in bionemo/llm/model/lr_scheduler.py
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
def __init__(
    self,
    optimizer: MegatronOptimizerModule,
    *,
    warmup_steps: Optional[int] = None,
    max_steps: Optional[int] = None,
    max_lr: Optional[float] = None,
    min_lr: float = 4e-5,
    anneal_percentage: float = 0.10,
    last_epoch: int = -1,
) -> None:
    """Initializes the WarmupAnnealDecayHold learning rate scheduler.

    Args:
        optimizer: Optimizer to apply the learning rate scheduler.
        warmup_steps (int): Number of steps for the linear warm-up.
        max_steps (int): Total number of training steps.
        max_lr (float): Peak learning rate to be achieved after warm-up.
        min_lr (float): Minimum learning rate.
        anneal_percentage (float): Percentage of the max_lr to hold after decay.
        last_epoch (int): The index of the last epoch.
    """
    self.warmup_steps = warmup_steps
    self.max_steps = max_steps
    self.max_lr = max_lr
    self.min_lr = min_lr
    self.anneal_percentage = anneal_percentage
    self.last_epoch = last_epoch

    for group in optimizer.param_groups:
        group.setdefault("initial_lr", max_lr)

    super(WarmupAnnealDecayHold, self).__init__(optimizer, last_epoch)

get_lr()

Get the learning rate at the current step.

Source code in bionemo/llm/model/lr_scheduler.py
78
79
80
81
82
83
84
85
86
87
88
def get_lr(self) -> List[float]:
    """Get the learning rate at the current step."""
    step_num = self.last_epoch
    if step_num < self.warmup_steps:
        lr = self.min_lr + (self.max_lr - self.min_lr) * step_num / self.warmup_steps
    else:
        decay_steps = self.max_steps - self.warmup_steps
        lr = self.max_lr * (1 - (step_num - self.warmup_steps) / decay_steps)
        lr = max(lr, self.max_lr * self.anneal_percentage)

    return [lr for _ in self.optimizer.param_groups]

WarmupAnnealDecayHoldScheduler

Bases: LRSchedulerModule

Warmup Policy Learning Rate Scheduler.

Source code in bionemo/llm/model/lr_scheduler.py
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
class WarmupAnnealDecayHoldScheduler(LRSchedulerModule):
    """Warmup Policy Learning Rate Scheduler."""

    def __init__(
        self,
        warmup_steps: int = 2000,
        max_steps: int = 500_000,
        max_lr: float = 4e-4,
        min_lr: float = 4e-5,
        anneal_percentage: float = 0.10,
        interval: str = "step",
        frequency: int = 1,
        monitor: str = "val_loss",
    ) -> None:
        """Initializes the WarmupAnnealDecayHoldScheduler."""
        super().__init__()
        self.warmup_steps = warmup_steps
        self.max_steps = max_steps
        self.max_lr = max_lr
        self.min_lr = min_lr
        self.anneal_percentage = anneal_percentage
        self.interval = interval
        self.frequency = frequency
        self.monitor = monitor

    def scheduler(self, model: MegatronBioBertModel, optimizer: MegatronOptimizerModule) -> SchedulerOutput:
        """Returns the scheduler output."""
        lr_scheduler = WarmupAnnealDecayHold(
            optimizer,
            warmup_steps=self.warmup_steps,
            max_steps=self.max_steps,
            max_lr=self.max_lr,
            min_lr=self.min_lr,
            anneal_percentage=self.anneal_percentage,
        )
        return {
            "optimizer": optimizer,
            # REQUIRED: The scheduler instance
            "lr_scheduler": {
                "scheduler": lr_scheduler,
                # `interval` is the unit of the scheduler's step size, could also be 'step'.
                # 'epoch' updates the scheduler on epoch end whereas 'step'
                # updates it after a optimizer update.
                "interval": self.interval,
                # How many epochs/steps should pass between calls to
                # `scheduler.step()`. 1 corresponds to updating the learning
                # rate after every epoch/step.
                "frequency": self.frequency,
            },
            # Metric to to monitor for schedulers like `ReduceLROnPlateau`
            "monitor": self.monitor,
        }

__init__(warmup_steps=2000, max_steps=500000, max_lr=0.0004, min_lr=4e-05, anneal_percentage=0.1, interval='step', frequency=1, monitor='val_loss')

Initializes the WarmupAnnealDecayHoldScheduler.

Source code in bionemo/llm/model/lr_scheduler.py
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
def __init__(
    self,
    warmup_steps: int = 2000,
    max_steps: int = 500_000,
    max_lr: float = 4e-4,
    min_lr: float = 4e-5,
    anneal_percentage: float = 0.10,
    interval: str = "step",
    frequency: int = 1,
    monitor: str = "val_loss",
) -> None:
    """Initializes the WarmupAnnealDecayHoldScheduler."""
    super().__init__()
    self.warmup_steps = warmup_steps
    self.max_steps = max_steps
    self.max_lr = max_lr
    self.min_lr = min_lr
    self.anneal_percentage = anneal_percentage
    self.interval = interval
    self.frequency = frequency
    self.monitor = monitor

scheduler(model, optimizer)

Returns the scheduler output.

Source code in bionemo/llm/model/lr_scheduler.py
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
def scheduler(self, model: MegatronBioBertModel, optimizer: MegatronOptimizerModule) -> SchedulerOutput:
    """Returns the scheduler output."""
    lr_scheduler = WarmupAnnealDecayHold(
        optimizer,
        warmup_steps=self.warmup_steps,
        max_steps=self.max_steps,
        max_lr=self.max_lr,
        min_lr=self.min_lr,
        anneal_percentage=self.anneal_percentage,
    )
    return {
        "optimizer": optimizer,
        # REQUIRED: The scheduler instance
        "lr_scheduler": {
            "scheduler": lr_scheduler,
            # `interval` is the unit of the scheduler's step size, could also be 'step'.
            # 'epoch' updates the scheduler on epoch end whereas 'step'
            # updates it after a optimizer update.
            "interval": self.interval,
            # How many epochs/steps should pass between calls to
            # `scheduler.step()`. 1 corresponds to updating the learning
            # rate after every epoch/step.
            "frequency": self.frequency,
        },
        # Metric to to monitor for schedulers like `ReduceLROnPlateau`
        "monitor": self.monitor,
    }