Skip to content

Commit a4e83d5

Browse files
committed
fix: correct help text for on_demand_checkpointing sync point count
1 parent 25752c1 commit a4e83d5

1 file changed

Lines changed: 5 additions & 5 deletions

File tree

src/instructlab/training/main_ds.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1207,11 +1207,11 @@ def run_training(torch_args: TorchrunArgs, train_args: TrainingArgs) -> None:
12071207
default=False,
12081208
help=(
12091209
"Enable on-demand full-state checkpointing triggered by Unix signals. "
1210-
"When enabled, workers check for a trigger file in /dev/shm at five "
1211-
"synchronization points per step (before/after each minibatch forward "
1212-
"and backward pass, and before/after the optimizer step) and collectively "
1213-
"save a distributed checkpoint before exiting. Designed for OpenShift AI / "
1214-
"KubeFlow preemption handling."
1210+
"When enabled, workers check for a trigger file in /dev/shm at "
1211+
"multiple synchronization points per step (before/after each "
1212+
"minibatch forward and backward pass, and before/after the optimizer "
1213+
"step) and collectively save a distributed checkpoint before exiting. "
1214+
"Designed for OpenShift AI / KubeFlow preemption handling."
12151215
),
12161216
)
12171217
parser.add_argument(

0 commit comments

Comments
 (0)