Skip to content

Commit 8ee323a

Browse files
committed
Merge branch 'main' into 'main'
Fix: Update OneLogger Instrumentation Points for Optimizer Init See merge request ADLR/megatron-lm!3698
2 parents 472ecf1 + f7de536 commit 8ee323a

File tree

2 files changed

+3
-3
lines changed

2 files changed

+3
-3
lines changed

megatron/training/one_logger_utils.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33

44
from .global_vars import get_one_logger, get_args
55

6-
_one_logger_utils_version = "1.1.0-mlm"
6+
_one_logger_utils_version = "1.2.0-mlm"
77

88

99
def get_timestamp_in_ms():

megatron/training/training.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -836,14 +836,12 @@ def pretrain(
836836

837837
# Model, optimizer, and learning rate.
838838
timers('model-and-optimizer-setup', log_level=0).start(barrier=True)
839-
app_metrics['app_build_optimizer_start_time'] = one_logger_utils.get_timestamp_in_ms()
840839
model, optimizer, opt_param_scheduler = setup_model_and_optimizer(
841840
model_provider, model_type, checkpointing_context=checkpointing_context
842841
)
843842

844843
timers('model-and-optimizer-setup').stop()
845844
print_datetime('after model, optimizer, and learning rate ' 'scheduler are built')
846-
app_metrics['app_build_optimizer_finish_time'] = one_logger_utils.get_timestamp_in_ms()
847845
config = get_model_config(model[0])
848846

849847
# Data stuff.
@@ -1234,6 +1232,7 @@ def setup_model_and_optimizer(
12341232
model = get_model(model_provider_func, model_type)
12351233
unwrapped_model = unwrap_model(model)
12361234

1235+
one_logger and one_logger.log_metrics({"app_build_optimzer_start_time": one_logger_utils.get_timestamp_in_ms()})
12371236
kwargs = {}
12381237
for f in dataclasses.fields(OptimizerConfig):
12391238
if hasattr(args, f.name):
@@ -1252,6 +1251,7 @@ def setup_model_and_optimizer(
12521251
default_skip_embedding_weight_decay=args.embedding_init_method_std is not None,
12531252
)
12541253
opt_param_scheduler = get_optimizer_param_scheduler(optimizer)
1254+
one_logger and one_logger.log_metrics({"app_build_optimzer_finish_time": one_logger_utils.get_timestamp_in_ms()})
12551255

12561256
if args.moe_use_upcycling:
12571257
torch.distributed.barrier()

0 commit comments

Comments
 (0)