Skip to content

Commit d338252

Browse files
committed
Merge branch 'yueshen/support_llama4_hf_mlm_import' into 'main'
Support Llama4 HF checkpoint to MLM checkpoint See merge request ADLR/megatron-lm!3731
2 parents 10e3163 + 10493df commit d338252

File tree

7 files changed

+86
-2
lines changed

7 files changed

+86
-2
lines changed

examples/post_training/modelopt/conf/arguments.sh

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@ fi
5050

5151
if [ -z ${ETP} ]; then
5252
ETP=${TP}
53-
printf "${MLM_WARNING} Variable ${PURPLE}TP${WHITE} not set! (default: ${ETP})\n"
53+
printf "${MLM_WARNING} Variable ${PURPLE}ETP${WHITE} not set! (default: ${ETP})\n"
5454
fi
5555

5656
if [ -z ${EP} ]; then
@@ -70,7 +70,7 @@ fi
7070

7171

7272
if [ -z ${LAUNCH_SCRIPT} ]; then
73-
LAUNCH_SCRIPT="torchrun --nproc_per_node=$((TP * EP * PP * DP))"
73+
LAUNCH_SCRIPT="torchrun --nproc_per_node=$((ETP * EP * PP * DP))"
7474
fi
7575

7676
# Install TensorRT Model Optimizer if haven't.

examples/post_training/modelopt/conf/meta-llama/Llama-4-Maverick-17B-128E-Instruct.sh

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,4 +49,5 @@ MODEL_ARGS=" \
4949
--rotary-interleaved \
5050
--no-rope-freq 4 \
5151
--export-moe-apply-probs-on-input \
52+
--padded-vocab-size 202048 \
5253
"

examples/post_training/modelopt/conf/meta-llama/Llama-4-Scout-17B-16E-Instruct.sh

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,4 +50,5 @@ MODEL_ARGS=" \
5050
--no-bias-swiglu-fusion \
5151
--export-qk-l2-norm \
5252
--export-moe-apply-probs-on-input \
53+
--padded-vocab-size 202048 \
5354
"
Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
#!/bin/bash
2+
3+
if [ -z ${HF_MODEL_CKPT} ]; then
4+
HF_MODEL_CKPT=Qwen/Qwen2.5-0.5B
5+
TOKENIZER_MODEL=Qwen/Qwen2.5-0.5B
6+
else
7+
TOKENIZER_MODEL=${HF_MODEL_CKPT}
8+
fi
9+
10+
MODEL_ARGS=" \
11+
--save-interval 100000 \
12+
--micro-batch-size 1 \
13+
--bf16 \
14+
--no-masked-softmax-fusion \
15+
--disable-bias-linear \
16+
--add-qkv-bias \
17+
--position-embedding-type rope \
18+
--no-rope-fusion \
19+
--normalization RMSNorm \
20+
--swiglu \
21+
--num-layers 24 \
22+
--hidden-size 896 \
23+
--ffn-hidden-size 4864 \
24+
--num-attention-heads 14 \
25+
--group-query-attention \
26+
--num-query-groups 2 \
27+
--kv-channels 64 \
28+
--seq-length 4096 \
29+
--max-position-embeddings 32768 \
30+
--tokenizer-type HuggingFaceTokenizer \
31+
--padded-vocab-size 151936 \
32+
--make-vocab-size-divisible-by 1 \
33+
--use-mcore-models \
34+
--rotary-percent 1.0 \
35+
--rotary-base 1000000 \
36+
--no-bias-swiglu-fusion \
37+
"
Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
#!/bin/bash
2+
3+
if [ -z ${HF_MODEL_CKPT} ]; then
4+
HF_MODEL_CKPT=Qwen/Qwen2.5-7B-Instruct
5+
TOKENIZER_MODEL=Qwen/Qwen2.5-7B-Instruct
6+
else
7+
TOKENIZER_MODEL=${HF_MODEL_CKPT}
8+
fi
9+
10+
MODEL_ARGS=" \
11+
--save-interval 100000 \
12+
--micro-batch-size 1 \
13+
--bf16 \
14+
--no-masked-softmax-fusion \
15+
--disable-bias-linear \
16+
--add-qkv-bias \
17+
--position-embedding-type rope \
18+
--no-rope-fusion \
19+
--normalization RMSNorm \
20+
--swiglu \
21+
--num-layers 28 \
22+
--hidden-size 3584 \
23+
--ffn-hidden-size 18944 \
24+
--num-attention-heads 28 \
25+
--group-query-attention \
26+
--num-query-groups 4 \
27+
--kv-channels 128 \
28+
--seq-length 4096 \
29+
--max-position-embeddings 32768 \
30+
--tokenizer-type HuggingFaceTokenizer \
31+
--padded-vocab-size 152064 \
32+
--make-vocab-size-divisible-by 1 \
33+
--use-mcore-models \
34+
--rotary-percent 1.0 \
35+
--rotary-base 1000000 \
36+
--no-bias-swiglu-fusion \
37+
--untie-embeddings-and-output-weights \
38+
"

examples/post_training/modelopt/convert_model.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313

1414
from megatron.core import mpu
1515
from megatron.core.enums import ModelType
16+
from megatron.core.parallel_state import destroy_model_parallel
1617
from megatron.post_training.arguments import add_modelopt_args
1718
from megatron.post_training.checkpointing import load_modelopt_checkpoint
1819
from megatron.post_training.model_provider import model_provider
@@ -183,3 +184,5 @@ def check_arguments():
183184
torch.distributed.barrier()
184185

185186
save_checkpoint(1, model, None, None, 0)
187+
188+
destroy_model_parallel()

megatron/training/arguments.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2451,6 +2451,10 @@ def _add_tokenizer_args(parser):
24512451
group = parser.add_argument_group(title='tokenizer')
24522452
group.add_argument('--vocab-size', type=int, default=None,
24532453
help='Size of vocab before EOD or padding.')
2454+
group.add_argument('--padded-vocab-size', type=int, default=None,
2455+
help='Vocabulary size of the model (padded to be divisible by '
2456+
'tensor model parallel size). If not provided, it will be '
2457+
'automatically calculated from vocab-size.')
24542458
group.add_argument('--vocab-file', type=str, default=None,
24552459
help='Path to the vocab file.')
24562460
group.add_argument('--merge-file', type=str, default=None,

0 commit comments

Comments
 (0)