Merge branch 'mla_down_proj_telinear' into 'main'

ko3n1g · ko3n1g · commit 2fbd64665153 · 2025-07-29T05:47:28.000-07:00
perf(MLA): MLA down proj switch back to TELinear

See merge request ADLR/megatron-lm!3576
diff --git a/megatron/core/extensions/transformer_engine_spec_provider.py b/megatron/core/extensions/transformer_engine_spec_provider.py
@@ -8,6 +8,7 @@
     TEColumnParallelLinear,
     TEDotProductAttention,
     TELayerNormColumnParallelLinear,
+    TELinear,
     TENorm,
     TERowParallelGroupedLinear,
     TERowParallelLinear,
@@ -23,6 +24,10 @@
 class TESpecProvider(BackendSpecProvider):
     """A protocol for providing the submodules used in Spec building."""
 
+    def linear(self) -> type:
+        """Which linear module TE backend uses"""
+        return TELinear
+
     def column_parallel_linear(self) -> type:
         """Which column parallel linear module TE backend uses"""
         return TEColumnParallelLinear
diff --git a/megatron/core/models/gpt/gpt_layer_specs.py b/megatron/core/models/gpt/gpt_layer_specs.py
@@ -141,9 +141,9 @@ def get_gpt_layer_with_transformer_engine_spec(
                     params={"attn_mask_type": AttnMaskType.causal},
                     submodules=MLASelfAttentionSubmodules(
                         linear_q_proj=backend.column_parallel_linear(),
-                        linear_q_down_proj=backend.column_parallel_linear(),
+                        linear_q_down_proj=backend.linear(),
                         linear_q_up_proj=linear_q_up_proj,
-                        linear_kv_down_proj=backend.column_parallel_linear(),
+                        linear_kv_down_proj=backend.linear(),
                         linear_kv_up_proj=linear_kv_up_proj,
                         core_attention=backend.core_attention(),
                         linear_proj=backend.row_parallel_linear(),
diff --git a/megatron/core/transformer/multi_latent_attention.py b/megatron/core/transformer/multi_latent_attention.py
@@ -15,6 +15,7 @@
     apply_rotary_pos_emb,
 )
 from megatron.core.process_groups_config import ModelCommProcessGroups
+from megatron.core.tensor_parallel.layers import ColumnParallelLinear
 from megatron.core.tensor_parallel.mappings import (
     gather_from_sequence_parallel_region,
     gather_from_tensor_model_parallel_region,
@@ -36,6 +37,16 @@
     fused_apply_mla_rope_for_q = None
 
 
+try:
+    from megatron.core.extensions.transformer_engine import TEColumnParallelLinear, TELinear
+    from megatron.core.post_training.modelopt.layers import Linear
+
+    HAVE_TE = True
+except ImportError:
+    TEColumnParallelLinear, TELinear, Linear = None, None, None
+    HAVE_TE = False
+
+
 @dataclass
 class MLASelfAttentionSubmodules:
     """Submodules for the MLA self-attention layer."""
@@ -282,6 +293,17 @@ def __init__(
             )
 
         else:
+            q_down_proj_kwargs = {}
+            if submodules.linear_q_down_proj in [TELinear]:
+                q_down_proj_kwargs['parallel_mode'] = 'duplicated'
+            elif submodules.linear_q_down_proj in [
+                Linear,
+                TEColumnParallelLinear,
+                ColumnParallelLinear,
+            ]:
+                q_down_proj_kwargs['gather_output'] = False
+            else:
+                raise ValueError(f"Unsupported linear_q_down_proj: {submodules.linear_q_down_proj}")
 
             self.linear_q_down_proj = build_module(
                 submodules.linear_q_down_proj,
@@ -291,9 +313,10 @@ def __init__(
                 init_method=self.config.init_method,
                 bias=False,
                 skip_bias_add=False,
-                gather_output=False,
                 is_expert=False,
                 tp_comm_buffer_name='q_down_proj',
+                skip_weight_param_allocation=False,
+                **q_down_proj_kwargs,
             )
 
             self.linear_q_up_proj = build_module(
@@ -309,6 +332,18 @@ def __init__(
                 tp_comm_buffer_name='q_up_proj',
             )
 
+        kv_down_proj_kwargs = {}
+        if submodules.linear_kv_down_proj in [TELinear]:
+            kv_down_proj_kwargs['parallel_mode'] = 'duplicated'
+        elif submodules.linear_kv_down_proj in [
+            Linear,
+            TEColumnParallelLinear,
+            ColumnParallelLinear,
+        ]:
+            kv_down_proj_kwargs['gather_output'] = False
+        else:
+            raise ValueError(f"Unsupported linear_kv_down_proj: {submodules.linear_kv_down_proj}")
+
         self.linear_kv_down_proj = build_module(
             submodules.linear_kv_down_proj,
             self.config.hidden_size,
@@ -317,9 +352,10 @@ def __init__(
             init_method=self.config.init_method,
             bias=False,
             skip_bias_add=False,
-            gather_output=False,
             is_expert=False,
             tp_comm_buffer_name='kv_down_proj',
+            skip_weight_param_allocation=False,
+            **kv_down_proj_kwargs,
         )
 
         self.linear_kv_up_proj = build_module(
@@ -453,7 +489,10 @@ def get_query_key_value_tensors(
             kv_compressed, k_pos_emb = torch.split(
                 kv_combined, [self.config.kv_lora_rank, self.config.qk_pos_emb_head_dim], dim=-1
             )
-            if parallel_state.get_tensor_model_parallel_world_size() > 1:
+            if (
+                parallel_state.get_tensor_model_parallel_world_size() > 1
+                and self.config.sequence_parallel
+            ):
                 # k_pos_emb: [s, b, qk_pos_emb_head_dim]
                 k_pos_emb = gather_from_sequence_parallel_region(k_pos_emb)
 
diff --git a/megatron/training/arguments.py b/megatron/training/arguments.py
@@ -1135,6 +1135,13 @@ def core_transformer_config_from_args(args, config_class=None):
     else:
         kw_args['num_query_groups'] = None
     kw_args['config_logger_dir'] = args.config_logger_dir
+    if args.rope_type is None:
+        # Pop 'rope_type' to let the config class use the default value.
+        kw_args.pop('rope_type', None)
+    else:
+        assert (args.multi_latent_attention or args.rope_type == 'rope'), (
+            f'Common attention only support rope_type="rope", but got {args.rope_type}.'
+        )
 
     if len(args.cp_comm_type) == 1:
         kw_args['cp_comm_type'] = args.cp_comm_type[0]
@@ -1884,6 +1891,10 @@ def _add_training_args(parser):
                        help='Disable rope fusion, the fusion is available '
                        'only when using megatron-core.',
                        dest='apply_rope_fusion')
+    group.add_argument('--rope-type', type=str, default=None,
+                      choices=['rope', 'yarn'],
+                      help='Type of rope to use. Note that MLA takes yarn by default, '
+                      'and common attention takes rope by default.')
     group.add_argument('--cross-entropy-loss-fusion', action='store_true',
                        help='Enabled fusion of cross entropy loss calculation.',
                        dest='cross_entropy_loss_fusion')
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_mla_dgx_a100_1N8G/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_mla_dgx_a100_1N8G/golden_values_dev_dgx_h100.json
@@ -1 +1 @@
-{"lm loss": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 10.90224, "5": 10.9121, "10": 10.89882, "15": 10.90014, "20": 10.87364, "25": 10.86175, "30": 10.79053, "35": 10.76848, "40": 10.63331, "45": 10.54116, "50": 10.54543}}, "num-zeros": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 22791116.0, "5": 22778568.0, "10": 22982992.0, "15": 22885372.0, "20": 22758452.0, "25": 22883116.0, "30": 22694696.0, "35": 22851780.0, "40": 22721894.0, "45": 22738960.0, "50": 22968936.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 384918016.0, "5": 384918016.0, "10": 384918016.0, "15": 384918016.0, "20": 384918016.0, "25": 384918016.0, "30": 384918016.0, "35": 384918016.0, "40": 384918016.0, "45": 384918016.0, "50": 384918016.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 1123523072.0, "5": 1245691392.0, "10": 1245691392.0, "15": 1245691392.0, "20": 1245691392.0, "25": 1245691392.0, "30": 1245691392.0, "35": 1245691392.0, "40": 1245691392.0, "45": 1245691392.0, "50": 1245691392.0}}, "iteration-time": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 9.95327, "5": 0.107, "10": 0.1037, "15": 0.10008, "20": 0.09966, "25": 0.09698, "30": 0.09982, "35": 0.09784, "40": 0.09998, "45": 0.09728, "50": 0.10112}}}
+{"lm loss": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 10.92337, "5": 10.92089, "10": 10.92192, "15": 10.92351, "20": 10.90031, "25": 10.87827, "30": 10.81423, "35": 10.78865, "40": 10.65927, "45": 10.56875, "50": 10.55421}}, "num-zeros": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 22791636.0, "5": 22778528.0, "10": 22983232.0, "15": 22886400.0, "20": 22758358.0, "25": 22883742.0, "30": 22695256.0, "35": 22851572.0, "40": 22721680.0, "45": 22738904.0, "50": 22968272.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 384918016.0, "5": 384918016.0, "10": 384918016.0, "15": 384918016.0, "20": 384918016.0, "25": 384918016.0, "30": 384918016.0, "35": 384918016.0, "40": 384918016.0, "45": 384918016.0, "50": 384918016.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 1123523072.0, "5": 1245691392.0, "10": 1245691392.0, "15": 1245691392.0, "20": 1245691392.0, "25": 1245691392.0, "30": 1245691392.0, "35": 1245691392.0, "40": 1245691392.0, "45": 1245691392.0, "50": 1245691392.0}}, "iteration-time": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 9.95327, "5": 0.107, "10": 0.1037, "15": 0.10008, "20": 0.09966, "25": 0.09698, "30": 0.09982, "35": 0.09784, "40": 0.09998, "45": 0.09728, "50": 0.10112}}}
diff --git a/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp2_ep4_etp1_memory_speed/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp2_ep4_etp1_memory_speed/golden_values_dev_dgx_h100.json
@@ -4,35 +4,35 @@
         "end_step": 50,
         "step_interval": 5,
         "values": {
-            "1": 11.11942,
-            "5": 9.28112,
-            "10": 8.96767,
-            "15": 7.98924,
-            "20": 7.80173,
-            "25": 7.63303,
-            "30": 7.57275,
-            "35": 7.16214,
-            "40": 7.49418,
-            "45": 7.13663,
-            "50": 6.97223
+            "1": 11.04747,
+            "5": 9.52402,
+            "10": 9.0596,
+            "15": 8.04431,
+            "20": 7.90653,
+            "25": 7.67312,
+            "30": 7.64496,
+            "35": 7.21326,
+            "40": 7.54337,
+            "45": 7.18518,
+            "50": 7.03308
         }
     },
     "num-zeros": {
         "start_step": 1,
         "end_step": 50,
         "step_interval": 5,
         "values": {
-            "1": 38802576.0,
-            "5": 240322608.0,
-            "10": 627841536.0,
-            "15": 579569024.0,
-            "20": 658931008.0,
-            "25": 509733120.0,
-            "30": 445364352.0,
-            "35": 561495552.0,
-            "40": 311616704.0,
-            "45": 420418656.0,
-            "50": 199242224.0
+            "1": 38802572.0,
+            "5": 252883888.0,
+            "10": 731676032.0,
+            "15": 708564416.0,
+            "20": 989209984.0,
+            "25": 827443712.0,
+            "30": 756766080.0,
+            "35": 709348352.0,
+            "40": 588409600.0,
+            "45": 517940384.0,
+            "50": 409992032.0
         }
     },
     "mem-allocated-bytes": {
@@ -76,17 +76,17 @@
         "end_step": 50,
         "step_interval": 5,
         "values": {
-            "1": 11.04374,
-            "5": 9.86351,
-            "10": 9.02642,
-            "15": 7.99309,
-            "20": 7.4113,
-            "25": 7.73904,
-            "30": 7.48829,
-            "35": 7.54205,
-            "40": 7.94269,
-            "45": 7.3323,
-            "50": 6.83748
+            "1": 11.10137,
+            "5": 9.92778,
+            "10": 9.06984,
+            "15": 8.03354,
+            "20": 7.45652,
+            "25": 7.77087,
+            "30": 7.52221,
+            "35": 7.54715,
+            "40": 7.94738,
+            "45": 7.32562,
+            "50": 6.85517
         }
     },
     "iteration-time": {
diff --git a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp2_ep4_etp1_mtp_resume_torch_dist_fp8/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp2_ep4_etp1_mtp_resume_torch_dist_fp8/golden_values_dev_dgx_h100.json
@@ -4,35 +4,35 @@
         "end_step": 50,
         "step_interval": 5,
         "values": {
-            "1": 11.12115,
-            "5": 9.48776,
-            "10": 9.06639,
-            "15": 8.04478,
-            "20": 7.81622,
-            "25": 7.64405,
-            "30": 7.59737,
-            "35": 7.17911,
-            "40": 7.50704,
-            "45": 7.15152,
-            "50": 6.9914
+            "1": 11.0475,
+            "5": 9.47263,
+            "10": 8.90522,
+            "15": 7.94285,
+            "20": 7.7696,
+            "25": 7.60471,
+            "30": 7.56115,
+            "35": 7.14613,
+            "40": 7.47799,
+            "45": 7.11821,
+            "50": 6.96092
         }
     },
     "num-zeros": {
         "start_step": 1,
         "end_step": 50,
         "step_interval": 5,
         "values": {
-            "1": 38802568.0,
-            "5": 290825856.0,
-            "10": 731989952.0,
-            "15": 630001984.0,
-            "20": 677850432.0,
-            "25": 585432256.0,
-            "30": 750669888.0,
-            "35": 618214784.0,
-            "40": 531952480.0,
-            "45": 313537728.0,
-            "50": 394303040.0
+            "1": 38802664.0,
+            "5": 221567312.0,
+            "10": 735002624.0,
+            "15": 611135296.0,
+            "20": 590465536.0,
+            "25": 542043712.0,
+            "30": 429887840.0,
+            "35": 467241280.0,
+            "40": 380798464.0,
+            "45": 329247616.0,
+            "50": 284175040.0
         }
     },
     "mem-allocated-bytes": {
@@ -76,17 +76,17 @@
         "end_step": 50,
         "step_interval": 5,
         "values": {
-            "1": 11.03802,
-            "5": 9.88912,
-            "10": 9.02538,
-            "15": 8.00927,
-            "20": 7.41448,
-            "25": 7.73636,
-            "30": 7.48773,
-            "35": 7.5439,
-            "40": 7.93894,
-            "45": 7.32524,
-            "50": 6.83572
+            "1": 11.10067,
+            "5": 9.99203,
+            "10": 8.95639,
+            "15": 7.95116,
+            "20": 7.37498,
+            "25": 7.71218,
+            "30": 7.46442,
+            "35": 7.5167,
+            "40": 7.91951,
+            "45": 7.30491,
+            "50": 6.82535
         }
     },
     "iteration-time": {
diff --git a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp2_ep4_etp1_resume_torch_dist_attn_cudagraph/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp2_ep4_etp1_resume_torch_dist_attn_cudagraph/golden_values_dev_dgx_h100.json
@@ -4,35 +4,35 @@
         "end_step": 50,
         "step_interval": 5,
         "values": {
-            "1": 10.96592,
-            "5": 9.91128,
-            "10": 9.79015,
-            "15": 9.01941,
-            "20": 8.83745,
-            "25": 8.65462,
-            "30": 8.67364,
-            "35": 8.08232,
-            "40": 8.36432,
-            "45": 8.09963,
-            "50": 7.81534
+            "1": 10.94995,
+            "5": 9.9238,
+            "10": 9.85512,
+            "15": 9.01582,
+            "20": 8.83018,
+            "25": 8.62061,
+            "30": 8.65266,
+            "35": 8.06408,
+            "40": 8.34095,
+            "45": 8.08321,
+            "50": 7.79855
         }
     },
     "num-zeros": {
         "start_step": 1,
         "end_step": 50,
         "step_interval": 5,
         "values": {
-            "1": 19403588.0,
-            "5": 106158376.0,
-            "10": 167686976.0,
-            "15": 197075792.0,
-            "20": 198976032.0,
-            "25": 270640864.0,
-            "30": 224341904.0,
-            "35": 246167936.0,
-            "40": 174723744.0,
-            "45": 125710632.0,
-            "50": 156288624.0
+            "1": 19403880.0,
+            "5": 142219280.0,
+            "10": 125211168.0,
+            "15": 264681776.0,
+            "20": 217875968.0,
+            "25": 236053968.0,
+            "30": 266788816.0,
+            "35": 243068672.0,
+            "40": 166847584.0,
+            "45": 159913152.0,
+            "50": 165693360.0
         }
     },
     "mem-allocated-bytes": {
diff --git a/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp2_ep4_etp1_selective_recompute_experimental/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/moe/gpt3_mr_mcore_te_tp2_pp2_ep4_etp1_selective_recompute_experimental/golden_values_dev_dgx_h100.json
diff --git a/tests/unit_tests/transformer/test_multi_latent_attention.py b/tests/unit_tests/transformer/test_multi_latent_attention.py

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		-{"lm loss": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 10.90224, "5": 10.9121, "10": 10.89882, "15": 10.90014, "20": 10.87364, "25": 10.86175, "30": 10.79053, "35": 10.76848, "40": 10.63331, "45": 10.54116, "50": 10.54543}}, "num-zeros": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 22791116.0, "5": 22778568.0, "10": 22982992.0, "15": 22885372.0, "20": 22758452.0, "25": 22883116.0, "30": 22694696.0, "35": 22851780.0, "40": 22721894.0, "45": 22738960.0, "50": 22968936.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 384918016.0, "5": 384918016.0, "10": 384918016.0, "15": 384918016.0, "20": 384918016.0, "25": 384918016.0, "30": 384918016.0, "35": 384918016.0, "40": 384918016.0, "45": 384918016.0, "50": 384918016.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 1123523072.0, "5": 1245691392.0, "10": 1245691392.0, "15": 1245691392.0, "20": 1245691392.0, "25": 1245691392.0, "30": 1245691392.0, "35": 1245691392.0, "40": 1245691392.0, "45": 1245691392.0, "50": 1245691392.0}}, "iteration-time": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 9.95327, "5": 0.107, "10": 0.1037, "15": 0.10008, "20": 0.09966, "25": 0.09698, "30": 0.09982, "35": 0.09784, "40": 0.09998, "45": 0.09728, "50": 0.10112}}}
	`1`	+{"lm loss": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 10.92337, "5": 10.92089, "10": 10.92192, "15": 10.92351, "20": 10.90031, "25": 10.87827, "30": 10.81423, "35": 10.78865, "40": 10.65927, "45": 10.56875, "50": 10.55421}}, "num-zeros": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 22791636.0, "5": 22778528.0, "10": 22983232.0, "15": 22886400.0, "20": 22758358.0, "25": 22883742.0, "30": 22695256.0, "35": 22851572.0, "40": 22721680.0, "45": 22738904.0, "50": 22968272.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 384918016.0, "5": 384918016.0, "10": 384918016.0, "15": 384918016.0, "20": 384918016.0, "25": 384918016.0, "30": 384918016.0, "35": 384918016.0, "40": 384918016.0, "45": 384918016.0, "50": 384918016.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 1123523072.0, "5": 1245691392.0, "10": 1245691392.0, "15": 1245691392.0, "20": 1245691392.0, "25": 1245691392.0, "30": 1245691392.0, "35": 1245691392.0, "40": 1245691392.0, "45": 1245691392.0, "50": 1245691392.0}}, "iteration-time": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 9.95327, "5": 0.107, "10": 0.1037, "15": 0.10008, "20": 0.09966, "25": 0.09698, "30": 0.09982, "35": 0.09784, "40": 0.09998, "45": 0.09728, "50": 0.10112}}}