Merge branch 'mblaz/mlp-glu-oom-fix' into 'main'

ko3n1g · ko3n1g · commit 2a97a16bdfa3 · 2025-07-29T20:54:13.000-07:00
Fix DCP OOM during SwiGLU load

See merge request ADLR/megatron-lm!3642
diff --git a/megatron/core/dist_checkpointing/strategies/torch.py b/megatron/core/dist_checkpointing/strategies/torch.py
@@ -374,7 +374,8 @@ def _unwrap_pyt_sharded_tensor(sh_ten: TorchShardedTensor) -> List[torch.Tensor]
             ten = ten.view(-1)
         else:
             for _ in range(mcore_sh_ten.prepend_axis_num):
-                ten = ten.squeeze(0)
+                assert ten.size(0) == 1
+                ten = ten[0]  # NOTE: ten.squeeze(0) uses more memory for FP8 tensors
         ret_tensors.append(ten)
     return ret_tensors
 
diff --git a/megatron/core/transformer/mlp.py b/megatron/core/transformer/mlp.py
@@ -1,5 +1,6 @@
 # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
-
+import gc
+import logging
 import warnings
 from dataclasses import dataclass
 from typing import Optional, Union
@@ -34,6 +35,9 @@
     HAVE_TE = False
 
 
+logger = logging.getLogger(__name__)
+
+
 # pylint: disable=missing-class-docstring
 @dataclass
 class MLPSubmodules:
@@ -311,7 +315,17 @@ def sh_ten_build_fn(
 
     def sh_ten_merge_fn(sub_state_dict):
         with torch.no_grad():
-            return torch.cat(sub_state_dict)
+            try:
+                return torch.cat(sub_state_dict)
+            except (RuntimeError, torch.cuda.OutOfMemoryError) as e:
+                logger.warning(
+                    f"CUDA OutOfMemoryError encountered during tensors merging."
+                    f" Switching to CPU merge. (Error: {e})"
+                )
+                merged_sub_state_dict = torch.cat([t.cpu() for t in sub_state_dict])
+                gc.collect()
+                torch.cuda.empty_cache()
+                return merged_sub_state_dict
 
     return ShardedTensorFactory(
         original_sh_ten.key,
diff --git a/tests/unit_tests/dist_checkpointing/models/test_mlp_glu.py b/tests/unit_tests/dist_checkpointing/models/test_mlp_glu.py
@@ -1,4 +1,6 @@
 # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+import inspect
+import logging
 
 import pytest
 import torch
@@ -13,7 +15,7 @@
 )
 from megatron.core.models.gpt.gpt_layer_specs import get_gpt_layer_with_transformer_engine_spec
 from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed
-from megatron.core.transformer.mlp import MLP
+from megatron.core.transformer.mlp import MLP, apply_swiglu_sharded_factory
 from megatron.core.transformer.transformer_config import TransformerConfig
 from tests.unit_tests.dist_checkpointing import TempNamedDir
 from tests.unit_tests.test_utilities import Utils
@@ -61,11 +63,10 @@ def test_parallel_reconfiguration_e2e(self, tmp_path_dist_ckpt, src_tp_pp, dest_
         """Test module saving and loading with different TP/PP"""
         Utils.initialize_model_parallel(*src_tp_pp)
 
-        with TempNamedDir(
-            tmp_path_dist_ckpt / 'test_mlp_glu_reconfiguration_model_A'
-        ) as ckpt_dir_A, TempNamedDir(
-            tmp_path_dist_ckpt / 'test_mlp_glu_reconfiguration_model_B'
-        ) as ckpt_dir_B:
+        with (
+            TempNamedDir(tmp_path_dist_ckpt / 'test_mlp_glu_reconfiguration_model_A') as ckpt_dir_A,
+            TempNamedDir(tmp_path_dist_ckpt / 'test_mlp_glu_reconfiguration_model_B') as ckpt_dir_B,
+        ):
             # Save checkpoint A
             mlp_A = initialize_mlp()
             save(mlp_A.sharded_state_dict(sharded_offsets=get_pp_offsets()), ckpt_dir_A)
@@ -87,3 +88,49 @@ def test_parallel_reconfiguration_e2e(self, tmp_path_dist_ckpt, src_tp_pp, dest_
             state_dict_B = load_plain_tensors(ckpt_dir_B)
             diffs = diff(state_dict_A, state_dict_B)
             assert not any(map(bool, diffs)), diffs
+
+    def test_oom_is_handled(self, caplog):
+        Utils.initialize_model_parallel(Utils.world_size, 1)
+        dtype = torch.bfloat16
+
+        # Compute free memory in bytes
+        device = torch.cuda.current_device()
+        allocated = torch.cuda.memory_allocated(device)
+        total = torch.cuda.get_device_properties(device).total_memory
+        free = total - allocated
+
+        # We should create two tensor which take up between 50% and 100% of free memory,
+        # so that the torch.cat tries to allocate twice as many and OOMs.
+        expected_local_num_bytes = free * 0.6
+
+        local_num_elems = expected_local_num_bytes // torch._utils._element_size(dtype)
+        local_num_elems = int(local_num_elems // 1024 * 1024)
+        assert local_num_elems % 1024 == 0
+
+        local_w_plus_v_shape = (local_num_elems // 512, 512)
+        local_w_or_v_shape = (local_num_elems // 1024, 512)
+
+        fc1_weight_sh_ten = ShardedTensor.from_rank_offsets(
+            'a',
+            torch.ones(local_w_plus_v_shape, device='cuda', dtype=dtype),
+            (0, Utils.rank, Utils.world_size),
+        )
+        fc1_factory = apply_swiglu_sharded_factory(fc1_weight_sh_ten, ())
+        sharded_state_dict = fc1_factory.build()
+        assert len(sharded_state_dict) == 2
+        assert sharded_state_dict[0].data.shape == local_w_or_v_shape
+        # NOTE: with singleton_local_shards=True this assert would fail - global shape is
+        #  `(Utils.world_size * local_w_or_v_shape[0], local_w_or_v_shape[1])`
+        assert sharded_state_dict[0].global_shape[-2:] == (
+            Utils.world_size * local_w_plus_v_shape[0],
+            local_w_or_v_shape[1],
+        )
+
+        # Checkpoint load replaces ShardedTensors with tensors.
+        # Load happens in-place, so we can just use the same tensors
+        loaded_state_dict = [sh_ten.data for sh_ten in sharded_state_dict]
+
+        # The critical part that should OOM:
+        with caplog.at_level(logging.WARNING):
+            fc1_factory.merge_fn(loaded_state_dict)
+            assert "CUDA OutOfMemoryError encountered during tensors merging" in caplog.text