Make room for transformer changes requiring attn_mask fn

kimishpatel · kimishpatel · commit 918b2478aec5 · 2025-06-25T09:10:38.000-07:00
Summary:
Latest changse in 4.53 requires that custom attentin functions have
corresponding mask generation

Test Plan:
CI wont pass without changes on top of 4.53

Reviewers:

Subscribers:

Tasks:

Tags:
diff --git a/install_dev.py b/install_dev.py
@@ -5,7 +5,7 @@
 
 def install_torch_nightly_deps():
     """Install torch related dependencies from pinned nightly"""
-    EXECUTORCH_NIGHTLY_VERSION = "dev20250620"
+    EXECUTORCH_NIGHTLY_VERSION = "dev20250625"
     TORCHAO_NIGHTLY_VERSION = "dev20250620"
     # Torch nightly is aligned with pinned nightly in https://github.com/pytorch/executorch/blob/main/install_requirements.py#L74
     TORCH_NIGHTLY_VERSION = "dev20250601"
@@ -34,7 +34,7 @@ def install_dep_from_source():
             "-m",
             "pip",
             "install",
-            "git+https://github.com/huggingface/transformers@51f94ea06d19a6308c61bbb4dc97c40aabd12bad#egg=transformers",  # v4.52.4
+            "git+https://github.com/huggingface/transformers@37367c7d9fd23401c26e79a2b26253ab2d1b7236#egg=transformers",
         ]
     )
     subprocess.check_call(
diff --git a/optimum/executorch/attentions/custom_kv_cache.py b/optimum/executorch/attentions/custom_kv_cache.py
@@ -213,7 +213,7 @@ def __init__(
         for layer_idx in range(config.num_hidden_layers):
             # newer version of transfomer has is_sliding defined
             # for HybridCache
-            if self.is_sliding_list[layer_idx]:
+            if self.is_sliding[layer_idx]:
                 # This is a sliding window layer
                 layer_cache = CustomRingKVCache(
                     max_batch_size=self.max_batch_size,
@@ -388,7 +388,7 @@ def _replace_with_et_custom_kv_cache(module, config, generation_config, cache_dt
             for i in range(len(module.cache.kv_cache)):
                 setattr(module, f"key_cache_{i}", module.cache.kv_cache[i].k_cache)
                 setattr(module, f"value_cache_{i}", module.cache.kv_cache[i].v_cache)
-                if module.cache.is_sliding_list[i]:
+                if module.cache.is_sliding[i]:
                     # Register cache_positions as buffer for sliding window layers
                     # This prevents it from being traced as a constant
                     module.register_buffer(
diff --git a/optimum/exporters/executorch/integrations.py b/optimum/exporters/executorch/integrations.py
@@ -38,13 +38,33 @@ class CausalLMExportableModule(torch.nn.Module):
     This module ensures that the exported model is compatible with ExecuTorch.
     """
 
-    def __init__(self, model, use_custom_kv_cache=False):
+    def __init__(self, model, use_custom_kv_cache=False, use_custom_sdpa=False):
         super().__init__()
         self.model = model
         self.config = model.config
         self.use_custom_kv_cache = use_custom_kv_cache
+        self.use_custom_sdpa = use_custom_sdpa
         self.metadata = save_config_to_constant_methods(model.config, model.generation_config)
 
+    def _register_attention_mask_for_4_53(self, exportable_module: torch.nn.Module):
+        if is_transformers_version(">=", "4.53.0.dev0"):
+            from transformers.integrations.executorch import sdpa_mask_without_vmap
+            from transformers.masking_utils import AttentionMaskInterface
+            from transformers.modeling_utils import AttentionInterface
+
+            _custom_sdpa_for_ring_kv_cache = get_custom_sdpa_for_ring_kv_cache(exportable_module)
+            if self.use_custom_sdpa:
+                if self.use_custom_kv_cache:
+                    AttentionInterface.register("custom_sdpa_ring_kv_cache", _custom_sdpa_for_ring_kv_cache)
+                    AttentionMaskInterface.register("custom_sdpa_ring_kv_cache", sdpa_mask_without_vmap)
+                    # Manually set the attention implementation to custom_sdpa_ring_kv_cache
+                    # This handles both regular sdpa and one for sliding window/local attention
+                    exportable_module.model.model.config._attn_implementation = "custom_sdpa_ring_kv_cache"
+                else:
+                    # Manually set the attention implementation to custom_sdpa_ring_kv_cache
+                    # This handles both regular sdpa and one for sliding window/local attention
+                    exportable_module.model.model.config._attn_implementation = "custom_sdpa"
+
     def export(self, input_ids=None, cache_position=None) -> Dict[str, ExportedProgram]:
         example_input_ids = input_ids if input_ids is not None else torch.tensor([[1]], dtype=torch.long)
         example_cache_position = cache_position if cache_position is not None else torch.tensor([0], dtype=torch.long)
@@ -57,12 +77,7 @@ def export(self, input_ids=None, cache_position=None) -> Dict[str, ExportedProgr
             max_batch_size = 1
             max_cache_len = 4094
             exportable_module = TorchExportableModuleForDecoderOnlyLM(self.model, max_batch_size, max_cache_len)
-
-            from transformers.modeling_utils import AttentionInterface
-
-            _custom_sdpa_for_ring_kv_cache = get_custom_sdpa_for_ring_kv_cache(exportable_module)
-            AttentionInterface.register("custom_sdpa_ring_kv_cache", _custom_sdpa_for_ring_kv_cache)
-            exportable_module.model.model.config._attn_implementation = "custom_sdpa_ring_kv_cache"
+            self._register_attention_mask_for_4_53(exportable_module)
             if self.use_custom_kv_cache:
                 from optimum.executorch.attentions.custom_kv_cache import (
                     replace_with_et_custom_kv_cache,
diff --git a/optimum/exporters/executorch/recipes/xnnpack.py b/optimum/exporters/executorch/recipes/xnnpack.py
@@ -97,7 +97,10 @@ def _lower_to_executorch(
 
     exported_progs = model.export()
 
-    if model.config._attn_implementation == "custom_sdpa":
+    if (
+        model.config._attn_implementation == "custom_sdpa"
+        or model.config._attn_implementation == "custom_sdpa_ring_kv_cache"
+    ):
         # Sanity check to make sure the exported program contains the custom sdpa operator.
         if not any(
             node.op == "call_function" and "custom_sdpa" in str(node.target)
diff --git a/optimum/exporters/executorch/tasks/causal_lm.py b/optimum/exporters/executorch/tasks/causal_lm.py
@@ -58,6 +58,7 @@ def load_causal_lm_model(model_name_or_path: str, **kwargs) -> CausalLMExportabl
     use_custom_kv_cache = kwargs.get("use_custom_kv_cache", False)
     attn_implementation = kwargs.get("attn_implementation", "custom_sdpa" if use_custom_sdpa else "sdpa")
     cache_implementation = kwargs.get("cache_implementation", "static")
+    use_custom_sdpa = use_custom_sdpa or attn_implementation == "custom_sdpa"
     max_length = kwargs.get("max_length", 2048)
     config = kwargs.get("config", None)
 
@@ -126,4 +127,4 @@ def load_causal_lm_model(model_name_or_path: str, **kwargs) -> CausalLMExportabl
 
         unwrap_tensor_subclass(eager_model)
 
-    return CausalLMExportableModule(eager_model, use_custom_kv_cache)
+    return CausalLMExportableModule(eager_model, use_custom_kv_cache, use_custom_sdpa)
diff --git a/tests/models/test_modeling_gemma3.py b/tests/models/test_modeling_gemma3.py
@@ -112,6 +112,10 @@ def test_gemma3_text_generation(self):
     @slow
     @pytest.mark.run_slow
     @pytest.mark.skipif(is_linux_ci, reason="OOM on linux runner")
+    @pytest.mark.skipif(
+        parse(transformers.__version__) < parse("4.53.0.dev0") or parse(torchao.__version__) < parse("0.11.0"),
+        reason="Only available on transformers >= 4.53.0.dev0 and torchao >= 0.11.0",
+    )
     def test_gemma3_text_generation_with_custom_sdpa(self):
         # TODO: Until https://github.com/huggingface/optimum/issues/2127 is fixed, have to use non-gated model on CI
         # model_id = "google/gemma-3-1b-it"
@@ -181,8 +185,8 @@ def test_gemma3_text_generation_with_custom_sdpa_float16(self):
     @slow
     @pytest.mark.run_slow
     @pytest.mark.skipif(
-        parse(torchao.__version__) < parse("0.11.0.dev0"),
-        reason="Only available on torchao >= 0.11.0.dev0",
+        parse(transformers.__version__) < parse("4.53.0.dev0") or parse(torchao.__version__) < parse("0.11.0"),
+        reason="Only available on transformers >= 4.53.0.dev0 and torchao >= 0.11.0",
     )
     def test_gemma3_text_generation_with_custom_sdpa_8da4w_8we(self):
         # TODO: Until https://github.com/huggingface/optimum/issues/2127 is fixed, have to use non-gated model on CI
@@ -217,9 +221,11 @@ def test_gemma3_text_generation_with_custom_sdpa_8da4w_8we(self):
 
         self.assertTrue(check_causal_lm_output_quality(model_id, generated_tokens))
 
+    @slow
+    @pytest.mark.run_slow
     @pytest.mark.skipif(
-        parse(transformers.__version__) < parse("4.52.0") or parse(torchao.__version__) < parse("0.11.0"),
-        reason="Only available on transformers >= 4.52.0 and torchao >= 0.11.0",
+        parse(transformers.__version__) < parse("4.53.0.dev0") or parse(torchao.__version__) < parse("0.11.0"),
+        reason="Only available on transformers >= 4.53.0.dev0 and torchao >= 0.11.0",
     )
     def test_gemma3_text_generation_with_custom_sdpa_kv_cache_8da4w_8we(self):
         # TODO: Until https://github.com/huggingface/optimum/issues/2127 is fixed, have to use non-gated model on CI