Make room for transformer changes requiring attn_mask fn

kimishpatel · kimishpatel · commit b4bf11ade713 · 2025-06-24T17:50:20.000-07:00
Summary:
Latest changse in 4.53 requires that custom attentin functions have
corresponding mask generation

Test Plan:
CI wont pass without changes on top of 4.53

Reviewers:

Subscribers:

Tasks:

Tags:
diff --git a/optimum/executorch/attentions/custom_kv_cache.py b/optimum/executorch/attentions/custom_kv_cache.py
@@ -213,7 +213,7 @@ def __init__(
         for layer_idx in range(config.num_hidden_layers):
             # newer version of transfomer has is_sliding defined
             # for HybridCache
-            if self.is_sliding_list[layer_idx]:
+            if self.is_sliding[layer_idx]:
                 # This is a sliding window layer
                 layer_cache = CustomRingKVCache(
                     max_batch_size=self.max_batch_size,
@@ -388,7 +388,7 @@ def _replace_with_et_custom_kv_cache(module, config, generation_config, cache_dt
             for i in range(len(module.cache.kv_cache)):
                 setattr(module, f"key_cache_{i}", module.cache.kv_cache[i].k_cache)
                 setattr(module, f"value_cache_{i}", module.cache.kv_cache[i].v_cache)
-                if module.cache.is_sliding_list[i]:
+                if module.cache.is_sliding[i]:
                     # Register cache_positions as buffer for sliding window layers
                     # This prevents it from being traced as a constant
                     module.register_buffer(
diff --git a/optimum/exporters/executorch/integrations.py b/optimum/exporters/executorch/integrations.py
@@ -38,13 +38,33 @@ class CausalLMExportableModule(torch.nn.Module):
     This module ensures that the exported model is compatible with ExecuTorch.
     """
 
-    def __init__(self, model, use_custom_kv_cache=False):
+    def __init__(self, model, use_custom_kv_cache=False, use_custom_sdpa=False):
         super().__init__()
         self.model = model
         self.config = model.config
         self.use_custom_kv_cache = use_custom_kv_cache
+        self.use_custom_sdpa = use_custom_sdpa
         self.metadata = save_config_to_constant_methods(model.config, model.generation_config)
 
+    def _register_attention_mask_for_4_53(self, exportable_module: torch.nn.Module):
+        if is_transformers_version(">=", "4.53.0.dev0"):
+            from transformers.integrations.executorch import sdpa_mask_without_vmap
+            from transformers.masking_utils import AttentionMaskInterface
+            from transformers.modeling_utils import AttentionInterface
+
+            _custom_sdpa_for_ring_kv_cache = get_custom_sdpa_for_ring_kv_cache(exportable_module)
+            if self.use_custom_sdpa:
+                if self.use_custom_kv_cache:
+                    AttentionInterface.register("custom_sdpa_ring_kv_cache", _custom_sdpa_for_ring_kv_cache)
+                    AttentionMaskInterface.register("custom_sdpa_ring_kv_cache", sdpa_mask_without_vmap)
+                    # Manually set the attention implementation to custom_sdpa_ring_kv_cache
+                    # This handles both regular sdpa and one for sliding window/local attention
+                    exportable_module.model.model.config._attn_implementation = "custom_sdpa_ring_kv_cache"
+                else:
+                    # Manually set the attention implementation to custom_sdpa_ring_kv_cache
+                    # This handles both regular sdpa and one for sliding window/local attention
+                    exportable_module.model.model.config._attn_implementation = "custom_sdpa"
+
     def export(self, input_ids=None, cache_position=None) -> Dict[str, ExportedProgram]:
         example_input_ids = input_ids if input_ids is not None else torch.tensor([[1]], dtype=torch.long)
         example_cache_position = cache_position if cache_position is not None else torch.tensor([0], dtype=torch.long)
@@ -57,12 +77,7 @@ def export(self, input_ids=None, cache_position=None) -> Dict[str, ExportedProgr
             max_batch_size = 1
             max_cache_len = 4094
             exportable_module = TorchExportableModuleForDecoderOnlyLM(self.model, max_batch_size, max_cache_len)
-
-            from transformers.modeling_utils import AttentionInterface
-
-            _custom_sdpa_for_ring_kv_cache = get_custom_sdpa_for_ring_kv_cache(exportable_module)
-            AttentionInterface.register("custom_sdpa_ring_kv_cache", _custom_sdpa_for_ring_kv_cache)
-            exportable_module.model.model.config._attn_implementation = "custom_sdpa_ring_kv_cache"
+            self._register_attention_mask_for_4_53(exportable_module)
             if self.use_custom_kv_cache:
                 from optimum.executorch.attentions.custom_kv_cache import (
                     replace_with_et_custom_kv_cache,
diff --git a/optimum/exporters/executorch/recipes/xnnpack.py b/optimum/exporters/executorch/recipes/xnnpack.py
@@ -97,7 +97,10 @@ def _lower_to_executorch(
 
     exported_progs = model.export()
 
-    if model.config._attn_implementation == "custom_sdpa":
+    if (
+        model.config._attn_implementation == "custom_sdpa"
+        or model.config._attn_implementation == "custom_sdpa_ring_kv_cache"
+    ):
         # Sanity check to make sure the exported program contains the custom sdpa operator.
         if not any(
             node.op == "call_function" and "custom_sdpa" in str(node.target)
diff --git a/optimum/exporters/executorch/tasks/causal_lm.py b/optimum/exporters/executorch/tasks/causal_lm.py
@@ -126,4 +126,4 @@ def load_causal_lm_model(model_name_or_path: str, **kwargs) -> CausalLMExportabl
 
         unwrap_tensor_subclass(eager_model)
 
-    return CausalLMExportableModule(eager_model, use_custom_kv_cache)
+    return CausalLMExportableModule(eager_model, use_custom_kv_cache, use_custom_sdpa)
diff --git a/tests/models/test_modeling_gemma3.py b/tests/models/test_modeling_gemma3.py
@@ -112,6 +112,10 @@ def test_gemma3_text_generation(self):
     @slow
     @pytest.mark.run_slow
     @pytest.mark.skipif(is_linux_ci, reason="OOM on linux runner")
+    @pytest.mark.skipif(
+        parse(transformers.__version__) < parse("4.53.0.dev0") or parse(torchao.__version__) < parse("0.11.0"),
+        reason="Only available on transformers >= 4.53.0.dev0 and torchao >= 0.11.0",
+    )
     def test_gemma3_text_generation_with_custom_sdpa(self):
         # TODO: Until https://github.com/huggingface/optimum/issues/2127 is fixed, have to use non-gated model on CI
         # model_id = "google/gemma-3-1b-it"
@@ -181,8 +185,8 @@ def test_gemma3_text_generation_with_custom_sdpa_float16(self):
     @slow
     @pytest.mark.run_slow
     @pytest.mark.skipif(
-        parse(torchao.__version__) < parse("0.11.0.dev0"),
-        reason="Only available on torchao >= 0.11.0.dev0",
+        parse(transformers.__version__) < parse("4.53.0.dev0") or parse(torchao.__version__) < parse("0.11.0"),
+        reason="Only available on transformers >= 4.53.0.dev0 and torchao >= 0.11.0",
     )
     def test_gemma3_text_generation_with_custom_sdpa_8da4w_8we(self):
         # TODO: Until https://github.com/huggingface/optimum/issues/2127 is fixed, have to use non-gated model on CI
@@ -217,9 +221,11 @@ def test_gemma3_text_generation_with_custom_sdpa_8da4w_8we(self):
 
         self.assertTrue(check_causal_lm_output_quality(model_id, generated_tokens))
 
+    @slow
+    @pytest.mark.run_slow
     @pytest.mark.skipif(
-        parse(transformers.__version__) < parse("4.52.0") or parse(torchao.__version__) < parse("0.11.0"),
-        reason="Only available on transformers >= 4.52.0 and torchao >= 0.11.0",
+        parse(transformers.__version__) < parse("4.53.0.dev0") or parse(torchao.__version__) < parse("0.11.0"),
+        reason="Only available on transformers >= 4.53.0.dev0 and torchao >= 0.11.0",
     )
     def test_gemma3_text_generation_with_custom_sdpa_kv_cache_8da4w_8we(self):
         # TODO: Until https://github.com/huggingface/optimum/issues/2127 is fixed, have to use non-gated model on CI

Original file line number	Diff line number	Diff line change
`@@ -126,4 +126,4 @@ def load_causal_lm_model(model_name_or_path: str, **kwargs) -> CausalLMExportabl`
`126`	`126`
`127`	`127`	`unwrap_tensor_subclass(eager_model)`
`128`	`128`
`129`		`- return CausalLMExportableModule(eager_model, use_custom_kv_cache)`
	`129`	`+ return CausalLMExportableModule(eager_model, use_custom_kv_cache, use_custom_sdpa)`