Make room for transformer changes requiring attn_mask fn

kimishpatel · kimishpatel · commit fe38f144dae9 · 2025-06-18T12:26:31.000-07:00
Summary:
Latest changse in 4.53 requires that custom attentin functions have
corresponding mask generation

Test Plan:
CI wont pass without changes on top of 4.53

Reviewers:

Subscribers:

Tasks:

Tags:
diff --git a/optimum/executorch/attentions/custom_kv_cache.py b/optimum/executorch/attentions/custom_kv_cache.py
@@ -213,7 +213,7 @@ def __init__(
         for layer_idx in range(config.num_hidden_layers):
             # newer version of transfomer has is_sliding defined
             # for HybridCache
-            if self.is_sliding_list[layer_idx]:
+            if self.is_sliding[layer_idx]:
                 # This is a sliding window layer
                 layer_cache = CustomRingKVCache(
                     max_batch_size=self.max_batch_size,
@@ -388,7 +388,7 @@ def _replace_with_et_custom_kv_cache(module, config, generation_config, cache_dt
             for i in range(len(module.cache.kv_cache)):
                 setattr(module, f"key_cache_{i}", module.cache.kv_cache[i].k_cache)
                 setattr(module, f"value_cache_{i}", module.cache.kv_cache[i].v_cache)
-                if module.cache.is_sliding_list[i]:
+                if module.cache.is_sliding[i]:
                     # Register cache_positions as buffer for sliding window layers
                     # This prevents it from being traced as a constant
                     module.register_buffer(
diff --git a/optimum/exporters/executorch/integrations.py b/optimum/exporters/executorch/integrations.py
@@ -49,10 +49,12 @@ def export(self, input_ids=None, cache_position=None) -> Dict[str, ExportedProgr
         example_input_ids = input_ids if input_ids is not None else torch.tensor([[1]], dtype=torch.long)
         example_cache_position = cache_position if cache_position is not None else torch.tensor([0], dtype=torch.long)
 
-        if is_transformers_version(">=", "4.52.0.dev0"):
+        if is_transformers_version(">=", "4.53.0.dev0"):
             from transformers.integrations.executorch import (
                 TorchExportableModuleForDecoderOnlyLM,
+                sdpa_mask_without_vmap,
             )
+            from transformers.masking_utils import AttentionMaskInterface
 
             max_batch_size = 1
             max_cache_len = 4094
@@ -62,6 +64,7 @@ def export(self, input_ids=None, cache_position=None) -> Dict[str, ExportedProgr
 
             _custom_sdpa_for_ring_kv_cache = get_custom_sdpa_for_ring_kv_cache(exportable_module)
             AttentionInterface.register("custom_sdpa_ring_kv_cache", _custom_sdpa_for_ring_kv_cache)
+            AttentionMaskInterface.register("custom_sdpa_ring_kv_cache", sdpa_mask_without_vmap)
             exportable_module.model.model.config._attn_implementation = "custom_sdpa_ring_kv_cache"
             if self.use_custom_kv_cache:
                 from optimum.executorch.attentions.custom_kv_cache import (
diff --git a/optimum/exporters/executorch/recipes/xnnpack.py b/optimum/exporters/executorch/recipes/xnnpack.py
@@ -97,7 +97,10 @@ def _lower_to_executorch(
 
     exported_progs = model.export()
 
-    if model.config._attn_implementation == "custom_sdpa":
+    if (
+        model.config._attn_implementation == "custom_sdpa"
+        or model.config._attn_implementation == "custom_sdpa_ring_kv_cache"
+    ):
         # Sanity check to make sure the exported program contains the custom sdpa operator.
         if not any(
             node.op == "call_function" and "custom_sdpa" in str(node.target)
diff --git a/tests/models/test_modeling_gemma3.py b/tests/models/test_modeling_gemma3.py
@@ -218,8 +218,8 @@ def test_gemma3_text_generation_with_custom_sdpa_8da4w_8we(self):
         self.assertTrue(check_causal_lm_output_quality(model_id, generated_tokens))
 
     @pytest.mark.skipif(
-        parse(transformers.__version__) < parse("4.52.0") or parse(torchao.__version__) < parse("0.11.0"),
-        reason="Only available on transformers >= 4.52.0 and torchao >= 0.11.0",
+        parse(transformers.__version__) < parse("4.53.0.dev0") or parse(torchao.__version__) < parse("0.11.0"),
+        reason="Only available on transformers >= 4.53.0.dev0 and torchao >= 0.11.0",
     )
     def test_gemma3_text_generation_with_custom_sdpa_kv_cache_8da4w_8we(self):
         # TODO: Until https://github.com/huggingface/optimum/issues/2127 is fixed, have to use non-gated model on CI