huggingface
diff --git a/‎optimum/executorch/attentions/custom_kv_cache.py
Lines changed: 207 additions & 45 deletions b/‎optimum/executorch/attentions/custom_kv_cache.py
Lines changed: 207 additions & 45 deletions
@@ -21,10 +21,20 @@
 try:
     from executorch.examples.models.llama.source_transformation.custom_kv_cache import (
         CustomKVCache,
+        CustomRingKVCache,
     )
 except ImportError:
     raise ImportError("ExecutorTorch is not installed. Please install it to use CustomKVCache.")
 
+try:
+    from transformers.cache_utils import HybridCache
+except ImportError:
+    # If transformers is not installed, raise an ImportError
+    try:
+        from transformers.cache_utils import HybridCache
+    except ImportError:
+        raise ImportError("transformers is not installed. Please install it to use HybridCache.")
+
 
 class ETCustomStaticCache(StaticCache):
     """
@@ -55,7 +65,7 @@ def __init__(
         assert device is None or device == "cpu", "Device must be None or 'cpu'"
 
         # Create a list of CustomKVCache instances, one per layer
-        kv_cache_list = []
+        self.kv_cache = torch.nn.ModuleList()
         for _ in range(config.num_hidden_layers):
             layer_cache = CustomKVCache(
                 max_batch_size=self.max_batch_size,
@@ -64,8 +74,7 @@ def __init__(
                 head_dim=self.head_dim,
                 dtype=dtype,
             )
-            kv_cache_list.append(layer_cache)
-        self.kv_cache = torch.nn.ModuleList(kv_cache_list)
+            self.kv_cache.append(layer_cache)
 
     def update(
         self,
@@ -180,6 +189,135 @@ def from_legacy_cache(
         )
 
 
+# Need to figure out if I have to inherit from HybridCache or StaticCache
+class ETCustomHybridCache(HybridCache):
+    """
+    Custom Hybrid KV Cache implementation for ExecutorTorch that inherits from Hugging Face's HybridCache
+    but uses ExecutorTorch's CustomKVCache for global layers and CustomRingKVCache for sliding window layers.
+    """
+
+    def __init__(
+        self,
+        config,
+        max_batch_size: int,
+        max_cache_len: Optional[int] = None,
+        device: Union[torch.device, str, None] = None,
+        dtype: torch.dtype = torch.float32,
+        layer_device_map: Optional[Dict[int, Union[str, torch.device, int]]] = None,
+    ):
+        super().__init__(
+            config=config,
+            max_batch_size=max_batch_size,
+            max_cache_len=max_cache_len,
+            device=device,
+            dtype=dtype,
+            layer_device_map=layer_device_map,
+        )
+
+        # make sure layer_device_map is none
+        assert layer_device_map is None
+        assert device is None or device == "cpu", "Device must be None or 'cpu'"
+
+        self.cache_position = None
+        # Create a list of cache instances, one per layer
+        # Use CustomKVCache for global layers and CustomRingKVCache for sliding window layers
+        self.kv_cache = torch.nn.ModuleList()
+        for layer_idx in range(config.num_hidden_layers):
+            # newer version of transfomer has is_sliding defined
+            # for HybridCache
+            if self.is_sliding_list[layer_idx]:
+                # This is a sliding window layer
+                layer_cache = CustomRingKVCache(
+                    max_batch_size=self.max_batch_size,
+                    max_context_length=self.sliding_window_len,
+                    n_heads=self.num_key_value_heads,
+                    head_dim=self.head_dim,
+                    dtype=dtype,
+                )
+            else:
+                layer_cache = CustomKVCache(
+                    max_batch_size=self.max_batch_size,
+                    max_context_length=self.max_cache_len,
+                    n_heads=self.num_key_value_heads,
+                    head_dim=self.head_dim,
+                    dtype=dtype,
+                )
+            self.kv_cache.append(layer_cache)
+
+    def update(
+        self,
+        key_states: torch.Tensor,
+        value_states: torch.Tensor,
+        layer_idx: int,
+        cache_kwargs: Optional[Dict[str, Any]] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Updates the cache with the new `key_states` and `value_states` for the layer `layer_idx`
+        using ExecutorTorch's CustomKVCache or CustomRingKVCache depending on the layer type.
+
+        Args:
+            key_states (`torch.Tensor`):
+                The new key states to cache. Shape: [batch_size, n_heads, seq_len, head_dim]
+            value_states (`torch.Tensor`):
+                The new value states to cache. Shape: [batch_size, n_heads, seq_len, head_dim]
+            layer_idx (`int`):
+                The index of the layer to cache the states for.
+            cache_kwargs (`Dict[str, Any]`, `optional`):
+                Additional arguments for the cache update.
+
+        Returns:
+            A tuple containing the updated key and value states.
+        """
+        assert cache_kwargs is not None
+
+        # Get cache position from cache_kwargs (used by HybridCache)
+        cache_position = cache_kwargs.get("cache_position")
+        assert cache_position is not None
+        assert isinstance(cache_position, torch.Tensor)
+        self.cache_position = cache_position
+
+        # Get the cache instance for this layer (either CustomKVCache or CustomRingKVCache)
+        layer_cache = self.kv_cache[layer_idx]
+
+        # Use the cache's update method
+        # Both CustomKVCache and CustomRingKVCache have the same update interface
+        k_out, v_out = layer_cache.update(
+            input_pos=cache_position,
+            k_val=key_states,
+            v_val=value_states,
+        )
+
+        return k_out, v_out
+
+    def get_seq_length(self, layer_idx: Optional[int] = 0) -> int:
+        """Returns the sequence length of the cached states. A layer index can be optionally passed."""
+        if layer_idx is None:
+            layer_idx = 0
+
+        # For CustomRingKVCache, we need to handle the sequence length differently
+        layer_cache = self.kv_cache[layer_idx]
+        if self.is_sliding[layer_idx]:
+            # CustomRingKVCache cache_position_manager which
+            # maintains cache position for each slot in the kv cache
+            # we return the max position + 1 to indicate max position
+            # seen so far. Not sure if thats the correct interpretation
+            # of sequence length
+            return layer_cache.cache_positions_manager.cache_positions.max().item() + 1
+        return (layer_cache.k_cache[0, :, 0].any(dim=-1)).sum()
+
+    def get_layer_cache(self, layer_idx: int):
+        """
+        Get the cache for a specific layer. This method is dynamo-traceable.
+
+        Args:
+            layer_idx (int): The layer index
+
+        Returns:
+            The cache instance for the specified layer (CustomKVCache or CustomRingKVCache)
+        """
+        return self.kv_cache[layer_idx]
+
+
 def replace_with_et_custom_kv_cache(module, config, generation_config, cache_dtype):
     """
     Replace all KV caches in the module with ETCustomStaticCache.
@@ -192,22 +330,6 @@ def replace_with_et_custom_kv_cache(module, config, generation_config, cache_dty
     Returns:
         The modified module
     """
-    # Ensure custom ops are registered
-    try:
-        op = torch.ops.llama.update_cache
-        assert op is not None
-    except Exception:
-        try:
-            from executorch.extension.llm.custom_ops import custom_ops  # noqa: F401
-
-            op = torch.ops.llama.update_cache
-            assert op is not None
-        except ImportError:
-            raise ImportError(
-                "ExecutorTorch custom operations are not available. "
-                "Please install executorch with custom operations support."
-            )
-
     # Recursively replace KV caches
     return _replace_with_et_custom_kv_cache(module, config, generation_config, cache_dtype)
 
@@ -223,33 +345,73 @@ def _replace_with_et_custom_kv_cache(module, config, generation_config, cache_dt
     Returns:
         The modified module
     """
-    assert hasattr(module, "static_cache")
-    assert isinstance(
-        module.static_cache, StaticCache
-    ), "Only StaticCache transform is supported. Hybrid cache with local global attention is not yet supported"
-    # TODO: Add replace_cache to exported module
-    # in transformer's executorch.py
-    if getattr(module, "replace_cache", None) is not None:
-        static_cache = ETCustomStaticCache(
-            config=config,
-            max_batch_size=generation_config.cache_config.batch_size,
-            max_cache_len=generation_config.cache_config.max_cache_len,
-            device=generation_config.cache_config.device,
-            dtype=cache_dtype,
-        )
-        module.replace_cache(static_cache)
+    # Check if module has static_cache (TorchExportableModuleWithStaticCache)
+    if hasattr(module, "static_cache"):
+        assert isinstance(module.static_cache, StaticCache), f"Expected StaticCache, got {type(module.static_cache)}"
+
+        # TODO: Add replace_cache to exported module
+        # in transformer's executorch.py
+        if getattr(module, "replace_cache", None) is not None:
+            static_cache = ETCustomStaticCache(
+                config=config,
+                max_batch_size=generation_config.cache_config.batch_size,
+                max_cache_len=generation_config.cache_config.max_cache_len,
+                device=generation_config.cache_config.device,
+                dtype=cache_dtype,
+            )
+            module.replace_cache(static_cache)
+        else:
+            module.static_cache = ETCustomStaticCache(
+                config=config,
+                max_batch_size=generation_config.cache_config.batch_size,
+                max_cache_len=generation_config.cache_config.max_cache_len,
+                device=generation_config.cache_config.device,
+                dtype=cache_dtype,
+            )
+            # Dont know why we need to this even though
+            # CustomKVCache registers the attributes
+            for i in range(len(module.static_cache.kv_cache)):
+                setattr(module, f"key_cache_{i}", module.static_cache.kv_cache[i].k_cache)
+                setattr(module, f"value_cache_{i}", module.static_cache.kv_cache[i].v_cache)
+
+    # Check if module has cache (TorchExportableModuleWithHybridCache)
+    elif hasattr(module, "cache"):
+        assert isinstance(module.cache, HybridCache), f"Expected HybridCache, got {type(module.cache)}"
+
+        # Replace with ETCustomHybridCache
+        if getattr(module, "replace_cache", None) is not None:
+            hybrid_cache = ETCustomHybridCache(
+                config=config,
+                max_batch_size=generation_config.cache_config.batch_size,
+                max_cache_len=generation_config.cache_config.max_cache_len,
+                device=generation_config.cache_config.device,
+                dtype=cache_dtype,
+            )
+            module.replace_cache(hybrid_cache)
+        else:
+            module.cache = ETCustomHybridCache(
+                config=config,
+                max_batch_size=generation_config.cache_config.batch_size,
+                max_cache_len=generation_config.cache_config.max_cache_len,
+                device=generation_config.cache_config.device,
+                dtype=cache_dtype,
+            )
+            # Register cache attributes for each layer
+            for i in range(len(module.cache.kv_cache)):
+                setattr(module, f"key_cache_{i}", module.cache.kv_cache[i].k_cache)
+                setattr(module, f"value_cache_{i}", module.cache.kv_cache[i].v_cache)
+                if module.cache.is_sliding_list[i]:
+                    # Register cache_positions as buffer for sliding window layers
+                    # This prevents it from being traced as a constant
+                    module.register_buffer(
+                        f"cache_positions_{i}",
+                        module.cache.kv_cache[i].cache_positions_manager.cache_positions,
+                        persistent=False,
+                    )
     else:
-        module.static_cache = ETCustomStaticCache(
-            config=config,
-            max_batch_size=generation_config.cache_config.batch_size,
-            max_cache_len=generation_config.cache_config.max_cache_len,
-            device=generation_config.cache_config.device,
-            dtype=cache_dtype,
+        raise ValueError(
+            "Module must have either 'static_cache' (TorchExportableModuleWithStaticCache) "
+            "or 'cache' (TorchExportableModuleWithHybridCache) attribute"
         )
-        # Dont know why we need to this even though
-        # CustomKVCache registers the attributes
-        for i in range(len(module.static_cache.kv_cache)):
-            setattr(module, f"key_cache_{i}", module.static_cache.kv_cache[i].k_cache)
-            setattr(module, f"value_cache_{i}", module.static_cache.kv_cache[i].v_cache)
 
     return module