Fix for Qwen3 embedding (#85)

guangy10 · Guang Yang · web-flow · commit 4b373265a1b4 · 2025-06-16T20:18:43.000-07:00
Co-authored-by: Guang Yang &lt;guangyang@fb.com&gt;
diff --git a/README.md b/README.md
@@ -150,7 +150,7 @@ We currently support a wide range of popular transformer models, including encod
 - [Gemma3](https://huggingface.co/google/gemma-3-1b-it): `Gemma-3-1b` and its variants *(requires `transformers >= 4.52.0`)*
 - [Llama](https://huggingface.co/meta-llama/Llama-3.2-1B): `Llama-3.2-1B` and its variants
 - [Qwen2](https://huggingface.co/Qwen/Qwen2.5-0.5B): `Qwen2.5-0.5B` and its variants
-- [Qwen3](https://huggingface.co/Qwen/Qwen3-0.6B): `Qwen3-0.6B` and its variants
+- [Qwen3](https://huggingface.co/Qwen/Qwen3-0.6B): `Qwen3-0.6B`, `Qwen3-Embedding-0.6B` and other variants
 - [Olmo](https://huggingface.co/allenai/OLMo-1B-hf): `OLMo-1B-hf` and its variants
 - [Phi4](https://huggingface.co/microsoft/Phi-4-mini-instruct): `Phi-4-mini-instruct` and its variants
 - [Smollm](https://huggingface.co/HuggingFaceTB/SmolLM2-135M): 🤗 `SmolLM2-135M` and its variants
diff --git a/optimum/executorch/modeling.py b/optimum/executorch/modeling.py
@@ -41,6 +41,7 @@
 
 from ..exporters import TasksManager
 from ..exporters.executorch import main_export
+from ..exporters.executorch.utils import verify_eos_tokens_in_tokenizer
 from ..modeling_base import FROM_PRETRAINED_START_DOCSTRING, OptimizedModel
 from ..utils.file_utils import find_files_matching_pattern
 from .stats import Stats
@@ -736,9 +737,9 @@ def text_generation(
             raise ValueError(
                 f"The tokenizer's bos_token_id={self.tokenizer.bos_token_id} must be the same as the model's bos_token_id={self.bos_token_id}."
             )
-        if self.tokenizer.eos_token_id is not None and self.tokenizer.eos_token_id not in self.eos_token_ids:
+        if not verify_eos_tokens_in_tokenizer(self.eos_token_ids, self.tokenizer):
             raise ValueError(
-                f"The tokenizer's eos_token_id={self.tokenizer.eos_token_id} must match with the model's eos_token_ids={self.eos_token_ids}."
+                f"The tokenizer's eos_token_id does not match with the model's eos_token_ids={self.eos_token_ids}."
             )
 
         # Reset stats for a new generation
diff --git a/optimum/exporters/executorch/utils.py b/optimum/exporters/executorch/utils.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import Optional
+from typing import List, Optional, Set
 
 import torch
 from transformers import GenerationConfig, PretrainedConfig
@@ -65,3 +65,49 @@ def save_config_to_constant_methods(
 
     # Combine with any additional kwargs and filter out None values
     return {k: v for k, v in {**metadata, **kwargs}.items() if v is not None}
+
+
+def verify_eos_tokens_in_tokenizer(model_eos_ids: List[int], tokenizer) -> bool:
+    """
+    Verifies that the model's EOS token IDs are present in the tokenizer's
+    set of potential end-of-sequence tokens.
+
+    Args:
+        model_eos_ids: A list of EOS token IDs recorded int the PTE file (the source of truth).
+        tokenizer: The Hugging Face tokenizer instance to check.
+
+    Returns:
+        True if at least one model EOS ID is found among the tokenizer's potential
+        EOS tokens, False otherwise.
+    """
+    if not model_eos_ids:
+        print("Warning: model_eos_ids list is empty. No verification can be performed.")
+        return True
+
+    candidate_eos_ids: Set[int] = set()
+
+    # 1. Check primary eos_token and pad_token attributes
+    if tokenizer.eos_token_id is not None:
+        candidate_eos_ids.add(tokenizer.eos_token_id)
+    if tokenizer.pad_token_id is not None:
+        candidate_eos_ids.add(tokenizer.pad_token_id)
+
+    # 2. Check all tokens listed in the special_tokens_map
+    for token_string in tokenizer.special_tokens_map.values():
+        if token_string:
+            # Use convert_tokens_to_ids for robustness
+            token_id = tokenizer.convert_tokens_to_ids(token_string)
+            if isinstance(token_id, int):
+                candidate_eos_ids.add(token_id)
+
+    # 3. Check added tokens for "end-of-X" patterns
+    for token_id, added_token in tokenizer.added_tokens_decoder.items():
+        token_str = added_token.content.lower()
+        # Heuristic to find tokens that signify an end
+        if "end" in token_str or token_str.startswith("</"):
+            candidate_eos_ids.add(token_id)
+
+    # The check: is any "true" ID present in the candidate set?
+    is_valid = any(model_id in candidate_eos_ids for model_id in model_eos_ids)
+
+    return is_valid
diff --git a/tests/models/test_modeling_qwen3_embedding.py b/tests/models/test_modeling_qwen3_embedding.py
@@ -0,0 +1,73 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import gc
+import logging
+import os
+import unittest
+
+import pytest
+import torchao
+import transformers
+from executorch.extension.pybindings.portable_lib import ExecuTorchModule
+from packaging.version import parse
+from transformers import AutoTokenizer
+from transformers.testing_utils import slow
+
+from optimum.executorch import ExecuTorchModelForCausalLM
+
+from ..utils import check_causal_lm_output_quality
+
+
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+
+
+class ExecuTorchModelIntegrationTest(unittest.TestCase):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+    @slow
+    @pytest.mark.run_slow
+    @pytest.mark.skipif(
+        parse(transformers.__version__) < parse("4.52.0") or parse(torchao.__version__) < parse("0.11.0"),
+        reason="Only available on transformers >= 4.52.0 and torchao >= 0.11.0",
+    )
+    def test_qwen3_embedding_text_generation_with_custom_sdpa_and_kv_cache_8da4w_8we(self):
+        model_id = "Qwen/Qwen3-Embedding-0.6B"
+        prompt = "Explain gravity"
+        tokenizer = AutoTokenizer.from_pretrained(model_id)
+        model = ExecuTorchModelForCausalLM.from_pretrained(
+            model_id,
+            recipe="xnnpack",
+            attn_implementation="custom_sdpa",
+            use_custom_kv_cache=True,
+            **{"qlinear": True, "qembeeding": True},
+        )
+        self.assertIsInstance(model, ExecuTorchModelForCausalLM)
+        self.assertIsInstance(model.model, ExecuTorchModule)
+        generated_text = model.text_generation(
+            tokenizer=tokenizer,
+            prompt=prompt,
+            max_seq_len=64,
+        )
+        logging.info(f"\nGenerated text:\n\t{generated_text}")
+        generated_tokens = tokenizer(generated_text, return_tensors="pt").input_ids
+
+        # Free memory before loading eager for quality check
+        del model
+        del tokenizer
+        gc.collect()
+
+        self.assertTrue(check_causal_lm_output_quality(model_id, generated_tokens))