add support for embedding quantization

Guang Yang · Guang Yang · commit a2de3dc5a08e · 2025-05-02T12:26:08.000-07:00
diff --git a/.github/workflows/test_models.yml b/.github/workflows/test_models.yml
@@ -50,14 +50,15 @@ jobs:
         with:
           python-version: ${{ matrix.python-version }}
       - name: Install dependencies for ExecuTorch
+        # Consolidate torchao nightly version once https://github.com/pytorch/ao/issues/2157 is fixed
         run: |
           if [ "${{ matrix.executorch-version }}" == "nightly" ]; then
-            export NIGHTLY_VERSION=dev20250422
+            export NIGHTLY_VERSION=dev20250501
             pip install executorch==0.7.0.${NIGHTLY_VERSION} \
               torch==2.8.0.${NIGHTLY_VERSION} \
               torchvision==0.22.0.${NIGHTLY_VERSION} \
               torchaudio==2.6.0.${NIGHTLY_VERSION} \
-              torchao==0.11.0.${NIGHTLY_VERSION} \
+              torchao==0.11.0.dev20250422 \
               --extra-index-url "https://download.pytorch.org/whl/nightly/cpu"
           else
             pip install executorch==${{ matrix.executorch-version }}
diff --git a/README.md b/README.md
@@ -129,7 +129,7 @@ generated_text = model.text_generation(
 print(generated_text)
 ```
 
-## Supported Models and Backend
+## Supported Models
 
 **Optimum-ExecuTorch** currently supports the following transformer models:
 
@@ -174,9 +174,9 @@ We currently support a wide range of popular transformer models, including encod
 
 *📌 Note: This list is continuously expanding. As we continue to expand support, more models will be added.*
 
-**Supported Backend:**
+## Supported Optimizations
 
-Currently, **Optimum-ExecuTorch** supports only the [XNNPACK Backend](https://pytorch.org/executorch/main/backends-xnnpack.html) for efficient execution on mobile CPUs. We currently support Post-Training Quantization (PTQ) for linear layers using int8 dynamic per-token activations and int4 grouped per-channel weights (`8da4w`).
+Currently, **Optimum-ExecuTorch** supports the [XNNPACK Backend](https://pytorch.org/executorch/main/backends-xnnpack.html) with [custom SDPA](https://github.com/pytorch/executorch/blob/a4322c71c3a97e79e0454a8223db214b010f1193/extension/llm/README.md?plain=1#L40) for efficient execution on mobile CPUs. We currently support Post-Training Quantization (PTQ) for linear layers using int8 dynamic per-token activations and int4 grouped per-channel weights (`8da4w`), and int8 channelwise embedding quantization.
 
 For a comprehensive overview of all backends supported by ExecuTorch, please refer to the [ExecuTorch Backend Overview](https://pytorch.org/executorch/main/backends-overview.html).
 
diff --git a/optimum/commands/export/executorch.py b/optimum/commands/export/executorch.py
@@ -58,11 +58,16 @@ def parse_args_executorch(parser):
         help="For decoder-only models to use custom sdpa with static kv cache to boost performance. Defaults to False.",
     )
     required_group.add_argument(
-        "-qmode",
-        "--quantization_mode",
+        "--qlinear",
         required=False,
-        choices=["8da4w"],
-        help="Quantization recipe to use. Defaults to None.",
+        action="store_true",
+        help="Quantization config for linear layers. If set, defaults to '8da4w' w/ groupsize 32.",
+    )
+    required_group.add_argument(
+        "--qembedding",
+        required=False,
+        action="store_true",
+        help="Quantization config for embedding. If set, defaults to int8 channelwise.",
     )
 
 
@@ -79,8 +84,10 @@ def run(self):
         kwargs = {}
         if self.args.use_custom_sdpa:
             kwargs["use_custom_sdpa"] = self.args.use_custom_sdpa
-        if self.args.quantization_mode:
-            kwargs["quantization_mode"] = self.args.quantization_mode
+        if self.args.qlinear:
+            kwargs["qlinear"] = self.args.qlinear
+        if self.args.qembedding:
+            kwargs["qembedding"] = self.args.qembedding
 
         main_export(
             model_name_or_path=self.args.model,
diff --git a/optimum/executorch/modeling.py b/optimum/executorch/modeling.py
@@ -37,6 +37,7 @@
 from transformers.utils import is_offline_mode
 
 from executorch.extension.pybindings.portable_lib import ExecuTorchModule, _load_for_executorch
+from executorch.kernels import quantized  # noqa
 
 from ..exporters import TasksManager
 from ..exporters.executorch import main_export
diff --git a/optimum/exporters/executorch/recipes/xnnpack.py b/optimum/exporters/executorch/recipes/xnnpack.py
@@ -15,9 +15,11 @@
 import logging
 from typing import Dict, Union
 
+from packaging.version import parse
 from tabulate import tabulate
 from torch.export import ExportedProgram
 
+from executorch import version as executorch_version
 from executorch.backends.xnnpack.partition.xnnpack_partitioner import XnnpackPartitioner
 from executorch.devtools.backend_debug import get_delegation_info
 from executorch.exir import (
@@ -62,6 +64,12 @@ def _lower_to_executorch(
         metadata=None,
     ) -> Dict[str, ExecutorchProgram]:
         et_progs = {}
+        backend_config_dict = {
+            "extract_delegate_segments": True,
+        }
+        if parse(executorch_version.__version__).base_version > "0.6.0":
+            backend_config_dict["do_quant_fusion_and_const_prop"] = True
+
         for pte_name, exported_program in exported_programs.items():
             et_progs[pte_name] = to_edge_transform_and_lower(
                 exported_program,
@@ -71,10 +79,7 @@ def _lower_to_executorch(
                 ),
                 constant_methods=metadata,
             ).to_executorch(
-                config=ExecutorchBackendConfig(
-                    do_quant_fusion_and_const_prop=True,
-                    extract_delegate_segments=True,
-                ),
+                config=ExecutorchBackendConfig(**backend_config_dict),
             )
             logging.debug(f"\nExported program for {pte_name}.pte: {exported_program}")
             logging.debug(
diff --git a/optimum/exporters/executorch/tasks/causal_lm.py b/optimum/exporters/executorch/tasks/causal_lm.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import logging
+
 import torch
 import torchao
 from packaging.version import parse
@@ -57,14 +59,12 @@ def load_causal_lm_model(model_name_or_path: str, **kwargs) -> CausalLMExportabl
     cache_implementation = kwargs.get("cache_implementation", "static")
     max_length = kwargs.get("max_length", 2048)
     config = kwargs.get("config", None)
-    quantization_mode = kwargs.get("quantization_mode", None)
 
     eager_model = AutoModelForCausalLM.from_pretrained(
         model_name_or_path,
         device_map=device,
         torch_dtype=dtype,
         config=config,
-        # quantization_config=quantization_config,
         attn_implementation=attn_implementation,
         generation_config=GenerationConfig(
             use_cache=True,
@@ -77,24 +77,47 @@ def load_causal_lm_model(model_name_or_path: str, **kwargs) -> CausalLMExportabl
         ),
     )
 
-    if quantization_mode == "8da4w":
+    # TODO: Move quantization recipe out for better composability.
+    # TODO: Should switch to `TorchAoConfig` once the quant issue on final lm_head layer is fixed.
+    qlinear_config = kwargs.get("qlinear", None)
+    qembedding_config = kwargs.get("qembedding", None)
+    if qlinear_config or qembedding_config:
+        # TODO: Update torchao to use 0.11.0 once released
         if parse(torchao.__version__) < parse("0.11.0.dev0"):
             raise RuntimeError("Quantization 8da4w requires torchao >= 0.11.0. Please upgrade torchao.")
 
-        from torchao.quantization.granularity import PerGroup
+        from torchao.quantization.granularity import PerAxis, PerGroup
         from torchao.quantization.quant_api import (
             Int8DynamicActivationIntxWeightConfig,
+            IntxWeightOnlyConfig,
+            quantize_,
         )
+        from torchao.utils import unwrap_tensor_subclass
 
-        # TODO: Should switch to TorchAoConfig once the quant issue on final lm_head layer is fixed.
-        linear_config = Int8DynamicActivationIntxWeightConfig(
-            weight_dtype=torch.int4,
-            weight_granularity=PerGroup(64),
-        )
+        if qembedding_config:
+            logging.info("Quantizing embedding layers.")
+            # TODO: Should switch to `AOPerModuleConfig` once fix for tied weights is available.
+            embedding_config = IntxWeightOnlyConfig(
+                weight_dtype=torch.int8,
+                granularity=PerAxis(0),
+            )
+            quantize_(
+                eager_model,
+                embedding_config,
+                lambda m, fqn: isinstance(m, torch.nn.Embedding),
+            )
 
-        torchao.quantize_(
-            eager_model,
-            linear_config,
-        )
+        if qlinear_config:
+            logging.info("Quantizing linear layers.")
+            linear_config = Int8DynamicActivationIntxWeightConfig(
+                weight_dtype=torch.int4,
+                weight_granularity=PerGroup(32),
+            )
+            quantize_(
+                eager_model,
+                linear_config,
+            )
+
+        unwrap_tensor_subclass(eager_model)
 
     return CausalLMExportableModule(eager_model)
diff --git a/tests/models/test_modeling_gemma3.py b/tests/models/test_modeling_gemma3.py
@@ -181,10 +181,48 @@ def test_gemma3_text_generation_with_custom_sdpa_8da4w(self):
         # model_id = "google/gemma-3-1b-it"
         model_id = "unsloth/gemma-3-1b-it"
         prompt = "Write a poem about a machine learning."
+
+        # ExecuTorch model + custom sdpa + 8da4w linear quantization
+        kwargs = {"qlinear": True}
+        model = ExecuTorchModelForCausalLM.from_pretrained(
+            model_id,
+            recipe="xnnpack",
+            attn_implementation="custom_sdpa",
+            **kwargs,
+        )
+        self.assertIsInstance(model, ExecuTorchModelForCausalLM)
+        self.assertIsInstance(model.model, ExecuTorchModule)
+
         tokenizer = AutoTokenizer.from_pretrained(model_id)
-        kwargs = {"quantize": "8da4w"}
+        generated_text = model.text_generation(
+            tokenizer=tokenizer,
+            prompt=prompt,
+            max_seq_len=64,
+        )
+        logging.info(f"\nGenerated text:\n\t{generated_text}")
+        generated_tokens = tokenizer(generated_text, return_tensors="pt").input_ids
 
-        # ExecuTorch model + custom sdpa + float16
+        # Free memory before loading eager for quality check
+        del model
+        del tokenizer
+        gc.collect()
+
+        self.assertTrue(check_causal_lm_output_quality(model_id, generated_tokens))
+
+    @slow
+    @pytest.mark.run_slow
+    @pytest.mark.skipif(
+        parse(torchao.__version__) < parse("0.11.0.dev0"),
+        reason="Only available on torchao >= 0.11.0.dev0",
+    )
+    def test_gemma3_text_generation_with_custom_sdpa_8da4w_8we(self):
+        # TODO: Until https://github.com/huggingface/optimum/issues/2127 is fixed, have to use non-gated model on CI
+        # model_id = "google/gemma-3-1b-it"
+        model_id = "unsloth/gemma-3-1b-it"
+        prompt = "Write a poem about a machine learning."
+
+        # ExecuTorch model + custom sdpa + 8da4w linear quantization + int8 embedding quantization
+        kwargs = {"qlinear": True, "qembedding": True}
         model = ExecuTorchModelForCausalLM.from_pretrained(
             model_id,
             recipe="xnnpack",
@@ -194,6 +232,7 @@ def test_gemma3_text_generation_with_custom_sdpa_8da4w(self):
         self.assertIsInstance(model, ExecuTorchModelForCausalLM)
         self.assertIsInstance(model.model, ExecuTorchModule)
 
+        tokenizer = AutoTokenizer.from_pretrained(model_id)
         generated_text = model.text_generation(
             tokenizer=tokenizer,
             prompt=prompt,
diff --git a/tests/models/test_modeling_qwen3.py b/tests/models/test_modeling_qwen3.py
@@ -145,13 +145,13 @@ def test_qwen3_text_generation_with_custom_sdpa_float16(self):
         parse(torchao.__version__) < parse("0.11.0.dev0"),
         reason="Only available on torchao >= 0.11.0.dev0",
     )
-    def test_qwen3_text_generation_with_custom_sdpa_8da4w(self):
+    def test_qwen3_text_generation_with_custom_sdpa_8da4w_8we(self):
         model_id = "Qwen/Qwen3-0.6B"
         prompt = "Give me a short introduction to large language model."
         tokenizer = AutoTokenizer.from_pretrained(model_id)
 
-        # ExecuTorch model + custom sdpa
-        kwargs = {"quantize": "8da4w"}
+        # ExecuTorch model + custom sdpa + 8da4w linear quantization + int8 embedding quantization
+        kwargs = {"qlinear": True, "qembedding": True}
         model = ExecuTorchModelForCausalLM.from_pretrained(
             model_id,
             recipe="xnnpack",
@@ -163,7 +163,7 @@ def test_qwen3_text_generation_with_custom_sdpa_8da4w(self):
         generated_text = model.text_generation(
             tokenizer=tokenizer,
             prompt=prompt,
-            max_seq_len=64,
+            max_seq_len=128,
         )
         logging.info(f"\nGenerated text:\n\t{generated_text}")
         generated_tokens = tokenizer(generated_text, return_tensors="pt").input_ids
diff --git a/tests/models/test_modeling_smollm.py b/tests/models/test_modeling_smollm.py
@@ -82,8 +82,6 @@ def test_smollm_text_generation(self):
     def test_smollm_text_generation_with_custom_sdpa(self):
         model_id = "HuggingFaceTB/SmolLM2-135M"
         prompt = "My favourite condiment is "
-        max_seq_len = 32
-        tokenizer = AutoTokenizer.from_pretrained(model_id)
 
         # ExecuTorch model + custom sdpa
         model = ExecuTorchModelForCausalLM.from_pretrained(
@@ -94,10 +92,11 @@ def test_smollm_text_generation_with_custom_sdpa(self):
         self.assertIsInstance(model, ExecuTorchModelForCausalLM)
         self.assertIsInstance(model.model, ExecuTorchModule)
 
+        tokenizer = AutoTokenizer.from_pretrained(model_id)
         generated_text = model.text_generation(
             tokenizer=tokenizer,
             prompt=prompt,
-            max_seq_len=max_seq_len,
+            max_seq_len=32,
         )
         logging.info(f"\nGenerated text:\n\t{generated_text}")
         generated_tokens = tokenizer(generated_text, return_tensors="pt").input_ids
@@ -118,11 +117,46 @@ def test_smollm_text_generation_with_custom_sdpa(self):
     def test_smollm_text_generation_with_custom_sdpa_8da4w(self):
         model_id = "HuggingFaceTB/SmolLM2-135M"
         prompt = "My favourite condiment is "
-        max_seq_len = 32
+
+        # ExecuTorch model + custom sdpa + 8da4w linear quantization
+        kwargs = {"qlinear": True}
+        model = ExecuTorchModelForCausalLM.from_pretrained(
+            model_id,
+            recipe="xnnpack",
+            attn_implementation="custom_sdpa",
+            **kwargs,
+        )
+        self.assertIsInstance(model, ExecuTorchModelForCausalLM)
+        self.assertIsInstance(model.model, ExecuTorchModule)
+
         tokenizer = AutoTokenizer.from_pretrained(model_id)
+        generated_text = model.text_generation(
+            tokenizer=tokenizer,
+            prompt=prompt,
+            max_seq_len=64,
+        )
+        logging.info(f"\nGenerated text:\n\t{generated_text}")
+        generated_tokens = tokenizer(generated_text, return_tensors="pt").input_ids
 
-        # ExecuTorch model + custom sdpa
-        kwargs = {"quantize": "8da4w"}
+        # Free memory before loading eager for quality check
+        del model
+        del tokenizer
+        gc.collect()
+
+        self.assertTrue(check_causal_lm_output_quality(model_id, generated_tokens))
+
+    @slow
+    @pytest.mark.run_slow
+    @pytest.mark.skipif(
+        parse(torchao.__version__) < parse("0.11.0.dev0"),
+        reason="Only available on torchao >= 0.11.0.dev0",
+    )
+    def test_smollm_text_generation_with_custom_sdpa_8da4w_8we(self):
+        model_id = "HuggingFaceTB/SmolLM2-135M"
+        prompt = "My favourite condiment is "
+
+        # ExecuTorch model + custom sdpa + 8da4w linear quantization + int8 embedding quantization
+        kwargs = {"qlinear": True, "qembedding": True}
         model = ExecuTorchModelForCausalLM.from_pretrained(
             model_id,
             recipe="xnnpack",
@@ -132,10 +166,11 @@ def test_smollm_text_generation_with_custom_sdpa_8da4w(self):
         self.assertIsInstance(model, ExecuTorchModelForCausalLM)
         self.assertIsInstance(model.model, ExecuTorchModule)
 
+        tokenizer = AutoTokenizer.from_pretrained(model_id)
         generated_text = model.text_generation(
             tokenizer=tokenizer,
             prompt=prompt,
-            max_seq_len=max_seq_len,
+            max_seq_len=64,
         )
         logging.info(f"\nGenerated text:\n\t{generated_text}")
         generated_tokens = tokenizer(generated_text, return_tensors="pt").input_ids