NVIDIA · cjluo-nv · Feb 20, 2026 · Feb 20, 2026 · Feb 20, 2026 · coderabbitai
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -8,6 +8,7 @@ NVIDIA Model Optimizer Changelog (Linux)
 
 - User does not need to manually register MOE modules to cover experts calibration coverage in PTQ workflow.
 - ``hf_ptq.py`` now saves the quantization summary and moe expert token count table to the export directory.
+- Add ``--moe_calib_experts_ratio`` flag in ``hf_ptq.py`` to specify the ratio of experts to calibrate during forward pass to improve expert coverage during calibration. Default to 1/4 of all the experts.
 - Add sparse attention optimization for transformer models (``modelopt.torch.sparsity.attention_sparsity``). This reduces computational cost by skipping attention computation. Supports calibration for threshold selection on HuggingFace models. See `examples/llm_sparsity/attention_sparsity/README.md <https://github.com/NVIDIA/Model-Optimizer/tree/main/examples/llm_sparsity/attention_sparsity>`_ for usage.
 
 0.42 (2026-02-xx)

@@ -201,6 +201,7 @@ def build_quant_cfg(
     model_type,
     quant_cfg_choices,
     kv_quant_cfg_choices,
+    moe_calib_experts_ratio,
 ) -> dict[str, Any]:
     quant_cfg = {}
     assert qformat in quant_cfg_choices, (
@@ -232,6 +233,15 @@ def build_quant_cfg(
             getattr(mtq, kv_quant_cfg_choices[kv_cache_qformat])["quant_cfg"],
         )
 
+    if moe_calib_experts_ratio:
+        if isinstance(quant_cfg["algorithm"], str):
+            quant_cfg["algorithm"] = {
+                "method": quant_cfg["algorithm"],
+                "moe_calib_experts_ratio": moe_calib_experts_ratio,
+            }
+        else:
+            quant_cfg["algorithm"]["moe_calib_experts_ratio"] = moe_calib_experts_ratio
-    if moe_calib_experts_ratio:
-        if isinstance(quant_cfg["algorithm"], str):
-            quant_cfg["algorithm"] = {
-                "method": quant_cfg["algorithm"],
-                "moe_calib_experts_ratio": moe_calib_experts_ratio,
-            }
-        else:
-            quant_cfg["algorithm"]["moe_calib_experts_ratio"] = moe_calib_experts_ratio
+    if moe_calib_experts_ratio:
+        if quant_cfg["algorithm"] is None:
+            quant_cfg["algorithm"] = {
+                "method": None,
+                "moe_calib_experts_ratio": moe_calib_experts_ratio,
+            }
+        elif isinstance(quant_cfg["algorithm"], str):
+            quant_cfg["algorithm"] = {
+                "method": quant_cfg["algorithm"],
+                "moe_calib_experts_ratio": moe_calib_experts_ratio,
+            }
+        else:
+            quant_cfg["algorithm"]["moe_calib_experts_ratio"] = moe_calib_experts_ratio
-    if moe_calib_experts_ratio:
-        if isinstance(quant_cfg["algorithm"], str):
-            quant_cfg["algorithm"] = {
-                "method": quant_cfg["algorithm"],
-                "moe_calib_experts_ratio": moe_calib_experts_ratio,
-            }
-        else:
-            quant_cfg["algorithm"]["moe_calib_experts_ratio"] = moe_calib_experts_ratio
+    if moe_calib_experts_ratio:
+        if quant_cfg["algorithm"] is None:
+            quant_cfg["algorithm"] = {
+                "method": None,
+                "moe_calib_experts_ratio": moe_calib_experts_ratio,
+            }
+        elif isinstance(quant_cfg["algorithm"], str):
+            quant_cfg["algorithm"] = {
+                "method": quant_cfg["algorithm"],
+                "moe_calib_experts_ratio": moe_calib_experts_ratio,
+            }
+        else:
+            quant_cfg["algorithm"]["moe_calib_experts_ratio"] = moe_calib_experts_ratio
+
     # Gemma 7B has accuracy regression using alpha 1. We set 0.5 instead.
     if model_type == "gemma" and "int8_sq" in qformat:
         quant_cfg["algorithm"] = {"method": "smoothquant", "alpha": 0.5}

@@ -906,6 +906,7 @@ def quantize_main(
             model_type,
             QUANT_CFG_CHOICES,
             KV_QUANT_CFG_CHOICES,
+            args.moe_calib_experts_ratio,
         )
 
         # Exclude MTP layers from quantization if detected (e.g., GLM-4.7's layer 92)
@@ -1126,6 +1127,15 @@ def parse_args() -> argparse.Namespace:
             "(sensitivity scores, costs, etc.). Only used when auto_quantize_bits is specified."
         ),
     )
+    parser.add_argument(
+        "--moe_calib_experts_ratio",
+        type=float,
+        default=1.0 / 4,
+        help=(
+            "Percentage of experts to calibrate during forward pass. Only used for MOE models. "
+            "This is used to reduce the number of experts to calibrate during forward pass. "
+        ),
+    )
 
     return parser.parse_args()
 

@@ -48,7 +48,7 @@ def save_expert_token_count_table(model: nn.Module, output_dir: str | Path | Non
         "th, td { border: 1px solid #ccc; padding: 4px 8px; text-align: right; }",
         "th { background: #f0f0f0; }",
         "</style></head><body>",
-        "<h2>Expert Token Counts (per MoE layer)</h2>",
+        "<h2>Expert Calib Token Counts (per MoE layer)</h2>",
         "<table><tr><th>Layer/Expert</th>",
     ]
     html_parts.extend(f"<th>{i}</th>" for i in range(num_experts))

@@ -1070,6 +1070,16 @@ class QuantizeAlgorithmConfig(ModeloptBaseConfig):
         title="This field specifies the name of the calibration algorithm. If None, no calibration is performed.",
     )
 
+    moe_calib_experts_ratio: float | None = ModeloptField(
+        default=None,
+        title="% of experts to calibrate during forward pass.",
+        description=(
+            "If specified, we force forward tokens to % of experts during the calibration"
+            " pass. This forward is for calibration purpose only and will not affect the"
+            " actual inference."
+        ),
+    )
+
 
 class MaxCalibConfig(QuantizeAlgorithmConfig):
     """The config for max calibration algorithm.

@@ -216,6 +216,12 @@ def wrapped_calib_func(
         # For backward compatibility
         kwargs["algorithm"] = method
 
+    moe_calib_experts_ratio = kwargs.pop("moe_calib_experts_ratio", None)
+    if moe_calib_experts_ratio is not None:
+        for module in model.modules():
+            if hasattr(module, "_moe_calib_experts_ratio"):
+                module._moe_calib_experts_ratio = moe_calib_experts_ratio
+
     if func is not None:
         # Call the function with forward_loop as a separate argument
         func(model, forward_loop=forward_loop, **kwargs)

@@ -458,8 +458,9 @@ def _setup(self):
         elif hasattr(self, "experts") and hasattr(self.experts, "num_experts"):
             num_experts = self.experts.num_experts
 
-        self.expert_token_count = torch.zeros(num_experts, dtype=torch.long, device="cpu")
+        self.expert_token_count = torch.zeros(num_experts, dtype=torch.long, device="cuda")
         self._count_expert_tokens = False
+        self._moe_calib_experts_ratio = None
 
         if num_experts == 0:
             warnings.warn(
@@ -483,36 +484,50 @@ def _gate_forward_hook(self, module, input, output):
                 logits = output if not isinstance(output, tuple) else output[0]
                 top_k = self.gate.top_k if hasattr(self.gate, "top_k") else self.top_k
                 _, indices = torch.topk(logits.float(), top_k, dim=-1)
-            counts = torch.bincount(
-                indices.reshape(-1).cpu(), minlength=len(self.expert_token_count)
-            )
-            self.expert_token_count += counts
+            counts = torch.bincount(indices.reshape(-1), minlength=len(self.expert_token_count))
+            self.expert_token_count += counts.to(self.expert_token_count.device)
 
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         is_calib = any(getattr(m, "_if_calib", False) for m in self.experts.modules())
-        if is_calib:
+        self._count_expert_tokens = is_calib
+        if is_calib and self._moe_calib_experts_ratio:
+            self._count_expert_tokens = True
+            assert 0 < self._moe_calib_experts_ratio <= 1, (
+                "moe_calib_experts_ratio must be between 0 and 1"
+            )
             # If any of the experts are in calibration mode, we will forward all tokens to all experts
             # This is used only for calibration, we need to re-calculate the actual outputs again using
             # the original top_k
             if TRANSFORMERS_VERSION_GE_5_0:
                 assert hasattr(self, "gate") and hasattr(self.gate, "top_k")
                 original_top_k = self.gate.top_k
-                self.gate.top_k = self.gate.num_experts
+                self.gate.top_k = round(self.gate.num_experts * self._moe_calib_experts_ratio)
+                assert self.gate.top_k >= original_top_k, (
+                    f"moe_calib_experts_ratio {self._moe_calib_experts_ratio},"
+                    f" calib top_k {self.gate.top_k} smaller than original"
+                    f" top_k {original_top_k}"
+                )
-                self.gate.top_k = round(self.gate.num_experts * self._moe_calib_experts_ratio)
-                assert self.gate.top_k >= original_top_k, (
-                    f"moe_calib_experts_ratio {self._moe_calib_experts_ratio},"
-                    f" calib top_k {self.gate.top_k} smaller than original"
-                    f" top_k {original_top_k}"
-                )
+                self.gate.top_k = max(
+                    round(self.gate.num_experts * self._moe_calib_experts_ratio),
+                    original_top_k,
+                )
-                self.gate.top_k = round(self.gate.num_experts * self._moe_calib_experts_ratio)
-                assert self.gate.top_k >= original_top_k, (
-                    f"moe_calib_experts_ratio {self._moe_calib_experts_ratio},"
-                    f" calib top_k {self.gate.top_k} smaller than original"
-                    f" top_k {original_top_k}"
-                )
+                self.gate.top_k = max(
+                    round(self.gate.num_experts * self._moe_calib_experts_ratio),
+                    original_top_k,
+                )
                 super().forward(hidden_states)
                 self.gate.top_k = original_top_k
             else:
                 # Path for transformers < 5.0
                 original_top_k = self.top_k
                 if hasattr(self, "num_experts"):
-                    self.top_k = self.num_experts
+                    self.top_k = round(self.num_experts * self._moe_calib_experts_ratio)
                 elif hasattr(self, "experts"):
-                    self.top_k = self.experts.num_experts
+                    self.top_k = round(self.experts.num_experts * self._moe_calib_experts_ratio)
                 else:
                     raise ValueError(f"Could not find num_experts in module {self}")
+                assert self.top_k >= original_top_k, (
+                    f"moe_calib_experts_ratio {self._moe_calib_experts_ratio},"
+                    f" calib top_k {self.top_k} smaller than original"
+                    f" top_k {original_top_k}"
+                )
                 super().forward(hidden_states)
                 self.top_k = original_top_k
-        # Enable counting only for the real-routing forward during calibration
-        self._count_expert_tokens = is_calib
+            self._count_expert_tokens = False
+        else:
+            self._count_expert_tokens = True
         output = super().forward(hidden_states)
         self._count_expert_tokens = False
         return output