diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 744238656..b7047213b 100755 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -8,6 +8,7 @@ NVIDIA Model Optimizer Changelog (Linux) - User does not need to manually register MOE modules to cover experts calibration coverage in PTQ workflow. - ``hf_ptq.py`` now saves the quantization summary and moe expert token count table to the export directory. +- Add ``--moe_calib_experts_ratio`` flag in ``hf_ptq.py`` to specify the ratio of experts to calibrate during forward pass to improve expert coverage during calibration. Default to 1/4 of all the experts. - Add sparse attention optimization for transformer models (``modelopt.torch.sparsity.attention_sparsity``). This reduces computational cost by skipping attention computation. Supports calibration for threshold selection on HuggingFace models. See `examples/llm_sparsity/attention_sparsity/README.md `_ for usage. 0.42 (2026-02-xx) diff --git a/examples/llm_ptq/example_utils.py b/examples/llm_ptq/example_utils.py index d8bff7ba2..ccc594612 100755 --- a/examples/llm_ptq/example_utils.py +++ b/examples/llm_ptq/example_utils.py @@ -201,6 +201,7 @@ def build_quant_cfg( model_type, quant_cfg_choices, kv_quant_cfg_choices, + moe_calib_experts_ratio, ) -> dict[str, Any]: quant_cfg = {} assert qformat in quant_cfg_choices, ( @@ -232,6 +233,15 @@ def build_quant_cfg( getattr(mtq, kv_quant_cfg_choices[kv_cache_qformat])["quant_cfg"], ) + if moe_calib_experts_ratio: + if isinstance(quant_cfg["algorithm"], str): + quant_cfg["algorithm"] = { + "method": quant_cfg["algorithm"], + "moe_calib_experts_ratio": moe_calib_experts_ratio, + } + else: + quant_cfg["algorithm"]["moe_calib_experts_ratio"] = moe_calib_experts_ratio + # Gemma 7B has accuracy regression using alpha 1. We set 0.5 instead. if model_type == "gemma" and "int8_sq" in qformat: quant_cfg["algorithm"] = {"method": "smoothquant", "alpha": 0.5} diff --git a/examples/llm_ptq/hf_ptq.py b/examples/llm_ptq/hf_ptq.py index d7aadf994..7d3db93f3 100755 --- a/examples/llm_ptq/hf_ptq.py +++ b/examples/llm_ptq/hf_ptq.py @@ -906,6 +906,7 @@ def quantize_main( model_type, QUANT_CFG_CHOICES, KV_QUANT_CFG_CHOICES, + args.moe_calib_experts_ratio, ) # Exclude MTP layers from quantization if detected (e.g., GLM-4.7's layer 92) @@ -1126,6 +1127,15 @@ def parse_args() -> argparse.Namespace: "(sensitivity scores, costs, etc.). Only used when auto_quantize_bits is specified." ), ) + parser.add_argument( + "--moe_calib_experts_ratio", + type=float, + default=1.0 / 4, + help=( + "Percentage of experts to calibrate during forward pass. Only used for MOE models. " + "This is used to reduce the number of experts to calibrate during forward pass. " + ), + ) return parser.parse_args() diff --git a/modelopt/torch/export/moe_utils.py b/modelopt/torch/export/moe_utils.py index a5ba465b1..dc3574868 100644 --- a/modelopt/torch/export/moe_utils.py +++ b/modelopt/torch/export/moe_utils.py @@ -48,7 +48,7 @@ def save_expert_token_count_table(model: nn.Module, output_dir: str | Path | Non "th, td { border: 1px solid #ccc; padding: 4px 8px; text-align: right; }", "th { background: #f0f0f0; }", "", - "

Expert Token Counts (per MoE layer)

", + "

Expert Calib Token Counts (per MoE layer)

", "", ] html_parts.extend(f"" for i in range(num_experts)) diff --git a/modelopt/torch/quantization/config.py b/modelopt/torch/quantization/config.py index 5d95ffe5f..8f53b68ab 100644 --- a/modelopt/torch/quantization/config.py +++ b/modelopt/torch/quantization/config.py @@ -1091,6 +1091,16 @@ class QuantizeAlgorithmConfig(ModeloptBaseConfig): title="This field specifies the name of the calibration algorithm. If None, no calibration is performed.", ) + moe_calib_experts_ratio: float | None = ModeloptField( + default=None, + title="% of experts to calibrate during forward pass.", + description=( + "If specified, we force forward tokens to % of experts during the calibration" + " pass. This forward is for calibration purpose only and will not affect the" + " actual inference." + ), + ) + class MaxCalibConfig(QuantizeAlgorithmConfig): """The config for max calibration algorithm. diff --git a/modelopt/torch/quantization/mode.py b/modelopt/torch/quantization/mode.py index 1f3346ea9..e0bd58b6b 100644 --- a/modelopt/torch/quantization/mode.py +++ b/modelopt/torch/quantization/mode.py @@ -225,6 +225,12 @@ def wrapped_calib_func( # For backward compatibility kwargs["algorithm"] = method + moe_calib_experts_ratio = kwargs.pop("moe_calib_experts_ratio", None) + if moe_calib_experts_ratio is not None: + for module in model.modules(): + if hasattr(module, "_moe_calib_experts_ratio"): + module._moe_calib_experts_ratio = moe_calib_experts_ratio + if func is not None: # Call the function with forward_loop as a separate argument func(model, forward_loop=forward_loop, **kwargs) diff --git a/modelopt/torch/quantization/plugins/huggingface.py b/modelopt/torch/quantization/plugins/huggingface.py index aa274ea7e..69f83c372 100644 --- a/modelopt/torch/quantization/plugins/huggingface.py +++ b/modelopt/torch/quantization/plugins/huggingface.py @@ -458,8 +458,11 @@ def _setup(self): elif hasattr(self, "experts") and hasattr(self.experts, "num_experts"): num_experts = self.experts.num_experts - self.expert_token_count = torch.zeros(num_experts, dtype=torch.long, device="cpu") + self.expert_token_count = torch.zeros( + num_experts, dtype=torch.long, device=next(self.parameters()).device + ) self._count_expert_tokens = False + self._moe_calib_experts_ratio = None if num_experts == 0: warnings.warn( @@ -483,36 +486,47 @@ def _gate_forward_hook(self, module, input, output): logits = output if not isinstance(output, tuple) else output[0] top_k = self.gate.top_k if hasattr(self.gate, "top_k") else self.top_k _, indices = torch.topk(logits.float(), top_k, dim=-1) - counts = torch.bincount( - indices.reshape(-1).cpu(), minlength=len(self.expert_token_count) - ) - self.expert_token_count += counts + counts = torch.bincount(indices.reshape(-1), minlength=self.expert_token_count.shape[0]) + self.expert_token_count += counts.to(self.expert_token_count.device) def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: is_calib = any(getattr(m, "_if_calib", False) for m in self.experts.modules()) - if is_calib: + self._count_expert_tokens = is_calib + if is_calib and self._moe_calib_experts_ratio: + self._count_expert_tokens = True + assert 0 < self._moe_calib_experts_ratio <= 1, ( + "moe_calib_experts_ratio must be between 0 and 1" + ) # If any of the experts are in calibration mode, we will forward all tokens to all experts # This is used only for calibration, we need to re-calculate the actual outputs again using # the original top_k if TRANSFORMERS_VERSION_GE_5_0: assert hasattr(self, "gate") and hasattr(self.gate, "top_k") original_top_k = self.gate.top_k - self.gate.top_k = self.gate.num_experts + self.gate.top_k = max( + original_top_k, round(self.gate.num_experts * self._moe_calib_experts_ratio) + ) super().forward(hidden_states) self.gate.top_k = original_top_k else: # Path for transformers < 5.0 original_top_k = self.top_k if hasattr(self, "num_experts"): - self.top_k = self.num_experts + self.top_k = max( + original_top_k, round(self.num_experts * self._moe_calib_experts_ratio) + ) elif hasattr(self, "experts"): - self.top_k = self.experts.num_experts + self.top_k = max( + original_top_k, + round(self.experts.num_experts * self._moe_calib_experts_ratio), + ) else: raise ValueError(f"Could not find num_experts in module {self}") super().forward(hidden_states) self.top_k = original_top_k - # Enable counting only for the real-routing forward during calibration - self._count_expert_tokens = is_calib + self._count_expert_tokens = False + else: + self._count_expert_tokens = True output = super().forward(hidden_states) self._count_expert_tokens = False return output
Layer/Expert{i}