From 530a255d490c1a322c97f0182e829346336c1436 Mon Sep 17 00:00:00 2001 From: Anirudh Swaminathan Date: Wed, 18 Feb 2026 15:15:45 -0800 Subject: [PATCH] =?UTF-8?q?Update=20Intel=C2=AE=20optimum=20passes=20to=20?= =?UTF-8?q?latest=20optimum?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Update Intel® compression and optimum conversion passes to utilize latest optimum version APIs - [x] Add unit tests for this change. - [x] Make sure all tests can pass. - [x] Update documents if necessary. - [x] Lint and apply fixes to your code by running `lintrunner -a` - [x] Is this a user-facing change? If yes, give a description of this change to be included in the release notes. --------- Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- .../features/ihv-integration/openvino.md | 8 +- olive/olive_config.json | 6 +- olive/passes/openvino/compression.py | 702 ++++-------------- olive/passes/openvino/encapsulation.py | 173 +---- olive/passes/openvino/optimum_intel.py | 475 ++++++------ olive/passes/openvino/ov_utils.py | 370 +++++++++ olive/passes/openvino/quantization.py | 7 +- .../openvino/test_openvino_compression.py | 147 +++- .../test_openvino_optimum_conversion.py | 24 +- test/requirements-test.txt | 4 +- test/utils.py | 21 + 11 files changed, 977 insertions(+), 960 deletions(-) create mode 100644 olive/passes/openvino/ov_utils.py diff --git a/docs/source/features/ihv-integration/openvino.md b/docs/source/features/ihv-integration/openvino.md index 6dea33684e..373ea127ea 100644 --- a/docs/source/features/ihv-integration/openvino.md +++ b/docs/source/features/ihv-integration/openvino.md @@ -23,18 +23,18 @@ pip install olive-ai[openvino] ### Option 2: Install OpenVINO Runtime and OpenVINO Development Tools from Pypi ```bash -pip install openvino>=2025.3.0 -pip install nncf>=2.18.0 +pip install openvino>=2025.4.1 +pip install nncf>=2.19.0 pip install onnxruntime-openvino ``` ### Install Optimum Intel® for Generative AI Workloads ```bash -pip install optimum[openvino]<=1.24.0 +pip install optimum[openvino]>=2.1.0 ``` -More detailed instructions are available at [Optimum Intel® Installation Instructions](https://huggingface.co/docs/optimum/main/en/intel/installation) +More detailed instructions are available at [Optimum Installation Instructions](https://huggingface.co/docs/optimum/installation) and at [Optimum Intel® Installation Instructions](https://huggingface.co/docs/optimum/main/en/intel/installation) ## Model Conversion diff --git a/olive/olive_config.json b/olive/olive_config.json index 77debc1818..2748c39101 100644 --- a/olive/olive_config.json +++ b/olive/olive_config.json @@ -646,10 +646,10 @@ "diffusers": [ "accelerate>=0.30.0", "peft", "diffusers" ], "nvmo": [ "nvidia-modelopt[onnx]" ], "openvino": [ - "openvino>=2025.3.0", - "nncf>=2.18.0", + "openvino>=2025.4.1", + "nncf>=2.19.0", "numpy<2.0", - "optimum[openvino]<=1.24", + "optimum[openvino]>=2.1.0", "onnxruntime-openvino" ], "optimum": [ "optimum" ], diff --git a/olive/passes/openvino/compression.py b/olive/passes/openvino/compression.py index 689b11892a..93cfc40a9e 100644 --- a/olive/passes/openvino/compression.py +++ b/olive/passes/openvino/compression.py @@ -7,238 +7,26 @@ from copy import deepcopy from functools import partial from pathlib import Path -from typing import Callable, Optional, Union - -from huggingface_hub.constants import HUGGINGFACE_HUB_CACHE +from typing import Any, Callable, Optional, Union from olive.common.config_utils import validate_config -from olive.common.utils import StrEnumBase from olive.data.config import DataConfig from olive.hardware.accelerator import AcceleratorSpec, Device from olive.model.handler import CompositeModelHandler, HfModelHandler, ONNXModelHandler, OpenVINOModelHandler from olive.passes import Pass +from olive.passes.openvino.ov_utils import ( + IgnoreScopeTypeEnum, + OVOptimumLibrary, + _convert_to_enum, + _validate_enum_value, + create_genai_config, + infer_library_name, +) from olive.passes.pass_config import BasePassConfig, ParamCategory, PassConfigParam, get_user_script_data_config logger = logging.getLogger(__name__) -class IgnoreScopeTypeEnum(StrEnumBase): - NAMES = "names" - TYPES = "types" - PATTERNS = "patterns" - - -class OVOptimumLibrary(StrEnumBase): - TRANSFORMERS = "transformers" - DIFFUSERS = "diffusers" - TIMM = "timm" - SENTENCE_TRANSFORMERS = "sentence_transformers" - OPEN_CLIP = "open_clip" - - -def infer_task( - task, - model_name_or_path, - subfolder: str = "", - revision: Optional[str] = None, - cache_dir: str = HUGGINGFACE_HUB_CACHE, - token: Optional[Union[bool, str]] = None, - library_name: Optional[str] = None, -): - try: - from optimum.exporters import TasksManager - except Exception as e: - raise ImportError("Unable to import optimum packages:", e) from None - - try: - from requests.exceptions import ConnectionError as RequestsConnectionError - except Exception as e: - raise ImportError("Unable to import ConnectionError packages:", e) from None - - task = TasksManager.map_from_synonym(task) - if task == "auto": - if library_name == "open_clip": - task = "zero-shot-image-classification" - else: - try: - task = TasksManager._infer_task_from_model_name_or_path( # pylint: disable=W0212 - model_name_or_path=model_name_or_path, - subfolder=subfolder, - revision=revision, - cache_dir=cache_dir, - token=token, - library_name=library_name, - ) - except KeyError as e: - raise KeyError( - f"The task could not be automatically inferred. Please provide the argument --task with the relevant task from {', '.join(TasksManager.get_all_tasks())}. Detailed error: {e}" - ) from None - except RequestsConnectionError as e: - raise RequestsConnectionError( - f"The task could not be automatically inferred as this is available only for models hosted on the Hugging Face Hub. Please provide the argument --task with the relevant task from {', '.join(TasksManager.get_all_tasks())}. Detailed error: {e}" - ) from None - return task - - -def maybe_load_preprocessors( - src_name_or_path: Union[str, Path], subfolder: str = "", trust_remote_code: bool = False -) -> list: - try: - from transformers import AutoFeatureExtractor, AutoImageProcessor, AutoProcessor, AutoTokenizer - except Exception as e: - raise ImportError("Unable to import transformers packages: ", e) from None - - preprocessors = [] - try: - preprocessors.append( - AutoTokenizer.from_pretrained(src_name_or_path, subfolder=subfolder, trust_remote_code=trust_remote_code) - ) - except Exception as e: - logger.warning("Could not load tokenizer using specified model ID or path.\n Exception: %s", e) - - try: - preprocessors.append( - AutoProcessor.from_pretrained(src_name_or_path, subfolder=subfolder, trust_remote_code=trust_remote_code) - ) - except Exception as e: - logger.warning("Could not load processor using specified model ID or path.\n Exception: %s", e) - - try: - preprocessors.append( - AutoFeatureExtractor.from_pretrained( - src_name_or_path, subfolder=subfolder, trust_remote_code=trust_remote_code - ) - ) - except Exception as e: - logger.warning("Could not load feature extractor using specified model ID or path.\n Exception: %s", e) - - try: - preprocessors.append( - AutoImageProcessor.from_pretrained( - src_name_or_path, subfolder=subfolder, trust_remote_code=trust_remote_code - ) - ) - except Exception as e: - logger.warning("Could not load image processor using specified model ID or path.\n Exception: %s", e) - - return preprocessors - - -def maybe_convert_tokenizers(library_name: str, output: Path, model=None, preprocessors=None, task=None): - try: - from optimum.exporters.openvino.convert import export_tokenizer - except Exception as e: - raise ImportError("Unable to import optimum Intel® package:", e) from None - - try: - from transformers import PreTrainedTokenizerBase - except Exception as e: - raise ImportError("Unable to import transformers packages:", e) from None - - try: - from optimum.intel.utils.import_utils import is_openvino_tokenizers_available - except Exception as e: - raise ImportError("openvino tokenizers unavailable :", e) from None - - if is_openvino_tokenizers_available(): - if library_name != "diffusers" and preprocessors: - tokenizer = next(filter(lambda it: isinstance(it, PreTrainedTokenizerBase), preprocessors), None) - if tokenizer: - try: - export_tokenizer(tokenizer, output, task=task) - except Exception as exception: - logger.warning( - "Could not load tokenizer using specified model ID or path. OpenVINO tokenizer/detokenizer models won't be generated. Exception: %s", - exception, - ) - elif model: - for tokenizer_name in ("tokenizer", "tokenizer_2", "tokenizer_3"): - tokenizer = getattr(model, tokenizer_name, None) - if tokenizer: - export_tokenizer(tokenizer, output / tokenizer_name, task=task) - else: - logger.warning("Tokenizer won't be converted.") - - -def _validate_enum_value(value, enum_class: type, param_name: str) -> tuple[bool, str]: - """Validate that a value can be converted to an enum (case-insensitive). - - Args: - value: The value to validate (None, string, or already enum). - enum_class: The enum class to validate against. - param_name: Name of the parameter for error messages. - - Returns: - Tuple of (is_valid, error_message). error_message is empty if valid. - - """ - if value is None or isinstance(value, enum_class): - return True, "" - - if not isinstance(value, str): - return False, f"{param_name} '{value}' is not a valid string or {enum_class.__name__} enum." - - lookup_key = value.lower() - - # Try matching by enum.value first (case-insensitive) - value_map = {m.value.lower(): m for m in enum_class} - if lookup_key in value_map: - return True, "" - - # Try matching by enum.name (case-insensitive) - name_map = {m.name.lower(): m for m in enum_class} - if lookup_key in name_map: - return True, "" - - # Validation failed - valid_values = sorted(set([m.value for m in enum_class] + [m.name for m in enum_class])) - return False, f"{param_name} '{value}' is not supported. Supported values are: {', '.join(valid_values)}." - - -def _convert_to_enum(value, enum_class: type, param_name: str): - """Convert a value to an enum if needed (case-insensitive). - - Accepts: - - None (returns None) - - Enum instances of the correct type (returns as-is) - - Strings matching enum.value (case-insensitive) - - Strings matching enum.name (case-insensitive) - - Args: - value: The value to convert (None, string, or already enum). - enum_class: The enum class to convert to. - param_name: Name of the parameter for error messages. - - Returns: - The enum value, or None if input was None. - - Raises: - ValueError: If conversion fails. - - """ - if value is None or isinstance(value, enum_class): - return value - - if not isinstance(value, str): - raise ValueError(f"{param_name} '{value}' is not a valid string or {enum_class.__name__} enum.") - - lookup_key = value.lower() - - # Try matching by enum.value first (case-insensitive) - value_map = {m.value.lower(): m for m in enum_class} - if lookup_key in value_map: - return value_map[lookup_key] - - # Try matching by enum.name (case-insensitive) - name_map = {m.name.lower(): m for m in enum_class} - if lookup_key in name_map: - return name_map[lookup_key] - - # Conversion failed - valid_values = sorted(set([m.value for m in enum_class] + [m.name for m in enum_class])) - raise ValueError(f"{param_name} '{value}' is not supported. Supported values are: {', '.join(valid_values)}.") - - def _convert_compress_config_enums(compress_config: dict) -> dict: """Convert compress_config enum values from strings to enum instances. @@ -651,14 +439,79 @@ def _get_advanced_compression_params(config): return advanced_params + def _apply_compression( + self, + model_to_compress: Any, + config: type[BasePassConfig], + output_model_path: str, + tokenizer: Optional[Any] = None, + ) -> Any: + """Apply NNCF weight compression to a model. + + Args: + model_to_compress: The model object to compress (OpenVINO model or ONNX model). + config: The pass configuration. + output_model_path: Path where the output model will be saved. + tokenizer: Optional tokenizer for dataset transform (used in HF path). + + Returns: + The compressed model object from nncf.compress_weights(). + + Raises: + ImportError: If nncf is not installed. + + """ + try: + import nncf + from nncf.onnx.quantization.backend_parameters import BackendParameters + except ImportError: + raise ImportError("Please install olive-ai[openvino] to use OpenVINO NNCF") from None + + # get the weight compression dataset + compression_dataset = self._get_nncf_dataset(config, tokenizer) + + # get the extra params + extra_params = self._get_extra_params(config) + + # local copy of compress_config and ensure enum values are converted + # (handles case where validate_config was bypassed, e.g., in unit tests) + compress_config = deepcopy(config.compress_config) if config.compress_config else {} + compress_config = _convert_compress_config_enums(compress_config) + + # append extra params to compress config + compress_config.update(extra_params) + + # get nncf.AdvancedCompressionParameters if any + advanced_params = None + adv_par = self._get_advanced_compression_params(config) + if adv_par is not None: + # Handle external_dir for backend_params - add output path at runtime + if adv_par.get("_external_dir") is not None: + # Create or update backend_params with external data dir + if adv_par.get("backend_params") is None: + adv_par["backend_params"] = {BackendParameters.EXTERNAL_DATA_DIR: output_model_path} + else: + adv_par["backend_params"][BackendParameters.EXTERNAL_DATA_DIR] = output_model_path + # Remove the temporary _external_dir key + adv_par.pop("_external_dir") + + advanced_params = nncf.AdvancedCompressionParameters(**adv_par) + + # perform weight compression + return nncf.compress_weights( + model_to_compress, dataset=compression_dataset, advanced_parameters=advanced_params, **compress_config + ) + def _run_for_config( self, - model: Union[HfModelHandler, ONNXModelHandler], + model: Union[HfModelHandler, ONNXModelHandler, OpenVINOModelHandler], config: type[BasePassConfig], output_model_path: str, ) -> Union[OpenVINOModelHandler, ONNXModelHandler, CompositeModelHandler]: - if not isinstance(model, (HfModelHandler, ONNXModelHandler)): - raise TypeError("OpenVINOWeightCompression pass can only be applied to Hugging Face or ONNX models") + if not isinstance(model, (HfModelHandler, ONNXModelHandler, OpenVINOModelHandler)): + raise TypeError( + "OpenVINOWeightCompression pass can only be applied to Hugging Face, ONNX, or OpenVINO models" + ) if config.reuse_cache: model_name_path = Path(model.model_path) @@ -680,6 +533,8 @@ def _run_for_config( output_model = self._run_hf_pass(model, config, output_model_path) elif isinstance(model, ONNXModelHandler): output_model = self._run_onnx_pass(model, config, output_model_path) + elif isinstance(model, OpenVINOModelHandler): + output_model = self._run_openvino_pass(model, config, output_model_path) if config.reuse_cache: if os.path.exists(model_name_path): @@ -696,10 +551,7 @@ def _run_hf_pass( output_model_path: str, ) -> Union[OpenVINOModelHandler, CompositeModelHandler]: try: - import nncf - from nncf.onnx.quantization.backend_parameters import BackendParameters from optimum.exporters.openvino import main_export as export_optimum_intel - from optimum.intel.utils.modeling_utils import _infer_library_from_model_name_or_path except ImportError: raise ImportError( "Please install Intel® optimum[openvino] to use NNCF for weight compression on HF models" @@ -708,123 +560,34 @@ def _run_hf_pass( # local copy of extra_args extra_args = deepcopy(config.extra_args) if config.extra_args else {} - # local copy of compress_config and ensure enum values are converted - # (handles case where validate_config was bypassed, e.g., in unit tests) - compress_config = deepcopy(config.compress_config) if config.compress_config else {} - compress_config = _convert_compress_config_enums(compress_config) - # set the library name for the HF Model if extra_args.get("library") is None: - lib_name = _infer_library_from_model_name_or_path(model.model_name_or_path) - if lib_name == "sentence_transformers": - logger.warning( - "Library is not specified. " - "There are multiple possible variants: `sentence_transformers`, `transformers`. " - "`transformers` will be selected. " - "If you want to load your model with the `sentence-transformers` library instead, " - "Please set it as sentence_transformers in extra_args dictionary under 'library' key" - ) - lib_name = "transformers" - extra_args["library"] = lib_name + lib_name = infer_library_name(model.model_name_or_path) else: lib_name = extra_args["library"] - # infer task - task = infer_task(extra_args.get("task", "auto"), model.model_name_or_path, library_name=lib_name) - - # model - if lib_name == "diffusers": - try: - from diffusers import DiffusionPipeline - except ImportError: - raise ImportError("Please install diffusers to use OpenVINO with Diffusers models.") from None - - diffusers_config = DiffusionPipeline.load_config(model.model_name_or_path) - class_name = diffusers_config.get("_class_name", None) - - if class_name == "LatentConsistencyModelPipeline": - from optimum.intel import OVLatentConsistencyModelPipeline - - model_cls = OVLatentConsistencyModelPipeline - - elif class_name == "StableDiffusionXLPipeline": - from optimum.intel import OVStableDiffusionXLPipeline - - model_cls = OVStableDiffusionXLPipeline - elif class_name == "StableDiffusionPipeline": - from optimum.intel import OVStableDiffusionPipeline - - model_cls = OVStableDiffusionPipeline - elif class_name == "StableDiffusion3Pipeline": - from optimum.intel import OVStableDiffusion3Pipeline - - model_cls = OVStableDiffusion3Pipeline - elif class_name == "FluxPipeline": - from optimum.intel import OVFluxPipeline - - model_cls = OVFluxPipeline - elif class_name == "SanaPipeline": - from optimum.intel import OVSanaPipeline - - model_cls = OVSanaPipeline - else: - raise NotImplementedError(f"{class_name} isn't supported.") - - output_model = model_cls.from_pretrained( - model.model_name_or_path, export=True, load_in_8bit=False, compile=False - ) - if not extra_args.get("disable_convert_tokenizer", False): - maybe_convert_tokenizers(lib_name, output_model_path, model, task=task) - elif (task.startswith("text-generation") or "automatic-speech-recognition" in task) or ( - task == "image-text-to-text" - ): - if task.startswith("text-generation"): - from optimum.intel import OVModelForCausalLM - - model_cls = OVModelForCausalLM - elif task == "image-text-to-text": - from optimum.intel import OVModelForVisualCausalLM + # prepare extra args for export + extra_args["stateful"] = not extra_args.get("disable_stateful", False) + extra_args.pop("disable_stateful", None) + extra_args["convert_tokenizer"] = not extra_args.get("disable_convert_tokenizer", False) + extra_args.pop("disable_convert_tokenizer", None) + extra_args["library_name"] = lib_name + extra_args.pop("library", None) + + # export HF model to OpenVINO format + export_optimum_intel( + model.model_name_or_path, + output_model_path, + **extra_args, + ) - model_cls = OVModelForVisualCausalLM - else: - from optimum.intel import OVModelForSpeechSeq2Seq - - model_cls = OVModelForSpeechSeq2Seq - - output_model = model_cls.from_pretrained( - model.model_name_or_path, - export=True, - load_in_8bit=False, - compile=False, - stateful=not extra_args.get("disable_stateful", False), - trust_remote_code=extra_args.get("trust_remote_code", False), - variant=extra_args.get("variant", None), - cache_dir=extra_args.get("cache_dir", HUGGINGFACE_HUB_CACHE), - ) + # load the exported OpenVINO model + from optimum.intel import OVModelForCausalLM - preprocessors = maybe_load_preprocessors( - model.model_name_or_path, trust_remote_code=extra_args.get("trust_remote_code", False) - ) - if not extra_args.get("disable_convert_tokenizer", False): - maybe_convert_tokenizers(lib_name, output_model_path, preprocessors=preprocessors, task=task) - - else: - extra_args["stateful"] = not extra_args.get("disable_stateful", False) - extra_args.pop("disable_stateful", False) - extra_args["convert_tokenizer"] = not extra_args.get("disable_convert_tokenizer", False) - extra_args.pop("disable_convert_tokenizer", False) - extra_args["library_name"] = lib_name - extra_args.pop("library", None) - export_optimum_intel( - model.model_name_or_path, - output_model_path, - **extra_args, - ) + output_model = OVModelForCausalLM.from_pretrained(output_model_path, compile=False) # redirect to ONNXModelHandler if extra_args requests ONNX processing # this is also only for CausalLM models - from optimum.intel import OVModelForCausalLM - if config.extra_args and config.extra_args.get("use_onnx") and isinstance(output_model, OVModelForCausalLM): try: from optimum.onnxruntime import ORTModelForCausalLM @@ -862,38 +625,35 @@ def _run_hf_pass( ) from None tokenizer = AutoTokenizer.from_pretrained(model.model_name_or_path) - # get the weight compression dataset - compression_dataset = self._get_nncf_dataset(config, tokenizer) - - # get the extra params - extra_params = self._get_extra_params(config) + # perform weight compression using shared compression logic + output_model.model = self._apply_compression(output_model.model, config, output_model_path, tokenizer) - # append extra params to compress config - compress_config.update(extra_params) + # save compressed model to temp directory to avoid file locking issues, + # then copy back to output_model_path + import gc + import shutil + import tempfile - # get nncf.AdvancedCompressionParameters if any - advanced_params = None - adv_par = self._get_advanced_compression_params(config) - if adv_par is not None: - # Handle external_dir for backend_params - add output path at runtime - if adv_par.get("_external_dir") is not None: - # Create or update backend_params with external data dir - if adv_par.get("backend_params") is None: - adv_par["backend_params"] = {BackendParameters.EXTERNAL_DATA_DIR: output_model_path} - else: - adv_par["backend_params"][BackendParameters.EXTERNAL_DATA_DIR] = output_model_path - # Remove the temporary _external_dir key - adv_par.pop("_external_dir") - - advanced_params = nncf.AdvancedCompressionParameters(**adv_par) - - # perform weight compression - output_model.model = nncf.compress_weights( - output_model.model, dataset=compression_dataset, advanced_parameters=advanced_params, **compress_config - ) - - # save to output_model_path - output_model.save_pretrained(output_model_path) + temp_dir = None + try: + temp_dir = tempfile.mkdtemp(prefix="olive_ov_compress_") + output_model.save_pretrained(temp_dir) + + # release model to free file handles before copying + del output_model + gc.collect() + + # copy all files from temp_dir back to output_model_path + for item in Path(temp_dir).iterdir(): + dest = Path(output_model_path) / item.name + if item.is_file(): + shutil.copy2(item, dest) + elif item.is_dir(): + shutil.copytree(item, dest, dirs_exist_ok=True) + finally: + # clean up temp directory + if temp_dir is not None: + shutil.rmtree(temp_dir, ignore_errors=True) # check the exported components exported_models = [name.stem for name in Path(output_model_path).iterdir() if name.suffix == ".xml"] @@ -959,9 +719,7 @@ def _run_onnx_pass( output_model_path: str, ) -> ONNXModelHandler: try: - import nncf import onnx - from nncf.onnx.quantization.backend_parameters import BackendParameters except ImportError: raise ImportError( "Please install Intel® NNCF and ONNX to use nncf.compress_weights() on ONNX models" @@ -975,41 +733,8 @@ def _run_onnx_pass( if loaded_model.opset_import[0].version != target_opset: loaded_model = onnx.version_converter.convert_version(loaded_model, target_opset) - # local copy of compress_config and ensure enum values are converted - # (handles case where validate_config was bypassed, e.g., in unit tests) - compress_config = deepcopy(config.compress_config) if config.compress_config else {} - compress_config = _convert_compress_config_enums(compress_config) - - # get the weight compression dataset - compression_dataset = self._get_nncf_dataset(config) - - # get the extra params - extra_params = self._get_extra_params(config) - - # append extra params to compress config - compress_config.update(extra_params) - - # get nncf.AdvancedCompressionParameters if any - advanced_params = None - adv_par = self._get_advanced_compression_params(config) - if adv_par is not None: - # Handle external_dir for backend_params - add output path at runtime - if adv_par.get("_external_dir") is not None: - # Create or update backend_params with external data dir - # Note: BackendParameters is already imported from nncf.onnx.quantization.backend_parameters - if adv_par.get("backend_params") is None: - adv_par["backend_params"] = {BackendParameters.EXTERNAL_DATA_DIR: output_model_path} - else: - adv_par["backend_params"][BackendParameters.EXTERNAL_DATA_DIR] = output_model_path - # Remove the temporary _external_dir key - adv_par.pop("_external_dir") - - advanced_params = nncf.AdvancedCompressionParameters(**adv_par) - - # perform weight compression - output_model = nncf.compress_weights( - loaded_model, dataset=compression_dataset, advanced_parameters=advanced_params, **compress_config - ) + # perform weight compression using shared compression logic + output_model = self._apply_compression(loaded_model, config, output_model_path) # save to output_model_path model_name = Path(model.model_path).name.replace(".onnx", "_compressed.onnx") @@ -1024,147 +749,44 @@ def _run_onnx_pass( return ONNXModelHandler(model_path=output_model_path) + def _run_openvino_pass( + self, + model: OpenVINOModelHandler, + config: type[BasePassConfig], + output_model_path: str, + ) -> OpenVINOModelHandler: + """Run weight compression on an OpenVINO model. -def create_genai_config(model_name: str, output_path: str, config: type[BasePassConfig]) -> None: - """Generate the genai_config.json from the model config files. + Args: + model: The OpenVINO model handler. + config: The pass configuration. + output_model_path: Path where the output model will be saved. - This is only for Generative AI models for which the config.json and generation_config.json files exist - Arguments: - @param model_name: name of model ONNX file that is generated - @param output_path: path to the output directory where the genai_config.json file will be created - @return: None - """ - ip_conf_pth = Path(output_path) / "config.json" - - # do not create genai_config.json if config.json does not exist - if not ip_conf_pth.exists(): - return - - ip_gen_pth = Path(output_path) / "generation_config.json" - - # do not create genai_config.json if generation_config.json does not exist - if not ip_gen_pth.exists(): - return - - # Step 1: Create your data structure - genai_config = { - "model": { - "bos_token_id": -1, - "context_length": -1, - "decoder": { - "session_options": { - "log_id": "onnxruntime-genai", - "graph_optimization_level": "ORT_DISABLE_ALL", - "provider_options": [ - {"OpenVINO": {"device_type": config.target_device.upper(), "enable_causallm": "True"}} - ], - }, - "filename": "openvino_model.onnx", - "head_size": -1, - "hidden_size": -1, - "inputs": {}, - "outputs": {}, - "num_attention_heads": -1, - "num_hidden_layers": -1, - "num_key_value_heads": -1, - }, - "eos_token_id": -1, - "type": "", - "vocab_size": -1, - }, - "search": { - "diversity_penalty": 0.0, - "do_sample": False, - "early_stopping": True, - "length_penalty": 1.0, - "max_length": -1, - "min_length": 0, - "no_repeat_ngram_size": 0, - "num_beams": 1, - "num_return_sequences": 1, - "past_present_share_buffer": False, - "repetition_penalty": 1.0, - "temperature": 1.0, - "top_k": 1, - "top_p": 1.0, - }, - } - - import json - - with open(ip_conf_pth) as f: - src_config = json.load(f) - - with open(ip_gen_pth) as f: - src_gen_config = json.load(f) + Returns: + OpenVINOModelHandler for the compressed model. - try: - import onnx - except ImportError: - raise ImportError( - "Please install onnx to create genai_config.json for ONNX OpenVINO IR Encapsulated model" - ) from None - - model_path = Path(output_path) / model_name - model = onnx.load(model_path) - - # Get input and output tensor names - inputs = [inp.name for inp in model.graph.input] - outputs = [out.name for out in model.graph.output] - - genai_config["model"]["bos_token_id"] = src_config.get("bos_token_id", -1) - genai_config["model"]["context_length"] = src_config.get("max_position_embeddings", -1) - genai_config["model"]["decoder"]["filename"] = model_name - num_attention_heads = src_config.get("num_attention_heads", -1) - hidden_size = src_config.get("hidden_size", -1) - if ( - isinstance(num_attention_heads, int) - and isinstance(hidden_size, int) - and num_attention_heads > 0 - and hidden_size >= 0 - ): - genai_config["model"]["decoder"]["head_size"] = hidden_size // num_attention_heads - else: - if not isinstance(num_attention_heads, int): - logger.warning("num_attention_heads is not an int: %s found in src_config", num_attention_heads) - elif num_attention_heads <= 0: - logger.warning("Invalid num_attention_heads (<= 0) %s found in src_config", num_attention_heads) - if not isinstance(hidden_size, int): - logger.warning("hidden_size is not an int: %s found in src_config", hidden_size) - elif hidden_size < 0: - logger.warning("Invalid hidden_size (< 0) %s found in src_config", hidden_size) - logger.warning("Setting genai_config['model']['decoder']['head_size'] to -1") - genai_config["model"]["decoder"]["head_size"] = -1 - genai_config["model"]["decoder"]["hidden_size"] = src_config.get("hidden_size", -1) - - for name in inputs: - if name != "beam_idx": - genai_config["model"]["decoder"]["inputs"].update({name: name}) - - for name in outputs: - genai_config["model"]["decoder"]["outputs"].update({name: name}) - - genai_config["model"]["decoder"]["num_attention_heads"] = src_config.get("num_attention_heads", -1) - genai_config["model"]["decoder"]["num_hidden_layers"] = src_config.get("num_hidden_layers", -1) - genai_config["model"]["decoder"]["num_key_value_heads"] = src_config.get("num_key_value_heads", -1) - - eos_token_id = src_gen_config.get("eos_token_id", -1) - genai_config["model"]["eos_token_id"] = eos_token_id - pad_token_id = src_gen_config.get("pad_token_id", None) - if pad_token_id is not None: - genai_config["model"]["pad_token_id"] = pad_token_id - elif eos_token_id != -1: - genai_config["model"]["pad_token_id"] = ( - eos_token_id[0] if isinstance(eos_token_id, list) and len(eos_token_id) > 0 else eos_token_id - ) - else: - genai_config["model"]["pad_token_id"] = -1 - genai_config["model"]["type"] = src_config.get("model_type", "") - genai_config["model"]["vocab_size"] = src_config.get("vocab_size", -1) + Raises: + ImportError: If openvino is not installed. + + """ + try: + import openvino as ov + except ImportError: + raise ImportError("Please install openvino to use OpenVINO weight compression") from None + + # load the OpenVINO model + core = ov.Core() + model_config = model.model_config + loaded_model = core.read_model(model_config["model"]) + + # perform weight compression using shared compression logic + compressed_model = self._apply_compression(loaded_model, config, output_model_path) - genai_config["search"]["max_length"] = src_config.get("max_position_embeddings", -1) + # save the compressed model + output_dir = Path(output_model_path) + output_dir.mkdir(parents=True, exist_ok=True) + model_name = model_config["model_name"] + output_xml_path = output_dir / f"{model_name}.xml" + ov.save_model(compressed_model, output_xml_path) - # Step 2: Write to JSON file - output_genai_config = Path(output_path) / "genai_config.json" - with open(output_genai_config, "w") as f: - json.dump(genai_config, f, indent=4) + return OpenVINOModelHandler(model_path=output_model_path) diff --git a/olive/passes/openvino/encapsulation.py b/olive/passes/openvino/encapsulation.py index f052f88b50..c8e24a2b37 100644 --- a/olive/passes/openvino/encapsulation.py +++ b/olive/passes/openvino/encapsulation.py @@ -3,11 +3,9 @@ # Licensed under the MIT License. # -------------------------------------------------------------------------- import logging -import numbers import os -from collections.abc import Mapping, MutableMapping from pathlib import Path -from typing import Any, ClassVar, Union +from typing import ClassVar, Union import onnx.helper as helper from onnx import TensorProto, save @@ -16,6 +14,7 @@ from olive.hardware.accelerator import AcceleratorSpec, Device from olive.model import ONNXModelHandler, OpenVINOModelHandler from olive.passes import Pass +from olive.passes.openvino.ov_utils import create_genai_config from olive.passes.pass_config import BasePassConfig, PassConfigParam logger = logging.getLogger(__name__) @@ -261,171 +260,3 @@ def extract_shape_list(shape, config, prefix: str = "input_0_") -> list: else: shape_list.append(-1) return shape_list - - -def _compatible_type(default_val: Any, new_val: Any) -> bool: - """Loose type check: allow ints for floats, bool as bool, etc.""" - if default_val is None: - return True - if isinstance(default_val, bool): - return isinstance(new_val, bool) - if isinstance(default_val, numbers.Real) and not isinstance(default_val, bool): - return isinstance(new_val, numbers.Real) and not isinstance(new_val, bool) - if isinstance(default_val, str): - return isinstance(new_val, str) - if isinstance(default_val, (list, tuple)): - return isinstance(new_val, (list, tuple)) - if isinstance(default_val, Mapping): - return isinstance(new_val, Mapping) - return True # fall back to permissive - - -def apply_genai_overrides( - defaults: MutableMapping[str, Any], overrides: Mapping[str, Any], *, path: str = "" -) -> MutableMapping[str, Any]: - """Recursively merge `overrides` into `defaults`.""" - for k, v in overrides.items(): - here = f"{path}.{k}" if path else k - if k not in defaults: - continue - - dv = defaults[k] - - # Recurse for dicts - if isinstance(dv, Mapping) and isinstance(v, Mapping): - apply_genai_overrides(dv, v, path=here) - continue - - # Replace lists/tuples and scalars - if not _compatible_type(dv, v): - logger.warning("Type mismatch at %s", here) - defaults[k] = v - return defaults - - -def create_genai_config(model_name: str, output_path: str, config: type[BasePassConfig]) -> None: - """Generate the genai_config.json from the model config files. - - This is only for Generative AI models for which the config.json and generation_config.json files exist - Arguments: - @param model_name: name of model ONNX file that is generated - @param output_path: path to the output directory where the genai_config.json file will be created - @return: None - """ - ip_conf_pth = Path(output_path) / "config.json" - - # do not create genai_config.json if config.json does not exist - if not ip_conf_pth.exists(): - return - - ip_gen_pth = Path(output_path) / "generation_config.json" - - # do not create genai_config.json if generation_config.json does not exist - if not ip_gen_pth.exists(): - return - - # Step 1: Create your data structure - genai_config = { - "model": { - "bos_token_id": -1, - "context_length": -1, - "decoder": { - "session_options": { - "log_id": "onnxruntime-genai", - "graph_optimization_level": "ORT_DISABLE_ALL", - "provider_options": [ - {"OpenVINO": {"device_type": config.target_device.upper(), "enable_causallm": "True"}} - ], - }, - "filename": "openvino_model.onnx", - "head_size": -1, - "hidden_size": -1, - "inputs": {}, - "outputs": {}, - "num_attention_heads": -1, - "num_hidden_layers": -1, - "num_key_value_heads": -1, - }, - "eos_token_id": -1, - "type": "", - "vocab_size": -1, - }, - "search": { - "diversity_penalty": 0.0, - "do_sample": False, - "early_stopping": True, - "length_penalty": 1.0, - "max_length": -1, - "min_length": 0, - "no_repeat_ngram_size": 0, - "num_beams": 1, - "num_return_sequences": 1, - "past_present_share_buffer": False, - "repetition_penalty": 1.0, - "temperature": 1.0, - "top_k": 1, - "top_p": 1.0, - }, - } - - import json - - with open(ip_conf_pth) as f: - src_config = json.load(f) - - with open(ip_gen_pth) as f: - src_gen_config = json.load(f) - - try: - import onnx - except ImportError: - raise ImportError( - "Please install onnx to create genai_config.json for ONNX OpenVINO IR Encapsulated model" - ) from None - - model_path = Path(output_path) / model_name - model = onnx.load(model_path) - - # Get input and output tensor names - inputs = [inp.name for inp in model.graph.input] - outputs = [out.name for out in model.graph.output] - - genai_config["model"]["bos_token_id"] = src_config.get("bos_token_id", -1) - genai_config["model"]["context_length"] = src_config.get("max_position_embeddings", -1) - genai_config["model"]["decoder"]["filename"] = model_name - genai_config["model"]["decoder"]["head_size"] = src_config.get("hidden_size", -1) // src_config.get( - "num_attention_heads", -1 - ) - genai_config["model"]["decoder"]["hidden_size"] = src_config.get("hidden_size", -1) - - for name in inputs: - if name != "beam_idx": - genai_config["model"]["decoder"]["inputs"].update({name: name}) - - for name in outputs: - genai_config["model"]["decoder"]["outputs"].update({name: name}) - - genai_config["model"]["decoder"]["num_attention_heads"] = src_config.get("num_attention_heads", -1) - genai_config["model"]["decoder"]["num_hidden_layers"] = src_config.get("num_hidden_layers", -1) - genai_config["model"]["decoder"]["num_key_value_heads"] = src_config.get("num_key_value_heads", -1) - - genai_config["model"]["eos_token_id"] = src_gen_config.get("eos_token_id", -1) - genai_config["model"]["pad_token_id"] = ( - src_gen_config["pad_token_id"] - if hasattr(src_gen_config, "pad_token_id") and src_gen_config["pad_token_id"] is not None - else src_gen_config["eos_token_id"][0] - if isinstance(src_gen_config["eos_token_id"], list) - else src_gen_config["eos_token_id"] - ) - genai_config["model"]["type"] = src_config.get("model_type", "") - genai_config["model"]["vocab_size"] = src_config.get("vocab_size", -1) - - genai_config["search"]["max_length"] = src_config.get("max_position_embeddings", -1) - - if isinstance(config.genai_config_override, dict): - apply_genai_overrides(genai_config, config.genai_config_override) - - # Step 2: Write to JSON file - output_genai_config = Path(output_path) / "genai_config.json" - with open(output_genai_config, "w") as f: - json.dump(genai_config, f, indent=4) diff --git a/olive/passes/openvino/optimum_intel.py b/olive/passes/openvino/optimum_intel.py index f872c14539..2105f512f2 100644 --- a/olive/passes/openvino/optimum_intel.py +++ b/olive/passes/openvino/optimum_intel.py @@ -5,7 +5,7 @@ import logging from copy import deepcopy from pathlib import Path -from typing import Optional, Union +from typing import Any, Optional, Union from huggingface_hub.constants import HUGGINGFACE_HUB_CACHE @@ -13,55 +13,12 @@ from olive.hardware.accelerator import AcceleratorSpec, Device from olive.model import CompositeModelHandler, HfModelHandler, OpenVINOModelHandler from olive.passes import Pass +from olive.passes.openvino.ov_utils import OVOptimumLibrary, infer_library_name from olive.passes.pass_config import BasePassConfig, PassConfigParam, get_user_script_data_config logger = logging.getLogger(__name__) -def maybe_load_preprocessors( - src_name_or_path: Union[str, Path], subfolder: str = "", trust_remote_code: bool = False -) -> list: - try: - from transformers import AutoFeatureExtractor, AutoImageProcessor, AutoProcessor, AutoTokenizer - except Exception as e: - raise ImportError("Unable to import transformers packages: ", e) from None - - preprocessors = [] - try: - preprocessors.append( - AutoTokenizer.from_pretrained(src_name_or_path, subfolder=subfolder, trust_remote_code=trust_remote_code) - ) - except Exception: - pass - - try: - preprocessors.append( - AutoProcessor.from_pretrained(src_name_or_path, subfolder=subfolder, trust_remote_code=trust_remote_code) - ) - except Exception: - pass - - try: - preprocessors.append( - AutoFeatureExtractor.from_pretrained( - src_name_or_path, subfolder=subfolder, trust_remote_code=trust_remote_code - ) - ) - except Exception: - pass - - try: - preprocessors.append( - AutoImageProcessor.from_pretrained( - src_name_or_path, subfolder=subfolder, trust_remote_code=trust_remote_code - ) - ) - except Exception: - pass - - return preprocessors - - def infer_task( task, model_name_or_path, @@ -70,9 +27,10 @@ def infer_task( cache_dir: str = HUGGINGFACE_HUB_CACHE, token: Optional[Union[bool, str]] = None, library_name: Optional[str] = None, + trust_remote_code: bool = False, ): try: - from optimum.exporters import TasksManager + from optimum.exporters.tasks import TasksManager except Exception as e: raise ImportError("Unable to import optimum packages:", e) from None @@ -81,6 +39,7 @@ def infer_task( except Exception as e: raise ImportError("Unable to import ConnectionError packages:", e) from None + original_task = task task = TasksManager.map_from_synonym(task) if task == "auto": if library_name == "open_clip": @@ -96,52 +55,149 @@ def infer_task( library_name=library_name, ) except KeyError as e: - raise KeyError( - f"The task could not be automatically inferred. Please provide the argument --task with the relevant task from {', '.join(TasksManager.get_all_tasks())}. Detailed error: {e}" - ) from None + try: + from transformers import AutoConfig + except ImportError as ie: + raise ImportError(f"Unable to import AutoConfig from transformers: {ie}") from None + try: + config = AutoConfig.from_pretrained(model_name_or_path) + with_past_arch_list = ["MistralForCausalLM", "Zamba2ForCausalLM"] + architectures = getattr(config, "architectures", None) or [] + if any(arch in architectures for arch in with_past_arch_list): + task = "text-generation-with-past" + except Exception: + raise KeyError( + f"The task could not be automatically inferred. Please provide the argument --task with the relevant task from {', '.join(TasksManager.get_all_tasks())}. Detailed error: {e}" + ) from None except RequestsConnectionError as e: raise RequestsConnectionError( f"The task could not be automatically inferred as this is available only for models hosted on the Hugging Face Hub. Please provide the argument --task with the relevant task from {', '.join(TasksManager.get_all_tasks())}. Detailed error: {e}" ) from None - return task + if library_name == "transformers": + try: + from transformers import AutoConfig + except ImportError as e: + raise ImportError(f"Unable to import AutoConfig from transformers: {e}") from None + config = AutoConfig.from_pretrained( + model_name_or_path, + subfolder=subfolder, + revision=revision, + cache_dir=cache_dir, + token=token, + trust_remote_code=trust_remote_code, + ) + if hasattr(config, "export_model_type"): + model_type = config.export_model_type + else: + model_type = config.model_type + custom_architecture = model_type not in TasksManager._SUPPORTED_MODEL_TYPE # pylint: disable=W0212 + if not custom_architecture and task + "-with-past" in TasksManager.get_supported_tasks_for_model_type( + model_type, exporter="openvino", library_name=library_name + ): + # Make -with-past the default if --task was not explicitly specified + if original_task == "auto": + task = task + "-with-past" + else: + logger.info( + "The task `%s` was manually specified, and past key values will not be reused in the decoding." + " if needed, please pass `--task %s-with-past` to export using the past key values.", + task, + task, + ) + return task -def maybe_convert_tokenizers(library_name: str, output: Path, model=None, preprocessors=None, task=None): - from optimum.exporters.openvino.convert import export_tokenizer +def _main_quantize( + model_name_or_path: str, + task: str, + library_name: str, + quantization_config: Union[dict, "OVQuantizationConfigBase"], # noqa: F821 + output: Path, + cache_dir: str, + trust_remote_code: bool = False, + subfolder: str = "", + revision: str = "main", + token: Optional[Union[bool, str]] = None, + model_kwargs: Optional[dict[str, Any]] = None, +): try: - from transformers import PreTrainedTokenizerBase, ProcessorMixin - except Exception as e: - raise ImportError("Unable to import transformers packages:", e) from None + from optimum.intel.openvino.utils import _HEAD_TO_AUTOMODELS + from optimum.intel.utils.import_utils import is_diffusers_available + except ImportError as e: + raise ImportError("Please install Intel® optimum[openvino] to use OpenVINO Optimum Conversion") from e + + # Step 0. Infer task and library name if needed + original_task = task + task = infer_task( + task, + model_name_or_path, + subfolder=subfolder, + revision=revision, + cache_dir=cache_dir, + token=token, + library_name=library_name, + trust_remote_code=trust_remote_code, + ) + if library_name is None: + library_name = infer_library_name( + model_name_or_path, + subfolder=subfolder, + revision=revision, + cache_dir=cache_dir, + token=token, + ) - try: - from optimum.intel.utils.import_utils import is_openvino_tokenizers_available - except Exception as e: - raise ImportError("openvino tokenizers unavailable :", e) from None - - if is_openvino_tokenizers_available(): - if library_name != "diffusers" and preprocessors: - processor_chat_template = None - tokenizer = next(filter(lambda it: isinstance(it, PreTrainedTokenizerBase), preprocessors), None) - if len(preprocessors) > 1: - for processor in preprocessors: - if isinstance(processor, ProcessorMixin) and hasattr(processor, "chat_template"): - processor_chat_template = processor.chat_template - if tokenizer: - try: - export_tokenizer(tokenizer, output, task=task, processor_chat_template=processor_chat_template) - except Exception as exception: - logger.warning( - "Could not load tokenizer using specified model ID or path. OpenVINO tokenizer/detokenizer models won't be generated. Exception: %s", - exception, - ) - elif model: - for tokenizer_name in ("tokenizer", "tokenizer_2", "tokenizer_3"): - tokenizer = getattr(model, tokenizer_name, None) - if tokenizer: - export_tokenizer(tokenizer, output / tokenizer_name, task=task) + # Step 1. Obtain the correct OpenVINO model class + if library_name == "diffusers": + if not is_diffusers_available(): + raise ValueError("Export of diffusers models requires the diffusers library to be installed.") + + try: + from diffusers import DiffusionPipeline + except ImportError as e: + raise ImportError("Unable to import diffusers packages:", e) from None + + diffusers_config = DiffusionPipeline.load_config(model_name_or_path) + class_name = diffusers_config.get("_class_name", None) + ov_class_name = f"OV{class_name}" + try: + model_cls = getattr(__import__("optimum.intel", fromlist=[ov_class_name]), ov_class_name) + except (AttributeError, ImportError) as e: + raise RuntimeError(f"Wasn't able to locate OpenVINO class for {class_name} diffusion model.") from e else: - logger.warning("Tokenizer won't be converted.") + try: + model_cls_name = _HEAD_TO_AUTOMODELS[task.replace("-with-past", "")] + if library_name == "sentence_transformers": + model_cls_name = "OVSentenceTransformer" + model_cls = getattr(__import__("optimum.intel", fromlist=[model_cls_name]), model_cls_name) + except (AttributeError, ImportError, KeyError) as e: + raise RuntimeError(f"Wasn't able to locate OpenVINO class for task {original_task} ({task}).") from e + + # Step 2. Load the exported model + # Filter out keys that are explicitly passed to from_pretrained to avoid + # "got multiple values for keyword argument" TypeError + _explicit_keys = {"trust_remote_code", "cache_dir", "use_cache", "compile"} + filtered_kwargs = {k: v for k, v in (model_kwargs or {}).items() if k not in _explicit_keys} + model = model_cls.from_pretrained( + output, + compile=False, + trust_remote_code=trust_remote_code, + cache_dir=cache_dir, + use_cache=task.endswith("with-past"), + **filtered_kwargs, + ) + + # Step 3. Apply quantization and save the quantized model + model._apply_quantization( # pylint: disable=W0212 + quantization_config, + compile_only=False, + compile_model=False, + model_name_or_path=model_name_or_path, + trust_remote_code=trust_remote_code, + save_directory=output, + immediate_save=True, + ) class OVQuantMode(StrEnumBase): @@ -154,14 +210,6 @@ class OVQuantMode(StrEnumBase): INT4_F8E5M2 = "int4_f8e5m2" -class OVOptimumLibrary(StrEnumBase): - TRANSFORMERS = "transformers" - DIFFUSERS = "diffusers" - TIMM = "timm" - SENTENCE_TRANSFORMERS = "sentence_transformers" - OPEN_CLIP = "open_clip" - - class OVOptimumFramework(StrEnumBase): PT = "pt" TF = "tf" @@ -302,9 +350,12 @@ def _run_for_config( ) -> Union[OpenVINOModelHandler, CompositeModelHandler]: try: from optimum.exporters.openvino import main_export as export_optimum_intel - from optimum.exporters.openvino.utils import save_preprocessors - from optimum.intel.openvino.configuration import OVConfig, get_default_int4_config - from optimum.intel.utils.modeling_utils import _infer_library_from_model_name_or_path + from optimum.intel.openvino.configuration import ( + OVConfig, + _GPTOSSQuantizationConfig, + get_default_quantization_config, + ) + from optimum.intel.utils.import_utils import is_nncf_available except ImportError as e: raise ImportError("Please install Intel® optimum[openvino] to use OpenVINO Optimum Conversion") from e @@ -323,27 +374,18 @@ def _run_for_config( } ) - if model.load_kwargs and "trust_remote_code" not in extra_args: - extra_args["trust_remote_code"] = model.load_kwargs.trust_remote_code - if extra_args.get("library") is None: - lib_name = _infer_library_from_model_name_or_path(model.model_name_or_path) - if lib_name == "sentence_transformers": - logger.warning( - "Library is not specified. " - "There are multiple possible variants: `sentence_transformers`, `transformers`. " - "`transformers` will be selected. " - "If you want to load your model with the `sentence-transformers` library instead, " - "Please set it as sentence_transformers in extra_args dictionary under 'library' key" - ) - lib_name = "transformers" + lib_name = infer_library_name(model.model_name_or_path) else: lib_name = extra_args["library"] if config.ov_quant_config: if config.ov_quant_config.get("weight_format") is None and config.ov_quant_config.get("quant_mode") is None: ov_config = None - if not no_compression_parameter_provided(config.ov_quant_config): + if ( + not no_compression_parameter_provided(config.ov_quant_config) + or config.ov_quant_config.get("quantization_statistics_path", None) is not None + ): raise ValueError( "Some compression parameters are provided, but the weight format is not specified. " "Please provide it with weight_format key in ov_quant_config dictionary." @@ -356,139 +398,98 @@ def _run_for_config( elif config.ov_quant_config.get("weight_format") in {"fp16", "fp32"}: ov_config = OVConfig(dtype=config.ov_quant_config["weight_format"]) else: + if not is_nncf_available(): + raise ImportError("Please install nncf to use OpenVINO Optimum Conversion with quantization.") + if ( + config.ov_quant_config.get("weight_format") is not None + and config.ov_quant_config.get("quant_mode") is not None + ): + # both are provided, so raise ValueError + raise ValueError("Both weight_format and quant_mode are provided. Please provide only one of them.") + + default_quantization_config = get_default_quantization_config( + model.model_name_or_path, + config.ov_quant_config.get("weight_format"), + config.ov_quant_config.get("quant_mode"), + ) + if config.ov_quant_config.get("weight_format") is not None: - # For int4 quantization if no parameter is provided, then use the default config if exists + # weight compression + quant_config = prep_wc_config(config.ov_quant_config, WRAPPER_4_BIT) + if no_compression_parameter_provided(config.ov_quant_config) and config.ov_quant_config.get( + "weight_format" + ) in ["int4", "int8"]: + if default_quantization_config is not None: + quant_config = default_quantization_config + logger.info( + "Applying the default quantization config for model %s: %s", + model.model_name_or_path, + quant_config, + ) + elif config.ov_quant_config.get("weight_format") == "int4": + quant_config = WRAPPER_4_BIT + logger.info( + "Applying a default 4-bit weight compression config for model %s: %s", + model.model_name_or_path, + quant_config, + ) + if config.ov_quant_config.get("quantization_statistics_path", None) is not None: + quant_config["statistics_path"] = config.ov_quant_config.get("quantization_statistics_path") + else: if ( - no_compression_parameter_provided(config.ov_quant_config) - and config.ov_quant_config.get("weight_format") == "int4" + no_quantization_parameter_provided(config.ov_quant_config) + and default_quantization_config is not None ): - quant_config = get_default_int4_config(model.model_name_or_path) - else: - quant_config = prep_wc_config(config.ov_quant_config, WRAPPER_4_BIT) - if quant_config.get("dataset", None) is not None: - quant_config["trust_remote_code"] = config.ov_quant_config.get("trust_remote_code", False) - ov_config = OVConfig(quantization_config=quant_config) - else: - ov_config = None - if config.ov_quant_config.get("dataset", None) is None: - raise ValueError( - "Dataset is required for full quantization. " - "Please provide it in ov_quant_config dictionary under 'dataset' key" + quant_config = default_quantization_config + logger.info( + "Applying the default quantization config for model %s: %s", + model.model_name_or_path, + quant_config, ) - if config.ov_quant_config.get("quant_mode") in [ - "nf4_f8e4m3", - "nf4_f8e5m2", - "int4_f8e4m3", - "int4_f8e5m2", - ]: - if lib_name == "diffusers": - raise NotImplementedError("Mixed precision quantization isn't supported for diffusers.") - wc_config = prep_wc_config(config.ov_quant_config, WRAPPER_4_BIT) - wc_dtype, q_dtype = config.ov_quant_config["quant_mode"].split("_") - wc_config["dtype"] = wc_dtype - - q_config = prep_q_config(config.ov_quant_config) - q_config["dtype"] = q_dtype - quant_config = { - "weight_quantization_config": wc_config, - "full_quantization_config": q_config, - "num_samples": self.args.num_samples, - "dataset": self.args.dataset, - "trust_remote_code": self.args.trust_remote_code, - } else: - quant_config = prep_q_config(config.ov_quant_config) - ov_config = OVConfig(quantization_config=quant_config) + if quant_config.get("dataset", None) is None: + raise ValueError( + "Dataset is required for full quantization. " + "Please provide it in ov_quant_config dictionary under 'dataset' key" + ) + if config.ov_quant_config.get("quant_mode") in [ + "cb4_f8e4m3", + "int4_f8e4m3", + "int4_f8e5m2", + ]: + if lib_name == "diffusers": + raise NotImplementedError("Mixed precision quantization isn't supported for diffusers.") + wc_config = prep_wc_config(config.ov_quant_config, WRAPPER_4_BIT) + wc_dtype, q_dtype = config.ov_quant_config["quant_mode"].split("_") + wc_config["dtype"] = wc_dtype + + q_config = prep_q_config(config.ov_quant_config) + q_config["dtype"] = q_dtype + + quant_config = { + "weight_quantization_config": wc_config, + "full_quantization_config": q_config, + "num_samples": config.ov_quant_config.get("num_samples"), + "dataset": config.ov_quant_config.get("dataset"), + } + else: + if config.ov_quant_config.get("quantization_statistics_path", None) is not None: + logger.warning( + "quantization_statistics_path is only applicable for weight-only" + " quantization. It will be ignored." + ) + quant_config = prep_q_config(config.ov_quant_config) + + ov_config = OVConfig(quantization_config=quant_config) else: ov_config = None # quantization config quant_config = ov_config.quantization_config if ov_config else None - quantize_with_dataset = quant_config and getattr(quant_config, "dataset", None) is not None - task = infer_task(extra_args.get("task", "auto"), model.model_name_or_path, library_name=lib_name) - - # model - if lib_name == "diffusers" and quantize_with_dataset: - try: - from diffusers import DiffusionPipeline - except ImportError: - raise ImportError("Please install diffusers to use OpenVINO with Diffusers models.") from None - - diffusers_config = DiffusionPipeline.load_config(model.model_name_or_path) - class_name = diffusers_config.get("_class_name", None) - if class_name == "LatentConsistencyModelPipeline": - from optimum.intel import OVLatentConsistencyModelPipeline + apply_main_quantize = quant_config and not isinstance(quant_config, _GPTOSSQuantizationConfig) - model_cls = OVLatentConsistencyModelPipeline - - elif class_name == "StableDiffusionXLPipeline": - from optimum.intel import OVStableDiffusionXLPipeline - - model_cls = OVStableDiffusionXLPipeline - elif class_name == "StableDiffusionPipeline": - from optimum.intel import OVStableDiffusionPipeline - - model_cls = OVStableDiffusionPipeline - elif class_name == "StableDiffusion3Pipeline": - from optimum.intel import OVStableDiffusion3Pipeline - - model_cls = OVStableDiffusion3Pipeline - elif class_name == "FluxPipeline": - from optimum.intel import OVFluxPipeline - - model_cls = OVFluxPipeline - elif class_name == "SanaPipeline": - from optimum.intel import OVSanaPipeline - - model_cls = OVSanaPipeline - else: - raise NotImplementedError(f"Quantization isn't supported for class {class_name}.") - - output_model = model_cls.from_pretrained( - model.model_name_or_path, export=True, quantization_config=quant_config - ) - output_model.save_pretrained(output_model_path) - if not extra_args.get("disable_convert_tokenizer", False): - maybe_convert_tokenizers(lib_name, output_model_path, model, task=task) - elif ( - quantize_with_dataset and (task.startswith("text-generation") or "automatic-speech-recognition" in task) - ) or (task == "image-text-to-text" and quant_config is not None): - if task.startswith("text-generation"): - from optimum.intel import OVModelForCausalLM - - model_cls = OVModelForCausalLM - elif task == "image-text-to-text": - from optimum.intel import OVModelForVisualCausalLM - - model_cls = OVModelForVisualCausalLM - else: - from optimum.intel import OVModelForSpeechSeq2Seq - - model_cls = OVModelForSpeechSeq2Seq - - # In this case, to apply quantization an instance of a model class is required - output_model = model_cls.from_pretrained( - model.model_name_or_path, - export=True, - quantization_config=quant_config, - stateful=not extra_args.get("disable_stateful", False), - trust_remote_code=extra_args.get("trust_remote_code", False), - variant=extra_args.get("variant", None), - cache_dir=extra_args.get("cache_dir", HUGGINGFACE_HUB_CACHE), - ) - output_model.save_pretrained(output_model_path) - - preprocessors = maybe_load_preprocessors( - model.model_name_or_path, trust_remote_code=extra_args.get("trust_remote_code", False) - ) - save_preprocessors( - preprocessors, output_model.config, output_model_path, extra_args.get("trust_remote_code", False) - ) - if not extra_args.get("disable_convert_tokenizer", False): - maybe_convert_tokenizers(lib_name, output_model_path, preprocessors=preprocessors, task=task) - - else: + try: extra_args["ov_config"] = ov_config extra_args["stateful"] = not extra_args.get("disable_stateful", False) extra_args.pop("disable_stateful", False) @@ -501,6 +502,21 @@ def _run_for_config( output_model_path, **extra_args, ) + if apply_main_quantize: + _main_quantize( + model_name_or_path=model.model_name_or_path, + task=extra_args.get("task", "auto"), + library_name=lib_name, + quantization_config=quant_config, + output=Path(output_model_path), + cache_dir=config.ov_quant_config.get("cache_dir", None) if config.ov_quant_config else None, + trust_remote_code=config.ov_quant_config.get("trust_remote_code", False) + if config.ov_quant_config + else False, + model_kwargs=model.load_kwargs.__dict__ if model.load_kwargs else None, + ) + except Exception as e: + raise RuntimeError(f"OpenVINO optimum export failed: {e}") from None # check the exported components exported_models = [name.stem for name in Path(output_model_path).iterdir() if name.suffix == ".xml"] @@ -578,6 +594,8 @@ def prep_wc_config(quant_cfg, default_cfg): "lora_correction": quant_cfg.get("lora_correction", None), "dtype": quant_cfg.get("weight_format"), "backup_precision": quant_cfg.get("backup_precision"), + "statistics_path": quant_cfg.get("statistics_path", None), + "group_size_fallback": quant_cfg.get("group_size_fallback", None), } @@ -590,7 +608,6 @@ def prep_q_config(quant_cfg): "dataset": quant_cfg.get("dataset"), "num_samples": quant_cfg.get("num_samples"), "smooth_quant_alpha": quant_cfg.get("smooth_quant_alpha"), - "trust_remote_code": quant_cfg.get("trust_remote_code", False), } diff --git a/olive/passes/openvino/ov_utils.py b/olive/passes/openvino/ov_utils.py new file mode 100644 index 0000000000..9b9f4db9d8 --- /dev/null +++ b/olive/passes/openvino/ov_utils.py @@ -0,0 +1,370 @@ +# ------------------------------------------------------------------------- +# Copyright (c) Intel Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------- +import json +import logging +import numbers +from collections.abc import Mapping, MutableMapping +from pathlib import Path +from typing import Any, Optional, Union + +from huggingface_hub.constants import HUGGINGFACE_HUB_CACHE + +from olive.common.utils import StrEnumBase +from olive.passes.pass_config import BasePassConfig + +logger = logging.getLogger(__name__) + + +class IgnoreScopeTypeEnum(StrEnumBase): + NAMES = "names" + TYPES = "types" + PATTERNS = "patterns" + + +class OVOptimumLibrary(StrEnumBase): + TRANSFORMERS = "transformers" + DIFFUSERS = "diffusers" + TIMM = "timm" + SENTENCE_TRANSFORMERS = "sentence_transformers" + OPEN_CLIP = "open_clip" + + +def _validate_enum_value(value, enum_class: type, param_name: str) -> tuple[bool, str]: + """Validate that a value can be converted to an enum (case-insensitive). + + Args: + value: The value to validate (None, string, or already enum). + enum_class: The enum class to validate against. + param_name: Name of the parameter for error messages. + + Returns: + Tuple of (is_valid, error_message). error_message is empty if valid. + + """ + if value is None or isinstance(value, enum_class): + return True, "" + + if not isinstance(value, str): + return False, f"{param_name} '{value}' is not a valid string or {enum_class.__name__} enum." + + lookup_key = value.lower() + + # Try matching by enum.value first (case-insensitive) + value_map = {m.value.lower(): m for m in enum_class} + if lookup_key in value_map: + return True, "" + + # Try matching by enum.name (case-insensitive) + name_map = {m.name.lower(): m for m in enum_class} + if lookup_key in name_map: + return True, "" + + # Validation failed + valid_values = sorted({m.value for m in enum_class} | {m.name for m in enum_class}) + return False, f"{param_name} '{value}' is not supported. Supported values are: {', '.join(valid_values)}." + + +def _convert_to_enum(value, enum_class: type, param_name: str): + """Convert a value to an enum if needed (case-insensitive). + + Accepts: + - None (returns None) + - Enum instances of the correct type (returns as-is) + - Strings matching enum.value (case-insensitive) + - Strings matching enum.name (case-insensitive) + + Args: + value: The value to convert (None, string, or already enum). + enum_class: The enum class to convert to. + param_name: Name of the parameter for error messages. + + Returns: + The enum value, or None if input was None. + + Raises: + ValueError: If conversion fails. + + """ + if value is None or isinstance(value, enum_class): + return value + + if not isinstance(value, str): + raise ValueError(f"{param_name} '{value}' is not a valid string or {enum_class.__name__} enum.") + + lookup_key = value.lower() + + # Try matching by enum.value first (case-insensitive) + value_map = {m.value.lower(): m for m in enum_class} + if lookup_key in value_map: + return value_map[lookup_key] + + # Try matching by enum.name (case-insensitive) + name_map = {m.name.lower(): m for m in enum_class} + if lookup_key in name_map: + return name_map[lookup_key] + + # Conversion failed + valid_values = sorted({m.value for m in enum_class} | {m.name for m in enum_class}) + raise ValueError(f"{param_name} '{value}' is not supported. Supported values are: {', '.join(valid_values)}.") + + +def infer_library_name( + model_name_or_path: str, + subfolder: str = "", + revision: Optional[str] = None, + cache_dir: str = HUGGINGFACE_HUB_CACHE, + token: Optional[Union[bool, str]] = None, +) -> str: + """Infer the Optimum-Intel library name for a given model. + + Falls back to ``"transformers"`` when ``sentence_transformers`` is detected + + Args: + model_name_or_path: The model identifier or path. str + subfolder: The subfolder within the model repository. optional. str. default is "". + revision: The specific model version to use. optional. str. default is None (latest version). + cache_dir: The directory to use for caching. optional. str. default is HUGGINGFACE_HUB_CACHE. + token: The huggingface token to use. optional. bool or str. default is None. + + Returns: + The inferred library name. str + + Raises: + ImportError: If the optimum.intel library cannot be imported. + + """ + try: + from optimum.intel.utils.modeling_utils import _infer_library_from_model_name_or_path + except ImportError as e: + raise ImportError("Please install Intel® optimum[openvino] to use OpenVINO Optimum Conversion") from e + library_name = _infer_library_from_model_name_or_path( + model_name_or_path=model_name_or_path, + subfolder=subfolder, + revision=revision, + cache_dir=cache_dir, + token=token, + ) + if library_name == "sentence_transformers": + logger.warning( + "Library name is not specified. There are multiple possible variants: `sentence_transformers`, `transformers`." + " `transformers` will be selected. If you want to load your model with the `sentence-transformers` library instead, please set --library sentence_transformers" + ) + library_name = "transformers" + return library_name + + +def _compatible_type(default_val: Any, new_val: Any) -> bool: + """Loose type check: allow ints for floats, bool as bool, etc.""" + if default_val is None: + return True + if isinstance(default_val, bool): + return isinstance(new_val, bool) + if isinstance(default_val, numbers.Real) and not isinstance(default_val, bool): + return isinstance(new_val, numbers.Real) and not isinstance(new_val, bool) + if isinstance(default_val, str): + return isinstance(new_val, str) + if isinstance(default_val, (list, tuple)): + return isinstance(new_val, (list, tuple)) + if isinstance(default_val, Mapping): + return isinstance(new_val, Mapping) + return True # fall back to permissive + + +def apply_genai_overrides( + defaults: MutableMapping[str, Any], overrides: Mapping[str, Any], *, path: str = "" +) -> MutableMapping[str, Any]: + """Recursively merge *overrides* into *defaults*. + + Only keys that already exist in *defaults* are updated. Type mismatches + are logged as warnings but still applied. + + Args: + defaults: The original config to be updated (modified in-place). MutableMapping[str, Any]. + overrides: The config values to override. Mapping[str, Any]. + path: The current path within the config (used for recursive calls). + + Returns: + The updated config with overrides applied. MutableMapping[str, Any]. + + """ + for k, v in overrides.items(): + here = f"{path}.{k}" if path else k + if k not in defaults: + continue + + dv = defaults[k] + + # Recurse for dicts + if isinstance(dv, Mapping) and isinstance(v, Mapping): + apply_genai_overrides(dv, v, path=here) + continue + + # Replace lists/tuples and scalars + if not _compatible_type(dv, v): + logger.warning("Type mismatch at %s", here) + defaults[k] = v + return defaults + + +def create_genai_config(model_name: str, output_path: str, config: BasePassConfig) -> None: + """Generate ``genai_config.json`` from model config files. + + This is only for Generative AI models for which ``config.json`` and + ``generation_config.json`` exist in *output_path*. + + Args: + model_name: Name of the ONNX model file that was generated. + output_path: Directory containing the model and config files. + config: Pass configuration instance (must expose ``target_device``; may + optionally expose ``genai_config_override``). + + Returns: + None + + Raises: + FileNotFoundError: If required config files are missing. + + """ + ip_conf_pth = Path(output_path) / "config.json" + + # do not create genai_config.json if config.json does not exist + if not ip_conf_pth.exists(): + return + + ip_gen_pth = Path(output_path) / "generation_config.json" + + # do not create genai_config.json if generation_config.json does not exist + if not ip_gen_pth.exists(): + return + + # Step 1: Create the data structure + genai_config: dict[str, Any] = { + "model": { + "bos_token_id": -1, + "context_length": -1, + "decoder": { + "session_options": { + "log_id": "onnxruntime-genai", + "graph_optimization_level": "ORT_DISABLE_ALL", + "provider_options": [ + {"OpenVINO": {"device_type": config.target_device.upper(), "enable_causallm": "True"}} + ], + }, + "filename": "openvino_model.onnx", + "head_size": -1, + "hidden_size": -1, + "inputs": {}, + "outputs": {}, + "num_attention_heads": -1, + "num_hidden_layers": -1, + "num_key_value_heads": -1, + }, + "eos_token_id": -1, + "type": "", + "vocab_size": -1, + }, + "search": { + "diversity_penalty": 0.0, + "do_sample": False, + "early_stopping": True, + "length_penalty": 1.0, + "max_length": -1, + "min_length": 0, + "no_repeat_ngram_size": 0, + "num_beams": 1, + "num_return_sequences": 1, + "past_present_share_buffer": False, + "repetition_penalty": 1.0, + "temperature": 1.0, + "top_k": 1, + "top_p": 1.0, + }, + } + + with open(ip_conf_pth) as f: + src_config = json.load(f) + + with open(ip_gen_pth) as f: + src_gen_config = json.load(f) + + try: + import onnx + except ImportError: + raise ImportError( + "Please install onnx to create genai_config.json for ONNX OpenVINO IR Encapsulated model" + ) from None + + model_path = Path(output_path) / model_name + model = onnx.load(model_path) + + # Get input and output tensor names + inputs = [inp.name for inp in model.graph.input] + outputs = [out.name for out in model.graph.output] + + genai_config["model"]["bos_token_id"] = src_config.get("bos_token_id", -1) + genai_config["model"]["context_length"] = src_config.get("max_position_embeddings", -1) + genai_config["model"]["decoder"]["filename"] = model_name + + # Safe head_size computation + num_attention_heads = src_config.get("num_attention_heads", -1) + hidden_size = src_config.get("hidden_size", -1) + if ( + isinstance(num_attention_heads, int) + and isinstance(hidden_size, int) + and num_attention_heads > 0 + and hidden_size >= 0 + ): + genai_config["model"]["decoder"]["head_size"] = hidden_size // num_attention_heads + else: + if not isinstance(num_attention_heads, int): + logger.warning("num_attention_heads is not an int: %s found in src_config", num_attention_heads) + elif num_attention_heads <= 0: + logger.warning("Invalid num_attention_heads (<= 0) %s found in src_config", num_attention_heads) + if not isinstance(hidden_size, int): + logger.warning("hidden_size is not an int: %s found in src_config", hidden_size) + elif hidden_size < 0: + logger.warning("Invalid hidden_size (< 0) %s found in src_config", hidden_size) + logger.warning("Setting genai_config['model']['decoder']['head_size'] to -1") + genai_config["model"]["decoder"]["head_size"] = -1 + + genai_config["model"]["decoder"]["hidden_size"] = src_config.get("hidden_size", -1) + + for name in inputs: + if name != "beam_idx": + genai_config["model"]["decoder"]["inputs"].update({name: name}) + + for name in outputs: + genai_config["model"]["decoder"]["outputs"].update({name: name}) + + genai_config["model"]["decoder"]["num_attention_heads"] = src_config.get("num_attention_heads", -1) + genai_config["model"]["decoder"]["num_hidden_layers"] = src_config.get("num_hidden_layers", -1) + genai_config["model"]["decoder"]["num_key_value_heads"] = src_config.get("num_key_value_heads", -1) + + eos_token_id = src_gen_config.get("eos_token_id", -1) + genai_config["model"]["eos_token_id"] = eos_token_id + pad_token_id = src_gen_config.get("pad_token_id", None) + if pad_token_id is not None: + genai_config["model"]["pad_token_id"] = pad_token_id + elif eos_token_id != -1: + genai_config["model"]["pad_token_id"] = ( + eos_token_id[0] if isinstance(eos_token_id, list) and len(eos_token_id) > 0 else eos_token_id + ) + else: + genai_config["model"]["pad_token_id"] = -1 + + genai_config["model"]["type"] = src_config.get("model_type", "") + genai_config["model"]["vocab_size"] = src_config.get("vocab_size", -1) + + genai_config["search"]["max_length"] = src_config.get("max_position_embeddings", -1) + + # Apply genai_config_override if the pass config exposes it + genai_config_override = getattr(config, "genai_config_override", None) + if isinstance(genai_config_override, dict): + apply_genai_overrides(genai_config, genai_config_override) + + # Step 2: Write to JSON file + output_genai_config = Path(output_path) / "genai_config.json" + with open(output_genai_config, "w") as f: + json.dump(genai_config, f, indent=4) diff --git a/olive/passes/openvino/quantization.py b/olive/passes/openvino/quantization.py index 35c19339c6..967a341547 100644 --- a/olive/passes/openvino/quantization.py +++ b/olive/passes/openvino/quantization.py @@ -15,6 +15,7 @@ from olive.model.handler import ONNXModelHandler, OpenVINOModelHandler from olive.passes import Pass from olive.passes.onnx.common import model_proto_to_file +from olive.passes.openvino.ov_utils import IgnoreScopeTypeEnum from olive.passes.pass_config import BasePassConfig, ParamCategory, PassConfigParam, get_user_script_data_config logger = logging.getLogger(__name__) @@ -91,12 +92,6 @@ class PresetEnum(StrEnumBase): MIXED = "MIXED" -class IgnoreScopeTypeEnum(StrEnumBase): - NAMES = "names" - TYPES = "types" - PATTERNS = "patterns" - - class DropTypeEnum(StrEnumBase): ABSOLUTE = "ABSOLUTE" RELATIVE = "RELATIVE" diff --git a/test/passes/openvino/test_openvino_compression.py b/test/passes/openvino/test_openvino_compression.py index f12471c437..51a7d86647 100644 --- a/test/passes/openvino/test_openvino_compression.py +++ b/test/passes/openvino/test_openvino_compression.py @@ -13,7 +13,8 @@ from olive.passes.olive_pass import create_pass_from_dict from olive.passes.onnx.optimum_conversion import OptimumConversion from olive.passes.openvino.compression import OpenVINOWeightCompression -from test.utils import get_hf_model +from olive.passes.openvino.optimum_intel import OpenVINOOptimumConversion +from test.utils import get_hf_model, package_version_at_least pytestmark = pytest.mark.openvino @@ -25,6 +26,10 @@ def wikitext_2_raw_v1_test(): return datasets.load_dataset("Salesforce/wikitext", "wikitext-2-raw-v1", split="test") +@pytest.mark.skipif( + not package_version_at_least("optimum", "2.1.0"), + reason="Requires optimum >= 2.1.0", +) def test_openvino_weight_compression_hf_to_openvino(tmp_path): # imports here import numpy as np @@ -97,6 +102,10 @@ def custom_transform_func(data, tokenizer): shutil.rmtree(hf_to_ov_model.model_path) +@pytest.mark.skipif( + not package_version_at_least("optimum", "2.1.0"), + reason="Requires optimum >= 2.1.0", +) def test_openvino_weight_compression_hf_to_openvino_multi_ignore_scope(tmp_path): # imports here import numpy as np @@ -165,6 +174,10 @@ def custom_transform_func(data, tokenizer): shutil.rmtree(hf_to_ov_model.model_path) +@pytest.mark.skipif( + not package_version_at_least("optimum", "2.1.0"), + reason="Requires optimum >= 2.1.0", +) def test_openvino_weight_compression_hf_to_onnx(tmp_path): from nncf.parameters import CompressWeightsMode from nncf.quantization.advanced_parameters import GroupSizeFallbackMode @@ -209,6 +222,10 @@ def test_openvino_weight_compression_hf_to_onnx(tmp_path): shutil.rmtree(q_dir) +@pytest.mark.skipif( + not package_version_at_least("optimum", "2.1.0"), + reason="Requires optimum >= 2.1.0", +) def test_openvino_weight_compression_hf_to_onnx_multi_ignore_scope(tmp_path): from nncf.parameters import CompressWeightsMode @@ -220,7 +237,7 @@ def test_openvino_weight_compression_hf_to_onnx_multi_ignore_scope(tmp_path): "extra_args": {"use_onnx": True, "advanced_compression_parameters": {"backend_params": {"external_dir": True}}}, "ignored_scope": [ ["Gather", "Add", "MatMul"], - ["/model/Mul_1", "/model/Mul_2", "/model/Mul_3", "/model/Mul_4"], + ["/model/Mul", "/model/Mul_1"], ], "ignored_scope_type": ["types", "names"], } @@ -249,6 +266,10 @@ def test_openvino_weight_compression_hf_to_onnx_multi_ignore_scope(tmp_path): shutil.rmtree(q_dir) +@pytest.mark.skipif( + not package_version_at_least("optimum", "2.1.0"), + reason="Requires optimum >= 2.1.0", +) def test_openvino_weight_compression_onnx_to_onnx(tmp_path): from nncf.parameters import CompressWeightsMode from nncf.quantization.advanced_parameters import GroupSizeFallbackMode @@ -305,6 +326,10 @@ def test_openvino_weight_compression_onnx_to_onnx(tmp_path): shutil.rmtree(q_dir) +@pytest.mark.skipif( + not package_version_at_least("optimum", "2.1.0"), + reason="Requires optimum >= 2.1.0", +) def test_openvino_weight_compression_onnx_to_onnx_multi_ignore_scope(tmp_path): from nncf.parameters import CompressWeightsMode @@ -327,7 +352,7 @@ def test_openvino_weight_compression_onnx_to_onnx_multi_ignore_scope(tmp_path): "extra_args": {"use_onnx": True, "advanced_compression_parameters": {"backend_params": {"external_dir": True}}}, "ignored_scope": [ ["Gather", "Add", "MatMul"], - ["/model/Mul_1", "/model/Mul_2", "/model/Mul_3", "/model/Mul_4"], + ["/model/Mul", "/model/Mul_1"], ], "ignored_scope_type": ["types", "names"], } @@ -355,3 +380,119 @@ def test_openvino_weight_compression_onnx_to_onnx_multi_ignore_scope(tmp_path): else: q_dir = Path(onnx_to_onnx_model.model_path) shutil.rmtree(q_dir) + + +@pytest.mark.skipif( + not package_version_at_least("optimum", "2.1.0"), + reason="Requires optimum >= 2.1.0", +) +def test_openvino_weight_compression_openvino_to_openvino(tmp_path): + """Test weight compression on an OpenVINO model.""" + from nncf.parameters import CompressWeightsMode + from nncf.quantization.advanced_parameters import GroupSizeFallbackMode + + # setup - first convert HF model to OpenVINO + input_hf_model = get_hf_model("hf-internal-testing/tiny-random-LlamaForCausalLM") + openvino_conversion_config = {"extra_args": {"disable_convert_tokenizer": True}} + p_convert = create_pass_from_dict( + OpenVINOOptimumConversion, + openvino_conversion_config, + disable_search=True, + ) + + # create output folder for conversion + output_folder_convert = str(Path(tmp_path) / "openvino_convert") + input_ov_model = p_convert.run(input_hf_model, output_folder_convert) + + # setup weight compression pass + openvino_weight_compression_config = { + "compress_config": {"mode": CompressWeightsMode.INT4_SYM, "ratio": 1.0}, + "extra_args": { + "advanced_compression_parameters": { + "group_size_fallback_mode": GroupSizeFallbackMode.IGNORE, + }, + }, + } + p = create_pass_from_dict( + OpenVINOWeightCompression, + openvino_weight_compression_config, + disable_search=True, + accelerator_spec=AcceleratorSpec("cpu", "OpenVINOExecutionProvider"), + ) + + # create output folder + output_folder = str(Path(tmp_path) / "openvino_wc_ov_to_ov") + + # execute + ov_to_ov_model = p.run(input_ov_model, output_folder) + + # define the XML and bin files paths if openvino models are produced + xml_file = Path(ov_to_ov_model.model_path) / "openvino_model.xml" + bin_file = Path(ov_to_ov_model.model_path) / "openvino_model.bin" + + # test if the model file is created + assert xml_file.exists() + assert xml_file.is_file() + assert bin_file.exists() + assert bin_file.is_file() + + # cleanup + shutil.rmtree(output_folder_convert) + if Path(ov_to_ov_model.model_path).exists(): + shutil.rmtree(ov_to_ov_model.model_path) + + +@pytest.mark.skipif( + not package_version_at_least("optimum", "2.1.0"), + reason="Requires optimum >= 2.1.0", +) +def test_openvino_weight_compression_openvino_to_openvino_multi_ignore_scope(tmp_path): + """Test weight compression on an OpenVINO model with multiple ignored scopes.""" + from nncf.parameters import CompressWeightsMode + + # setup - first convert HF model to OpenVINO + input_hf_model = get_hf_model("hf-internal-testing/tiny-random-LlamaForCausalLM") + openvino_conversion_config = {"extra_args": {"disable_convert_tokenizer": True}} + p_convert = create_pass_from_dict( + OpenVINOOptimumConversion, + openvino_conversion_config, + disable_search=True, + ) + + # create output folder for conversion + output_folder_convert = str(Path(tmp_path) / "openvino_convert") + input_ov_model = p_convert.run(input_hf_model, output_folder_convert) + + # setup weight compression pass with multiple ignored scopes + openvino_weight_compression_config = { + "compress_config": {"mode": CompressWeightsMode.INT4_SYM, "ratio": 1.0}, + "ignored_scope": [["Gather", "Add", "MatMul"], [".*Mul.*"]], + "ignored_scope_type": ["types", "patterns"], + } + p = create_pass_from_dict( + OpenVINOWeightCompression, + openvino_weight_compression_config, + disable_search=True, + accelerator_spec=AcceleratorSpec("cpu", "OpenVINOExecutionProvider"), + ) + + # create output folder + output_folder = str(Path(tmp_path) / "openvino_wc_ov_to_ov") + + # execute + ov_to_ov_model = p.run(input_ov_model, output_folder) + + # define the XML and bin files paths if openvino models are produced + xml_file = Path(ov_to_ov_model.model_path) / "openvino_model.xml" + bin_file = Path(ov_to_ov_model.model_path) / "openvino_model.bin" + + # test if the model file is created + assert xml_file.exists() + assert xml_file.is_file() + assert bin_file.exists() + assert bin_file.is_file() + + # cleanup + shutil.rmtree(output_folder_convert) + if Path(ov_to_ov_model.model_path).exists(): + shutil.rmtree(ov_to_ov_model.model_path) diff --git a/test/passes/openvino/test_openvino_optimum_conversion.py b/test/passes/openvino/test_openvino_optimum_conversion.py index 55b6e76c40..ae411b24a3 100644 --- a/test/passes/openvino/test_openvino_optimum_conversion.py +++ b/test/passes/openvino/test_openvino_optimum_conversion.py @@ -8,11 +8,15 @@ from olive.passes.olive_pass import create_pass_from_dict from olive.passes.openvino.optimum_intel import OpenVINOOptimumConversion -from test.utils import get_hf_model +from test.utils import get_hf_model, package_version_at_least pytestmark = pytest.mark.openvino +@pytest.mark.skipif( + not package_version_at_least("optimum", "2.1.0"), + reason="Requires optimum >= 2.1.0", +) def test_openvino_optimum_conversion_pass_convert_with_tokenizers(tmp_path): # setup input_hf_model = get_hf_model("hf-internal-testing/tiny-random-PhiForCausalLM") @@ -37,6 +41,10 @@ def test_openvino_optimum_conversion_pass_convert_with_tokenizers(tmp_path): assert bin_file.is_file() +@pytest.mark.skipif( + not package_version_at_least("optimum", "2.1.0"), + reason="Requires optimum >= 2.1.0", +) def test_openvino_optimum_conversion_pass_convert_without_tokenizers(tmp_path): # setup input_hf_model = get_hf_model("hf-internal-testing/tiny-random-PhiForCausalLM") @@ -61,6 +69,10 @@ def test_openvino_optimum_conversion_pass_convert_without_tokenizers(tmp_path): assert bin_file.is_file() +@pytest.mark.skipif( + not package_version_at_least("optimum", "2.1.0"), + reason="Requires optimum >= 2.1.0", +) def test_openvino_optimum_conversion_pass_convert_with_weight_compression(tmp_path): # setup input_hf_model = get_hf_model("hf-internal-testing/tiny-random-PhiForCausalLM") @@ -94,12 +106,16 @@ def test_openvino_optimum_conversion_pass_convert_with_weight_compression(tmp_pa assert bin_file.is_file() +@pytest.mark.skipif( + not package_version_at_least("optimum", "2.1.0"), + reason="Requires optimum >= 2.1.0", +) def test_openvino_optimum_conversion_pass_convert_with_quantization(tmp_path): # setup input_hf_model = get_hf_model("hf-internal-testing/tiny-random-clip-zero-shot-image-classification") openvino_optimum_conversion_config = { "extra_args": {"device": "npu"}, - "ov_quant_config": {"weight_format": "int8", "dataset": "auto"}, + "ov_quant_config": {"weight_format": "int8"}, } p = create_pass_from_dict(OpenVINOOptimumConversion, openvino_optimum_conversion_config, disable_search=True) @@ -121,6 +137,10 @@ def test_openvino_optimum_conversion_pass_convert_with_quantization(tmp_path): assert bin_file.is_file() +@pytest.mark.skipif( + not package_version_at_least("optimum", "2.1.0"), + reason="Requires optimum >= 2.1.0", +) def test_openvino_optimum_conversion_pass_convert_multiple_components_without_main(tmp_path): # setup input_hf_model = get_hf_model("hf-internal-testing/tiny-random-Qwen2VLForConditionalGeneration") diff --git a/test/requirements-test.txt b/test/requirements-test.txt index 37d7fad2a1..f0e09c1ed4 100644 --- a/test/requirements-test.txt +++ b/test/requirements-test.txt @@ -12,7 +12,7 @@ marshmallow ml_dtypes mlflow neural-compressor<2.4 -nncf>=2.18.0 +nncf>=2.19.0 numpy<2.0.0 nvidia-modelopt onnx @@ -23,7 +23,7 @@ onnxoptimizer onnxruntime_extensions onnxscript>=0.2.4, <0.6.1 onnxsim -openvino>=2025.3.0 +openvino>=2025.4.1 optuna pandas peft diff --git a/test/utils.py b/test/utils.py index 8b4821babc..dc0683eb21 100644 --- a/test/utils.py +++ b/test/utils.py @@ -408,3 +408,24 @@ def get_wikitext_data_config( "random_seed": 42, }, ) + + +def package_version_at_least(package_name: str, min_ver: str) -> bool: + """Return True if *package_name* is installed and its version is >= *min_ver*, False otherwise. + + Intended for use in ``pytest.mark.skipif`` conditions where the check + must never raise during test collection. + """ + try: + from importlib.metadata import PackageNotFoundError + from importlib.metadata import version as pkg_version + + from packaging.version import InvalidVersion + from packaging.version import parse as parse_version + except ImportError: + return False + + try: + return parse_version(pkg_version(package_name)) >= parse_version(min_ver) + except (PackageNotFoundError, InvalidVersion): + return False