From 530a255d490c1a322c97f0182e829346336c1436 Mon Sep 17 00:00:00 2001
From: Anirudh Swaminathan <anirudh.swaminathan@intel.com>
Date: Wed, 18 Feb 2026 15:15:45 -0800
Subject: [PATCH] =?UTF-8?q?Update=20Intel=C2=AE=20optimum=20passes=20to=20?=
 =?UTF-8?q?latest=20optimum?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Update Intel® compression and optimum conversion passes to utilize
latest optimum version APIs

- [x] Add unit tests for this change.
- [x] Make sure all tests can pass.
- [x] Update documents if necessary.
- [x] Lint and apply fixes to your code by running `lintrunner -a`
- [x] Is this a user-facing change? If yes, give a description of this
change to be included in the release notes.

---------

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
---
 .../features/ihv-integration/openvino.md      |   8 +-
 olive/olive_config.json                       |   6 +-
 olive/passes/openvino/compression.py          | 702 ++++--------------
 olive/passes/openvino/encapsulation.py        | 173 +----
 olive/passes/openvino/optimum_intel.py        | 475 ++++++------
 olive/passes/openvino/ov_utils.py             | 370 +++++++++
 olive/passes/openvino/quantization.py         |   7 +-
 .../openvino/test_openvino_compression.py     | 147 +++-
 .../test_openvino_optimum_conversion.py       |  24 +-
 test/requirements-test.txt                    |   4 +-
 test/utils.py                                 |  21 +
 11 files changed, 977 insertions(+), 960 deletions(-)
 create mode 100644 olive/passes/openvino/ov_utils.py

diff --git a/docs/source/features/ihv-integration/openvino.md b/docs/source/features/ihv-integration/openvino.md
index 6dea33684e..373ea127ea 100644
--- a/docs/source/features/ihv-integration/openvino.md
+++ b/docs/source/features/ihv-integration/openvino.md
@@ -23,18 +23,18 @@ pip install olive-ai[openvino]
 ### Option 2: Install OpenVINO Runtime and OpenVINO Development Tools from Pypi
 
 ```bash
-pip install openvino>=2025.3.0
-pip install nncf>=2.18.0
+pip install openvino>=2025.4.1
+pip install nncf>=2.19.0
 pip install onnxruntime-openvino
 ```
 
 ### Install Optimum Intel® for Generative AI Workloads
 
 ```bash
-pip install optimum[openvino]<=1.24.0
+pip install optimum[openvino]>=2.1.0
 ```
 
-More detailed instructions are available at [Optimum Intel® Installation Instructions](https://huggingface.co/docs/optimum/main/en/intel/installation)
+More detailed instructions are available at [Optimum Installation Instructions](https://huggingface.co/docs/optimum/installation) and at [Optimum Intel® Installation Instructions](https://huggingface.co/docs/optimum/main/en/intel/installation)
 
 ## Model Conversion
 
diff --git a/olive/olive_config.json b/olive/olive_config.json
index 77debc1818..2748c39101 100644
--- a/olive/olive_config.json
+++ b/olive/olive_config.json
@@ -646,10 +646,10 @@
         "diffusers": [ "accelerate>=0.30.0", "peft", "diffusers" ],
         "nvmo": [ "nvidia-modelopt[onnx]" ],
         "openvino": [
-            "openvino>=2025.3.0",
-            "nncf>=2.18.0",
+            "openvino>=2025.4.1",
+            "nncf>=2.19.0",
             "numpy<2.0",
-            "optimum[openvino]<=1.24",
+            "optimum[openvino]>=2.1.0",
             "onnxruntime-openvino"
         ],
         "optimum": [ "optimum" ],
diff --git a/olive/passes/openvino/compression.py b/olive/passes/openvino/compression.py
index 689b11892a..93cfc40a9e 100644
--- a/olive/passes/openvino/compression.py
+++ b/olive/passes/openvino/compression.py
@@ -7,238 +7,26 @@
 from copy import deepcopy
 from functools import partial
 from pathlib import Path
-from typing import Callable, Optional, Union
-
-from huggingface_hub.constants import HUGGINGFACE_HUB_CACHE
+from typing import Any, Callable, Optional, Union
 
 from olive.common.config_utils import validate_config
-from olive.common.utils import StrEnumBase
 from olive.data.config import DataConfig
 from olive.hardware.accelerator import AcceleratorSpec, Device
 from olive.model.handler import CompositeModelHandler, HfModelHandler, ONNXModelHandler, OpenVINOModelHandler
 from olive.passes import Pass
+from olive.passes.openvino.ov_utils import (
+    IgnoreScopeTypeEnum,
+    OVOptimumLibrary,
+    _convert_to_enum,
+    _validate_enum_value,
+    create_genai_config,
+    infer_library_name,
+)
 from olive.passes.pass_config import BasePassConfig, ParamCategory, PassConfigParam, get_user_script_data_config
 
 logger = logging.getLogger(__name__)
 
 
-class IgnoreScopeTypeEnum(StrEnumBase):
-    NAMES = "names"
-    TYPES = "types"
-    PATTERNS = "patterns"
-
-
-class OVOptimumLibrary(StrEnumBase):
-    TRANSFORMERS = "transformers"
-    DIFFUSERS = "diffusers"
-    TIMM = "timm"
-    SENTENCE_TRANSFORMERS = "sentence_transformers"
-    OPEN_CLIP = "open_clip"
-
-
-def infer_task(
-    task,
-    model_name_or_path,
-    subfolder: str = "",
-    revision: Optional[str] = None,
-    cache_dir: str = HUGGINGFACE_HUB_CACHE,
-    token: Optional[Union[bool, str]] = None,
-    library_name: Optional[str] = None,
-):
-    try:
-        from optimum.exporters import TasksManager
-    except Exception as e:
-        raise ImportError("Unable to import optimum packages:", e) from None
-
-    try:
-        from requests.exceptions import ConnectionError as RequestsConnectionError
-    except Exception as e:
-        raise ImportError("Unable to import ConnectionError packages:", e) from None
-
-    task = TasksManager.map_from_synonym(task)
-    if task == "auto":
-        if library_name == "open_clip":
-            task = "zero-shot-image-classification"
-        else:
-            try:
-                task = TasksManager._infer_task_from_model_name_or_path(  # pylint: disable=W0212
-                    model_name_or_path=model_name_or_path,
-                    subfolder=subfolder,
-                    revision=revision,
-                    cache_dir=cache_dir,
-                    token=token,
-                    library_name=library_name,
-                )
-            except KeyError as e:
-                raise KeyError(
-                    f"The task could not be automatically inferred. Please provide the argument --task with the relevant task from {', '.join(TasksManager.get_all_tasks())}. Detailed error: {e}"
-                ) from None
-            except RequestsConnectionError as e:
-                raise RequestsConnectionError(
-                    f"The task could not be automatically inferred as this is available only for models hosted on the Hugging Face Hub. Please provide the argument --task with the relevant task from {', '.join(TasksManager.get_all_tasks())}. Detailed error: {e}"
-                ) from None
-    return task
-
-
-def maybe_load_preprocessors(
-    src_name_or_path: Union[str, Path], subfolder: str = "", trust_remote_code: bool = False
-) -> list:
-    try:
-        from transformers import AutoFeatureExtractor, AutoImageProcessor, AutoProcessor, AutoTokenizer
-    except Exception as e:
-        raise ImportError("Unable to import transformers packages: ", e) from None
-
-    preprocessors = []
-    try:
-        preprocessors.append(
-            AutoTokenizer.from_pretrained(src_name_or_path, subfolder=subfolder, trust_remote_code=trust_remote_code)
-        )
-    except Exception as e:
-        logger.warning("Could not load tokenizer using specified model ID or path.\n Exception: %s", e)
-
-    try:
-        preprocessors.append(
-            AutoProcessor.from_pretrained(src_name_or_path, subfolder=subfolder, trust_remote_code=trust_remote_code)
-        )
-    except Exception as e:
-        logger.warning("Could not load processor using specified model ID or path.\n Exception: %s", e)
-
-    try:
-        preprocessors.append(
-            AutoFeatureExtractor.from_pretrained(
-                src_name_or_path, subfolder=subfolder, trust_remote_code=trust_remote_code
-            )
-        )
-    except Exception as e:
-        logger.warning("Could not load feature extractor using specified model ID or path.\n Exception: %s", e)
-
-    try:
-        preprocessors.append(
-            AutoImageProcessor.from_pretrained(
-                src_name_or_path, subfolder=subfolder, trust_remote_code=trust_remote_code
-            )
-        )
-    except Exception as e:
-        logger.warning("Could not load image processor using specified model ID or path.\n Exception: %s", e)
-
-    return preprocessors
-
-
-def maybe_convert_tokenizers(library_name: str, output: Path, model=None, preprocessors=None, task=None):
-    try:
-        from optimum.exporters.openvino.convert import export_tokenizer
-    except Exception as e:
-        raise ImportError("Unable to import optimum Intel® package:", e) from None
-
-    try:
-        from transformers import PreTrainedTokenizerBase
-    except Exception as e:
-        raise ImportError("Unable to import transformers packages:", e) from None
-
-    try:
-        from optimum.intel.utils.import_utils import is_openvino_tokenizers_available
-    except Exception as e:
-        raise ImportError("openvino tokenizers unavailable :", e) from None
-
-    if is_openvino_tokenizers_available():
-        if library_name != "diffusers" and preprocessors:
-            tokenizer = next(filter(lambda it: isinstance(it, PreTrainedTokenizerBase), preprocessors), None)
-            if tokenizer:
-                try:
-                    export_tokenizer(tokenizer, output, task=task)
-                except Exception as exception:
-                    logger.warning(
-                        "Could not load tokenizer using specified model ID or path. OpenVINO tokenizer/detokenizer models won't be generated. Exception: %s",
-                        exception,
-                    )
-        elif model:
-            for tokenizer_name in ("tokenizer", "tokenizer_2", "tokenizer_3"):
-                tokenizer = getattr(model, tokenizer_name, None)
-                if tokenizer:
-                    export_tokenizer(tokenizer, output / tokenizer_name, task=task)
-    else:
-        logger.warning("Tokenizer won't be converted.")
-
-
-def _validate_enum_value(value, enum_class: type, param_name: str) -> tuple[bool, str]:
-    """Validate that a value can be converted to an enum (case-insensitive).
-
-    Args:
-        value: The value to validate (None, string, or already enum).
-        enum_class: The enum class to validate against.
-        param_name: Name of the parameter for error messages.
-
-    Returns:
-        Tuple of (is_valid, error_message). error_message is empty if valid.
-
-    """
-    if value is None or isinstance(value, enum_class):
-        return True, ""
-
-    if not isinstance(value, str):
-        return False, f"{param_name} '{value}' is not a valid string or {enum_class.__name__} enum."
-
-    lookup_key = value.lower()
-
-    # Try matching by enum.value first (case-insensitive)
-    value_map = {m.value.lower(): m for m in enum_class}
-    if lookup_key in value_map:
-        return True, ""
-
-    # Try matching by enum.name (case-insensitive)
-    name_map = {m.name.lower(): m for m in enum_class}
-    if lookup_key in name_map:
-        return True, ""
-
-    # Validation failed
-    valid_values = sorted(set([m.value for m in enum_class] + [m.name for m in enum_class]))
-    return False, f"{param_name} '{value}' is not supported. Supported values are: {', '.join(valid_values)}."
-
-
-def _convert_to_enum(value, enum_class: type, param_name: str):
-    """Convert a value to an enum if needed (case-insensitive).
-
-    Accepts:
-    - None (returns None)
-    - Enum instances of the correct type (returns as-is)
-    - Strings matching enum.value (case-insensitive)
-    - Strings matching enum.name (case-insensitive)
-
-    Args:
-        value: The value to convert (None, string, or already enum).
-        enum_class: The enum class to convert to.
-        param_name: Name of the parameter for error messages.
-
-    Returns:
-        The enum value, or None if input was None.
-
-    Raises:
-        ValueError: If conversion fails.
-
-    """
-    if value is None or isinstance(value, enum_class):
-        return value
-
-    if not isinstance(value, str):
-        raise ValueError(f"{param_name} '{value}' is not a valid string or {enum_class.__name__} enum.")
-
-    lookup_key = value.lower()
-
-    # Try matching by enum.value first (case-insensitive)
-    value_map = {m.value.lower(): m for m in enum_class}
-    if lookup_key in value_map:
-        return value_map[lookup_key]
-
-    # Try matching by enum.name (case-insensitive)
-    name_map = {m.name.lower(): m for m in enum_class}
-    if lookup_key in name_map:
-        return name_map[lookup_key]
-
-    # Conversion failed
-    valid_values = sorted(set([m.value for m in enum_class] + [m.name for m in enum_class]))
-    raise ValueError(f"{param_name} '{value}' is not supported. Supported values are: {', '.join(valid_values)}.")
-
-
 def _convert_compress_config_enums(compress_config: dict) -> dict:
     """Convert compress_config enum values from strings to enum instances.
 
@@ -651,14 +439,79 @@ def _get_advanced_compression_params(config):
 
         return advanced_params
 
+    def _apply_compression(
+        self,
+        model_to_compress: Any,
+        config: type[BasePassConfig],
+        output_model_path: str,
+        tokenizer: Optional[Any] = None,
+    ) -> Any:
+        """Apply NNCF weight compression to a model.
+
+        Args:
+            model_to_compress: The model object to compress (OpenVINO model or ONNX model).
+            config: The pass configuration.
+            output_model_path: Path where the output model will be saved.
+            tokenizer: Optional tokenizer for dataset transform (used in HF path).
+
+        Returns:
+            The compressed model object from nncf.compress_weights().
+
+        Raises:
+            ImportError: If nncf is not installed.
+
+        """
+        try:
+            import nncf
+            from nncf.onnx.quantization.backend_parameters import BackendParameters
+        except ImportError:
+            raise ImportError("Please install olive-ai[openvino] to use OpenVINO NNCF") from None
+
+        # get the weight compression dataset
+        compression_dataset = self._get_nncf_dataset(config, tokenizer)
+
+        # get the extra params
+        extra_params = self._get_extra_params(config)
+
+        # local copy of compress_config and ensure enum values are converted
+        # (handles case where validate_config was bypassed, e.g., in unit tests)
+        compress_config = deepcopy(config.compress_config) if config.compress_config else {}
+        compress_config = _convert_compress_config_enums(compress_config)
+
+        # append extra params to compress config
+        compress_config.update(extra_params)
+
+        # get nncf.AdvancedCompressionParameters if any
+        advanced_params = None
+        adv_par = self._get_advanced_compression_params(config)
+        if adv_par is not None:
+            # Handle external_dir for backend_params - add output path at runtime
+            if adv_par.get("_external_dir") is not None:
+                # Create or update backend_params with external data dir
+                if adv_par.get("backend_params") is None:
+                    adv_par["backend_params"] = {BackendParameters.EXTERNAL_DATA_DIR: output_model_path}
+                else:
+                    adv_par["backend_params"][BackendParameters.EXTERNAL_DATA_DIR] = output_model_path
+                # Remove the temporary _external_dir key
+                adv_par.pop("_external_dir")
+
+            advanced_params = nncf.AdvancedCompressionParameters(**adv_par)
+
+        # perform weight compression
+        return nncf.compress_weights(
+            model_to_compress, dataset=compression_dataset, advanced_parameters=advanced_params, **compress_config
+        )
+
     def _run_for_config(
         self,
-        model: Union[HfModelHandler, ONNXModelHandler],
+        model: Union[HfModelHandler, ONNXModelHandler, OpenVINOModelHandler],
         config: type[BasePassConfig],
         output_model_path: str,
     ) -> Union[OpenVINOModelHandler, ONNXModelHandler, CompositeModelHandler]:
-        if not isinstance(model, (HfModelHandler, ONNXModelHandler)):
-            raise TypeError("OpenVINOWeightCompression pass can only be applied to Hugging Face or ONNX models")
+        if not isinstance(model, (HfModelHandler, ONNXModelHandler, OpenVINOModelHandler)):
+            raise TypeError(
+                "OpenVINOWeightCompression pass can only be applied to Hugging Face, ONNX, or OpenVINO models"
+            )
 
         if config.reuse_cache:
             model_name_path = Path(model.model_path)
@@ -680,6 +533,8 @@ def _run_for_config(
             output_model = self._run_hf_pass(model, config, output_model_path)
         elif isinstance(model, ONNXModelHandler):
             output_model = self._run_onnx_pass(model, config, output_model_path)
+        elif isinstance(model, OpenVINOModelHandler):
+            output_model = self._run_openvino_pass(model, config, output_model_path)
 
         if config.reuse_cache:
             if os.path.exists(model_name_path):
@@ -696,10 +551,7 @@ def _run_hf_pass(
         output_model_path: str,
     ) -> Union[OpenVINOModelHandler, CompositeModelHandler]:
         try:
-            import nncf
-            from nncf.onnx.quantization.backend_parameters import BackendParameters
             from optimum.exporters.openvino import main_export as export_optimum_intel
-            from optimum.intel.utils.modeling_utils import _infer_library_from_model_name_or_path
         except ImportError:
             raise ImportError(
                 "Please install Intel® optimum[openvino] to use NNCF for weight compression on HF models"
@@ -708,123 +560,34 @@ def _run_hf_pass(
         # local copy of extra_args
         extra_args = deepcopy(config.extra_args) if config.extra_args else {}
 
-        # local copy of compress_config and ensure enum values are converted
-        # (handles case where validate_config was bypassed, e.g., in unit tests)
-        compress_config = deepcopy(config.compress_config) if config.compress_config else {}
-        compress_config = _convert_compress_config_enums(compress_config)
-
         # set the library name for the HF Model
         if extra_args.get("library") is None:
-            lib_name = _infer_library_from_model_name_or_path(model.model_name_or_path)
-            if lib_name == "sentence_transformers":
-                logger.warning(
-                    "Library is not specified. "
-                    "There are multiple possible variants: `sentence_transformers`, `transformers`. "
-                    "`transformers` will be selected. "
-                    "If you want to load your model with the `sentence-transformers` library instead, "
-                    "Please set it as sentence_transformers in extra_args dictionary under 'library' key"
-                )
-                lib_name = "transformers"
-            extra_args["library"] = lib_name
+            lib_name = infer_library_name(model.model_name_or_path)
         else:
             lib_name = extra_args["library"]
 
-        # infer task
-        task = infer_task(extra_args.get("task", "auto"), model.model_name_or_path, library_name=lib_name)
-
-        # model
-        if lib_name == "diffusers":
-            try:
-                from diffusers import DiffusionPipeline
-            except ImportError:
-                raise ImportError("Please install diffusers to use OpenVINO with Diffusers models.") from None
-
-            diffusers_config = DiffusionPipeline.load_config(model.model_name_or_path)
-            class_name = diffusers_config.get("_class_name", None)
-
-            if class_name == "LatentConsistencyModelPipeline":
-                from optimum.intel import OVLatentConsistencyModelPipeline
-
-                model_cls = OVLatentConsistencyModelPipeline
-
-            elif class_name == "StableDiffusionXLPipeline":
-                from optimum.intel import OVStableDiffusionXLPipeline
-
-                model_cls = OVStableDiffusionXLPipeline
-            elif class_name == "StableDiffusionPipeline":
-                from optimum.intel import OVStableDiffusionPipeline
-
-                model_cls = OVStableDiffusionPipeline
-            elif class_name == "StableDiffusion3Pipeline":
-                from optimum.intel import OVStableDiffusion3Pipeline
-
-                model_cls = OVStableDiffusion3Pipeline
-            elif class_name == "FluxPipeline":
-                from optimum.intel import OVFluxPipeline
-
-                model_cls = OVFluxPipeline
-            elif class_name == "SanaPipeline":
-                from optimum.intel import OVSanaPipeline
-
-                model_cls = OVSanaPipeline
-            else:
-                raise NotImplementedError(f"{class_name} isn't supported.")
-
-            output_model = model_cls.from_pretrained(
-                model.model_name_or_path, export=True, load_in_8bit=False, compile=False
-            )
-            if not extra_args.get("disable_convert_tokenizer", False):
-                maybe_convert_tokenizers(lib_name, output_model_path, model, task=task)
-        elif (task.startswith("text-generation") or "automatic-speech-recognition" in task) or (
-            task == "image-text-to-text"
-        ):
-            if task.startswith("text-generation"):
-                from optimum.intel import OVModelForCausalLM
-
-                model_cls = OVModelForCausalLM
-            elif task == "image-text-to-text":
-                from optimum.intel import OVModelForVisualCausalLM
+        # prepare extra args for export
+        extra_args["stateful"] = not extra_args.get("disable_stateful", False)
+        extra_args.pop("disable_stateful", None)
+        extra_args["convert_tokenizer"] = not extra_args.get("disable_convert_tokenizer", False)
+        extra_args.pop("disable_convert_tokenizer", None)
+        extra_args["library_name"] = lib_name
+        extra_args.pop("library", None)
+
+        # export HF model to OpenVINO format
+        export_optimum_intel(
+            model.model_name_or_path,
+            output_model_path,
+            **extra_args,
+        )
 
-                model_cls = OVModelForVisualCausalLM
-            else:
-                from optimum.intel import OVModelForSpeechSeq2Seq
-
-                model_cls = OVModelForSpeechSeq2Seq
-
-            output_model = model_cls.from_pretrained(
-                model.model_name_or_path,
-                export=True,
-                load_in_8bit=False,
-                compile=False,
-                stateful=not extra_args.get("disable_stateful", False),
-                trust_remote_code=extra_args.get("trust_remote_code", False),
-                variant=extra_args.get("variant", None),
-                cache_dir=extra_args.get("cache_dir", HUGGINGFACE_HUB_CACHE),
-            )
+        # load the exported OpenVINO model
+        from optimum.intel import OVModelForCausalLM
 
-            preprocessors = maybe_load_preprocessors(
-                model.model_name_or_path, trust_remote_code=extra_args.get("trust_remote_code", False)
-            )
-            if not extra_args.get("disable_convert_tokenizer", False):
-                maybe_convert_tokenizers(lib_name, output_model_path, preprocessors=preprocessors, task=task)
-
-        else:
-            extra_args["stateful"] = not extra_args.get("disable_stateful", False)
-            extra_args.pop("disable_stateful", False)
-            extra_args["convert_tokenizer"] = not extra_args.get("disable_convert_tokenizer", False)
-            extra_args.pop("disable_convert_tokenizer", False)
-            extra_args["library_name"] = lib_name
-            extra_args.pop("library", None)
-            export_optimum_intel(
-                model.model_name_or_path,
-                output_model_path,
-                **extra_args,
-            )
+        output_model = OVModelForCausalLM.from_pretrained(output_model_path, compile=False)
 
         # redirect to ONNXModelHandler if extra_args requests ONNX processing
         # this is also only for CausalLM models
-        from optimum.intel import OVModelForCausalLM
-
         if config.extra_args and config.extra_args.get("use_onnx") and isinstance(output_model, OVModelForCausalLM):
             try:
                 from optimum.onnxruntime import ORTModelForCausalLM
@@ -862,38 +625,35 @@ def _run_hf_pass(
                 ) from None
             tokenizer = AutoTokenizer.from_pretrained(model.model_name_or_path)
 
-        # get the weight compression dataset
-        compression_dataset = self._get_nncf_dataset(config, tokenizer)
-
-        # get the extra params
-        extra_params = self._get_extra_params(config)
+        # perform weight compression using shared compression logic
+        output_model.model = self._apply_compression(output_model.model, config, output_model_path, tokenizer)
 
-        # append extra params to compress config
-        compress_config.update(extra_params)
+        # save compressed model to temp directory to avoid file locking issues,
+        # then copy back to output_model_path
+        import gc
+        import shutil
+        import tempfile
 
-        # get nncf.AdvancedCompressionParameters if any
-        advanced_params = None
-        adv_par = self._get_advanced_compression_params(config)
-        if adv_par is not None:
-            # Handle external_dir for backend_params - add output path at runtime
-            if adv_par.get("_external_dir") is not None:
-                # Create or update backend_params with external data dir
-                if adv_par.get("backend_params") is None:
-                    adv_par["backend_params"] = {BackendParameters.EXTERNAL_DATA_DIR: output_model_path}
-                else:
-                    adv_par["backend_params"][BackendParameters.EXTERNAL_DATA_DIR] = output_model_path
-                # Remove the temporary _external_dir key
-                adv_par.pop("_external_dir")
-
-            advanced_params = nncf.AdvancedCompressionParameters(**adv_par)
-
-        # perform weight compression
-        output_model.model = nncf.compress_weights(
-            output_model.model, dataset=compression_dataset, advanced_parameters=advanced_params, **compress_config
-        )
-
-        # save to output_model_path
-        output_model.save_pretrained(output_model_path)
+        temp_dir = None
+        try:
+            temp_dir = tempfile.mkdtemp(prefix="olive_ov_compress_")
+            output_model.save_pretrained(temp_dir)
+
+            # release model to free file handles before copying
+            del output_model
+            gc.collect()
+
+            # copy all files from temp_dir back to output_model_path
+            for item in Path(temp_dir).iterdir():
+                dest = Path(output_model_path) / item.name
+                if item.is_file():
+                    shutil.copy2(item, dest)
+                elif item.is_dir():
+                    shutil.copytree(item, dest, dirs_exist_ok=True)
+        finally:
+            # clean up temp directory
+            if temp_dir is not None:
+                shutil.rmtree(temp_dir, ignore_errors=True)
 
         # check the exported components
         exported_models = [name.stem for name in Path(output_model_path).iterdir() if name.suffix == ".xml"]
@@ -959,9 +719,7 @@ def _run_onnx_pass(
         output_model_path: str,
     ) -> ONNXModelHandler:
         try:
-            import nncf
             import onnx
-            from nncf.onnx.quantization.backend_parameters import BackendParameters
         except ImportError:
             raise ImportError(
                 "Please install Intel® NNCF and ONNX to use nncf.compress_weights() on ONNX models"
@@ -975,41 +733,8 @@ def _run_onnx_pass(
         if loaded_model.opset_import[0].version != target_opset:
             loaded_model = onnx.version_converter.convert_version(loaded_model, target_opset)
 
-        # local copy of compress_config and ensure enum values are converted
-        # (handles case where validate_config was bypassed, e.g., in unit tests)
-        compress_config = deepcopy(config.compress_config) if config.compress_config else {}
-        compress_config = _convert_compress_config_enums(compress_config)
-
-        # get the weight compression dataset
-        compression_dataset = self._get_nncf_dataset(config)
-
-        # get the extra params
-        extra_params = self._get_extra_params(config)
-
-        # append extra params to compress config
-        compress_config.update(extra_params)
-
-        # get nncf.AdvancedCompressionParameters if any
-        advanced_params = None
-        adv_par = self._get_advanced_compression_params(config)
-        if adv_par is not None:
-            # Handle external_dir for backend_params - add output path at runtime
-            if adv_par.get("_external_dir") is not None:
-                # Create or update backend_params with external data dir
-                # Note: BackendParameters is already imported from nncf.onnx.quantization.backend_parameters
-                if adv_par.get("backend_params") is None:
-                    adv_par["backend_params"] = {BackendParameters.EXTERNAL_DATA_DIR: output_model_path}
-                else:
-                    adv_par["backend_params"][BackendParameters.EXTERNAL_DATA_DIR] = output_model_path
-                # Remove the temporary _external_dir key
-                adv_par.pop("_external_dir")
-
-            advanced_params = nncf.AdvancedCompressionParameters(**adv_par)
-
-        # perform weight compression
-        output_model = nncf.compress_weights(
-            loaded_model, dataset=compression_dataset, advanced_parameters=advanced_params, **compress_config
-        )
+        # perform weight compression using shared compression logic
+        output_model = self._apply_compression(loaded_model, config, output_model_path)
 
         # save to output_model_path
         model_name = Path(model.model_path).name.replace(".onnx", "_compressed.onnx")
@@ -1024,147 +749,44 @@ def _run_onnx_pass(
 
         return ONNXModelHandler(model_path=output_model_path)
 
+    def _run_openvino_pass(
+        self,
+        model: OpenVINOModelHandler,
+        config: type[BasePassConfig],
+        output_model_path: str,
+    ) -> OpenVINOModelHandler:
+        """Run weight compression on an OpenVINO model.
 
-def create_genai_config(model_name: str, output_path: str, config: type[BasePassConfig]) -> None:
-    """Generate the genai_config.json from the model config files.
+        Args:
+            model: The OpenVINO model handler.
+            config: The pass configuration.
+            output_model_path: Path where the output model will be saved.
 
-    This is only for Generative AI models for which the config.json and generation_config.json files exist
-    Arguments:
-    @param model_name: name of model ONNX file that is generated
-    @param output_path: path to the output directory where the genai_config.json file will be created
-    @return: None
-    """
-    ip_conf_pth = Path(output_path) / "config.json"
-
-    # do not create genai_config.json if config.json does not exist
-    if not ip_conf_pth.exists():
-        return
-
-    ip_gen_pth = Path(output_path) / "generation_config.json"
-
-    # do not create genai_config.json if generation_config.json does not exist
-    if not ip_gen_pth.exists():
-        return
-
-    # Step 1: Create your data structure
-    genai_config = {
-        "model": {
-            "bos_token_id": -1,
-            "context_length": -1,
-            "decoder": {
-                "session_options": {
-                    "log_id": "onnxruntime-genai",
-                    "graph_optimization_level": "ORT_DISABLE_ALL",
-                    "provider_options": [
-                        {"OpenVINO": {"device_type": config.target_device.upper(), "enable_causallm": "True"}}
-                    ],
-                },
-                "filename": "openvino_model.onnx",
-                "head_size": -1,
-                "hidden_size": -1,
-                "inputs": {},
-                "outputs": {},
-                "num_attention_heads": -1,
-                "num_hidden_layers": -1,
-                "num_key_value_heads": -1,
-            },
-            "eos_token_id": -1,
-            "type": "",
-            "vocab_size": -1,
-        },
-        "search": {
-            "diversity_penalty": 0.0,
-            "do_sample": False,
-            "early_stopping": True,
-            "length_penalty": 1.0,
-            "max_length": -1,
-            "min_length": 0,
-            "no_repeat_ngram_size": 0,
-            "num_beams": 1,
-            "num_return_sequences": 1,
-            "past_present_share_buffer": False,
-            "repetition_penalty": 1.0,
-            "temperature": 1.0,
-            "top_k": 1,
-            "top_p": 1.0,
-        },
-    }
-
-    import json
-
-    with open(ip_conf_pth) as f:
-        src_config = json.load(f)
-
-    with open(ip_gen_pth) as f:
-        src_gen_config = json.load(f)
+        Returns:
+            OpenVINOModelHandler for the compressed model.
 
-    try:
-        import onnx
-    except ImportError:
-        raise ImportError(
-            "Please install onnx to create genai_config.json for ONNX OpenVINO IR Encapsulated model"
-        ) from None
-
-    model_path = Path(output_path) / model_name
-    model = onnx.load(model_path)
-
-    # Get input and output tensor names
-    inputs = [inp.name for inp in model.graph.input]
-    outputs = [out.name for out in model.graph.output]
-
-    genai_config["model"]["bos_token_id"] = src_config.get("bos_token_id", -1)
-    genai_config["model"]["context_length"] = src_config.get("max_position_embeddings", -1)
-    genai_config["model"]["decoder"]["filename"] = model_name
-    num_attention_heads = src_config.get("num_attention_heads", -1)
-    hidden_size = src_config.get("hidden_size", -1)
-    if (
-        isinstance(num_attention_heads, int)
-        and isinstance(hidden_size, int)
-        and num_attention_heads > 0
-        and hidden_size >= 0
-    ):
-        genai_config["model"]["decoder"]["head_size"] = hidden_size // num_attention_heads
-    else:
-        if not isinstance(num_attention_heads, int):
-            logger.warning("num_attention_heads is not an int: %s found in src_config", num_attention_heads)
-        elif num_attention_heads <= 0:
-            logger.warning("Invalid num_attention_heads (<= 0) %s found in src_config", num_attention_heads)
-        if not isinstance(hidden_size, int):
-            logger.warning("hidden_size is not an int: %s found in src_config", hidden_size)
-        elif hidden_size < 0:
-            logger.warning("Invalid hidden_size (< 0) %s found in src_config", hidden_size)
-        logger.warning("Setting genai_config['model']['decoder']['head_size'] to -1")
-        genai_config["model"]["decoder"]["head_size"] = -1
-    genai_config["model"]["decoder"]["hidden_size"] = src_config.get("hidden_size", -1)
-
-    for name in inputs:
-        if name != "beam_idx":
-            genai_config["model"]["decoder"]["inputs"].update({name: name})
-
-    for name in outputs:
-        genai_config["model"]["decoder"]["outputs"].update({name: name})
-
-    genai_config["model"]["decoder"]["num_attention_heads"] = src_config.get("num_attention_heads", -1)
-    genai_config["model"]["decoder"]["num_hidden_layers"] = src_config.get("num_hidden_layers", -1)
-    genai_config["model"]["decoder"]["num_key_value_heads"] = src_config.get("num_key_value_heads", -1)
-
-    eos_token_id = src_gen_config.get("eos_token_id", -1)
-    genai_config["model"]["eos_token_id"] = eos_token_id
-    pad_token_id = src_gen_config.get("pad_token_id", None)
-    if pad_token_id is not None:
-        genai_config["model"]["pad_token_id"] = pad_token_id
-    elif eos_token_id != -1:
-        genai_config["model"]["pad_token_id"] = (
-            eos_token_id[0] if isinstance(eos_token_id, list) and len(eos_token_id) > 0 else eos_token_id
-        )
-    else:
-        genai_config["model"]["pad_token_id"] = -1
-    genai_config["model"]["type"] = src_config.get("model_type", "")
-    genai_config["model"]["vocab_size"] = src_config.get("vocab_size", -1)
+        Raises:
+            ImportError: If openvino is not installed.
+
+        """
+        try:
+            import openvino as ov
+        except ImportError:
+            raise ImportError("Please install openvino to use OpenVINO weight compression") from None
+
+        # load the OpenVINO model
+        core = ov.Core()
+        model_config = model.model_config
+        loaded_model = core.read_model(model_config["model"])
+
+        # perform weight compression using shared compression logic
+        compressed_model = self._apply_compression(loaded_model, config, output_model_path)
 
-    genai_config["search"]["max_length"] = src_config.get("max_position_embeddings", -1)
+        # save the compressed model
+        output_dir = Path(output_model_path)
+        output_dir.mkdir(parents=True, exist_ok=True)
+        model_name = model_config["model_name"]
+        output_xml_path = output_dir / f"{model_name}.xml"
+        ov.save_model(compressed_model, output_xml_path)
 
-    # Step 2: Write to JSON file
-    output_genai_config = Path(output_path) / "genai_config.json"
-    with open(output_genai_config, "w") as f:
-        json.dump(genai_config, f, indent=4)
+        return OpenVINOModelHandler(model_path=output_model_path)
diff --git a/olive/passes/openvino/encapsulation.py b/olive/passes/openvino/encapsulation.py
index f052f88b50..c8e24a2b37 100644
--- a/olive/passes/openvino/encapsulation.py
+++ b/olive/passes/openvino/encapsulation.py
@@ -3,11 +3,9 @@
 # Licensed under the MIT License.
 # --------------------------------------------------------------------------
 import logging
-import numbers
 import os
-from collections.abc import Mapping, MutableMapping
 from pathlib import Path
-from typing import Any, ClassVar, Union
+from typing import ClassVar, Union
 
 import onnx.helper as helper
 from onnx import TensorProto, save
@@ -16,6 +14,7 @@
 from olive.hardware.accelerator import AcceleratorSpec, Device
 from olive.model import ONNXModelHandler, OpenVINOModelHandler
 from olive.passes import Pass
+from olive.passes.openvino.ov_utils import create_genai_config
 from olive.passes.pass_config import BasePassConfig, PassConfigParam
 
 logger = logging.getLogger(__name__)
@@ -261,171 +260,3 @@ def extract_shape_list(shape, config, prefix: str = "input_0_") -> list:
             else:
                 shape_list.append(-1)
     return shape_list
-
-
-def _compatible_type(default_val: Any, new_val: Any) -> bool:
-    """Loose type check: allow ints for floats, bool as bool, etc."""
-    if default_val is None:
-        return True
-    if isinstance(default_val, bool):
-        return isinstance(new_val, bool)
-    if isinstance(default_val, numbers.Real) and not isinstance(default_val, bool):
-        return isinstance(new_val, numbers.Real) and not isinstance(new_val, bool)
-    if isinstance(default_val, str):
-        return isinstance(new_val, str)
-    if isinstance(default_val, (list, tuple)):
-        return isinstance(new_val, (list, tuple))
-    if isinstance(default_val, Mapping):
-        return isinstance(new_val, Mapping)
-    return True  # fall back to permissive
-
-
-def apply_genai_overrides(
-    defaults: MutableMapping[str, Any], overrides: Mapping[str, Any], *, path: str = ""
-) -> MutableMapping[str, Any]:
-    """Recursively merge `overrides` into `defaults`."""
-    for k, v in overrides.items():
-        here = f"{path}.{k}" if path else k
-        if k not in defaults:
-            continue
-
-        dv = defaults[k]
-
-        # Recurse for dicts
-        if isinstance(dv, Mapping) and isinstance(v, Mapping):
-            apply_genai_overrides(dv, v, path=here)
-            continue
-
-        # Replace lists/tuples and scalars
-        if not _compatible_type(dv, v):
-            logger.warning("Type mismatch at %s", here)
-        defaults[k] = v
-    return defaults
-
-
-def create_genai_config(model_name: str, output_path: str, config: type[BasePassConfig]) -> None:
-    """Generate the genai_config.json from the model config files.
-
-    This is only for Generative AI models for which the config.json and generation_config.json files exist
-    Arguments:
-    @param model_name: name of model ONNX file that is generated
-    @param output_path: path to the output directory where the genai_config.json file will be created
-    @return: None
-    """
-    ip_conf_pth = Path(output_path) / "config.json"
-
-    # do not create genai_config.json if config.json does not exist
-    if not ip_conf_pth.exists():
-        return
-
-    ip_gen_pth = Path(output_path) / "generation_config.json"
-
-    # do not create genai_config.json if generation_config.json does not exist
-    if not ip_gen_pth.exists():
-        return
-
-    # Step 1: Create your data structure
-    genai_config = {
-        "model": {
-            "bos_token_id": -1,
-            "context_length": -1,
-            "decoder": {
-                "session_options": {
-                    "log_id": "onnxruntime-genai",
-                    "graph_optimization_level": "ORT_DISABLE_ALL",
-                    "provider_options": [
-                        {"OpenVINO": {"device_type": config.target_device.upper(), "enable_causallm": "True"}}
-                    ],
-                },
-                "filename": "openvino_model.onnx",
-                "head_size": -1,
-                "hidden_size": -1,
-                "inputs": {},
-                "outputs": {},
-                "num_attention_heads": -1,
-                "num_hidden_layers": -1,
-                "num_key_value_heads": -1,
-            },
-            "eos_token_id": -1,
-            "type": "",
-            "vocab_size": -1,
-        },
-        "search": {
-            "diversity_penalty": 0.0,
-            "do_sample": False,
-            "early_stopping": True,
-            "length_penalty": 1.0,
-            "max_length": -1,
-            "min_length": 0,
-            "no_repeat_ngram_size": 0,
-            "num_beams": 1,
-            "num_return_sequences": 1,
-            "past_present_share_buffer": False,
-            "repetition_penalty": 1.0,
-            "temperature": 1.0,
-            "top_k": 1,
-            "top_p": 1.0,
-        },
-    }
-
-    import json
-
-    with open(ip_conf_pth) as f:
-        src_config = json.load(f)
-
-    with open(ip_gen_pth) as f:
-        src_gen_config = json.load(f)
-
-    try:
-        import onnx
-    except ImportError:
-        raise ImportError(
-            "Please install onnx to create genai_config.json for ONNX OpenVINO IR Encapsulated model"
-        ) from None
-
-    model_path = Path(output_path) / model_name
-    model = onnx.load(model_path)
-
-    # Get input and output tensor names
-    inputs = [inp.name for inp in model.graph.input]
-    outputs = [out.name for out in model.graph.output]
-
-    genai_config["model"]["bos_token_id"] = src_config.get("bos_token_id", -1)
-    genai_config["model"]["context_length"] = src_config.get("max_position_embeddings", -1)
-    genai_config["model"]["decoder"]["filename"] = model_name
-    genai_config["model"]["decoder"]["head_size"] = src_config.get("hidden_size", -1) // src_config.get(
-        "num_attention_heads", -1
-    )
-    genai_config["model"]["decoder"]["hidden_size"] = src_config.get("hidden_size", -1)
-
-    for name in inputs:
-        if name != "beam_idx":
-            genai_config["model"]["decoder"]["inputs"].update({name: name})
-
-    for name in outputs:
-        genai_config["model"]["decoder"]["outputs"].update({name: name})
-
-    genai_config["model"]["decoder"]["num_attention_heads"] = src_config.get("num_attention_heads", -1)
-    genai_config["model"]["decoder"]["num_hidden_layers"] = src_config.get("num_hidden_layers", -1)
-    genai_config["model"]["decoder"]["num_key_value_heads"] = src_config.get("num_key_value_heads", -1)
-
-    genai_config["model"]["eos_token_id"] = src_gen_config.get("eos_token_id", -1)
-    genai_config["model"]["pad_token_id"] = (
-        src_gen_config["pad_token_id"]
-        if hasattr(src_gen_config, "pad_token_id") and src_gen_config["pad_token_id"] is not None
-        else src_gen_config["eos_token_id"][0]
-        if isinstance(src_gen_config["eos_token_id"], list)
-        else src_gen_config["eos_token_id"]
-    )
-    genai_config["model"]["type"] = src_config.get("model_type", "")
-    genai_config["model"]["vocab_size"] = src_config.get("vocab_size", -1)
-
-    genai_config["search"]["max_length"] = src_config.get("max_position_embeddings", -1)
-
-    if isinstance(config.genai_config_override, dict):
-        apply_genai_overrides(genai_config, config.genai_config_override)
-
-    # Step 2: Write to JSON file
-    output_genai_config = Path(output_path) / "genai_config.json"
-    with open(output_genai_config, "w") as f:
-        json.dump(genai_config, f, indent=4)
diff --git a/olive/passes/openvino/optimum_intel.py b/olive/passes/openvino/optimum_intel.py
index f872c14539..2105f512f2 100644
--- a/olive/passes/openvino/optimum_intel.py
+++ b/olive/passes/openvino/optimum_intel.py
@@ -5,7 +5,7 @@
 import logging
 from copy import deepcopy
 from pathlib import Path
-from typing import Optional, Union
+from typing import Any, Optional, Union
 
 from huggingface_hub.constants import HUGGINGFACE_HUB_CACHE
 
@@ -13,55 +13,12 @@
 from olive.hardware.accelerator import AcceleratorSpec, Device
 from olive.model import CompositeModelHandler, HfModelHandler, OpenVINOModelHandler
 from olive.passes import Pass
+from olive.passes.openvino.ov_utils import OVOptimumLibrary, infer_library_name
 from olive.passes.pass_config import BasePassConfig, PassConfigParam, get_user_script_data_config
 
 logger = logging.getLogger(__name__)
 
 
-def maybe_load_preprocessors(
-    src_name_or_path: Union[str, Path], subfolder: str = "", trust_remote_code: bool = False
-) -> list:
-    try:
-        from transformers import AutoFeatureExtractor, AutoImageProcessor, AutoProcessor, AutoTokenizer
-    except Exception as e:
-        raise ImportError("Unable to import transformers packages: ", e) from None
-
-    preprocessors = []
-    try:
-        preprocessors.append(
-            AutoTokenizer.from_pretrained(src_name_or_path, subfolder=subfolder, trust_remote_code=trust_remote_code)
-        )
-    except Exception:
-        pass
-
-    try:
-        preprocessors.append(
-            AutoProcessor.from_pretrained(src_name_or_path, subfolder=subfolder, trust_remote_code=trust_remote_code)
-        )
-    except Exception:
-        pass
-
-    try:
-        preprocessors.append(
-            AutoFeatureExtractor.from_pretrained(
-                src_name_or_path, subfolder=subfolder, trust_remote_code=trust_remote_code
-            )
-        )
-    except Exception:
-        pass
-
-    try:
-        preprocessors.append(
-            AutoImageProcessor.from_pretrained(
-                src_name_or_path, subfolder=subfolder, trust_remote_code=trust_remote_code
-            )
-        )
-    except Exception:
-        pass
-
-    return preprocessors
-
-
 def infer_task(
     task,
     model_name_or_path,
@@ -70,9 +27,10 @@ def infer_task(
     cache_dir: str = HUGGINGFACE_HUB_CACHE,
     token: Optional[Union[bool, str]] = None,
     library_name: Optional[str] = None,
+    trust_remote_code: bool = False,
 ):
     try:
-        from optimum.exporters import TasksManager
+        from optimum.exporters.tasks import TasksManager
     except Exception as e:
         raise ImportError("Unable to import optimum packages:", e) from None
 
@@ -81,6 +39,7 @@ def infer_task(
     except Exception as e:
         raise ImportError("Unable to import ConnectionError packages:", e) from None
 
+    original_task = task
     task = TasksManager.map_from_synonym(task)
     if task == "auto":
         if library_name == "open_clip":
@@ -96,52 +55,149 @@ def infer_task(
                     library_name=library_name,
                 )
             except KeyError as e:
-                raise KeyError(
-                    f"The task could not be automatically inferred. Please provide the argument --task with the relevant task from {', '.join(TasksManager.get_all_tasks())}. Detailed error: {e}"
-                ) from None
+                try:
+                    from transformers import AutoConfig
+                except ImportError as ie:
+                    raise ImportError(f"Unable to import AutoConfig from transformers: {ie}") from None
+                try:
+                    config = AutoConfig.from_pretrained(model_name_or_path)
+                    with_past_arch_list = ["MistralForCausalLM", "Zamba2ForCausalLM"]
+                    architectures = getattr(config, "architectures", None) or []
+                    if any(arch in architectures for arch in with_past_arch_list):
+                        task = "text-generation-with-past"
+                except Exception:
+                    raise KeyError(
+                        f"The task could not be automatically inferred. Please provide the argument --task with the relevant task from {', '.join(TasksManager.get_all_tasks())}. Detailed error: {e}"
+                    ) from None
             except RequestsConnectionError as e:
                 raise RequestsConnectionError(
                     f"The task could not be automatically inferred as this is available only for models hosted on the Hugging Face Hub. Please provide the argument --task with the relevant task from {', '.join(TasksManager.get_all_tasks())}. Detailed error: {e}"
                 ) from None
-    return task
 
+    if library_name == "transformers":
+        try:
+            from transformers import AutoConfig
+        except ImportError as e:
+            raise ImportError(f"Unable to import AutoConfig from transformers: {e}") from None
+        config = AutoConfig.from_pretrained(
+            model_name_or_path,
+            subfolder=subfolder,
+            revision=revision,
+            cache_dir=cache_dir,
+            token=token,
+            trust_remote_code=trust_remote_code,
+        )
+        if hasattr(config, "export_model_type"):
+            model_type = config.export_model_type
+        else:
+            model_type = config.model_type
+        custom_architecture = model_type not in TasksManager._SUPPORTED_MODEL_TYPE  # pylint: disable=W0212
+        if not custom_architecture and task + "-with-past" in TasksManager.get_supported_tasks_for_model_type(
+            model_type, exporter="openvino", library_name=library_name
+        ):
+            # Make -with-past the default if --task was not explicitly specified
+            if original_task == "auto":
+                task = task + "-with-past"
+            else:
+                logger.info(
+                    "The task `%s` was manually specified, and past key values will not be reused in the decoding."
+                    " if needed, please pass `--task %s-with-past` to export using the past key values.",
+                    task,
+                    task,
+                )
+    return task
 
-def maybe_convert_tokenizers(library_name: str, output: Path, model=None, preprocessors=None, task=None):
-    from optimum.exporters.openvino.convert import export_tokenizer
 
+def _main_quantize(
+    model_name_or_path: str,
+    task: str,
+    library_name: str,
+    quantization_config: Union[dict, "OVQuantizationConfigBase"],  # noqa: F821
+    output: Path,
+    cache_dir: str,
+    trust_remote_code: bool = False,
+    subfolder: str = "",
+    revision: str = "main",
+    token: Optional[Union[bool, str]] = None,
+    model_kwargs: Optional[dict[str, Any]] = None,
+):
     try:
-        from transformers import PreTrainedTokenizerBase, ProcessorMixin
-    except Exception as e:
-        raise ImportError("Unable to import transformers packages:", e) from None
+        from optimum.intel.openvino.utils import _HEAD_TO_AUTOMODELS
+        from optimum.intel.utils.import_utils import is_diffusers_available
+    except ImportError as e:
+        raise ImportError("Please install Intel® optimum[openvino] to use OpenVINO Optimum Conversion") from e
+
+    # Step 0. Infer task and library name if needed
+    original_task = task
+    task = infer_task(
+        task,
+        model_name_or_path,
+        subfolder=subfolder,
+        revision=revision,
+        cache_dir=cache_dir,
+        token=token,
+        library_name=library_name,
+        trust_remote_code=trust_remote_code,
+    )
+    if library_name is None:
+        library_name = infer_library_name(
+            model_name_or_path,
+            subfolder=subfolder,
+            revision=revision,
+            cache_dir=cache_dir,
+            token=token,
+        )
 
-    try:
-        from optimum.intel.utils.import_utils import is_openvino_tokenizers_available
-    except Exception as e:
-        raise ImportError("openvino tokenizers unavailable :", e) from None
-
-    if is_openvino_tokenizers_available():
-        if library_name != "diffusers" and preprocessors:
-            processor_chat_template = None
-            tokenizer = next(filter(lambda it: isinstance(it, PreTrainedTokenizerBase), preprocessors), None)
-            if len(preprocessors) > 1:
-                for processor in preprocessors:
-                    if isinstance(processor, ProcessorMixin) and hasattr(processor, "chat_template"):
-                        processor_chat_template = processor.chat_template
-            if tokenizer:
-                try:
-                    export_tokenizer(tokenizer, output, task=task, processor_chat_template=processor_chat_template)
-                except Exception as exception:
-                    logger.warning(
-                        "Could not load tokenizer using specified model ID or path. OpenVINO tokenizer/detokenizer models won't be generated. Exception: %s",
-                        exception,
-                    )
-        elif model:
-            for tokenizer_name in ("tokenizer", "tokenizer_2", "tokenizer_3"):
-                tokenizer = getattr(model, tokenizer_name, None)
-                if tokenizer:
-                    export_tokenizer(tokenizer, output / tokenizer_name, task=task)
+    # Step 1. Obtain the correct OpenVINO model class
+    if library_name == "diffusers":
+        if not is_diffusers_available():
+            raise ValueError("Export of diffusers models requires the diffusers library to be installed.")
+
+        try:
+            from diffusers import DiffusionPipeline
+        except ImportError as e:
+            raise ImportError("Unable to import diffusers packages:", e) from None
+
+        diffusers_config = DiffusionPipeline.load_config(model_name_or_path)
+        class_name = diffusers_config.get("_class_name", None)
+        ov_class_name = f"OV{class_name}"
+        try:
+            model_cls = getattr(__import__("optimum.intel", fromlist=[ov_class_name]), ov_class_name)
+        except (AttributeError, ImportError) as e:
+            raise RuntimeError(f"Wasn't able to locate OpenVINO class for {class_name} diffusion model.") from e
     else:
-        logger.warning("Tokenizer won't be converted.")
+        try:
+            model_cls_name = _HEAD_TO_AUTOMODELS[task.replace("-with-past", "")]
+            if library_name == "sentence_transformers":
+                model_cls_name = "OVSentenceTransformer"
+            model_cls = getattr(__import__("optimum.intel", fromlist=[model_cls_name]), model_cls_name)
+        except (AttributeError, ImportError, KeyError) as e:
+            raise RuntimeError(f"Wasn't able to locate OpenVINO class for task {original_task} ({task}).") from e
+
+    # Step 2. Load the exported model
+    # Filter out keys that are explicitly passed to from_pretrained to avoid
+    # "got multiple values for keyword argument" TypeError
+    _explicit_keys = {"trust_remote_code", "cache_dir", "use_cache", "compile"}
+    filtered_kwargs = {k: v for k, v in (model_kwargs or {}).items() if k not in _explicit_keys}
+    model = model_cls.from_pretrained(
+        output,
+        compile=False,
+        trust_remote_code=trust_remote_code,
+        cache_dir=cache_dir,
+        use_cache=task.endswith("with-past"),
+        **filtered_kwargs,
+    )
+
+    # Step 3. Apply quantization and save the quantized model
+    model._apply_quantization(  # pylint: disable=W0212
+        quantization_config,
+        compile_only=False,
+        compile_model=False,
+        model_name_or_path=model_name_or_path,
+        trust_remote_code=trust_remote_code,
+        save_directory=output,
+        immediate_save=True,
+    )
 
 
 class OVQuantMode(StrEnumBase):
@@ -154,14 +210,6 @@ class OVQuantMode(StrEnumBase):
     INT4_F8E5M2 = "int4_f8e5m2"
 
 
-class OVOptimumLibrary(StrEnumBase):
-    TRANSFORMERS = "transformers"
-    DIFFUSERS = "diffusers"
-    TIMM = "timm"
-    SENTENCE_TRANSFORMERS = "sentence_transformers"
-    OPEN_CLIP = "open_clip"
-
-
 class OVOptimumFramework(StrEnumBase):
     PT = "pt"
     TF = "tf"
@@ -302,9 +350,12 @@ def _run_for_config(
     ) -> Union[OpenVINOModelHandler, CompositeModelHandler]:
         try:
             from optimum.exporters.openvino import main_export as export_optimum_intel
-            from optimum.exporters.openvino.utils import save_preprocessors
-            from optimum.intel.openvino.configuration import OVConfig, get_default_int4_config
-            from optimum.intel.utils.modeling_utils import _infer_library_from_model_name_or_path
+            from optimum.intel.openvino.configuration import (
+                OVConfig,
+                _GPTOSSQuantizationConfig,
+                get_default_quantization_config,
+            )
+            from optimum.intel.utils.import_utils import is_nncf_available
         except ImportError as e:
             raise ImportError("Please install Intel® optimum[openvino] to use OpenVINO Optimum Conversion") from e
 
@@ -323,27 +374,18 @@ def _run_for_config(
             }
         )
 
-        if model.load_kwargs and "trust_remote_code" not in extra_args:
-            extra_args["trust_remote_code"] = model.load_kwargs.trust_remote_code
-
         if extra_args.get("library") is None:
-            lib_name = _infer_library_from_model_name_or_path(model.model_name_or_path)
-            if lib_name == "sentence_transformers":
-                logger.warning(
-                    "Library is not specified. "
-                    "There are multiple possible variants: `sentence_transformers`, `transformers`. "
-                    "`transformers` will be selected. "
-                    "If you want to load your model with the `sentence-transformers` library instead, "
-                    "Please set it as sentence_transformers in extra_args dictionary under 'library' key"
-                )
-                lib_name = "transformers"
+            lib_name = infer_library_name(model.model_name_or_path)
         else:
             lib_name = extra_args["library"]
 
         if config.ov_quant_config:
             if config.ov_quant_config.get("weight_format") is None and config.ov_quant_config.get("quant_mode") is None:
                 ov_config = None
-                if not no_compression_parameter_provided(config.ov_quant_config):
+                if (
+                    not no_compression_parameter_provided(config.ov_quant_config)
+                    or config.ov_quant_config.get("quantization_statistics_path", None) is not None
+                ):
                     raise ValueError(
                         "Some compression parameters are provided, but the weight format is not specified. "
                         "Please provide it with weight_format key in ov_quant_config dictionary."
@@ -356,139 +398,98 @@ def _run_for_config(
             elif config.ov_quant_config.get("weight_format") in {"fp16", "fp32"}:
                 ov_config = OVConfig(dtype=config.ov_quant_config["weight_format"])
             else:
+                if not is_nncf_available():
+                    raise ImportError("Please install nncf to use OpenVINO Optimum Conversion with quantization.")
+                if (
+                    config.ov_quant_config.get("weight_format") is not None
+                    and config.ov_quant_config.get("quant_mode") is not None
+                ):
+                    # both are provided, so raise ValueError
+                    raise ValueError("Both weight_format and quant_mode are provided. Please provide only one of them.")
+
+                default_quantization_config = get_default_quantization_config(
+                    model.model_name_or_path,
+                    config.ov_quant_config.get("weight_format"),
+                    config.ov_quant_config.get("quant_mode"),
+                )
+
                 if config.ov_quant_config.get("weight_format") is not None:
-                    # For int4 quantization if no parameter is provided, then use the default config if exists
+                    # weight compression
+                    quant_config = prep_wc_config(config.ov_quant_config, WRAPPER_4_BIT)
+                    if no_compression_parameter_provided(config.ov_quant_config) and config.ov_quant_config.get(
+                        "weight_format"
+                    ) in ["int4", "int8"]:
+                        if default_quantization_config is not None:
+                            quant_config = default_quantization_config
+                            logger.info(
+                                "Applying the default quantization config for model %s: %s",
+                                model.model_name_or_path,
+                                quant_config,
+                            )
+                        elif config.ov_quant_config.get("weight_format") == "int4":
+                            quant_config = WRAPPER_4_BIT
+                            logger.info(
+                                "Applying a default 4-bit weight compression config for model %s: %s",
+                                model.model_name_or_path,
+                                quant_config,
+                            )
+                        if config.ov_quant_config.get("quantization_statistics_path", None) is not None:
+                            quant_config["statistics_path"] = config.ov_quant_config.get("quantization_statistics_path")
+                else:
                     if (
-                        no_compression_parameter_provided(config.ov_quant_config)
-                        and config.ov_quant_config.get("weight_format") == "int4"
+                        no_quantization_parameter_provided(config.ov_quant_config)
+                        and default_quantization_config is not None
                     ):
-                        quant_config = get_default_int4_config(model.model_name_or_path)
-                    else:
-                        quant_config = prep_wc_config(config.ov_quant_config, WRAPPER_4_BIT)
-                    if quant_config.get("dataset", None) is not None:
-                        quant_config["trust_remote_code"] = config.ov_quant_config.get("trust_remote_code", False)
-                    ov_config = OVConfig(quantization_config=quant_config)
-                else:
-                    ov_config = None
-                    if config.ov_quant_config.get("dataset", None) is None:
-                        raise ValueError(
-                            "Dataset is required for full quantization. "
-                            "Please provide it in ov_quant_config dictionary under 'dataset' key"
+                        quant_config = default_quantization_config
+                        logger.info(
+                            "Applying the default quantization config for model %s: %s",
+                            model.model_name_or_path,
+                            quant_config,
                         )
-                    if config.ov_quant_config.get("quant_mode") in [
-                        "nf4_f8e4m3",
-                        "nf4_f8e5m2",
-                        "int4_f8e4m3",
-                        "int4_f8e5m2",
-                    ]:
-                        if lib_name == "diffusers":
-                            raise NotImplementedError("Mixed precision quantization isn't supported for diffusers.")
-                        wc_config = prep_wc_config(config.ov_quant_config, WRAPPER_4_BIT)
-                        wc_dtype, q_dtype = config.ov_quant_config["quant_mode"].split("_")
-                        wc_config["dtype"] = wc_dtype
-
-                        q_config = prep_q_config(config.ov_quant_config)
-                        q_config["dtype"] = q_dtype
-                        quant_config = {
-                            "weight_quantization_config": wc_config,
-                            "full_quantization_config": q_config,
-                            "num_samples": self.args.num_samples,
-                            "dataset": self.args.dataset,
-                            "trust_remote_code": self.args.trust_remote_code,
-                        }
                     else:
-                        quant_config = prep_q_config(config.ov_quant_config)
-                    ov_config = OVConfig(quantization_config=quant_config)
+                        if quant_config.get("dataset", None) is None:
+                            raise ValueError(
+                                "Dataset is required for full quantization. "
+                                "Please provide it in ov_quant_config dictionary under 'dataset' key"
+                            )
+                        if config.ov_quant_config.get("quant_mode") in [
+                            "cb4_f8e4m3",
+                            "int4_f8e4m3",
+                            "int4_f8e5m2",
+                        ]:
+                            if lib_name == "diffusers":
+                                raise NotImplementedError("Mixed precision quantization isn't supported for diffusers.")
+                            wc_config = prep_wc_config(config.ov_quant_config, WRAPPER_4_BIT)
+                            wc_dtype, q_dtype = config.ov_quant_config["quant_mode"].split("_")
+                            wc_config["dtype"] = wc_dtype
+
+                            q_config = prep_q_config(config.ov_quant_config)
+                            q_config["dtype"] = q_dtype
+
+                            quant_config = {
+                                "weight_quantization_config": wc_config,
+                                "full_quantization_config": q_config,
+                                "num_samples": config.ov_quant_config.get("num_samples"),
+                                "dataset": config.ov_quant_config.get("dataset"),
+                            }
+                        else:
+                            if config.ov_quant_config.get("quantization_statistics_path", None) is not None:
+                                logger.warning(
+                                    "quantization_statistics_path is only applicable for weight-only"
+                                    " quantization. It will be ignored."
+                                )
+                            quant_config = prep_q_config(config.ov_quant_config)
+
+                ov_config = OVConfig(quantization_config=quant_config)
         else:
             ov_config = None
 
         # quantization config
         quant_config = ov_config.quantization_config if ov_config else None
-        quantize_with_dataset = quant_config and getattr(quant_config, "dataset", None) is not None
-        task = infer_task(extra_args.get("task", "auto"), model.model_name_or_path, library_name=lib_name)
-
-        # model
-        if lib_name == "diffusers" and quantize_with_dataset:
-            try:
-                from diffusers import DiffusionPipeline
-            except ImportError:
-                raise ImportError("Please install diffusers to use OpenVINO with Diffusers models.") from None
-
-            diffusers_config = DiffusionPipeline.load_config(model.model_name_or_path)
-            class_name = diffusers_config.get("_class_name", None)
 
-            if class_name == "LatentConsistencyModelPipeline":
-                from optimum.intel import OVLatentConsistencyModelPipeline
+        apply_main_quantize = quant_config and not isinstance(quant_config, _GPTOSSQuantizationConfig)
 
-                model_cls = OVLatentConsistencyModelPipeline
-
-            elif class_name == "StableDiffusionXLPipeline":
-                from optimum.intel import OVStableDiffusionXLPipeline
-
-                model_cls = OVStableDiffusionXLPipeline
-            elif class_name == "StableDiffusionPipeline":
-                from optimum.intel import OVStableDiffusionPipeline
-
-                model_cls = OVStableDiffusionPipeline
-            elif class_name == "StableDiffusion3Pipeline":
-                from optimum.intel import OVStableDiffusion3Pipeline
-
-                model_cls = OVStableDiffusion3Pipeline
-            elif class_name == "FluxPipeline":
-                from optimum.intel import OVFluxPipeline
-
-                model_cls = OVFluxPipeline
-            elif class_name == "SanaPipeline":
-                from optimum.intel import OVSanaPipeline
-
-                model_cls = OVSanaPipeline
-            else:
-                raise NotImplementedError(f"Quantization isn't supported for class {class_name}.")
-
-            output_model = model_cls.from_pretrained(
-                model.model_name_or_path, export=True, quantization_config=quant_config
-            )
-            output_model.save_pretrained(output_model_path)
-            if not extra_args.get("disable_convert_tokenizer", False):
-                maybe_convert_tokenizers(lib_name, output_model_path, model, task=task)
-        elif (
-            quantize_with_dataset and (task.startswith("text-generation") or "automatic-speech-recognition" in task)
-        ) or (task == "image-text-to-text" and quant_config is not None):
-            if task.startswith("text-generation"):
-                from optimum.intel import OVModelForCausalLM
-
-                model_cls = OVModelForCausalLM
-            elif task == "image-text-to-text":
-                from optimum.intel import OVModelForVisualCausalLM
-
-                model_cls = OVModelForVisualCausalLM
-            else:
-                from optimum.intel import OVModelForSpeechSeq2Seq
-
-                model_cls = OVModelForSpeechSeq2Seq
-
-            # In this case, to apply quantization an instance of a model class is required
-            output_model = model_cls.from_pretrained(
-                model.model_name_or_path,
-                export=True,
-                quantization_config=quant_config,
-                stateful=not extra_args.get("disable_stateful", False),
-                trust_remote_code=extra_args.get("trust_remote_code", False),
-                variant=extra_args.get("variant", None),
-                cache_dir=extra_args.get("cache_dir", HUGGINGFACE_HUB_CACHE),
-            )
-            output_model.save_pretrained(output_model_path)
-
-            preprocessors = maybe_load_preprocessors(
-                model.model_name_or_path, trust_remote_code=extra_args.get("trust_remote_code", False)
-            )
-            save_preprocessors(
-                preprocessors, output_model.config, output_model_path, extra_args.get("trust_remote_code", False)
-            )
-            if not extra_args.get("disable_convert_tokenizer", False):
-                maybe_convert_tokenizers(lib_name, output_model_path, preprocessors=preprocessors, task=task)
-
-        else:
+        try:
             extra_args["ov_config"] = ov_config
             extra_args["stateful"] = not extra_args.get("disable_stateful", False)
             extra_args.pop("disable_stateful", False)
@@ -501,6 +502,21 @@ def _run_for_config(
                 output_model_path,
                 **extra_args,
             )
+            if apply_main_quantize:
+                _main_quantize(
+                    model_name_or_path=model.model_name_or_path,
+                    task=extra_args.get("task", "auto"),
+                    library_name=lib_name,
+                    quantization_config=quant_config,
+                    output=Path(output_model_path),
+                    cache_dir=config.ov_quant_config.get("cache_dir", None) if config.ov_quant_config else None,
+                    trust_remote_code=config.ov_quant_config.get("trust_remote_code", False)
+                    if config.ov_quant_config
+                    else False,
+                    model_kwargs=model.load_kwargs.__dict__ if model.load_kwargs else None,
+                )
+        except Exception as e:
+            raise RuntimeError(f"OpenVINO optimum export failed: {e}") from None
 
         # check the exported components
         exported_models = [name.stem for name in Path(output_model_path).iterdir() if name.suffix == ".xml"]
@@ -578,6 +594,8 @@ def prep_wc_config(quant_cfg, default_cfg):
         "lora_correction": quant_cfg.get("lora_correction", None),
         "dtype": quant_cfg.get("weight_format"),
         "backup_precision": quant_cfg.get("backup_precision"),
+        "statistics_path": quant_cfg.get("statistics_path", None),
+        "group_size_fallback": quant_cfg.get("group_size_fallback", None),
     }
 
 
@@ -590,7 +608,6 @@ def prep_q_config(quant_cfg):
         "dataset": quant_cfg.get("dataset"),
         "num_samples": quant_cfg.get("num_samples"),
         "smooth_quant_alpha": quant_cfg.get("smooth_quant_alpha"),
-        "trust_remote_code": quant_cfg.get("trust_remote_code", False),
     }
 
 
diff --git a/olive/passes/openvino/ov_utils.py b/olive/passes/openvino/ov_utils.py
new file mode 100644
index 0000000000..9b9f4db9d8
--- /dev/null
+++ b/olive/passes/openvino/ov_utils.py
@@ -0,0 +1,370 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Intel Corporation. All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+import json
+import logging
+import numbers
+from collections.abc import Mapping, MutableMapping
+from pathlib import Path
+from typing import Any, Optional, Union
+
+from huggingface_hub.constants import HUGGINGFACE_HUB_CACHE
+
+from olive.common.utils import StrEnumBase
+from olive.passes.pass_config import BasePassConfig
+
+logger = logging.getLogger(__name__)
+
+
+class IgnoreScopeTypeEnum(StrEnumBase):
+    NAMES = "names"
+    TYPES = "types"
+    PATTERNS = "patterns"
+
+
+class OVOptimumLibrary(StrEnumBase):
+    TRANSFORMERS = "transformers"
+    DIFFUSERS = "diffusers"
+    TIMM = "timm"
+    SENTENCE_TRANSFORMERS = "sentence_transformers"
+    OPEN_CLIP = "open_clip"
+
+
+def _validate_enum_value(value, enum_class: type, param_name: str) -> tuple[bool, str]:
+    """Validate that a value can be converted to an enum (case-insensitive).
+
+    Args:
+        value: The value to validate (None, string, or already enum).
+        enum_class: The enum class to validate against.
+        param_name: Name of the parameter for error messages.
+
+    Returns:
+        Tuple of (is_valid, error_message). error_message is empty if valid.
+
+    """
+    if value is None or isinstance(value, enum_class):
+        return True, ""
+
+    if not isinstance(value, str):
+        return False, f"{param_name} '{value}' is not a valid string or {enum_class.__name__} enum."
+
+    lookup_key = value.lower()
+
+    # Try matching by enum.value first (case-insensitive)
+    value_map = {m.value.lower(): m for m in enum_class}
+    if lookup_key in value_map:
+        return True, ""
+
+    # Try matching by enum.name (case-insensitive)
+    name_map = {m.name.lower(): m for m in enum_class}
+    if lookup_key in name_map:
+        return True, ""
+
+    # Validation failed
+    valid_values = sorted({m.value for m in enum_class} | {m.name for m in enum_class})
+    return False, f"{param_name} '{value}' is not supported. Supported values are: {', '.join(valid_values)}."
+
+
+def _convert_to_enum(value, enum_class: type, param_name: str):
+    """Convert a value to an enum if needed (case-insensitive).
+
+    Accepts:
+    - None (returns None)
+    - Enum instances of the correct type (returns as-is)
+    - Strings matching enum.value (case-insensitive)
+    - Strings matching enum.name (case-insensitive)
+
+    Args:
+        value: The value to convert (None, string, or already enum).
+        enum_class: The enum class to convert to.
+        param_name: Name of the parameter for error messages.
+
+    Returns:
+        The enum value, or None if input was None.
+
+    Raises:
+        ValueError: If conversion fails.
+
+    """
+    if value is None or isinstance(value, enum_class):
+        return value
+
+    if not isinstance(value, str):
+        raise ValueError(f"{param_name} '{value}' is not a valid string or {enum_class.__name__} enum.")
+
+    lookup_key = value.lower()
+
+    # Try matching by enum.value first (case-insensitive)
+    value_map = {m.value.lower(): m for m in enum_class}
+    if lookup_key in value_map:
+        return value_map[lookup_key]
+
+    # Try matching by enum.name (case-insensitive)
+    name_map = {m.name.lower(): m for m in enum_class}
+    if lookup_key in name_map:
+        return name_map[lookup_key]
+
+    # Conversion failed
+    valid_values = sorted({m.value for m in enum_class} | {m.name for m in enum_class})
+    raise ValueError(f"{param_name} '{value}' is not supported. Supported values are: {', '.join(valid_values)}.")
+
+
+def infer_library_name(
+    model_name_or_path: str,
+    subfolder: str = "",
+    revision: Optional[str] = None,
+    cache_dir: str = HUGGINGFACE_HUB_CACHE,
+    token: Optional[Union[bool, str]] = None,
+) -> str:
+    """Infer the Optimum-Intel library name for a given model.
+
+    Falls back to ``"transformers"`` when ``sentence_transformers`` is detected
+
+    Args:
+        model_name_or_path: The model identifier or path. str
+        subfolder: The subfolder within the model repository. optional. str. default is "".
+        revision: The specific model version to use. optional. str. default is None (latest version).
+        cache_dir: The directory to use for caching. optional. str. default is HUGGINGFACE_HUB_CACHE.
+        token: The huggingface token to use. optional. bool or str. default is None.
+
+    Returns:
+        The inferred library name. str
+
+    Raises:
+        ImportError: If the optimum.intel library cannot be imported.
+
+    """
+    try:
+        from optimum.intel.utils.modeling_utils import _infer_library_from_model_name_or_path
+    except ImportError as e:
+        raise ImportError("Please install Intel® optimum[openvino] to use OpenVINO Optimum Conversion") from e
+    library_name = _infer_library_from_model_name_or_path(
+        model_name_or_path=model_name_or_path,
+        subfolder=subfolder,
+        revision=revision,
+        cache_dir=cache_dir,
+        token=token,
+    )
+    if library_name == "sentence_transformers":
+        logger.warning(
+            "Library name is not specified. There are multiple possible variants: `sentence_transformers`, `transformers`."
+            " `transformers` will be selected. If you want to load your model with the `sentence-transformers` library instead, please set --library sentence_transformers"
+        )
+        library_name = "transformers"
+    return library_name
+
+
+def _compatible_type(default_val: Any, new_val: Any) -> bool:
+    """Loose type check: allow ints for floats, bool as bool, etc."""
+    if default_val is None:
+        return True
+    if isinstance(default_val, bool):
+        return isinstance(new_val, bool)
+    if isinstance(default_val, numbers.Real) and not isinstance(default_val, bool):
+        return isinstance(new_val, numbers.Real) and not isinstance(new_val, bool)
+    if isinstance(default_val, str):
+        return isinstance(new_val, str)
+    if isinstance(default_val, (list, tuple)):
+        return isinstance(new_val, (list, tuple))
+    if isinstance(default_val, Mapping):
+        return isinstance(new_val, Mapping)
+    return True  # fall back to permissive
+
+
+def apply_genai_overrides(
+    defaults: MutableMapping[str, Any], overrides: Mapping[str, Any], *, path: str = ""
+) -> MutableMapping[str, Any]:
+    """Recursively merge *overrides* into *defaults*.
+
+    Only keys that already exist in *defaults* are updated. Type mismatches
+    are logged as warnings but still applied.
+
+    Args:
+        defaults: The original config to be updated (modified in-place). MutableMapping[str, Any].
+        overrides: The config values to override. Mapping[str, Any].
+        path: The current path within the config (used for recursive calls).
+
+    Returns:
+        The updated config with overrides applied. MutableMapping[str, Any].
+
+    """
+    for k, v in overrides.items():
+        here = f"{path}.{k}" if path else k
+        if k not in defaults:
+            continue
+
+        dv = defaults[k]
+
+        # Recurse for dicts
+        if isinstance(dv, Mapping) and isinstance(v, Mapping):
+            apply_genai_overrides(dv, v, path=here)
+            continue
+
+        # Replace lists/tuples and scalars
+        if not _compatible_type(dv, v):
+            logger.warning("Type mismatch at %s", here)
+        defaults[k] = v
+    return defaults
+
+
+def create_genai_config(model_name: str, output_path: str, config: BasePassConfig) -> None:
+    """Generate ``genai_config.json`` from model config files.
+
+    This is only for Generative AI models for which ``config.json`` and
+    ``generation_config.json`` exist in *output_path*.
+
+    Args:
+        model_name: Name of the ONNX model file that was generated.
+        output_path: Directory containing the model and config files.
+        config: Pass configuration instance (must expose ``target_device``; may
+            optionally expose ``genai_config_override``).
+
+    Returns:
+        None
+
+    Raises:
+        FileNotFoundError: If required config files are missing.
+
+    """
+    ip_conf_pth = Path(output_path) / "config.json"
+
+    # do not create genai_config.json if config.json does not exist
+    if not ip_conf_pth.exists():
+        return
+
+    ip_gen_pth = Path(output_path) / "generation_config.json"
+
+    # do not create genai_config.json if generation_config.json does not exist
+    if not ip_gen_pth.exists():
+        return
+
+    # Step 1: Create the data structure
+    genai_config: dict[str, Any] = {
+        "model": {
+            "bos_token_id": -1,
+            "context_length": -1,
+            "decoder": {
+                "session_options": {
+                    "log_id": "onnxruntime-genai",
+                    "graph_optimization_level": "ORT_DISABLE_ALL",
+                    "provider_options": [
+                        {"OpenVINO": {"device_type": config.target_device.upper(), "enable_causallm": "True"}}
+                    ],
+                },
+                "filename": "openvino_model.onnx",
+                "head_size": -1,
+                "hidden_size": -1,
+                "inputs": {},
+                "outputs": {},
+                "num_attention_heads": -1,
+                "num_hidden_layers": -1,
+                "num_key_value_heads": -1,
+            },
+            "eos_token_id": -1,
+            "type": "",
+            "vocab_size": -1,
+        },
+        "search": {
+            "diversity_penalty": 0.0,
+            "do_sample": False,
+            "early_stopping": True,
+            "length_penalty": 1.0,
+            "max_length": -1,
+            "min_length": 0,
+            "no_repeat_ngram_size": 0,
+            "num_beams": 1,
+            "num_return_sequences": 1,
+            "past_present_share_buffer": False,
+            "repetition_penalty": 1.0,
+            "temperature": 1.0,
+            "top_k": 1,
+            "top_p": 1.0,
+        },
+    }
+
+    with open(ip_conf_pth) as f:
+        src_config = json.load(f)
+
+    with open(ip_gen_pth) as f:
+        src_gen_config = json.load(f)
+
+    try:
+        import onnx
+    except ImportError:
+        raise ImportError(
+            "Please install onnx to create genai_config.json for ONNX OpenVINO IR Encapsulated model"
+        ) from None
+
+    model_path = Path(output_path) / model_name
+    model = onnx.load(model_path)
+
+    # Get input and output tensor names
+    inputs = [inp.name for inp in model.graph.input]
+    outputs = [out.name for out in model.graph.output]
+
+    genai_config["model"]["bos_token_id"] = src_config.get("bos_token_id", -1)
+    genai_config["model"]["context_length"] = src_config.get("max_position_embeddings", -1)
+    genai_config["model"]["decoder"]["filename"] = model_name
+
+    # Safe head_size computation
+    num_attention_heads = src_config.get("num_attention_heads", -1)
+    hidden_size = src_config.get("hidden_size", -1)
+    if (
+        isinstance(num_attention_heads, int)
+        and isinstance(hidden_size, int)
+        and num_attention_heads > 0
+        and hidden_size >= 0
+    ):
+        genai_config["model"]["decoder"]["head_size"] = hidden_size // num_attention_heads
+    else:
+        if not isinstance(num_attention_heads, int):
+            logger.warning("num_attention_heads is not an int: %s found in src_config", num_attention_heads)
+        elif num_attention_heads <= 0:
+            logger.warning("Invalid num_attention_heads (<= 0) %s found in src_config", num_attention_heads)
+        if not isinstance(hidden_size, int):
+            logger.warning("hidden_size is not an int: %s found in src_config", hidden_size)
+        elif hidden_size < 0:
+            logger.warning("Invalid hidden_size (< 0) %s found in src_config", hidden_size)
+        logger.warning("Setting genai_config['model']['decoder']['head_size'] to -1")
+        genai_config["model"]["decoder"]["head_size"] = -1
+
+    genai_config["model"]["decoder"]["hidden_size"] = src_config.get("hidden_size", -1)
+
+    for name in inputs:
+        if name != "beam_idx":
+            genai_config["model"]["decoder"]["inputs"].update({name: name})
+
+    for name in outputs:
+        genai_config["model"]["decoder"]["outputs"].update({name: name})
+
+    genai_config["model"]["decoder"]["num_attention_heads"] = src_config.get("num_attention_heads", -1)
+    genai_config["model"]["decoder"]["num_hidden_layers"] = src_config.get("num_hidden_layers", -1)
+    genai_config["model"]["decoder"]["num_key_value_heads"] = src_config.get("num_key_value_heads", -1)
+
+    eos_token_id = src_gen_config.get("eos_token_id", -1)
+    genai_config["model"]["eos_token_id"] = eos_token_id
+    pad_token_id = src_gen_config.get("pad_token_id", None)
+    if pad_token_id is not None:
+        genai_config["model"]["pad_token_id"] = pad_token_id
+    elif eos_token_id != -1:
+        genai_config["model"]["pad_token_id"] = (
+            eos_token_id[0] if isinstance(eos_token_id, list) and len(eos_token_id) > 0 else eos_token_id
+        )
+    else:
+        genai_config["model"]["pad_token_id"] = -1
+
+    genai_config["model"]["type"] = src_config.get("model_type", "")
+    genai_config["model"]["vocab_size"] = src_config.get("vocab_size", -1)
+
+    genai_config["search"]["max_length"] = src_config.get("max_position_embeddings", -1)
+
+    # Apply genai_config_override if the pass config exposes it
+    genai_config_override = getattr(config, "genai_config_override", None)
+    if isinstance(genai_config_override, dict):
+        apply_genai_overrides(genai_config, genai_config_override)
+
+    # Step 2: Write to JSON file
+    output_genai_config = Path(output_path) / "genai_config.json"
+    with open(output_genai_config, "w") as f:
+        json.dump(genai_config, f, indent=4)
diff --git a/olive/passes/openvino/quantization.py b/olive/passes/openvino/quantization.py
index 35c19339c6..967a341547 100644
--- a/olive/passes/openvino/quantization.py
+++ b/olive/passes/openvino/quantization.py
@@ -15,6 +15,7 @@
 from olive.model.handler import ONNXModelHandler, OpenVINOModelHandler
 from olive.passes import Pass
 from olive.passes.onnx.common import model_proto_to_file
+from olive.passes.openvino.ov_utils import IgnoreScopeTypeEnum
 from olive.passes.pass_config import BasePassConfig, ParamCategory, PassConfigParam, get_user_script_data_config
 
 logger = logging.getLogger(__name__)
@@ -91,12 +92,6 @@ class PresetEnum(StrEnumBase):
     MIXED = "MIXED"
 
 
-class IgnoreScopeTypeEnum(StrEnumBase):
-    NAMES = "names"
-    TYPES = "types"
-    PATTERNS = "patterns"
-
-
 class DropTypeEnum(StrEnumBase):
     ABSOLUTE = "ABSOLUTE"
     RELATIVE = "RELATIVE"
diff --git a/test/passes/openvino/test_openvino_compression.py b/test/passes/openvino/test_openvino_compression.py
index f12471c437..51a7d86647 100644
--- a/test/passes/openvino/test_openvino_compression.py
+++ b/test/passes/openvino/test_openvino_compression.py
@@ -13,7 +13,8 @@
 from olive.passes.olive_pass import create_pass_from_dict
 from olive.passes.onnx.optimum_conversion import OptimumConversion
 from olive.passes.openvino.compression import OpenVINOWeightCompression
-from test.utils import get_hf_model
+from olive.passes.openvino.optimum_intel import OpenVINOOptimumConversion
+from test.utils import get_hf_model, package_version_at_least
 
 pytestmark = pytest.mark.openvino
 
@@ -25,6 +26,10 @@ def wikitext_2_raw_v1_test():
     return datasets.load_dataset("Salesforce/wikitext", "wikitext-2-raw-v1", split="test")
 
 
+@pytest.mark.skipif(
+    not package_version_at_least("optimum", "2.1.0"),
+    reason="Requires optimum >= 2.1.0",
+)
 def test_openvino_weight_compression_hf_to_openvino(tmp_path):
     # imports here
     import numpy as np
@@ -97,6 +102,10 @@ def custom_transform_func(data, tokenizer):
         shutil.rmtree(hf_to_ov_model.model_path)
 
 
+@pytest.mark.skipif(
+    not package_version_at_least("optimum", "2.1.0"),
+    reason="Requires optimum >= 2.1.0",
+)
 def test_openvino_weight_compression_hf_to_openvino_multi_ignore_scope(tmp_path):
     # imports here
     import numpy as np
@@ -165,6 +174,10 @@ def custom_transform_func(data, tokenizer):
         shutil.rmtree(hf_to_ov_model.model_path)
 
 
+@pytest.mark.skipif(
+    not package_version_at_least("optimum", "2.1.0"),
+    reason="Requires optimum >= 2.1.0",
+)
 def test_openvino_weight_compression_hf_to_onnx(tmp_path):
     from nncf.parameters import CompressWeightsMode
     from nncf.quantization.advanced_parameters import GroupSizeFallbackMode
@@ -209,6 +222,10 @@ def test_openvino_weight_compression_hf_to_onnx(tmp_path):
     shutil.rmtree(q_dir)
 
 
+@pytest.mark.skipif(
+    not package_version_at_least("optimum", "2.1.0"),
+    reason="Requires optimum >= 2.1.0",
+)
 def test_openvino_weight_compression_hf_to_onnx_multi_ignore_scope(tmp_path):
     from nncf.parameters import CompressWeightsMode
 
@@ -220,7 +237,7 @@ def test_openvino_weight_compression_hf_to_onnx_multi_ignore_scope(tmp_path):
         "extra_args": {"use_onnx": True, "advanced_compression_parameters": {"backend_params": {"external_dir": True}}},
         "ignored_scope": [
             ["Gather", "Add", "MatMul"],
-            ["/model/Mul_1", "/model/Mul_2", "/model/Mul_3", "/model/Mul_4"],
+            ["/model/Mul", "/model/Mul_1"],
         ],
         "ignored_scope_type": ["types", "names"],
     }
@@ -249,6 +266,10 @@ def test_openvino_weight_compression_hf_to_onnx_multi_ignore_scope(tmp_path):
     shutil.rmtree(q_dir)
 
 
+@pytest.mark.skipif(
+    not package_version_at_least("optimum", "2.1.0"),
+    reason="Requires optimum >= 2.1.0",
+)
 def test_openvino_weight_compression_onnx_to_onnx(tmp_path):
     from nncf.parameters import CompressWeightsMode
     from nncf.quantization.advanced_parameters import GroupSizeFallbackMode
@@ -305,6 +326,10 @@ def test_openvino_weight_compression_onnx_to_onnx(tmp_path):
     shutil.rmtree(q_dir)
 
 
+@pytest.mark.skipif(
+    not package_version_at_least("optimum", "2.1.0"),
+    reason="Requires optimum >= 2.1.0",
+)
 def test_openvino_weight_compression_onnx_to_onnx_multi_ignore_scope(tmp_path):
     from nncf.parameters import CompressWeightsMode
 
@@ -327,7 +352,7 @@ def test_openvino_weight_compression_onnx_to_onnx_multi_ignore_scope(tmp_path):
         "extra_args": {"use_onnx": True, "advanced_compression_parameters": {"backend_params": {"external_dir": True}}},
         "ignored_scope": [
             ["Gather", "Add", "MatMul"],
-            ["/model/Mul_1", "/model/Mul_2", "/model/Mul_3", "/model/Mul_4"],
+            ["/model/Mul", "/model/Mul_1"],
         ],
         "ignored_scope_type": ["types", "names"],
     }
@@ -355,3 +380,119 @@ def test_openvino_weight_compression_onnx_to_onnx_multi_ignore_scope(tmp_path):
     else:
         q_dir = Path(onnx_to_onnx_model.model_path)
     shutil.rmtree(q_dir)
+
+
+@pytest.mark.skipif(
+    not package_version_at_least("optimum", "2.1.0"),
+    reason="Requires optimum >= 2.1.0",
+)
+def test_openvino_weight_compression_openvino_to_openvino(tmp_path):
+    """Test weight compression on an OpenVINO model."""
+    from nncf.parameters import CompressWeightsMode
+    from nncf.quantization.advanced_parameters import GroupSizeFallbackMode
+
+    # setup - first convert HF model to OpenVINO
+    input_hf_model = get_hf_model("hf-internal-testing/tiny-random-LlamaForCausalLM")
+    openvino_conversion_config = {"extra_args": {"disable_convert_tokenizer": True}}
+    p_convert = create_pass_from_dict(
+        OpenVINOOptimumConversion,
+        openvino_conversion_config,
+        disable_search=True,
+    )
+
+    # create output folder for conversion
+    output_folder_convert = str(Path(tmp_path) / "openvino_convert")
+    input_ov_model = p_convert.run(input_hf_model, output_folder_convert)
+
+    # setup weight compression pass
+    openvino_weight_compression_config = {
+        "compress_config": {"mode": CompressWeightsMode.INT4_SYM, "ratio": 1.0},
+        "extra_args": {
+            "advanced_compression_parameters": {
+                "group_size_fallback_mode": GroupSizeFallbackMode.IGNORE,
+            },
+        },
+    }
+    p = create_pass_from_dict(
+        OpenVINOWeightCompression,
+        openvino_weight_compression_config,
+        disable_search=True,
+        accelerator_spec=AcceleratorSpec("cpu", "OpenVINOExecutionProvider"),
+    )
+
+    # create output folder
+    output_folder = str(Path(tmp_path) / "openvino_wc_ov_to_ov")
+
+    # execute
+    ov_to_ov_model = p.run(input_ov_model, output_folder)
+
+    # define the XML and bin files paths if openvino models are produced
+    xml_file = Path(ov_to_ov_model.model_path) / "openvino_model.xml"
+    bin_file = Path(ov_to_ov_model.model_path) / "openvino_model.bin"
+
+    # test if the model file is created
+    assert xml_file.exists()
+    assert xml_file.is_file()
+    assert bin_file.exists()
+    assert bin_file.is_file()
+
+    # cleanup
+    shutil.rmtree(output_folder_convert)
+    if Path(ov_to_ov_model.model_path).exists():
+        shutil.rmtree(ov_to_ov_model.model_path)
+
+
+@pytest.mark.skipif(
+    not package_version_at_least("optimum", "2.1.0"),
+    reason="Requires optimum >= 2.1.0",
+)
+def test_openvino_weight_compression_openvino_to_openvino_multi_ignore_scope(tmp_path):
+    """Test weight compression on an OpenVINO model with multiple ignored scopes."""
+    from nncf.parameters import CompressWeightsMode
+
+    # setup - first convert HF model to OpenVINO
+    input_hf_model = get_hf_model("hf-internal-testing/tiny-random-LlamaForCausalLM")
+    openvino_conversion_config = {"extra_args": {"disable_convert_tokenizer": True}}
+    p_convert = create_pass_from_dict(
+        OpenVINOOptimumConversion,
+        openvino_conversion_config,
+        disable_search=True,
+    )
+
+    # create output folder for conversion
+    output_folder_convert = str(Path(tmp_path) / "openvino_convert")
+    input_ov_model = p_convert.run(input_hf_model, output_folder_convert)
+
+    # setup weight compression pass with multiple ignored scopes
+    openvino_weight_compression_config = {
+        "compress_config": {"mode": CompressWeightsMode.INT4_SYM, "ratio": 1.0},
+        "ignored_scope": [["Gather", "Add", "MatMul"], [".*Mul.*"]],
+        "ignored_scope_type": ["types", "patterns"],
+    }
+    p = create_pass_from_dict(
+        OpenVINOWeightCompression,
+        openvino_weight_compression_config,
+        disable_search=True,
+        accelerator_spec=AcceleratorSpec("cpu", "OpenVINOExecutionProvider"),
+    )
+
+    # create output folder
+    output_folder = str(Path(tmp_path) / "openvino_wc_ov_to_ov")
+
+    # execute
+    ov_to_ov_model = p.run(input_ov_model, output_folder)
+
+    # define the XML and bin files paths if openvino models are produced
+    xml_file = Path(ov_to_ov_model.model_path) / "openvino_model.xml"
+    bin_file = Path(ov_to_ov_model.model_path) / "openvino_model.bin"
+
+    # test if the model file is created
+    assert xml_file.exists()
+    assert xml_file.is_file()
+    assert bin_file.exists()
+    assert bin_file.is_file()
+
+    # cleanup
+    shutil.rmtree(output_folder_convert)
+    if Path(ov_to_ov_model.model_path).exists():
+        shutil.rmtree(ov_to_ov_model.model_path)
diff --git a/test/passes/openvino/test_openvino_optimum_conversion.py b/test/passes/openvino/test_openvino_optimum_conversion.py
index 55b6e76c40..ae411b24a3 100644
--- a/test/passes/openvino/test_openvino_optimum_conversion.py
+++ b/test/passes/openvino/test_openvino_optimum_conversion.py
@@ -8,11 +8,15 @@
 
 from olive.passes.olive_pass import create_pass_from_dict
 from olive.passes.openvino.optimum_intel import OpenVINOOptimumConversion
-from test.utils import get_hf_model
+from test.utils import get_hf_model, package_version_at_least
 
 pytestmark = pytest.mark.openvino
 
 
+@pytest.mark.skipif(
+    not package_version_at_least("optimum", "2.1.0"),
+    reason="Requires optimum >= 2.1.0",
+)
 def test_openvino_optimum_conversion_pass_convert_with_tokenizers(tmp_path):
     # setup
     input_hf_model = get_hf_model("hf-internal-testing/tiny-random-PhiForCausalLM")
@@ -37,6 +41,10 @@ def test_openvino_optimum_conversion_pass_convert_with_tokenizers(tmp_path):
     assert bin_file.is_file()
 
 
+@pytest.mark.skipif(
+    not package_version_at_least("optimum", "2.1.0"),
+    reason="Requires optimum >= 2.1.0",
+)
 def test_openvino_optimum_conversion_pass_convert_without_tokenizers(tmp_path):
     # setup
     input_hf_model = get_hf_model("hf-internal-testing/tiny-random-PhiForCausalLM")
@@ -61,6 +69,10 @@ def test_openvino_optimum_conversion_pass_convert_without_tokenizers(tmp_path):
     assert bin_file.is_file()
 
 
+@pytest.mark.skipif(
+    not package_version_at_least("optimum", "2.1.0"),
+    reason="Requires optimum >= 2.1.0",
+)
 def test_openvino_optimum_conversion_pass_convert_with_weight_compression(tmp_path):
     # setup
     input_hf_model = get_hf_model("hf-internal-testing/tiny-random-PhiForCausalLM")
@@ -94,12 +106,16 @@ def test_openvino_optimum_conversion_pass_convert_with_weight_compression(tmp_pa
     assert bin_file.is_file()
 
 
+@pytest.mark.skipif(
+    not package_version_at_least("optimum", "2.1.0"),
+    reason="Requires optimum >= 2.1.0",
+)
 def test_openvino_optimum_conversion_pass_convert_with_quantization(tmp_path):
     # setup
     input_hf_model = get_hf_model("hf-internal-testing/tiny-random-clip-zero-shot-image-classification")
     openvino_optimum_conversion_config = {
         "extra_args": {"device": "npu"},
-        "ov_quant_config": {"weight_format": "int8", "dataset": "auto"},
+        "ov_quant_config": {"weight_format": "int8"},
     }
 
     p = create_pass_from_dict(OpenVINOOptimumConversion, openvino_optimum_conversion_config, disable_search=True)
@@ -121,6 +137,10 @@ def test_openvino_optimum_conversion_pass_convert_with_quantization(tmp_path):
     assert bin_file.is_file()
 
 
+@pytest.mark.skipif(
+    not package_version_at_least("optimum", "2.1.0"),
+    reason="Requires optimum >= 2.1.0",
+)
 def test_openvino_optimum_conversion_pass_convert_multiple_components_without_main(tmp_path):
     # setup
     input_hf_model = get_hf_model("hf-internal-testing/tiny-random-Qwen2VLForConditionalGeneration")
diff --git a/test/requirements-test.txt b/test/requirements-test.txt
index 37d7fad2a1..f0e09c1ed4 100644
--- a/test/requirements-test.txt
+++ b/test/requirements-test.txt
@@ -12,7 +12,7 @@ marshmallow
 ml_dtypes
 mlflow
 neural-compressor<2.4
-nncf>=2.18.0
+nncf>=2.19.0
 numpy<2.0.0
 nvidia-modelopt
 onnx
@@ -23,7 +23,7 @@ onnxoptimizer
 onnxruntime_extensions
 onnxscript>=0.2.4, <0.6.1
 onnxsim
-openvino>=2025.3.0
+openvino>=2025.4.1
 optuna
 pandas
 peft
diff --git a/test/utils.py b/test/utils.py
index 8b4821babc..dc0683eb21 100644
--- a/test/utils.py
+++ b/test/utils.py
@@ -408,3 +408,24 @@ def get_wikitext_data_config(
             "random_seed": 42,
         },
     )
+
+
+def package_version_at_least(package_name: str, min_ver: str) -> bool:
+    """Return True if *package_name* is installed and its version is >= *min_ver*, False otherwise.
+
+    Intended for use in ``pytest.mark.skipif`` conditions where the check
+    must never raise during test collection.
+    """
+    try:
+        from importlib.metadata import PackageNotFoundError
+        from importlib.metadata import version as pkg_version
+
+        from packaging.version import InvalidVersion
+        from packaging.version import parse as parse_version
+    except ImportError:
+        return False
+
+    try:
+        return parse_version(pkg_version(package_name)) >= parse_version(min_ver)
+    except (PackageNotFoundError, InvalidVersion):
+        return False