From d494c8277385edd8d39b8ff1d1e9fcfd87c23778 Mon Sep 17 00:00:00 2001
From: Kyle Romero <kromero@qti.qualcomm.com>
Date: Wed, 13 Aug 2025 14:29:59 -0700
Subject: [PATCH 01/24] Initial commit

---
 examples/gemma3/README.md        |  2 +
 examples/gemma3/qnn_config.json  | 95 ++++++++++++++++++++++++++++++++
 examples/gemma3/requirements.txt |  5 ++
 3 files changed, 102 insertions(+)
 create mode 100644 examples/gemma3/README.md
 create mode 100644 examples/gemma3/qnn_config.json
 create mode 100644 examples/gemma3/requirements.txt

diff --git a/examples/gemma3/README.md b/examples/gemma3/README.md
new file mode 100644
index 000000000..fa20478d2
--- /dev/null
+++ b/examples/gemma3/README.md
@@ -0,0 +1,2 @@
+# Gemma-3-4B Model Optimization
+
diff --git a/examples/gemma3/qnn_config.json b/examples/gemma3/qnn_config.json
new file mode 100644
index 000000000..d84d5cc13
--- /dev/null
+++ b/examples/gemma3/qnn_config.json
@@ -0,0 +1,95 @@
+{
+    "input_model": { "type": "HfModel", "model_path": "google/gemma-3-4b-it" },
+    "systems": {
+        "qnn_system": {
+            "type": "PythonEnvironment",
+            "python_environment_path": "/path/to/qnn/env/bin",
+            "accelerators": [ { "execution_providers": [ "QNNExecutionProvider" ] } ]
+        }
+    },
+    "data_configs": [
+        {
+            "name": "wikitext2_train_joined",
+            "type": "HuggingfaceContainer",
+            "load_dataset_config": { "data_name": "wikitext", "subset": "wikitext-2-raw-v1", "split": "train" },
+            "pre_process_data_config": {
+                "strategy": "join",
+                "add_special_tokens": false,
+                "max_seq_len": 4096,
+                "max_samples": 128
+            }
+        },
+        {
+            "name": "wikitext2_train_act",
+            "type": "HuggingfaceContainer",
+            "load_dataset_config": { "data_name": "wikitext", "subset": "wikitext-2-raw-v1", "split": "train" },
+            "pre_process_data_config": {
+                "strategy": "line-by-line",
+                "add_special_tokens": true,
+                "max_samples": 256,
+                "max_seq_len": 4096
+            }
+        }
+    ],
+    "passes": {
+        "q": { "type": "QuaRot" },
+        "g": {
+            "type": "GptqQuantizer",
+            "sym": true,
+            "group_size": -1,
+            "desc_act": true,
+            "data_config": "wikitext2_train_joined"
+        },
+        "cs": { "type": "CaptureSplitInfo", "num_splits": 4, "unique_embeds_lm_head_splits": true },
+        "mb": {
+            "type": "ModelBuilder",
+            "precision": "int4",
+            "int4_block_size": 32,
+            "int4_accuracy_level": 4,
+            "int4_op_types_to_quantize": [ "MatMul", "Gather" ]
+        },
+        "mq": {
+            "type": "MatMulNBitsToQDQ",
+            "use_int4": true,
+            "add_zero_point": true,
+            "nodes_to_exclude": [ "/lm_head/MatMul_Q4" ],
+            "save_as_external_data": true
+        },
+        "gs": {
+            "type": "GraphSurgeries",
+            "surgeries": [
+                { "surgeon": "RemoveRopeMultiCache" },
+                { "surgeon": "AttentionMaskToSequenceLengths" },
+                { "surgeon": "SimplifiedLayerNormToL2Norm" }
+            ],
+            "save_as_external_data": true
+        },
+        "sq": {
+            "type": "OnnxStaticQuantization",
+            "data_config": "wikitext2_train_act",
+            "activation_type": "uint16",
+            "precision": "uint8",
+            "calibration_providers": [ "CUDAExecutionProvider" ],
+            "quant_preprocess": true,
+            "op_types_to_exclude": [ "GatherBlockQuantized", "GroupQueryAttention", "MatMulNBits" ],
+            "save_as_external_data": true
+        },
+        "sp": { "type": "SplitModel" },
+        "st": { "type": "StaticLLM", "batch_size": 1, "context_length": 64 },
+        "cb": {
+            "type": "EPContextBinaryGenerator",
+            "provider_options": {
+                "htp_performance_mode": "burst",
+                "htp_graph_finalization_optimization_mode": "3",
+                "soc_model": "60"
+            },
+            "weight_sharing": true
+        },
+        "cp": { "type": "ComposeOnnxModels" }
+    },
+    "target": "qnn_system",
+    "log_severity_level": 1,
+    "output_dir": "models/gemma-3-4b-it",
+    "cache_dir": "cache",
+    "no_artifacts": true
+}
diff --git a/examples/gemma3/requirements.txt b/examples/gemma3/requirements.txt
new file mode 100644
index 000000000..c51bff135
--- /dev/null
+++ b/examples/gemma3/requirements.txt
@@ -0,0 +1,5 @@
+datasets
+transformers
+optimum
+onnxruntime-gpu==1.21.1
+onnxruntime-genai-cuda==0.7.1

From ddf3ea861bfb62c5c473a146c95e4c604c6f3f17 Mon Sep 17 00:00:00 2001
From: Kyle Romero <kromero@qti.qualcomm.com>
Date: Wed, 13 Aug 2025 16:26:33 -0700
Subject: [PATCH 02/24] Add README and start config

---
 examples/gemma3/README.md                     |  2 --
 examples/gemma3/qnn/README.md                 | 23 +++++++++++++++++++
 examples/gemma3/qnn/env_setup.sh              | 20 ++++++++++++++++
 .../gemma3-4b-qnn-config.json}                |  0
 4 files changed, 43 insertions(+), 2 deletions(-)
 delete mode 100644 examples/gemma3/README.md
 create mode 100644 examples/gemma3/qnn/README.md
 create mode 100644 examples/gemma3/qnn/env_setup.sh
 rename examples/gemma3/{qnn_config.json => qnn/gemma3-4b-qnn-config.json} (100%)

diff --git a/examples/gemma3/README.md b/examples/gemma3/README.md
deleted file mode 100644
index fa20478d2..000000000
--- a/examples/gemma3/README.md
+++ /dev/null
@@ -1,2 +0,0 @@
-# Gemma-3-4B Model Optimization
-
diff --git a/examples/gemma3/qnn/README.md b/examples/gemma3/qnn/README.md
new file mode 100644
index 000000000..edfc0ac0a
--- /dev/null
+++ b/examples/gemma3/qnn/README.md
@@ -0,0 +1,23 @@
+# Gemma-3-4B Model Optimization
+
+This repository demonstrates the optimization of the [Google Gemma-3-4B](https://huggingface.co/google/gemma-3-4b-it) model using **post-training quantization (PTQ)** techniques. The optimization process utilizes an environment based heavily upon the [PTQ tutorial for Phi-3.5](https://github.com/CodeLinaro/Olive/blob/main/examples/phi3_5/README.md)
+
+## Automated Setup (Linux Only)
+
+Requirements:
+* Python 3.10
+* uv
+
+This repository contains an automated setup script for Linux that can be used to help automate many of the steps listed in the tutorial above:
+
+```bash
+source env_setup.sh
+```
+
+## Optimization Process
+
+Run the following command in your Olive environment after completing the above setup steps:
+
+```bash
+olive run --config gemma3-4b-qnn-config.json
+```
diff --git a/examples/gemma3/qnn/env_setup.sh b/examples/gemma3/qnn/env_setup.sh
new file mode 100644
index 000000000..a51e84462
--- /dev/null
+++ b/examples/gemma3/qnn/env_setup.sh
@@ -0,0 +1,20 @@
+
+# Installing setuptools to build Olive from source
+uv pip install setuptools
+
+# Requires installation of uv
+uv pip install -r ../requirements.txt
+
+# Disable CUDA extension build
+export BUILD_CUDA_EXT=0
+
+# Install AutoGPTQ from source
+uv pip install --no-build-isolation git+https://github.com/PanQiWei/AutoGPTQ.git
+
+# Install GptqModel from source
+uv pip install --no-build-isolation git+https://github.com/ModelCloud/GPTQModel.git@5d2911a4b2a709afb0941d53c3882d0cd80b9649
+
+# Install onnxruntime-qnn without installing onnxruntime
+# Note: Installing both at the same time may cause conflicts
+uv pip install -r https://raw.githubusercontent.com/microsoft/onnxruntime/refs/heads/main/requirements.txt
+uv pip install -U --pre --extra-index-url https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/ORT-Nightly/pypi/simple onnxruntime-qnn --no-deps
diff --git a/examples/gemma3/qnn_config.json b/examples/gemma3/qnn/gemma3-4b-qnn-config.json
similarity index 100%
rename from examples/gemma3/qnn_config.json
rename to examples/gemma3/qnn/gemma3-4b-qnn-config.json

From 1f540743381cbae522d207ae2502022bacf255d2 Mon Sep 17 00:00:00 2001
From: Kyle Romero <kromero@qti.qualcomm.com>
Date: Wed, 13 Aug 2025 18:17:39 -0700
Subject: [PATCH 03/24] QuaRot passing, working on GptqQuantizer

---
 examples/gemma3/qnn/env_setup.sh              |  3 +++
 examples/gemma3/qnn/gemma3-4b-qnn-config.json |  4 +--
 olive/common/hf/utils.py                      |  4 +++
 olive/common/hf/wrapper.py                    | 26 ++++++++++++++-----
 4 files changed, 28 insertions(+), 9 deletions(-)

diff --git a/examples/gemma3/qnn/env_setup.sh b/examples/gemma3/qnn/env_setup.sh
index a51e84462..03a3a9993 100644
--- a/examples/gemma3/qnn/env_setup.sh
+++ b/examples/gemma3/qnn/env_setup.sh
@@ -5,6 +5,9 @@ uv pip install setuptools
 # Requires installation of uv
 uv pip install -r ../requirements.txt
 
+# Require installation of Olive dependencies
+uv pip install -r ../../../requirements.txt
+
 # Disable CUDA extension build
 export BUILD_CUDA_EXT=0
 
diff --git a/examples/gemma3/qnn/gemma3-4b-qnn-config.json b/examples/gemma3/qnn/gemma3-4b-qnn-config.json
index d84d5cc13..d1efe69d2 100644
--- a/examples/gemma3/qnn/gemma3-4b-qnn-config.json
+++ b/examples/gemma3/qnn/gemma3-4b-qnn-config.json
@@ -1,9 +1,9 @@
 {
-    "input_model": { "type": "HfModel", "model_path": "google/gemma-3-4b-it" },
+    "input_model": { "type": "HfModel", "model_path": "google/gemma-3-4b-it" , "model_attributes": {"head_dim": 256}},
     "systems": {
         "qnn_system": {
             "type": "PythonEnvironment",
-            "python_environment_path": "/path/to/qnn/env/bin",
+            "python_environment_path": "/local/mnt2/workspace/kromero/olive/olive-venv/bin",
             "accelerators": [ { "execution_providers": [ "QNNExecutionProvider" ] } ]
         }
     },
diff --git a/olive/common/hf/utils.py b/olive/common/hf/utils.py
index a070e85ac..9a98ff0e3 100644
--- a/olive/common/hf/utils.py
+++ b/olive/common/hf/utils.py
@@ -119,6 +119,10 @@ def save_model_config(config: Union["PretrainedConfig", "GenerationConfig"], out
     config.save_pretrained(output_dir, **kwargs)
 
 
+def get_model_attributes_config(config: "PretrainedConfig", model_type: str):
+    return config.text_config if model_type == "gemma3" else config
+
+
 def save_module_files(
     config: "PretrainedConfig", model_name_or_path: str, output_dir: str, **kwargs
 ) -> tuple["PretrainedConfig", list[str]]:
diff --git a/olive/common/hf/wrapper.py b/olive/common/hf/wrapper.py
index 3946f0f3a..8bb832c21 100644
--- a/olive/common/hf/wrapper.py
+++ b/olive/common/hf/wrapper.py
@@ -10,6 +10,7 @@
 from transformers import PretrainedConfig
 
 from olive.common.utils import find_first_matched_value, get_attr, replace_submodules, set_attr
+from olive.common.hf.utils import get_model_attributes_config
 
 if TYPE_CHECKING:
     from transformers import PreTrainedModel
@@ -195,6 +196,7 @@ class ModelWrapper:
         "default": ["model.embed_tokens"],
         "bloom": ["transformer.word_embeddings", "transformer.word_embeddings_layernorm"],
         "falcon": ["transformer.word_embeddings"],
+        "gemma3": ["model.language_model.embed_tokens"],
         "gpt2": ["transformer.wte", "transformer.wpe"],
         "gpt_neox": ["gpt_neox.embed_in"],
         "gptj": ["transformer.wte"],
@@ -209,11 +211,17 @@ class ModelWrapper:
         "qwen": "transformer.rotary_emb",
     }
     LM_HEAD = {"default": "lm_head"}
-    PRE_HEAD_LAYERNORM = {"default": "model.norm", "gpt2": "transformer.ln_f", "qwen": "transformer.ln_f"}
+    PRE_HEAD_LAYERNORM = {
+        "default": "model.norm",
+        "gemma3": "model.language_model.norm", 
+        "gpt2": "transformer.ln_f", 
+        "qwen": "transformer.ln_f"
+    }
     LAYERS = {
         "default": "model.layers",
         "bloom": "transformer.h",
         "falcon": "transformer.h",
+        "gemma3": "model.language_model.layers",
         "gpt2": "transformer.h",
         "gpt_neox": "gpt_neox.layers",
         "gptj": "transformer.h",
@@ -225,17 +233,20 @@ def __init__(self, config: Union[PretrainedConfig, dict]):
         self.config = config if isinstance(config, PretrainedConfig) else PretrainedConfig.from_dict(config)
         self.model_type = find_first_matched_value(self.config, "model_type")
 
+        logger.error(self.config)
+
         # model attributes
-        self.hidden_size = find_first_matched_value(self.config, self.HIDDEN_SIZE_NAMES)
-        self.num_attention_heads = find_first_matched_value(self.config, self.NUM_ATTENTION_HEADS_NAMES)
+        model_attributes_config = get_model_attributes_config(self.config, self.model_type)
+        self.hidden_size = find_first_matched_value(model_attributes_config, self.HIDDEN_SIZE_NAMES)
+        self.num_attention_heads = find_first_matched_value(model_attributes_config, self.NUM_ATTENTION_HEADS_NAMES)
         self.num_key_value_heads = (
-            find_first_matched_value(self.config, self.NUM_KEY_VALUE_HEADS_NAMES) or self.num_attention_heads
+            find_first_matched_value(model_attributes_config, self.NUM_KEY_VALUE_HEADS_NAMES) or self.num_attention_heads
         )
         self.head_dim = (
-            find_first_matched_value(self.config, self.HEAD_DIM_NAMES) or self.hidden_size // self.num_attention_heads
+            find_first_matched_value(model_attributes_config, self.HEAD_DIM_NAMES) or self.hidden_size // self.num_attention_heads
         )
-        self.num_hidden_layers = find_first_matched_value(self.config, self.NUM_HIDDEN_LAYER_NAMES)
-        self.max_length = find_first_matched_value(self.config, self.MAX_LENGTH)
+        self.num_hidden_layers = find_first_matched_value(model_attributes_config, self.NUM_HIDDEN_LAYER_NAMES)
+        self.max_length = find_first_matched_value(model_attributes_config, self.MAX_LENGTH)
 
         self._model = None
         self._layer_wrappers = None
@@ -266,6 +277,7 @@ def get_pre_head_layernorm(self, return_name: bool = True):
         return get_submodules(self.model, self.PRE_HEAD_LAYERNORM, self.model_type, return_name=return_name)
 
     def get_layers(self, return_name: bool = True):
+        logger.error(self.model)
         return get_submodules(self.model, self.LAYERS, self.model_type, return_name=return_name)
 
     def get_layer_wrappers(self):

From 6cae95ffb03a3a4a2f17a1a08497e4a696c7e1f9 Mon Sep 17 00:00:00 2001
From: Kyle Romero <kromero@qti.qualcomm.com>
Date: Thu, 14 Aug 2025 18:37:24 -0700
Subject: [PATCH 04/24] Work on dataset integration

---
 examples/gemma3/qnn/gemma3-4b-qnn-config.json |  33 +---
 examples/gemma3/qnn/user_script.py            | 168 ++++++++++++++++++
 2 files changed, 177 insertions(+), 24 deletions(-)
 create mode 100644 examples/gemma3/qnn/user_script.py

diff --git a/examples/gemma3/qnn/gemma3-4b-qnn-config.json b/examples/gemma3/qnn/gemma3-4b-qnn-config.json
index d1efe69d2..39c2b40ea 100644
--- a/examples/gemma3/qnn/gemma3-4b-qnn-config.json
+++ b/examples/gemma3/qnn/gemma3-4b-qnn-config.json
@@ -9,36 +9,21 @@
     },
     "data_configs": [
         {
-            "name": "wikitext2_train_joined",
-            "type": "HuggingfaceContainer",
-            "load_dataset_config": { "data_name": "wikitext", "subset": "wikitext-2-raw-v1", "split": "train" },
-            "pre_process_data_config": {
-                "strategy": "join",
-                "add_special_tokens": false,
-                "max_seq_len": 4096,
-                "max_samples": 128
-            }
-        },
-        {
-            "name": "wikitext2_train_act",
-            "type": "HuggingfaceContainer",
-            "load_dataset_config": { "data_name": "wikitext", "subset": "wikitext-2-raw-v1", "split": "train" },
-            "pre_process_data_config": {
-                "strategy": "line-by-line",
-                "add_special_tokens": true,
-                "max_samples": 256,
-                "max_seq_len": 4096
-            }
+            "name": "gemma_data_config",
+            "user_script": "user_script.py",
+            "load_dataset_config": { "type": "gemma_dataset", "model_id": "google/gemma-3-4b-it" }
         }
     ],
     "passes": {
         "q": { "type": "QuaRot" },
         "g": {
-            "type": "GptqQuantizer",
+            "type": "GptqModel",
+            "bits": 4,
             "sym": true,
             "group_size": -1,
-            "desc_act": true,
-            "data_config": "wikitext2_train_joined"
+            "lm_head": false,
+            "device": "cuda",
+            "data_config": "gemma_data_config"
         },
         "cs": { "type": "CaptureSplitInfo", "num_splits": 4, "unique_embeds_lm_head_splits": true },
         "mb": {
@@ -66,7 +51,7 @@
         },
         "sq": {
             "type": "OnnxStaticQuantization",
-            "data_config": "wikitext2_train_act",
+            "data_config": "gemma_data_config",
             "activation_type": "uint16",
             "precision": "uint8",
             "calibration_providers": [ "CUDAExecutionProvider" ],
diff --git a/examples/gemma3/qnn/user_script.py b/examples/gemma3/qnn/user_script.py
new file mode 100644
index 000000000..4fc8e8568
--- /dev/null
+++ b/examples/gemma3/qnn/user_script.py
@@ -0,0 +1,168 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+
+import logging
+import numpy as np
+import os
+import torch
+
+from huggingface_hub import hf_hub_download
+from typing import Optional
+
+from transformers import pipeline
+import requests
+from PIL import Image
+
+from transformers import AutoProcessor, LlavaForConditionalGeneration
+from transformers import AutoProcessor, Gemma3ForConditionalGeneration
+from transformers import AutoConfig, AutoTokenizer
+from itertools import chain
+from torch.utils.data import DataLoader, Dataset
+from datasets import IterableDataset, load_dataset
+from transformers import default_data_collator
+from PIL import Image as PILImage
+from transformers.feature_extraction_utils import BatchFeature
+from transformers.image_utils import make_nested_list_of_images
+
+import numpy as np
+from datasets import load_dataset
+from torch.utils.data import Dataset
+from transformers import CLIPProcessor
+
+from olive.data.registry import Registry
+from torch.utils.data import DataLoader
+
+logger = logging.getLogger(__name__)
+
+
+
+def get_gemma3_dataset(tokenzier, processor, data_files, dataset_path, cache_dir):
+    def _map1(example):
+        example['text'] = [_convert_one_conversation(conversation=conversation) for conversation in
+                            example['conversations']]
+        return example
+
+    def _map2(example):
+        image = PILImage.open(fp=os.path.join(dataset_path, example["image"]))
+        example['image_mode'] = image.mode
+        return example
+
+    def _load_image_and_tokenize(example):
+        # try:
+            #print(example['text'])
+            inputs = processor.apply_chat_template(example['text'][0],
+                                                   add_generation_prompt=True, tokenize=True,
+                                                   return_tensors="pt", return_dict=True)
+            # print("image=", example["image"][0])
+            inputs = {k: v.unsqueeze(0) for k, v in inputs.items()}
+            # inputs.update({"pixel_values": torch.tensor(processor(text="<start_of_image>", images=PILImage.open(fp=os.path.join(dataset_path, example["image"][0]))).pixel_values).unsqueeze(0)})
+            #print(inputs.keys())
+            inputs["input_ids"] = inputs["input_ids"][0]
+            #print(inputs["input_ids"])
+            return inputs
+        
+        # except Exception as e:
+        #     print(f"Skipping example due to error: {e}")
+        #     return None
+
+    
+    dataset = load_dataset("json", data_files=data_files, cache_dir=cache_dir, split='train')
+
+    dataset = dataset.map(_map1)
+    dataset = dataset.map(_map2)
+    
+    dataset = dataset.filter(lambda x: x["image_mode"] == 'RGB')
+
+    return dataset.with_transform(_load_image_and_tokenize)
+
+class GemmaDataset:
+
+    CACHE_DIR = os.getenv("CACHE_DIR", ".cache")
+     
+    def __init__(self, model_id: str, first_n: Optional[int] = None):
+        self.model_id = model_id
+        self.first_n = first_n
+        
+        self.processor = AutoProcessor.from_pretrained(self.model_id)
+        self.tokenizer = AutoTokenizer.from_pretrained(self.model_id, cache_dir=None, use_fast=True, trust_remote_code=True)
+
+        self.setup_dataset()
+
+    def setup_dataset(self):
+         # Uses a LlaVA dataset and transforms it to something Gemma-compatible
+
+         # Issue with Arrow leads to errors when using load_dataset directly on liuhaotian/LLaVA-Instruct-150K
+         file_path = hf_hub_download(repo_id="liuhaotian/LLaVA-Instruct-150K", filename="llava_instruct_80k.json", repo_type="dataset")
+
+         
+
+         logger.error(file_path)
+         logger.error(image_file_path)
+         self.raw_datasets = load_dataset("json", data_files=[file_path], split="train")
+         self.raw_datasets = self.raw_datasets if self.first_n is None else self.raw_datasets.select(range(self.first_n))
+         logger.error(self.raw_datasets)
+
+         # Convert the Llava-style conversation to Gemma-style conversation
+         self.raw_datasets = self.raw_datasets.map(self._convert_llava_to_gemma_conversation)
+         for row in self.raw_datasets:
+            print(row)
+
+    def get_train_dataset(self, first_n: Optional[int] = None):
+        self.train_dataset = self.raw_datasets if first_n is None else self.raw_datasets[:first_n]
+        return self.train_dataset
+    
+    @staticmethod
+    def _convert_llava_to_gemma_conversation(entry: dict[str, any]):
+        entry['text'] = [GemmaDataset._convert_single_llava_to_gemma_conversation(conversation) for conversation in entry["conversations"]]
+        del entry['conversations']
+        return entry
+    
+    @staticmethod
+    def _convert_single_llava_to_gemma_conversation(conversation: list[dict[str, str]]) -> dict[str, str | list[dict]]:
+        """Convert a single llava-style conversation entry to Gemma-style.
+
+        Examples:
+
+            >>> conversation = {"from": "human", "value": "<image>What are the colors of the bus in the image?"}
+            >>> _convert_llava_to_gemma_conversation(conversation)
+            {
+                'role': 'user',
+                'content': [{'type': 'image'}, {'type': 'text', 'text': 'What are the colors of the bus in the image?'}]
+            }
+            >>> conversation = {"from": "gpt", "value": "The bus in the image is white and red."}
+            >>> _convert_llava_to_gemma_conversation(conversation)
+            {
+                'role': 'assistant',
+                'content': [{'type': 'text', 'text': 'The bus in the image is white and red.'}]
+            }
+        """
+        who = conversation.get("from")
+        match who:
+            case "human":
+                role = "user"
+            case "gpt":
+                role = "assistant"
+            case _:
+                raise ValueError(f"Unknown role: {who}")
+
+        text = conversation.get("value")
+
+        if "<image>" in text:
+            has_image = True
+            text = text.replace("<image>", "")
+        else:
+            has_image = False
+
+        return {
+            "role": role,
+            "content": (
+                [{"type": "image"}, {"type": "text", "text": text}] if has_image else [{"type": "text", "text": text}]
+            ),
+        }
+    
+
+@Registry.register_dataset()
+def gemma_dataset(model_id: str):
+    return GemmaDataset(model_id, first_n=5).get_train_dataset()

From 2d0872e7b327ba24fb1a45ea9d95666dbe3ffacb Mon Sep 17 00:00:00 2001
From: Kyle Romero <kromero@qti.qualcomm.com>
Date: Fri, 15 Aug 2025 14:54:48 -0700
Subject: [PATCH 05/24] Data processing works

---
 examples/gemma3/qnn/user_script.py | 110 +++++++++++++++++++++++++----
 olive/passes/pytorch/gptqmodel.py  |  14 +++-
 2 files changed, 109 insertions(+), 15 deletions(-)

diff --git a/examples/gemma3/qnn/user_script.py b/examples/gemma3/qnn/user_script.py
index 4fc8e8568..1c10e08c0 100644
--- a/examples/gemma3/qnn/user_script.py
+++ b/examples/gemma3/qnn/user_script.py
@@ -6,6 +6,8 @@
 import logging
 import numpy as np
 import os
+import subprocess
+import zipfile
 import torch
 
 from huggingface_hub import hf_hub_download
@@ -86,32 +88,88 @@ def __init__(self, model_id: str, first_n: Optional[int] = None):
         self.first_n = first_n
         
         self.processor = AutoProcessor.from_pretrained(self.model_id)
-        self.tokenizer = AutoTokenizer.from_pretrained(self.model_id, cache_dir=None, use_fast=True, trust_remote_code=True)
+        self.tokenizer = AutoTokenizer.from_pretrained(self.model_id, cache_dir=self.CACHE_DIR, use_fast=True, trust_remote_code=True)
 
         self.setup_dataset()
 
+    def _download_and_extract_images(self):
+        """
+        Downloads the coco train2017 image dataset and extracts them to the cache directory
+        """
+        zip_filename = "train2017.zip"
+        zip_path = os.path.join(self.CACHE_DIR, zip_filename)
+        extract_path = os.path.join(self.CACHE_DIR, "train2017")
+        
+        # Create cache directory if it doesn't exist
+        os.makedirs(self.CACHE_DIR, exist_ok=True)
+        
+        # Check if images are already downloaded and extracted
+        if os.path.exists(extract_path) and os.listdir(extract_path):
+            logger.info(f"Images already exist at {extract_path}")
+            return extract_path
+        
+        # Download the dataset if zip doesn't exist
+        if not os.path.exists(zip_path):
+            logger.info(f"Downloading COCO train2017 dataset to {zip_path}")
+            try:
+                subprocess.run([
+                    "wget", 
+                    "https://images.cocodataset.org/zips/train2017.zip",
+                    "--no-check-certificate",
+                    "-O", zip_path
+                ], check=True, cwd=self.CACHE_DIR)
+                logger.info("Download completed successfully")
+            except subprocess.CalledProcessError as e:
+                logger.error(f"Failed to download dataset: {e}")
+                raise
+            except FileNotFoundError:
+                logger.error("wget command not found. Please install wget or use an alternative download method.")
+                raise
+        
+        # Extract the zip file
+        logger.info(f"Extracting {zip_path} to {self.CACHE_DIR}")
+        try:
+            with zipfile.ZipFile(zip_path, 'r') as zip_ref:
+                zip_ref.extractall(self.CACHE_DIR)
+            logger.info("Extraction completed successfully")
+        except zipfile.BadZipFile as e:
+            logger.error(f"Failed to extract zip file: {e}")
+            # Remove corrupted zip file so it can be re-downloaded
+            if os.path.exists(zip_path):
+                os.remove(zip_path)
+            raise
+        
+        return extract_path
+
     def setup_dataset(self):
          # Uses a LlaVA dataset and transforms it to something Gemma-compatible
 
          # Issue with Arrow leads to errors when using load_dataset directly on liuhaotian/LLaVA-Instruct-150K
-         file_path = hf_hub_download(repo_id="liuhaotian/LLaVA-Instruct-150K", filename="llava_instruct_80k.json", repo_type="dataset")
-
-         
+         file_path = hf_hub_download(repo_id="liuhaotian/LLaVA-Instruct-150K", filename="llava_instruct_80k.json", repo_type="dataset", cache_dir=self.CACHE_DIR)
 
-         logger.error(file_path)
-         logger.error(image_file_path)
+         self.image_data_path = self._download_and_extract_images()
          self.raw_datasets = load_dataset("json", data_files=[file_path], split="train")
+
+         # Limit data processing to the first_n rows
          self.raw_datasets = self.raw_datasets if self.first_n is None else self.raw_datasets.select(range(self.first_n))
-         logger.error(self.raw_datasets)
 
          # Convert the Llava-style conversation to Gemma-style conversation
          self.raw_datasets = self.raw_datasets.map(self._convert_llava_to_gemma_conversation)
-         for row in self.raw_datasets:
-            print(row)
 
-    def get_train_dataset(self, first_n: Optional[int] = None):
-        self.train_dataset = self.raw_datasets if first_n is None else self.raw_datasets[:first_n]
-        return self.train_dataset
+         # Extract image details using a lambda to pass the dataset_path
+         self.raw_datasets = self.raw_datasets.map(self._extract_image_details)
+
+         # Filter out any images that are not RGB
+         self.raw_datasets = self.raw_datasets.filter(lambda x: x["image_mode"] == 'RGB')
+
+         # Loads the images and tokenizes the text
+         self.raw_datasets = self.raw_datasets.with_transform(self._load_image_and_tokenize)
+
+         for entry in self.raw_datasets:
+             logger.error(entry)
+
+    def get_train_dataset(self):
+        return self.raw_datasets
     
     @staticmethod
     def _convert_llava_to_gemma_conversation(entry: dict[str, any]):
@@ -162,7 +220,33 @@ def _convert_single_llava_to_gemma_conversation(conversation: list[dict[str, str
             ),
         }
     
+    def _extract_image_details(self, entry: dict[str, any]):
+        """
+        Extract image details from the dataset example.
+        Opens the image file and adds image mode information to the example.
+        """
+        image = PILImage.open(fp=os.path.join(self.image_data_path, entry["image"]))
+        entry['image_mode'] = image.mode
+        return entry
+
+    def _load_image_and_tokenize(self, entry: dict[str, any]):
+        """
+        Load image and tokenize the conversation for model input.
+        
+        Args:
+            entry: Dataset entry containing text conversation and image path
+            
+        Returns:
+            Tokenized inputs ready for model processing
+        """
+        inputs = self.processor.apply_chat_template(entry['text'][0],
+                                                   add_generation_prompt=True, tokenize=True,
+                                                   return_tensors="pt", return_dict=True)
+        inputs = {k: v.unsqueeze(0) for k, v in inputs.items()}
+        inputs["input_ids"] = inputs["input_ids"][0]
+        return inputs
+
 
 @Registry.register_dataset()
 def gemma_dataset(model_id: str):
-    return GemmaDataset(model_id, first_n=5).get_train_dataset()
+    return GemmaDataset(model_id, first_n=200).get_train_dataset()
diff --git a/olive/passes/pytorch/gptqmodel.py b/olive/passes/pytorch/gptqmodel.py
index cb54385f3..eeedf8f62 100644
--- a/olive/passes/pytorch/gptqmodel.py
+++ b/olive/passes/pytorch/gptqmodel.py
@@ -189,8 +189,18 @@ def get_dataset(
             raise ValueError("Data config is required for PyTorch model.")
         data_config = validate_config(data_config, DataConfig)
         dataloader = data_config.to_data_container().create_dataloader()
-        # each batch consists of (input_data, labels)
-        dataset = [data[0] for data in dataloader]
+        # each batch consists of (input_data, labels) or just input_data
+        dataset = []
+        for data in dataloader:
+            if isinstance(data, (tuple, list)) and len(data) > 0:
+                # Standard format: (input_data, labels)
+                dataset.append(data[0])
+            elif isinstance(data, dict):
+                # Data is already in the expected dictionary format
+                dataset.append(data)
+            else:
+                # Data is the input data directly
+                dataset.append(data)
         if (
             not dataset
             or not isinstance(dataset, list)

From 6a6f67dd3afda4903b4654d22c7ded23ad5ece97 Mon Sep 17 00:00:00 2001
From: Kyle Romero <kromero@qti.qualcomm.com>
Date: Fri, 15 Aug 2025 15:12:22 -0700
Subject: [PATCH 06/24] Fix lint issues and cleanup

---
 examples/gemma3/qnn/gemma3-4b-qnn-config.json |   2 +-
 examples/gemma3/qnn/user_script.py            | 205 +++++++-----------
 examples/gemma3/requirements.txt              |   6 +-
 olive/common/hf/wrapper.py                    |  14 +-
 4 files changed, 93 insertions(+), 134 deletions(-)

diff --git a/examples/gemma3/qnn/gemma3-4b-qnn-config.json b/examples/gemma3/qnn/gemma3-4b-qnn-config.json
index 39c2b40ea..71986d135 100644
--- a/examples/gemma3/qnn/gemma3-4b-qnn-config.json
+++ b/examples/gemma3/qnn/gemma3-4b-qnn-config.json
@@ -1,5 +1,5 @@
 {
-    "input_model": { "type": "HfModel", "model_path": "google/gemma-3-4b-it" , "model_attributes": {"head_dim": 256}},
+    "input_model": { "type": "HfModel", "model_path": "google/gemma-3-4b-it", "model_attributes": { "head_dim": 256 } },
     "systems": {
         "qnn_system": {
             "type": "PythonEnvironment",
diff --git a/examples/gemma3/qnn/user_script.py b/examples/gemma3/qnn/user_script.py
index 1c10e08c0..4c62fa735 100644
--- a/examples/gemma3/qnn/user_script.py
+++ b/examples/gemma3/qnn/user_script.py
@@ -4,185 +4,141 @@
 # --------------------------------------------------------------------------
 
 import logging
-import numpy as np
 import os
 import subprocess
 import zipfile
-import torch
-
-from huggingface_hub import hf_hub_download
+from pathlib import Path
 from typing import Optional
 
-from transformers import pipeline
-import requests
-from PIL import Image
-
-from transformers import AutoProcessor, LlavaForConditionalGeneration
-from transformers import AutoProcessor, Gemma3ForConditionalGeneration
-from transformers import AutoConfig, AutoTokenizer
-from itertools import chain
-from torch.utils.data import DataLoader, Dataset
-from datasets import IterableDataset, load_dataset
-from transformers import default_data_collator
-from PIL import Image as PILImage
-from transformers.feature_extraction_utils import BatchFeature
-from transformers.image_utils import make_nested_list_of_images
-
-import numpy as np
 from datasets import load_dataset
-from torch.utils.data import Dataset
-from transformers import CLIPProcessor
+from huggingface_hub import hf_hub_download
+from PIL import Image as PILImage
+from transformers import (
+    AutoProcessor,
+    AutoTokenizer,
+)
 
 from olive.data.registry import Registry
-from torch.utils.data import DataLoader
 
 logger = logging.getLogger(__name__)
 
 
-
-def get_gemma3_dataset(tokenzier, processor, data_files, dataset_path, cache_dir):
-    def _map1(example):
-        example['text'] = [_convert_one_conversation(conversation=conversation) for conversation in
-                            example['conversations']]
-        return example
-
-    def _map2(example):
-        image = PILImage.open(fp=os.path.join(dataset_path, example["image"]))
-        example['image_mode'] = image.mode
-        return example
-
-    def _load_image_and_tokenize(example):
-        # try:
-            #print(example['text'])
-            inputs = processor.apply_chat_template(example['text'][0],
-                                                   add_generation_prompt=True, tokenize=True,
-                                                   return_tensors="pt", return_dict=True)
-            # print("image=", example["image"][0])
-            inputs = {k: v.unsqueeze(0) for k, v in inputs.items()}
-            # inputs.update({"pixel_values": torch.tensor(processor(text="<start_of_image>", images=PILImage.open(fp=os.path.join(dataset_path, example["image"][0]))).pixel_values).unsqueeze(0)})
-            #print(inputs.keys())
-            inputs["input_ids"] = inputs["input_ids"][0]
-            #print(inputs["input_ids"])
-            return inputs
-        
-        # except Exception as e:
-        #     print(f"Skipping example due to error: {e}")
-        #     return None
-
-    
-    dataset = load_dataset("json", data_files=data_files, cache_dir=cache_dir, split='train')
-
-    dataset = dataset.map(_map1)
-    dataset = dataset.map(_map2)
-    
-    dataset = dataset.filter(lambda x: x["image_mode"] == 'RGB')
-
-    return dataset.with_transform(_load_image_and_tokenize)
-
 class GemmaDataset:
-
     CACHE_DIR = os.getenv("CACHE_DIR", ".cache")
-     
+
     def __init__(self, model_id: str, first_n: Optional[int] = None):
         self.model_id = model_id
         self.first_n = first_n
-        
+
         self.processor = AutoProcessor.from_pretrained(self.model_id)
-        self.tokenizer = AutoTokenizer.from_pretrained(self.model_id, cache_dir=self.CACHE_DIR, use_fast=True, trust_remote_code=True)
+        self.tokenizer = AutoTokenizer.from_pretrained(
+            self.model_id, cache_dir=self.CACHE_DIR, use_fast=True, trust_remote_code=True
+        )
 
         self.setup_dataset()
 
     def _download_and_extract_images(self):
-        """
-        Downloads the coco train2017 image dataset and extracts them to the cache directory
-        """
+        """Download the COCO train2017 image dataset and extract to the cache directory."""
         zip_filename = "train2017.zip"
         zip_path = os.path.join(self.CACHE_DIR, zip_filename)
         extract_path = os.path.join(self.CACHE_DIR, "train2017")
-        
+
         # Create cache directory if it doesn't exist
         os.makedirs(self.CACHE_DIR, exist_ok=True)
-        
+
         # Check if images are already downloaded and extracted
-        if os.path.exists(extract_path) and os.listdir(extract_path):
-            logger.info(f"Images already exist at {extract_path}")
+        extract_path_obj = Path(extract_path)
+        if extract_path_obj.exists() and any(extract_path_obj.iterdir()):
+            logger.info("Images already exist at %s", extract_path)
             return extract_path
-        
+
         # Download the dataset if zip doesn't exist
         if not os.path.exists(zip_path):
-            logger.info(f"Downloading COCO train2017 dataset to {zip_path}")
+            logger.info("Downloading COCO train2017 dataset to %s", zip_path)
             try:
-                subprocess.run([
-                    "wget", 
-                    "https://images.cocodataset.org/zips/train2017.zip",
-                    "--no-check-certificate",
-                    "-O", zip_path
-                ], check=True, cwd=self.CACHE_DIR)
+                subprocess.run(
+                    [
+                        "wget",
+                        "https://images.cocodataset.org/zips/train2017.zip",
+                        "--no-check-certificate",
+                        "-O",
+                        zip_path,
+                    ],
+                    check=True,
+                    cwd=self.CACHE_DIR,
+                )
                 logger.info("Download completed successfully")
-            except subprocess.CalledProcessError as e:
-                logger.error(f"Failed to download dataset: {e}")
+            except subprocess.CalledProcessError:
+                logger.exception("Failed to download dataset")
                 raise
             except FileNotFoundError:
-                logger.error("wget command not found. Please install wget or use an alternative download method.")
+                logger.exception("wget command not found. Please install wget or use an alternative download method.")
                 raise
-        
+
         # Extract the zip file
-        logger.info(f"Extracting {zip_path} to {self.CACHE_DIR}")
+        logger.info("Extracting %s to %s", zip_path, self.CACHE_DIR)
         try:
-            with zipfile.ZipFile(zip_path, 'r') as zip_ref:
+            with zipfile.ZipFile(zip_path, "r") as zip_ref:
                 zip_ref.extractall(self.CACHE_DIR)
             logger.info("Extraction completed successfully")
-        except zipfile.BadZipFile as e:
-            logger.error(f"Failed to extract zip file: {e}")
+        except zipfile.BadZipFile:
+            logger.exception("Failed to extract zip file")
             # Remove corrupted zip file so it can be re-downloaded
             if os.path.exists(zip_path):
                 os.remove(zip_path)
             raise
-        
+
         return extract_path
 
     def setup_dataset(self):
-         # Uses a LlaVA dataset and transforms it to something Gemma-compatible
+        # Uses a LlaVA dataset and transforms it to something Gemma-compatible
 
-         # Issue with Arrow leads to errors when using load_dataset directly on liuhaotian/LLaVA-Instruct-150K
-         file_path = hf_hub_download(repo_id="liuhaotian/LLaVA-Instruct-150K", filename="llava_instruct_80k.json", repo_type="dataset", cache_dir=self.CACHE_DIR)
+        # Issue with Arrow leads to errors when using load_dataset directly on liuhaotian/LLaVA-Instruct-150K
+        file_path = hf_hub_download(
+            repo_id="liuhaotian/LLaVA-Instruct-150K",
+            filename="llava_instruct_80k.json",
+            repo_type="dataset",
+            cache_dir=self.CACHE_DIR,
+        )
 
-         self.image_data_path = self._download_and_extract_images()
-         self.raw_datasets = load_dataset("json", data_files=[file_path], split="train")
+        self.image_data_path = self._download_and_extract_images()
+        self.raw_datasets = load_dataset("json", data_files=[file_path], split="train")
 
-         # Limit data processing to the first_n rows
-         self.raw_datasets = self.raw_datasets if self.first_n is None else self.raw_datasets.select(range(self.first_n))
+        # Limit data processing to the first_n rows
+        self.raw_datasets = self.raw_datasets if self.first_n is None else self.raw_datasets.select(range(self.first_n))
 
-         # Convert the Llava-style conversation to Gemma-style conversation
-         self.raw_datasets = self.raw_datasets.map(self._convert_llava_to_gemma_conversation)
+        # Convert the Llava-style conversation to Gemma-style conversation
+        self.raw_datasets = self.raw_datasets.map(self._convert_llava_to_gemma_conversation)
 
-         # Extract image details using a lambda to pass the dataset_path
-         self.raw_datasets = self.raw_datasets.map(self._extract_image_details)
+        # Extract image details using a lambda to pass the dataset_path
+        self.raw_datasets = self.raw_datasets.map(self._extract_image_details)
 
-         # Filter out any images that are not RGB
-         self.raw_datasets = self.raw_datasets.filter(lambda x: x["image_mode"] == 'RGB')
+        # Filter out any images that are not RGB
+        self.raw_datasets = self.raw_datasets.filter(lambda x: x["image_mode"] == "RGB")
 
-         # Loads the images and tokenizes the text
-         self.raw_datasets = self.raw_datasets.with_transform(self._load_image_and_tokenize)
+        # Loads the images and tokenizes the text
+        self.raw_datasets = self.raw_datasets.with_transform(self._load_image_and_tokenize)
 
-         for entry in self.raw_datasets:
-             logger.error(entry)
+        for entry in self.raw_datasets:
+            logger.error(entry)
 
     def get_train_dataset(self):
         return self.raw_datasets
-    
+
     @staticmethod
     def _convert_llava_to_gemma_conversation(entry: dict[str, any]):
-        entry['text'] = [GemmaDataset._convert_single_llava_to_gemma_conversation(conversation) for conversation in entry["conversations"]]
-        del entry['conversations']
+        entry["text"] = [
+            GemmaDataset._convert_single_llava_to_gemma_conversation(conversation)
+            for conversation in entry["conversations"]
+        ]
+        del entry["conversations"]
         return entry
-    
+
     @staticmethod
     def _convert_single_llava_to_gemma_conversation(conversation: list[dict[str, str]]) -> dict[str, str | list[dict]]:
         """Convert a single llava-style conversation entry to Gemma-style.
 
         Examples:
-
             >>> conversation = {"from": "human", "value": "<image>What are the colors of the bus in the image?"}
             >>> _convert_llava_to_gemma_conversation(conversation)
             {
@@ -195,6 +151,7 @@ def _convert_single_llava_to_gemma_conversation(conversation: list[dict[str, str
                 'role': 'assistant',
                 'content': [{'type': 'text', 'text': 'The bus in the image is white and red.'}]
             }
+
         """
         who = conversation.get("from")
         match who:
@@ -219,29 +176,29 @@ def _convert_single_llava_to_gemma_conversation(conversation: list[dict[str, str
                 [{"type": "image"}, {"type": "text", "text": text}] if has_image else [{"type": "text", "text": text}]
             ),
         }
-    
+
     def _extract_image_details(self, entry: dict[str, any]):
-        """
-        Extract image details from the dataset example.
+        """Extract image details from the dataset example.
+
         Opens the image file and adds image mode information to the example.
         """
         image = PILImage.open(fp=os.path.join(self.image_data_path, entry["image"]))
-        entry['image_mode'] = image.mode
+        entry["image_mode"] = image.mode
         return entry
 
     def _load_image_and_tokenize(self, entry: dict[str, any]):
-        """
-        Load image and tokenize the conversation for model input.
-        
+        """Load image and tokenize the conversation for model input.
+
         Args:
             entry: Dataset entry containing text conversation and image path
-            
+
         Returns:
             Tokenized inputs ready for model processing
+
         """
-        inputs = self.processor.apply_chat_template(entry['text'][0],
-                                                   add_generation_prompt=True, tokenize=True,
-                                                   return_tensors="pt", return_dict=True)
+        inputs = self.processor.apply_chat_template(
+            entry["text"][0], add_generation_prompt=True, tokenize=True, return_tensors="pt", return_dict=True
+        )
         inputs = {k: v.unsqueeze(0) for k, v in inputs.items()}
         inputs["input_ids"] = inputs["input_ids"][0]
         return inputs
diff --git a/examples/gemma3/requirements.txt b/examples/gemma3/requirements.txt
index c51bff135..0b56b7908 100644
--- a/examples/gemma3/requirements.txt
+++ b/examples/gemma3/requirements.txt
@@ -1,5 +1,5 @@
 datasets
-transformers
-optimum
-onnxruntime-gpu==1.21.1
 onnxruntime-genai-cuda==0.7.1
+onnxruntime-gpu==1.21.1
+optimum
+transformers
diff --git a/olive/common/hf/wrapper.py b/olive/common/hf/wrapper.py
index 8bb832c21..6877d7720 100644
--- a/olive/common/hf/wrapper.py
+++ b/olive/common/hf/wrapper.py
@@ -9,8 +9,8 @@
 from torch import nn
 from transformers import PretrainedConfig
 
-from olive.common.utils import find_first_matched_value, get_attr, replace_submodules, set_attr
 from olive.common.hf.utils import get_model_attributes_config
+from olive.common.utils import find_first_matched_value, get_attr, replace_submodules, set_attr
 
 if TYPE_CHECKING:
     from transformers import PreTrainedModel
@@ -213,9 +213,9 @@ class ModelWrapper:
     LM_HEAD = {"default": "lm_head"}
     PRE_HEAD_LAYERNORM = {
         "default": "model.norm",
-        "gemma3": "model.language_model.norm", 
-        "gpt2": "transformer.ln_f", 
-        "qwen": "transformer.ln_f"
+        "gemma3": "model.language_model.norm",
+        "gpt2": "transformer.ln_f",
+        "qwen": "transformer.ln_f",
     }
     LAYERS = {
         "default": "model.layers",
@@ -240,10 +240,12 @@ def __init__(self, config: Union[PretrainedConfig, dict]):
         self.hidden_size = find_first_matched_value(model_attributes_config, self.HIDDEN_SIZE_NAMES)
         self.num_attention_heads = find_first_matched_value(model_attributes_config, self.NUM_ATTENTION_HEADS_NAMES)
         self.num_key_value_heads = (
-            find_first_matched_value(model_attributes_config, self.NUM_KEY_VALUE_HEADS_NAMES) or self.num_attention_heads
+            find_first_matched_value(model_attributes_config, self.NUM_KEY_VALUE_HEADS_NAMES)
+            or self.num_attention_heads
         )
         self.head_dim = (
-            find_first_matched_value(model_attributes_config, self.HEAD_DIM_NAMES) or self.hidden_size // self.num_attention_heads
+            find_first_matched_value(model_attributes_config, self.HEAD_DIM_NAMES)
+            or self.hidden_size // self.num_attention_heads
         )
         self.num_hidden_layers = find_first_matched_value(model_attributes_config, self.NUM_HIDDEN_LAYER_NAMES)
         self.max_length = find_first_matched_value(model_attributes_config, self.MAX_LENGTH)

From cd24ddf938cde13b922e038515fb50568fbc1901 Mon Sep 17 00:00:00 2001
From: Kyle Romero <kromero@qti.qualcomm.com>
Date: Mon, 18 Aug 2025 13:43:56 -0700
Subject: [PATCH 07/24] Adding vision resources

---
 examples/gemma3/qnn/README.md                 | 12 +++--
 ...ig.json => gemma3-4b-text-qnn-config.json} | 12 ++---
 .../qnn/gemma3-4b-vision-qnn-config.json      | 45 +++++++++++++++++++
 examples/gemma3/qnn/user_script.py            | 11 ++++-
 4 files changed, 70 insertions(+), 10 deletions(-)
 rename examples/gemma3/qnn/{gemma3-4b-qnn-config.json => gemma3-4b-text-qnn-config.json} (88%)
 create mode 100644 examples/gemma3/qnn/gemma3-4b-vision-qnn-config.json

diff --git a/examples/gemma3/qnn/README.md b/examples/gemma3/qnn/README.md
index edfc0ac0a..b7ff54fb4 100644
--- a/examples/gemma3/qnn/README.md
+++ b/examples/gemma3/qnn/README.md
@@ -6,7 +6,7 @@ This repository demonstrates the optimization of the [Google Gemma-3-4B](https:/
 
 Requirements:
 * Python 3.10
-* uv
+* uv - Used throughout the setup scripts, please follow the [publically available installation instructions](https://docs.astral.sh/uv/getting-started/installation/#installation-methods)
 
 This repository contains an automated setup script for Linux that can be used to help automate many of the steps listed in the tutorial above:
 
@@ -16,8 +16,14 @@ source env_setup.sh
 
 ## Optimization Process
 
-Run the following command in your Olive environment after completing the above setup steps:
+Since Gemma-3-4B is a multi-modal model composed of both vision and text components, the strategy for optimizing it through Olive is to operate on the constituent models separately before configuring them to work in concert at the onnxruntime-genai stage.
+
+Thus, the following commands should be used to separately produce context binaries for the text and vision portions of the model, respectively.
+
+```bash
+olive run --config gemma3-4b-text-qnn-config.json
+```
 
 ```bash
-olive run --config gemma3-4b-qnn-config.json
+olive run --config gemma3-4b-vision-qnn-config.json
 ```
diff --git a/examples/gemma3/qnn/gemma3-4b-qnn-config.json b/examples/gemma3/qnn/gemma3-4b-text-qnn-config.json
similarity index 88%
rename from examples/gemma3/qnn/gemma3-4b-qnn-config.json
rename to examples/gemma3/qnn/gemma3-4b-text-qnn-config.json
index 71986d135..d2eff5678 100644
--- a/examples/gemma3/qnn/gemma3-4b-qnn-config.json
+++ b/examples/gemma3/qnn/gemma3-4b-text-qnn-config.json
@@ -1,5 +1,5 @@
 {
-    "input_model": { "type": "HfModel", "model_path": "google/gemma-3-4b-it", "model_attributes": { "head_dim": 256 } },
+    "input_model": { "type": "HfModel", "model_path": "google/gemma-3-4b-it" },
     "systems": {
         "qnn_system": {
             "type": "PythonEnvironment",
@@ -9,9 +9,9 @@
     },
     "data_configs": [
         {
-            "name": "gemma_data_config",
+            "name": "gemma_text_data_config",
             "user_script": "user_script.py",
-            "load_dataset_config": { "type": "gemma_dataset", "model_id": "google/gemma-3-4b-it" }
+            "load_dataset_config": { "type": "gemma_text_dataset", "model_id": "google/gemma-3-4b-it" }
         }
     ],
     "passes": {
@@ -23,7 +23,7 @@
             "group_size": -1,
             "lm_head": false,
             "device": "cuda",
-            "data_config": "gemma_data_config"
+            "data_config": "gemma_text_data_config"
         },
         "cs": { "type": "CaptureSplitInfo", "num_splits": 4, "unique_embeds_lm_head_splits": true },
         "mb": {
@@ -51,7 +51,7 @@
         },
         "sq": {
             "type": "OnnxStaticQuantization",
-            "data_config": "gemma_data_config",
+            "data_config": "gemma_text_data_config",
             "activation_type": "uint16",
             "precision": "uint8",
             "calibration_providers": [ "CUDAExecutionProvider" ],
@@ -74,7 +74,7 @@
     },
     "target": "qnn_system",
     "log_severity_level": 1,
-    "output_dir": "models/gemma-3-4b-it",
+    "output_dir": "models/gemma-3-4b-it-text",
     "cache_dir": "cache",
     "no_artifacts": true
 }
diff --git a/examples/gemma3/qnn/gemma3-4b-vision-qnn-config.json b/examples/gemma3/qnn/gemma3-4b-vision-qnn-config.json
new file mode 100644
index 000000000..803000d36
--- /dev/null
+++ b/examples/gemma3/qnn/gemma3-4b-vision-qnn-config.json
@@ -0,0 +1,45 @@
+{
+    "input_model": { 
+        "type": "HfModel", 
+        "model_path": "google/gemma-3-4b-it",
+        "io_config": {
+            "input_names": [ "input_ids", "pixel_values", "attention_mask" ],
+            "input_shapes": [ [ 10, 77 ], [ 1, 3, 224, 224 ], [ 10, 77 ] ],
+            "input_types": [ "int64", "float32", "int64" ],
+            "output_names": [ "logits_per_image" ],
+            "output_shapes": [ [ 1, 2 ] ]
+        } 
+    },
+    "systems": {
+        "qnn_system": {
+            "type": "PythonEnvironment",
+            "python_environment_path": "/local/mnt2/workspace/kromero/olive/olive-venv/bin",
+            "accelerators": [ { "execution_providers": [ "QNNExecutionProvider" ] } ]
+        }
+    },
+    "data_configs": [
+        {
+            "name": "gemma_vision_data_config",
+            "user_script": "user_script.py",
+            "load_dataset_config": { "type": "gemma_vision_dataset", "model_id": "google/gemma-3-4b-it" }
+        }
+    ],
+    "passes": {
+        "conversion": { "type": "OnnxConversion", "target_opset": 17 },
+        "quantization": {
+            "type": "OnnxStaticQuantization",
+            "quant_preprocess": true,
+            "data_config": "gemma_vision_data_config",
+            "op_types_to_quantize": [ "MatMul", "LayerNormalization", "Gemm", "Sigmoid", "Gelu" ],
+            "activation_type": "uint16",
+            "precision": "uint8",
+            "calibrate_method": "MinMax"
+        },
+        "add_metadata": { "type": "AddOliveMetadata", "graph_name": "gemma-3-4b-it-vision" }
+    },
+    "target": "qnn_system",
+    "log_severity_level": 1,
+    "output_dir": "models/gemma-3-4b-it-vision",
+    "cache_dir": "cache",
+    "no_artifacts": true
+}
diff --git a/examples/gemma3/qnn/user_script.py b/examples/gemma3/qnn/user_script.py
index 4c62fa735..816db2e12 100644
--- a/examples/gemma3/qnn/user_script.py
+++ b/examples/gemma3/qnn/user_script.py
@@ -203,7 +203,16 @@ def _load_image_and_tokenize(self, entry: dict[str, any]):
         inputs["input_ids"] = inputs["input_ids"][0]
         return inputs
 
+SHORTCUT_FIRST_N = 256
 
 @Registry.register_dataset()
 def gemma_dataset(model_id: str):
-    return GemmaDataset(model_id, first_n=200).get_train_dataset()
+    return GemmaDataset(model_id, first_n=SHORTCUT_FIRST_N).get_train_dataset()
+
+@Registry.register_dataset()
+def gemma_text_dataset(model_id: str):
+    return GemmaDataset(model_id, first_n=SHORTCUT_FIRST_N, filter='text').get_train_dataset
+
+@Registry.register_dataset()
+def gemma_vision_dataset(model_id: str):
+    return GemmaDataset(model_id, first_n=SHORTCUT_FIRST_N, filter='images').get_train_dataset()

From 636e982f77e478d4dcd592b86d19cfc634b52bec Mon Sep 17 00:00:00 2001
From: Kyle Romero <kromero@qti.qualcomm.com>
Date: Mon, 18 Aug 2025 18:22:19 -0700
Subject: [PATCH 08/24] Add Gemma3 vision configurations

---
 .../gemma3/qnn/custom_gemma3_4b_it_vision.py   | 18 ++++++++++++++++++
 .../qnn/gemma3-4b-vision-qnn-config.json       | 16 +++++++++-------
 olive/model/handler/hf.py                      |  2 ++
 olive/passes/onnx/conversion.py                | 14 +++++++-------
 4 files changed, 36 insertions(+), 14 deletions(-)
 create mode 100644 examples/gemma3/qnn/custom_gemma3_4b_it_vision.py

diff --git a/examples/gemma3/qnn/custom_gemma3_4b_it_vision.py b/examples/gemma3/qnn/custom_gemma3_4b_it_vision.py
new file mode 100644
index 000000000..686de4395
--- /dev/null
+++ b/examples/gemma3/qnn/custom_gemma3_4b_it_vision.py
@@ -0,0 +1,18 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+
+
+import torch
+from transformers import AutoModel
+
+def load_gemma3_model(model_path):
+    return AutoModel.from_pretrained("google/gemma-3-4b-it")
+
+def get_dummy_inputs(model_handler):
+    return {
+        "input_ids": torch.full((1, 256), 262144, dtype=torch.long),  # Image token ID
+        "pixel_values": torch.randn(1, 3, 896, 896, dtype=torch.float32),
+        "attention_mask": torch.ones((1, 256), dtype=torch.long)
+    }
\ No newline at end of file
diff --git a/examples/gemma3/qnn/gemma3-4b-vision-qnn-config.json b/examples/gemma3/qnn/gemma3-4b-vision-qnn-config.json
index 803000d36..f09a76885 100644
--- a/examples/gemma3/qnn/gemma3-4b-vision-qnn-config.json
+++ b/examples/gemma3/qnn/gemma3-4b-vision-qnn-config.json
@@ -1,13 +1,15 @@
 {
     "input_model": { 
-        "type": "HfModel", 
-        "model_path": "google/gemma-3-4b-it",
+        "type": "PyTorchModel",
+        "model_script": "custom_gemma3_4b_it_vision.py",
+        "model_loader": "load_gemma3_model",
+        "dummy_inputs_func": "get_dummy_inputs",
         "io_config": {
-            "input_names": [ "input_ids", "pixel_values", "attention_mask" ],
-            "input_shapes": [ [ 10, 77 ], [ 1, 3, 224, 224 ], [ 10, 77 ] ],
-            "input_types": [ "int64", "float32", "int64" ],
-            "output_names": [ "logits_per_image" ],
-            "output_shapes": [ [ 1, 2 ] ]
+            "input_names": ["input_ids", "pixel_values", "attention_mask"],
+            "input_shapes": [[1, 256], [1, 3, 896, 896], [1, 256]],
+            "input_types": ["int64", "float32", "int64"],
+            "output_names": ["last_hidden_state"],
+            "output_shapes": [[1, 256, 2560]]
         } 
     },
     "systems": {
diff --git a/olive/model/handler/hf.py b/olive/model/handler/hf.py
index a56a1aab6..bf46d7417 100644
--- a/olive/model/handler/hf.py
+++ b/olive/model/handler/hf.py
@@ -82,6 +82,8 @@ def load_model(self, rank: int = None, cache_model: bool = True) -> "torch.nn.Mo
 
         self.model = model if cache_model else None
 
+        logger.error(self.model)
+
         return model
 
     @property
diff --git a/olive/passes/onnx/conversion.py b/olive/passes/onnx/conversion.py
index 9af4c0187..e9ff76799 100644
--- a/olive/passes/onnx/conversion.py
+++ b/olive/passes/onnx/conversion.py
@@ -49,7 +49,7 @@ def forward(self, *input_data, **input_dict):
         return self.model(*input_data, **input_dict)
 
 
-class OnnxConversion(Pass):
+class (Pass):
     """Convert a PyTorch model to ONNX model using torch.onnx.export on CPU."""
 
     @classmethod
@@ -212,7 +212,7 @@ def _export_pytorch_model(
             pytorch_model = pytorch_model.to(torch_dtype)
 
         # Apply any necessary patches
-        OnnxConversion._patch_model_if_necessary(pytorch_model)
+        ._patch_model_if_necessary(pytorch_model)
 
         # get input and output names, and dynamic axes
         assert io_config is not None, "Cannot get io_config for the model."
@@ -502,7 +502,7 @@ def _convert_model_on_device(
         dummy_inputs = self._get_dummy_inputs(model, config)
         io_config = model.io_config
 
-        converted_onnx_model = OnnxConversion._export_pytorch_model(
+        converted_onnx_model = ._export_pytorch_model(
             pytorch_model, dummy_inputs, io_config, config, device, torch_dtype, tempfile.tempdir
         )
 
@@ -570,11 +570,11 @@ def _export_ranked_model(params):
             input_model = DistributedHfModelHandler(**model_config)
 
             olive_pytorch_model = input_model.load_model(local_rank)
-            dummy_inputs = OnnxConversion._get_dummy_inputs(olive_pytorch_model, pass_config)
+            dummy_inputs = ._get_dummy_inputs(olive_pytorch_model, pass_config)
             io_config = None if pass_config.use_dynamo_exporter else olive_pytorch_model.io_config
             pytorch_model = olive_pytorch_model.prepare_session(rank=local_rank)
 
-            ranked_onnx_modelproto = OnnxConversion._export_pytorch_model(
+            ranked_onnx_modelproto = ._export_pytorch_model(
                 pytorch_model,
                 dummy_inputs,
                 io_config,
@@ -621,11 +621,11 @@ def _convert_distributed_model_on_device(
 
         max_parallel_jobs = min(world_size, config.parallel_jobs or multiprocessing.cpu_count())
         if max_parallel_jobs <= 1:
-            results = [OnnxConversion._export_ranked_model(_) for _ in params]
+            results = [._export_ranked_model(_) for _ in params]
         else:
             context = multiprocessing.get_context("spawn")
             with context.Pool(processes=max_parallel_jobs) as pool:
-                results = pool.map(OnnxConversion._export_ranked_model, params)
+                results = pool.map(._export_ranked_model, params)
 
         if world_size != sum(results):
             raise RuntimeError("Failed to convert models")

From b4ea7a3509e18d7723deea87207edf5ca218b412 Mon Sep 17 00:00:00 2001
From: Kyle Romero <kromero@qti.qualcomm.com>
Date: Mon, 18 Aug 2025 18:27:35 -0700
Subject: [PATCH 09/24] Fix linting error

---
 examples/gemma3/qnn/custom_gemma3_4b_it_vision.py  |  6 ++++--
 .../gemma3/qnn/gemma3-4b-vision-qnn-config.json    | 14 +++++++-------
 examples/gemma3/qnn/user_script.py                 |  8 ++++++--
 olive/passes/onnx/conversion.py                    | 14 +++++++-------
 4 files changed, 24 insertions(+), 18 deletions(-)

diff --git a/examples/gemma3/qnn/custom_gemma3_4b_it_vision.py b/examples/gemma3/qnn/custom_gemma3_4b_it_vision.py
index 686de4395..a969adecb 100644
--- a/examples/gemma3/qnn/custom_gemma3_4b_it_vision.py
+++ b/examples/gemma3/qnn/custom_gemma3_4b_it_vision.py
@@ -7,12 +7,14 @@
 import torch
 from transformers import AutoModel
 
+
 def load_gemma3_model(model_path):
     return AutoModel.from_pretrained("google/gemma-3-4b-it")
 
+
 def get_dummy_inputs(model_handler):
     return {
         "input_ids": torch.full((1, 256), 262144, dtype=torch.long),  # Image token ID
         "pixel_values": torch.randn(1, 3, 896, 896, dtype=torch.float32),
-        "attention_mask": torch.ones((1, 256), dtype=torch.long)
-    }
\ No newline at end of file
+        "attention_mask": torch.ones((1, 256), dtype=torch.long),
+    }
diff --git a/examples/gemma3/qnn/gemma3-4b-vision-qnn-config.json b/examples/gemma3/qnn/gemma3-4b-vision-qnn-config.json
index f09a76885..fb8dba200 100644
--- a/examples/gemma3/qnn/gemma3-4b-vision-qnn-config.json
+++ b/examples/gemma3/qnn/gemma3-4b-vision-qnn-config.json
@@ -1,16 +1,16 @@
 {
-    "input_model": { 
+    "input_model": {
         "type": "PyTorchModel",
         "model_script": "custom_gemma3_4b_it_vision.py",
         "model_loader": "load_gemma3_model",
         "dummy_inputs_func": "get_dummy_inputs",
         "io_config": {
-            "input_names": ["input_ids", "pixel_values", "attention_mask"],
-            "input_shapes": [[1, 256], [1, 3, 896, 896], [1, 256]],
-            "input_types": ["int64", "float32", "int64"],
-            "output_names": ["last_hidden_state"],
-            "output_shapes": [[1, 256, 2560]]
-        } 
+            "input_names": [ "input_ids", "pixel_values", "attention_mask" ],
+            "input_shapes": [ [ 1, 256 ], [ 1, 3, 896, 896 ], [ 1, 256 ] ],
+            "input_types": [ "int64", "float32", "int64" ],
+            "output_names": [ "last_hidden_state" ],
+            "output_shapes": [ [ 1, 256, 2560 ] ]
+        }
     },
     "systems": {
         "qnn_system": {
diff --git a/examples/gemma3/qnn/user_script.py b/examples/gemma3/qnn/user_script.py
index 816db2e12..b5d36ead1 100644
--- a/examples/gemma3/qnn/user_script.py
+++ b/examples/gemma3/qnn/user_script.py
@@ -203,16 +203,20 @@ def _load_image_and_tokenize(self, entry: dict[str, any]):
         inputs["input_ids"] = inputs["input_ids"][0]
         return inputs
 
+
 SHORTCUT_FIRST_N = 256
 
+
 @Registry.register_dataset()
 def gemma_dataset(model_id: str):
     return GemmaDataset(model_id, first_n=SHORTCUT_FIRST_N).get_train_dataset()
 
+
 @Registry.register_dataset()
 def gemma_text_dataset(model_id: str):
-    return GemmaDataset(model_id, first_n=SHORTCUT_FIRST_N, filter='text').get_train_dataset
+    return GemmaDataset(model_id, first_n=SHORTCUT_FIRST_N, filter="text").get_train_dataset
+
 
 @Registry.register_dataset()
 def gemma_vision_dataset(model_id: str):
-    return GemmaDataset(model_id, first_n=SHORTCUT_FIRST_N, filter='images').get_train_dataset()
+    return GemmaDataset(model_id, first_n=SHORTCUT_FIRST_N, filter="images").get_train_dataset()
diff --git a/olive/passes/onnx/conversion.py b/olive/passes/onnx/conversion.py
index e9ff76799..9af4c0187 100644
--- a/olive/passes/onnx/conversion.py
+++ b/olive/passes/onnx/conversion.py
@@ -49,7 +49,7 @@ def forward(self, *input_data, **input_dict):
         return self.model(*input_data, **input_dict)
 
 
-class (Pass):
+class OnnxConversion(Pass):
     """Convert a PyTorch model to ONNX model using torch.onnx.export on CPU."""
 
     @classmethod
@@ -212,7 +212,7 @@ def _export_pytorch_model(
             pytorch_model = pytorch_model.to(torch_dtype)
 
         # Apply any necessary patches
-        ._patch_model_if_necessary(pytorch_model)
+        OnnxConversion._patch_model_if_necessary(pytorch_model)
 
         # get input and output names, and dynamic axes
         assert io_config is not None, "Cannot get io_config for the model."
@@ -502,7 +502,7 @@ def _convert_model_on_device(
         dummy_inputs = self._get_dummy_inputs(model, config)
         io_config = model.io_config
 
-        converted_onnx_model = ._export_pytorch_model(
+        converted_onnx_model = OnnxConversion._export_pytorch_model(
             pytorch_model, dummy_inputs, io_config, config, device, torch_dtype, tempfile.tempdir
         )
 
@@ -570,11 +570,11 @@ def _export_ranked_model(params):
             input_model = DistributedHfModelHandler(**model_config)
 
             olive_pytorch_model = input_model.load_model(local_rank)
-            dummy_inputs = ._get_dummy_inputs(olive_pytorch_model, pass_config)
+            dummy_inputs = OnnxConversion._get_dummy_inputs(olive_pytorch_model, pass_config)
             io_config = None if pass_config.use_dynamo_exporter else olive_pytorch_model.io_config
             pytorch_model = olive_pytorch_model.prepare_session(rank=local_rank)
 
-            ranked_onnx_modelproto = ._export_pytorch_model(
+            ranked_onnx_modelproto = OnnxConversion._export_pytorch_model(
                 pytorch_model,
                 dummy_inputs,
                 io_config,
@@ -621,11 +621,11 @@ def _convert_distributed_model_on_device(
 
         max_parallel_jobs = min(world_size, config.parallel_jobs or multiprocessing.cpu_count())
         if max_parallel_jobs <= 1:
-            results = [._export_ranked_model(_) for _ in params]
+            results = [OnnxConversion._export_ranked_model(_) for _ in params]
         else:
             context = multiprocessing.get_context("spawn")
             with context.Pool(processes=max_parallel_jobs) as pool:
-                results = pool.map(._export_ranked_model, params)
+                results = pool.map(OnnxConversion._export_ranked_model, params)
 
         if world_size != sum(results):
             raise RuntimeError("Failed to convert models")

From 1f69af3939e780fea976676dddf254fa21ede420 Mon Sep 17 00:00:00 2001
From: Kyle Romero <kromero@qti.qualcomm.com>
Date: Tue, 19 Aug 2025 16:08:52 -0700
Subject: [PATCH 10/24] Vision model onnx conversion working

---
 examples/gemma3/qnn/README.md                 |   4 +-
 .../gemma3/qnn/custom_gemma3_4b_it_vision.py  |  32 +-
 examples/gemma3/qnn/env_setup.sh              |   5 +
 .../qnn/gemma3-4b-vision-qnn-config.json      |  20 +-
 examples/gemma3/qnn/user_script.py            | 277 +++++++++++++-----
 5 files changed, 247 insertions(+), 91 deletions(-)

diff --git a/examples/gemma3/qnn/README.md b/examples/gemma3/qnn/README.md
index b7ff54fb4..6fb3e3cb6 100644
--- a/examples/gemma3/qnn/README.md
+++ b/examples/gemma3/qnn/README.md
@@ -8,7 +8,7 @@ Requirements:
 * Python 3.10
 * uv - Used throughout the setup scripts, please follow the [publically available installation instructions](https://docs.astral.sh/uv/getting-started/installation/#installation-methods)
 
-This repository contains an automated setup script for Linux that can be used to help automate many of the steps listed in the tutorial above:
+This repository contains an automated setup script for Linux that can be used to help automate many of the steps listed in the Phi-3.5 tutorial above:
 
 ```bash
 source env_setup.sh
@@ -16,7 +16,7 @@ source env_setup.sh
 
 ## Optimization Process
 
-Since Gemma-3-4B is a multi-modal model composed of both vision and text components, the strategy for optimizing it through Olive is to operate on the constituent models separately before configuring them to work in concert at the onnxruntime-genai stage.
+Since Gemma-3-4B is a multi-modal model composed of both vision and text components, the strategy for optimizing it through Olive is to operate on the constituent models before configuring them to work in concert at the onnxruntime-genai stage.
 
 Thus, the following commands should be used to separately produce context binaries for the text and vision portions of the model, respectively.
 
diff --git a/examples/gemma3/qnn/custom_gemma3_4b_it_vision.py b/examples/gemma3/qnn/custom_gemma3_4b_it_vision.py
index a969adecb..c0d35ecb5 100644
--- a/examples/gemma3/qnn/custom_gemma3_4b_it_vision.py
+++ b/examples/gemma3/qnn/custom_gemma3_4b_it_vision.py
@@ -4,17 +4,33 @@
 # --------------------------------------------------------------------------
 
 
+import logging
+
 import torch
 from transformers import AutoModel
 
+logger = logging.getLogger(__name__)
 
-def load_gemma3_model(model_path):
-    return AutoModel.from_pretrained("google/gemma-3-4b-it")
 
+class Gemma3VisualEmbeddingGenerator(torch.nn.Module):
+    def __init__(self, full_model):
+        super().__init__()
+        # Extract only the vision components
+        self.vision_tower = full_model.vision_tower
+        self.multi_modal_projector = full_model.multi_modal_projector
+
+    def forward(self, pixel_values):
+        # Process images through vision tower
+        image_outputs = self.vision_tower(pixel_values, output_hidden_states=True)
+        selected_image_feature = image_outputs.last_hidden_state
+        # Project to final embedding space
+        return self.multi_modal_projector(selected_image_feature)
+
+
+def load_gemma3_model(model_path):
+    full_model = AutoModel.from_pretrained("google/gemma-3-4b-it")
+    logger.info("Loaded full model: %s", full_model)
 
-def get_dummy_inputs(model_handler):
-    return {
-        "input_ids": torch.full((1, 256), 262144, dtype=torch.long),  # Image token ID
-        "pixel_values": torch.randn(1, 3, 896, 896, dtype=torch.float32),
-        "attention_mask": torch.ones((1, 256), dtype=torch.long),
-    }
+    vision_model = Gemma3VisualEmbeddingGenerator(full_model)
+    logger.info("Created vision-only model: %s", vision_model)
+    return vision_model
diff --git a/examples/gemma3/qnn/env_setup.sh b/examples/gemma3/qnn/env_setup.sh
index 03a3a9993..bc799d110 100644
--- a/examples/gemma3/qnn/env_setup.sh
+++ b/examples/gemma3/qnn/env_setup.sh
@@ -1,3 +1,8 @@
+#!/bin/bash
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
 
 # Installing setuptools to build Olive from source
 uv pip install setuptools
diff --git a/examples/gemma3/qnn/gemma3-4b-vision-qnn-config.json b/examples/gemma3/qnn/gemma3-4b-vision-qnn-config.json
index fb8dba200..cb2860fd7 100644
--- a/examples/gemma3/qnn/gemma3-4b-vision-qnn-config.json
+++ b/examples/gemma3/qnn/gemma3-4b-vision-qnn-config.json
@@ -3,12 +3,11 @@
         "type": "PyTorchModel",
         "model_script": "custom_gemma3_4b_it_vision.py",
         "model_loader": "load_gemma3_model",
-        "dummy_inputs_func": "get_dummy_inputs",
         "io_config": {
-            "input_names": [ "input_ids", "pixel_values", "attention_mask" ],
-            "input_shapes": [ [ 1, 256 ], [ 1, 3, 896, 896 ], [ 1, 256 ] ],
-            "input_types": [ "int64", "float32", "int64" ],
-            "output_names": [ "last_hidden_state" ],
+            "input_names": [ "pixel_values" ],
+            "input_shapes": [ [ 1, 3, 896, 896 ] ],
+            "input_types": [ "float32" ],
+            "output_names": [ "image_features" ],
             "output_shapes": [ [ 1, 256, 2560 ] ]
         }
     },
@@ -27,16 +26,23 @@
         }
     ],
     "passes": {
-        "conversion": { "type": "OnnxConversion", "target_opset": 17 },
+        "conversion": { "type": "OnnxConversion", "target_opset": 20 },
+        "surgery": { "type": "GraphSurgeries", "surgeries": [ { "surgeon": "MatMulAddToGemm" } ] },
         "quantization": {
             "type": "OnnxStaticQuantization",
             "quant_preprocess": true,
             "data_config": "gemma_vision_data_config",
-            "op_types_to_quantize": [ "MatMul", "LayerNormalization", "Gemm", "Sigmoid", "Gelu" ],
             "activation_type": "uint16",
             "precision": "uint8",
             "calibrate_method": "MinMax"
         },
+        "cb": {
+            "type": "EPContextBinaryGenerator",
+            "provider_options": {
+                "htp_graph_finalization_optimization_mode": "3",
+                "offload_graph_io_quantization": "0"
+            }
+        },
         "add_metadata": { "type": "AddOliveMetadata", "graph_name": "gemma-3-4b-it-vision" }
     },
     "target": "qnn_system",
diff --git a/examples/gemma3/qnn/user_script.py b/examples/gemma3/qnn/user_script.py
index b5d36ead1..2388cbe7f 100644
--- a/examples/gemma3/qnn/user_script.py
+++ b/examples/gemma3/qnn/user_script.py
@@ -7,6 +7,7 @@
 import os
 import subprocess
 import zipfile
+from abc import ABC, abstractmethod
 from pathlib import Path
 from typing import Optional
 
@@ -23,20 +24,101 @@
 logger = logging.getLogger(__name__)
 
 
-class GemmaDataset:
+class BaseGemmaDataset(ABC):
+    """Abstract base class for Gemma dataset implementations."""
+
     CACHE_DIR = os.getenv("CACHE_DIR", ".cache")
 
     def __init__(self, model_id: str, first_n: Optional[int] = None):
         self.model_id = model_id
         self.first_n = first_n
-
         self.processor = AutoProcessor.from_pretrained(self.model_id)
-        self.tokenizer = AutoTokenizer.from_pretrained(
-            self.model_id, cache_dir=self.CACHE_DIR, use_fast=True, trust_remote_code=True
-        )
+
+        # Initialize attributes that will be set during dataset loading
+        self.image_data_path = None
+        self.raw_datasets = None
+
+        # Initialize processor components based on subclass requirements
+        self._initialize_processor_components()
 
         self.setup_dataset()
 
+    @abstractmethod
+    def _initialize_processor_components(self):
+        """Initialize processor components specific to the dataset type."""
+
+    @abstractmethod
+    def _process_dataset_entry(self, entry: dict[str, any]):
+        """Process a single dataset entry according to the dataset type."""
+
+    def _convert_single_llava_to_gemma_conversation(
+        self, conversation: list[dict[str, str]], strip_images: bool = False
+    ) -> dict[str, str | list[dict]]:
+        """Convert a single llava-style conversation entry to Gemma-style.
+
+        Args:
+            conversation: The conversation entry to convert
+            strip_images: If True, remove <image> tokens and create text-only content.
+                         If False, preserve <image> tokens and create multimodal content.
+
+        Examples:
+            >>> conversation = {"from": "human", "value": "<image>What are the colors of the bus in the image?"}
+            >>> _convert_single_llava_to_gemma_conversation(conversation, strip_images=False)
+            {
+                'role': 'user',
+                'content': [{'type': 'image'}, {'type': 'text', 'text': 'What are the colors of the bus in the image?'}]
+            }
+            >>> _convert_single_llava_to_gemma_conversation(conversation, strip_images=True)
+            {
+                'role': 'user',
+                'content': [{'type': 'text', 'text': 'What are the colors of the bus in the image?'}]
+            }
+
+        """
+        who = conversation.get("from")
+        match who:
+            case "human":
+                role = "user"
+            case "gpt":
+                role = "assistant"
+            case _:
+                raise ValueError(f"Unknown role: {who}")
+
+        text = conversation.get("value")
+
+        if strip_images:
+            # Text-only: remove image references completely
+            text = text.replace("<image>", "").strip()
+            return {
+                "role": role,
+                "content": [{"type": "text", "text": text}],
+            }
+        else:
+            # Multimodal: preserve image references
+            if "<image>" in text:
+                has_image = True
+                text = text.replace("<image>", "")
+            else:
+                has_image = False
+
+            return {
+                "role": role,
+                "content": (
+                    [{"type": "image"}, {"type": "text", "text": text}]
+                    if has_image
+                    else [{"type": "text", "text": text}]
+                ),
+            }
+
+    def _convert_llava_to_gemma_conversation(self, entry: dict[str, any], strip_images: bool = False):
+        """Convert LlaVA-style conversations to Gemma-style."""
+        entry["text"] = [
+            self._convert_single_llava_to_gemma_conversation(conversation, strip_images=strip_images)
+            for conversation in entry["conversations"]
+        ]
+        del entry["conversations"]
+        return entry
+
     def _download_and_extract_images(self):
         """Download the COCO train2017 image dataset and extract to the cache directory."""
         zip_filename = "train2017.zip"
@@ -90,9 +172,8 @@ def _download_and_extract_images(self):
 
         return extract_path
 
-    def setup_dataset(self):
-        # Uses a LlaVA dataset and transforms it to something Gemma-compatible
-
+    def _load_base_dataset(self):
+        """Load the base LlaVA dataset."""
         # Issue with Arrow leads to errors when using load_dataset directly on liuhaotian/LLaVA-Instruct-150K
         file_path = hf_hub_download(
             repo_id="liuhaotian/LLaVA-Instruct-150K",
@@ -107,86 +188,67 @@ def setup_dataset(self):
         # Limit data processing to the first_n rows
         self.raw_datasets = self.raw_datasets if self.first_n is None else self.raw_datasets.select(range(self.first_n))
 
-        # Convert the Llava-style conversation to Gemma-style conversation
-        self.raw_datasets = self.raw_datasets.map(self._convert_llava_to_gemma_conversation)
+    def _extract_image_details(self, entry: dict[str, any]):
+        """Extract image details from the dataset example.
+
+        Opens the image file and adds image mode information to the example.
+        """
+        image = PILImage.open(fp=os.path.join(self.image_data_path, entry["image"]))
+        entry["image_mode"] = image.mode
+        return entry
+
+    def setup_dataset(self):
+        """Set up the dataset with common preprocessing steps."""
+        self._load_base_dataset()
 
-        # Extract image details using a lambda to pass the dataset_path
+        # Extract image details
         self.raw_datasets = self.raw_datasets.map(self._extract_image_details)
 
         # Filter out any images that are not RGB
         self.raw_datasets = self.raw_datasets.filter(lambda x: x["image_mode"] == "RGB")
 
-        # Loads the images and tokenizes the text
-        self.raw_datasets = self.raw_datasets.with_transform(self._load_image_and_tokenize)
+        # Apply dataset-specific processing
+        logger.error(self.raw_datasets[0])
+        logger.error(self.raw_datasets[1])
 
-        for entry in self.raw_datasets:
-            logger.error(entry)
+        self.raw_datasets = self.raw_datasets.with_transform(self._process_dataset_entry)
 
-    def get_train_dataset(self):
-        return self.raw_datasets
+        logger.error(self.raw_datasets[0])
+        logger.error(self.raw_datasets[1])
 
-    @staticmethod
-    def _convert_llava_to_gemma_conversation(entry: dict[str, any]):
-        entry["text"] = [
-            GemmaDataset._convert_single_llava_to_gemma_conversation(conversation)
-            for conversation in entry["conversations"]
-        ]
-        del entry["conversations"]
-        return entry
+    def get_dataset(self):
+        """Return the processed dataset."""
+        return self.raw_datasets
 
-    @staticmethod
-    def _convert_single_llava_to_gemma_conversation(conversation: list[dict[str, str]]) -> dict[str, str | list[dict]]:
-        """Convert a single llava-style conversation entry to Gemma-style.
 
-        Examples:
-            >>> conversation = {"from": "human", "value": "<image>What are the colors of the bus in the image?"}
-            >>> _convert_llava_to_gemma_conversation(conversation)
-            {
-                'role': 'user',
-                'content': [{'type': 'image'}, {'type': 'text', 'text': 'What are the colors of the bus in the image?'}]
-            }
-            >>> conversation = {"from": "gpt", "value": "The bus in the image is white and red."}
-            >>> _convert_llava_to_gemma_conversation(conversation)
-            {
-                'role': 'assistant',
-                'content': [{'type': 'text', 'text': 'The bus in the image is white and red.'}]
-            }
+class GemmaMultimodalDataset(BaseGemmaDataset):
+    """Dataset for full E2E Gemma 3 multi-modal model including both image and text."""
 
-        """
-        who = conversation.get("from")
-        match who:
-            case "human":
-                role = "user"
-            case "gpt":
-                role = "assistant"
-            case _:
-                raise ValueError(f"Unknown role: {who}")
+    def _initialize_processor_components(self):
+        """Initialize tokenizer for multimodal processing."""
+        self.tokenizer = AutoTokenizer.from_pretrained(
+            self.model_id, cache_dir=self.CACHE_DIR, use_fast=True, trust_remote_code=True
+        )
 
-        text = conversation.get("value")
+    def setup_dataset(self):
+        """Set up the multimodal dataset with text conversation conversion."""
+        self._load_base_dataset()
 
-        if "<image>" in text:
-            has_image = True
-            text = text.replace("<image>", "")
-        else:
-            has_image = False
+        # Convert the Llava-style conversation to Gemma-style conversation (preserve images)
+        self.raw_datasets = self.raw_datasets.map(
+            lambda entry: self._convert_llava_to_gemma_conversation(entry, strip_images=False)
+        )
 
-        return {
-            "role": role,
-            "content": (
-                [{"type": "image"}, {"type": "text", "text": text}] if has_image else [{"type": "text", "text": text}]
-            ),
-        }
+        # Extract image details
+        self.raw_datasets = self.raw_datasets.map(self._extract_image_details)
 
-    def _extract_image_details(self, entry: dict[str, any]):
-        """Extract image details from the dataset example.
+        # Filter out any images that are not RGB
+        self.raw_datasets = self.raw_datasets.filter(lambda x: x["image_mode"] == "RGB")
 
-        Opens the image file and adds image mode information to the example.
-        """
-        image = PILImage.open(fp=os.path.join(self.image_data_path, entry["image"]))
-        entry["image_mode"] = image.mode
-        return entry
+        # Apply multimodal processing
+        self.raw_datasets = self.raw_datasets.with_transform(self._process_dataset_entry)
 
-    def _load_image_and_tokenize(self, entry: dict[str, any]):
+    def _process_dataset_entry(self, entry: dict[str, any]):
         """Load image and tokenize the conversation for model input.
 
         Args:
@@ -204,19 +266,86 @@ def _load_image_and_tokenize(self, entry: dict[str, any]):
         return inputs
 
 
-SHORTCUT_FIRST_N = 256
+class GemmaTextOnlyDataset(BaseGemmaDataset):
+    """Dataset for only the text portion of the Gemma 3 model."""
+
+    def _initialize_processor_components(self):
+        """Initialize tokenizer for text-only processing."""
+        self.tokenizer = AutoTokenizer.from_pretrained(
+            self.model_id, cache_dir=self.CACHE_DIR, use_fast=True, trust_remote_code=True
+        )
+
+    def setup_dataset(self):
+        """Set up the text-only dataset with conversation conversion."""
+        self._load_base_dataset()
+
+        # Convert the Llava-style conversation to Gemma-style conversation (strip images)
+        self.raw_datasets = self.raw_datasets.map(
+            lambda entry: self._convert_llava_to_gemma_conversation(entry, strip_images=True)
+        )
+
+        # Extract image details (still needed for filtering)
+        self.raw_datasets = self.raw_datasets.map(self._extract_image_details)
+
+        # Filter out any images that are not RGB
+        self.raw_datasets = self.raw_datasets.filter(lambda x: x["image_mode"] == "RGB")
+
+        # Apply text-only processing
+        self.raw_datasets = self.raw_datasets.with_transform(self._process_dataset_entry)
+
+    def _process_dataset_entry(self, entry: dict[str, any]):
+        """Extract and tokenize only the text content.
+
+        Args:
+            entry: Dataset entry containing text conversation
+
+        Returns:
+            Tokenized text inputs ready for model processing
+
+        """
+        # Apply chat template without images, text-only
+        inputs = self.tokenizer.apply_chat_template(
+            entry["text"][0], add_generation_prompt=True, tokenize=True, return_tensors="pt", return_dict=True
+        )
+        return {k: v.squeeze(0) for k, v in inputs.items()}  # Remove batch dimension
+
+
+class GemmaVisionOnlyDataset(BaseGemmaDataset):
+    """Dataset for only the vision tower of the Gemma 3 model."""
+
+    def _initialize_processor_components(self):
+        """No additional components needed for vision-only processing."""
+
+    def _process_dataset_entry(self, entry: dict[str, any]):
+        """Load image and extract only pixel_values for vision-only processing."""
+        # Load and process the image
+        logger.error("PROCESSING IMAGE")
+        image = PILImage.open(fp=os.path.join(self.image_data_path, entry["image"][0]))
+
+        # Process image to get pixel_values
+        inputs = self.processor(text="<start_of_image>", images=image, return_tensors="pt")
+
+        # Return only pixel_values
+        return {"pixel_values": inputs["pixel_values"]}
+
+
+# Remove this when submitting for review
+SHORTCUT_FIRST_N = 2
 
 
 @Registry.register_dataset()
 def gemma_dataset(model_id: str):
-    return GemmaDataset(model_id, first_n=SHORTCUT_FIRST_N).get_train_dataset()
+    """Full E2E Gemma 3 multi-modal dataset (image + text)."""
+    return GemmaMultimodalDataset(model_id, first_n=SHORTCUT_FIRST_N).get_dataset()
 
 
 @Registry.register_dataset()
 def gemma_text_dataset(model_id: str):
-    return GemmaDataset(model_id, first_n=SHORTCUT_FIRST_N, filter="text").get_train_dataset
+    """Text-only Gemma 3 dataset."""
+    return GemmaTextOnlyDataset(model_id, first_n=SHORTCUT_FIRST_N).get_dataset()
 
 
 @Registry.register_dataset()
 def gemma_vision_dataset(model_id: str):
-    return GemmaDataset(model_id, first_n=SHORTCUT_FIRST_N, filter="images").get_train_dataset()
+    """Vision-only Gemma 3 dataset."""
+    return GemmaVisionOnlyDataset(model_id, first_n=SHORTCUT_FIRST_N).get_dataset()

From aed20eccf1d64dde6f33124d343ca25ac1faf01d Mon Sep 17 00:00:00 2001
From: Kyle Romero <kromero@qti.qualcomm.com>
Date: Tue, 19 Aug 2025 17:02:26 -0700
Subject: [PATCH 11/24] Enable quant on text model

---
 examples/gemma3/qnn/README.md                       |  2 ++
 examples/gemma3/qnn/env_setup.sh                    |  4 ++--
 examples/gemma3/qnn/gemma3-4b-text-qnn-config.json  |  2 +-
 .../gemma3/qnn/gemma3-4b-vision-qnn-config.json     |  3 ++-
 examples/gemma3/qnn/user_script.py                  |  2 +-
 olive/passes/pytorch/rotate.py                      | 13 +++++++++++--
 6 files changed, 19 insertions(+), 7 deletions(-)

diff --git a/examples/gemma3/qnn/README.md b/examples/gemma3/qnn/README.md
index 6fb3e3cb6..275f1816f 100644
--- a/examples/gemma3/qnn/README.md
+++ b/examples/gemma3/qnn/README.md
@@ -14,6 +14,8 @@ This repository contains an automated setup script for Linux that can be used to
 source env_setup.sh
 ```
 
+> **Warning:** The above script uses a different commit hash (558449bed3ef2653c36041650d30da6bbbca440d) for building GPTQModel than the Phi-3.5 tutorial due to a [memory leak issue](https://github.com/ModelCloud/GPTQModel/commit/558449bed3ef2653c36041650d30da6bbbca440d) with Gemma3.
+
 ## Optimization Process
 
 Since Gemma-3-4B is a multi-modal model composed of both vision and text components, the strategy for optimizing it through Olive is to operate on the constituent models before configuring them to work in concert at the onnxruntime-genai stage.
diff --git a/examples/gemma3/qnn/env_setup.sh b/examples/gemma3/qnn/env_setup.sh
index bc799d110..aa117afc0 100644
--- a/examples/gemma3/qnn/env_setup.sh
+++ b/examples/gemma3/qnn/env_setup.sh
@@ -20,9 +20,9 @@ export BUILD_CUDA_EXT=0
 uv pip install --no-build-isolation git+https://github.com/PanQiWei/AutoGPTQ.git
 
 # Install GptqModel from source
-uv pip install --no-build-isolation git+https://github.com/ModelCloud/GPTQModel.git@5d2911a4b2a709afb0941d53c3882d0cd80b9649
+# Note: Commit hash corresponds to commit which fixes Gemma 3 memory leak issue. See README.md for additional details.
+uv pip install --no-build-isolation git+https://github.com/ModelCloud/GPTQModel.git@558449bed3ef2653c36041650d30da6bbbca440d
 
 # Install onnxruntime-qnn without installing onnxruntime
-# Note: Installing both at the same time may cause conflicts
 uv pip install -r https://raw.githubusercontent.com/microsoft/onnxruntime/refs/heads/main/requirements.txt
 uv pip install -U --pre --extra-index-url https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/ORT-Nightly/pypi/simple onnxruntime-qnn --no-deps
diff --git a/examples/gemma3/qnn/gemma3-4b-text-qnn-config.json b/examples/gemma3/qnn/gemma3-4b-text-qnn-config.json
index d2eff5678..675d991bb 100644
--- a/examples/gemma3/qnn/gemma3-4b-text-qnn-config.json
+++ b/examples/gemma3/qnn/gemma3-4b-text-qnn-config.json
@@ -15,7 +15,7 @@
         }
     ],
     "passes": {
-        "q": { "type": "QuaRot" },
+        "q": { "type": "QuaRot", "device": "cpu" },
         "g": {
             "type": "GptqModel",
             "bits": 4,
diff --git a/examples/gemma3/qnn/gemma3-4b-vision-qnn-config.json b/examples/gemma3/qnn/gemma3-4b-vision-qnn-config.json
index cb2860fd7..42b775087 100644
--- a/examples/gemma3/qnn/gemma3-4b-vision-qnn-config.json
+++ b/examples/gemma3/qnn/gemma3-4b-vision-qnn-config.json
@@ -34,7 +34,8 @@
             "data_config": "gemma_vision_data_config",
             "activation_type": "uint16",
             "precision": "uint8",
-            "calibrate_method": "MinMax"
+            "calibrate_method": "MinMax",
+            "calibration_providers": [ "CUDAExecutionProvider" ]
         },
         "cb": {
             "type": "EPContextBinaryGenerator",
diff --git a/examples/gemma3/qnn/user_script.py b/examples/gemma3/qnn/user_script.py
index 2388cbe7f..1ddbc1839 100644
--- a/examples/gemma3/qnn/user_script.py
+++ b/examples/gemma3/qnn/user_script.py
@@ -330,7 +330,7 @@ def _process_dataset_entry(self, entry: dict[str, any]):
 
 
 # Remove this when submitting for review
-SHORTCUT_FIRST_N = 2
+SHORTCUT_FIRST_N = 256
 
 
 @Registry.register_dataset()
diff --git a/olive/passes/pytorch/rotate.py b/olive/passes/pytorch/rotate.py
index 470eb619a..d82fe947d 100644
--- a/olive/passes/pytorch/rotate.py
+++ b/olive/passes/pytorch/rotate.py
@@ -44,6 +44,11 @@ class RotateMode(StrEnumBase):
     @classmethod
     def _default_config(cls, accelerator_spec: AcceleratorSpec) -> dict[str, PassConfigParam]:
         return {
+            "device": PassConfigParam(
+                type_=str,
+                default_value="cpu",
+                description="Whether to run rotation on cpu or gpu. Accepted values are 'cpu' and 'cuda'.",
+            ),
             "seed": PassConfigParam(
                 type_=int,
                 default_value=0,
@@ -60,6 +65,7 @@ def _default_config(cls, accelerator_spec: AcceleratorSpec) -> dict[str, PassCon
     def rotate_model(
         self,
         model: HfModelHandler,
+        device: str,
         rotate_mode: str,
         seed: int,
         training_args: Optional[BaseHFTrainingArguments] = None,
@@ -157,10 +163,13 @@ def rotate_model(
                 count_trainable_parameters(model_wrapper.model),
             )
 
+        if device == "cuda" and not torch.cuda.is_available():
+            raise ValueError("Please install CUDA to rotate with it.")
+
         return (
             model_wrapper,
             rotation_params,
-            [((RotateEmbed, RotateLinear), lambda x: x.create_merged("cuda" if torch.cuda.is_available() else "cpu"))],
+            [((RotateEmbed, RotateLinear), lambda x: x.create_merged(device))],
         )
 
     @classmethod
@@ -246,7 +255,7 @@ class QuaRot(RotateBase):
     def _run_for_config(
         self, model: HfModelHandler, config: type[BasePassConfig], output_model_path: str
     ) -> HfModelHandler:
-        model_wrapper, _, save_replacements = self.rotate_model(model, config.rotate_mode, config.seed)
+        model_wrapper, _, save_replacements = self.rotate_model(model, config.device, config.rotate_mode, config.seed)
 
         # save the model
         model_wrapper.save_model(output_model_path, replacements=save_replacements)

From ba0633c8bb354c705b326cc73cd12d73421930b9 Mon Sep 17 00:00:00 2001
From: Kyle Romero <kromero@qti.qualcomm.com>
Date: Mon, 25 Aug 2025 20:11:32 -0700
Subject: [PATCH 12/24] Improve README

---
 examples/gemma3/qnn/README.md                 | 109 ++++++++++++++++--
 .../qnn/gemma3-4b-vision-qnn-config.json      |   3 +-
 examples/gemma3/qnn/user_script.py            |   2 +-
 3 files changed, 102 insertions(+), 12 deletions(-)

diff --git a/examples/gemma3/qnn/README.md b/examples/gemma3/qnn/README.md
index 275f1816f..93c347fbe 100644
--- a/examples/gemma3/qnn/README.md
+++ b/examples/gemma3/qnn/README.md
@@ -1,26 +1,92 @@
 # Gemma-3-4B Model Optimization
 
-This repository demonstrates the optimization of the [Google Gemma-3-4B](https://huggingface.co/google/gemma-3-4b-it) model using **post-training quantization (PTQ)** techniques. The optimization process utilizes an environment based heavily upon the [PTQ tutorial for Phi-3.5](https://github.com/CodeLinaro/Olive/blob/main/examples/phi3_5/README.md)
+This repository demonstrates the optimization of the [Google Gemma-3-4B](https://huggingface.co/google/gemma-3-4b-it) model using **post-training quantization (PTQ)** techniques for QNN (Qualcomm Neural Network) execution. The optimization process utilizes an environment based heavily upon the [PTQ tutorial for Phi-3.5](https://github.com/CodeLinaro/Olive/blob/main/examples/phi3_5/README.md)
 
-## Automated Setup (Linux Only)
+## File Overview
 
-Requirements:
-* Python 3.10
-* uv - Used throughout the setup scripts, please follow the [publically available installation instructions](https://docs.astral.sh/uv/getting-started/installation/#installation-methods)
+This example contains the following key files:
 
-This repository contains an automated setup script for Linux that can be used to help automate many of the steps listed in the Phi-3.5 tutorial above:
+- **`env_setup.sh`** - Automated environment setup script (Linux only)
+- **`gemma3-4b-text-qnn-config.json`** - Olive configuration for optimizing the text component
+- **`gemma3-4b-vision-qnn-config.json`** - Olive configuration for optimizing the vision component
+- **`user_script.py`** - Dataset handling and preprocessing utilities
+- **`custom_gemma3_4b_it_vision.py`** - Vision model loader for the optimization pipeline
 
+## Prerequisites
+
+### System Requirements
+- **Operating System**: Linux (automated setup script is Linux-only)
+- **Python**: 3.10
+- **Package Manager**: [uv](https://docs.astral.sh/uv/getting-started/installation/#installation-methods)
+- **Storage**: ~13GB for COCO train2017 dataset (downloaded automatically)
+
+### Dependencies Installed by Setup Script
+The `env_setup.sh` script installs the following components:
+- setuptools (for building Olive from source)
+- Olive requirements and dependencies
+- AutoGPTQ (from source)
+- GPTQModel (specific commit: `558449bed3ef2653c36041650d30da6bbbca440d`)
+- onnxruntime-qnn (pre-release version)
+
+## Setup Instructions
+
+### Automated Setup (Recommended)
 ```bash
 source env_setup.sh
 ```
 
-> **Warning:** The above script uses a different commit hash (558449bed3ef2653c36041650d30da6bbbca440d) for building GPTQModel than the Phi-3.5 tutorial due to a [memory leak issue](https://github.com/ModelCloud/GPTQModel/commit/558449bed3ef2653c36041650d30da6bbbca440d) with Gemma3.
+### Manual Setup (Alternative)
+If you prefer to set up manually or need to troubleshoot:
+
+1. Install setuptools:
+   ```bash
+   uv pip install setuptools
+   ```
+
+2. Install requirements:
+   ```bash
+   uv pip install -r ../requirements.txt
+   uv pip install -r ../../../requirements.txt
+   ```
+
+3. Install AutoGPTQ from source:
+   ```bash
+   export BUILD_CUDA_EXT=0
+   uv pip install --no-build-isolation git+https://github.com/PanQiWei/AutoGPTQ.git
+   ```
+
+4. Install GPTQModel with Gemma3 fix:
+   ```bash
+   uv pip install --no-build-isolation git+https://github.com/ModelCloud/GPTQModel.git@558449bed3ef2653c36041650d30da6bbbca440d
+   ```
+
+5. Install onnxruntime-qnn:
+   ```bash
+   uv pip install -r https://raw.githubusercontent.com/microsoft/onnxruntime/refs/heads/main/requirements.txt
+   uv pip install -U --pre --extra-index-url https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/ORT-Nightly/pypi/simple onnxruntime-qnn --no-deps
+   ```
+
+> **Important:** The setup uses a specific commit hash for GPTQModel (`558449bed3ef2653c36041650d30da6bbbca440d`) to address a [memory leak issue](https://github.com/ModelCloud/GPTQModel/commit/558449bed3ef2653c36041650d30da6bbbca440d) with Gemma3 models.
 
 ## Optimization Process
 
-Since Gemma-3-4B is a multi-modal model composed of both vision and text components, the strategy for optimizing it through Olive is to operate on the constituent models before configuring them to work in concert at the onnxruntime-genai stage.
+Since Gemma-3-4B is a multi-modal model composed of both vision and text components, the strategy for optimizing it through Olive is to operate on the constituent models separately before configuring them to work together at the onnxruntime-genai stage.
+
+### Configuration Differences
+
+**Text Configuration (`gemma3-4b-text-qnn-config.json`)**:
+- Uses HuggingFace model directly (`google/gemma-3-4b-it`)
+- Applies comprehensive optimization pipeline: QuaRot → GptqModel → ModelBuilder → Quantization
+- Outputs to: `models/gemma-3-4b-it-text/`
+
+**Vision Configuration (`gemma3-4b-vision-qnn-config.json`)**:
+- Uses custom PyTorch model loader (`custom_gemma3_4b_it_vision.py`)
+- Simpler pipeline: ONNX Conversion → Graph Surgery → Quantization
+- Outputs to: `models/gemma-3-4b-it-vision/`
 
-Thus, the following commands should be used to separately produce context binaries for the text and vision portions of the model, respectively.
+### Running Optimization
+
+Execute the following commands to separately produce optimized binaries for each component:
 
 ```bash
 olive run --config gemma3-4b-text-qnn-config.json
@@ -29,3 +95,28 @@ olive run --config gemma3-4b-text-qnn-config.json
 ```bash
 olive run --config gemma3-4b-vision-qnn-config.json
 ```
+
+## Expected Outputs
+
+After successful optimization, you will find:
+
+- **Text model outputs**: `models/gemma-3-4b-it-text/`
+- **Vision model outputs**: `models/gemma-3-4b-it-vision/`
+- **Cache directory**: `cache/` (intermediate files and downloaded datasets)
+- **Dataset**: `.cache/train2017/` (COCO train2017 images, ~13GB)
+
+Both configurations use `"no_artifacts": true`, meaning only the final optimized models are retained.
+
+## Troubleshooting
+
+### Common Issues
+
+**Insufficient Storage**: The COCO train2017 dataset requires ~13GB of storage and is downloaded automatically to `.cache/train2017/`.
+
+**Memory Requirements**: The optimization process, particularly for the text model with its comprehensive pipeline, requires substantial memory.
+
+**QNN Provider**: Ensure the QNNExecutionProvider is properly installed and configured in your environment.
+
+**Platform Limitation**: The current setup script is designed for Linux only. Windows/macOS users will need to adapt the manual setup steps.
+
+**Dataset Download**: If the COCO dataset download fails, check your internet connection and available storage. The script uses `wget` which must be available on your system.
diff --git a/examples/gemma3/qnn/gemma3-4b-vision-qnn-config.json b/examples/gemma3/qnn/gemma3-4b-vision-qnn-config.json
index 42b775087..cb2860fd7 100644
--- a/examples/gemma3/qnn/gemma3-4b-vision-qnn-config.json
+++ b/examples/gemma3/qnn/gemma3-4b-vision-qnn-config.json
@@ -34,8 +34,7 @@
             "data_config": "gemma_vision_data_config",
             "activation_type": "uint16",
             "precision": "uint8",
-            "calibrate_method": "MinMax",
-            "calibration_providers": [ "CUDAExecutionProvider" ]
+            "calibrate_method": "MinMax"
         },
         "cb": {
             "type": "EPContextBinaryGenerator",
diff --git a/examples/gemma3/qnn/user_script.py b/examples/gemma3/qnn/user_script.py
index 1ddbc1839..5df7d2ad2 100644
--- a/examples/gemma3/qnn/user_script.py
+++ b/examples/gemma3/qnn/user_script.py
@@ -330,7 +330,7 @@ def _process_dataset_entry(self, entry: dict[str, any]):
 
 
 # Remove this when submitting for review
-SHORTCUT_FIRST_N = 256
+SHORTCUT_FIRST_N = 20
 
 
 @Registry.register_dataset()

From acbdfdca0b99b157efc5e18447892302243b435c Mon Sep 17 00:00:00 2001
From: Kyle Romero <kromero@qti.qualcomm.com>
Date: Thu, 28 Aug 2025 14:49:37 -0700
Subject: [PATCH 13/24] Add files from Prudvhi

---
 examples/gemma3/qnn/app.py                    |  61 +++
 ...script.py => custom_gemma3_4b_datasets.py} |   0
 .../gemma3/qnn/custom_gemma3_4b_embedding.py  |  38 ++
 ...t_vision.py => custom_gemma3_4b_vision.py} |   2 +-
 .../qnn/gemma3-4b-embedding-qnn-config.json   |  40 ++
 .../gemma3/qnn/gemma3-4b-text-qnn-config.json |   5 +-
 .../qnn/gemma3-4b-vision-qnn-config.json      |   4 +-
 examples/gemma3/qnn/genai_config.json         | 422 ++++++++++++++++++
 8 files changed, 567 insertions(+), 5 deletions(-)
 create mode 100644 examples/gemma3/qnn/app.py
 rename examples/gemma3/qnn/{user_script.py => custom_gemma3_4b_datasets.py} (100%)
 create mode 100644 examples/gemma3/qnn/custom_gemma3_4b_embedding.py
 rename examples/gemma3/qnn/{custom_gemma3_4b_it_vision.py => custom_gemma3_4b_vision.py} (96%)
 create mode 100644 examples/gemma3/qnn/gemma3-4b-embedding-qnn-config.json
 create mode 100644 examples/gemma3/qnn/genai_config.json

diff --git a/examples/gemma3/qnn/app.py b/examples/gemma3/qnn/app.py
new file mode 100644
index 000000000..380e15220
--- /dev/null
+++ b/examples/gemma3/qnn/app.py
@@ -0,0 +1,61 @@
+# app.py
+# ruff: noqa: T201
+from argparse import ArgumentParser
+import numpy as np
+
+import onnxruntime_genai as og
+
+parser = ArgumentParser(description="Run a simple chat application with the Gemma3 model.")
+parser.add_argument(
+    "-m",
+    "--model_folder",
+    type=str,
+    default="",
+    help="Path to the folder containing the outputs of Olive run",
+)
+args = parser.parse_args()
+
+# Load the base model and tokenizer
+model = og.Model(f"{args.model_folder}/model")
+tokenizer = og.Tokenizer(model)
+tokenizer_stream = tokenizer.create_stream()
+
+# Set the max length to something sensible by default,
+# since otherwise it will be set to the entire context length
+search_options = {}
+search_options["max_length"] = 512
+
+
+text = "Write a Python function to reverse a string."
+
+# Generate prompt (prompt template + input)
+prompt = tokenizer.apply_chat_template(
+    messages=f"""[{{"role": "user", "content": "{text}"}}]""", add_generation_prompt=True
+)
+
+# Encode the prompt using the tokenizer
+input_tokens = tokenizer.encode(prompt)
+
+# Create params and generator
+params = og.GeneratorParams(model)
+params.set_search_options(**search_options)
+generator = og.Generator(model, params)
+
+# Append input tokens to the generator
+generator.append_tokens(input_tokens)
+
+print("")
+print("Output: ", end="", flush=True)
+# Stream the output
+try:
+    while not generator.is_done():
+        generator.generate_next_token()
+
+        new_token = generator.get_next_tokens()[0]
+        print(tokenizer_stream.decode(new_token), end="", flush=True)
+except KeyboardInterrupt:
+    print("  --control+c pressed, aborting generation--")
+print()
+print()
+
+del generator
diff --git a/examples/gemma3/qnn/user_script.py b/examples/gemma3/qnn/custom_gemma3_4b_datasets.py
similarity index 100%
rename from examples/gemma3/qnn/user_script.py
rename to examples/gemma3/qnn/custom_gemma3_4b_datasets.py
diff --git a/examples/gemma3/qnn/custom_gemma3_4b_embedding.py b/examples/gemma3/qnn/custom_gemma3_4b_embedding.py
new file mode 100644
index 000000000..414756808
--- /dev/null
+++ b/examples/gemma3/qnn/custom_gemma3_4b_embedding.py
@@ -0,0 +1,38 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+
+
+import logging
+
+import torch
+from transformers import AutoModel
+
+logger = logging.getLogger(__name__)
+
+
+class EmbeddingLayer(torch.nn.Module):
+    def __init__(self, full_model):
+        super().__init__()
+        self.embedding_layer = full_model.language_model.embed_tokens
+ 
+    def forward(self, input_ids, image_features):
+        image_token_index=262144
+        inputs_embeds = self.embedding_layer(input_ids)
+ 
+        special_image_mask = (input_ids == image_token_index).unsqueeze(-1)
+        special_image_mask = special_image_mask.expand_as(inputs_embeds).to(inputs_embeds.device)
+        image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype)
+        inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features)
+        return inputs_embeds
+
+
+def load_gemma3_embedding_model(model_path):
+    full_model = AutoModel.from_pretrained("google/gemma-3-4b-it")
+    logger.info("Loaded full model: %s", full_model)
+
+    embedding_layer = EmbeddingLayer(full_model.language_model.embed_tokens)
+
+    logger.info("Created embedding-only model: %s", embedding_layer)
+    return embedding_layer
diff --git a/examples/gemma3/qnn/custom_gemma3_4b_it_vision.py b/examples/gemma3/qnn/custom_gemma3_4b_vision.py
similarity index 96%
rename from examples/gemma3/qnn/custom_gemma3_4b_it_vision.py
rename to examples/gemma3/qnn/custom_gemma3_4b_vision.py
index c0d35ecb5..1eb7f8f33 100644
--- a/examples/gemma3/qnn/custom_gemma3_4b_it_vision.py
+++ b/examples/gemma3/qnn/custom_gemma3_4b_vision.py
@@ -27,7 +27,7 @@ def forward(self, pixel_values):
         return self.multi_modal_projector(selected_image_feature)
 
 
-def load_gemma3_model(model_path):
+def load_gemma3_vision_model(model_path):
     full_model = AutoModel.from_pretrained("google/gemma-3-4b-it")
     logger.info("Loaded full model: %s", full_model)
 
diff --git a/examples/gemma3/qnn/gemma3-4b-embedding-qnn-config.json b/examples/gemma3/qnn/gemma3-4b-embedding-qnn-config.json
new file mode 100644
index 000000000..dc2acc3ed
--- /dev/null
+++ b/examples/gemma3/qnn/gemma3-4b-embedding-qnn-config.json
@@ -0,0 +1,40 @@
+{
+    "input_model": {
+        "type": "PyTorchModel",
+        "model_script": "custom_gemma3_4b_embedding.py",
+        "model_loader": "load_gemma3_embedding_model",
+        "io_config": {
+            "input_names": [ "input_ids", "image_features" ],
+            "input_shapes": [ [ 1, 64 ], [ 1, 256, 2560 ] ],
+            "input_types": [ "int32", "float32" ],
+            "output_names": [ "inputs_embeds" ],
+            "output_shapes": [ [ 1, 64, 2560 ] ],
+            "dynamic_axes": {
+                "input_ids": {"0": "batch_size", "1": "seq_length"},
+                "image_features": {"0": "batch_size"}
+            }
+        }
+    },
+    "systems": {
+        "local_system": {
+            "type": "LocalSystem",
+            "accelerators": [{"device": "cpu", "execution_providers": ["CPUExecutionProvider"]}],
+        }
+    },
+    "data_configs": [
+        {
+            "name": "gemma_embedding_data_config",
+            "user_script": "user_script.py",
+            "load_dataset_config": { "type": "gemma_embedding_layer_dataset", "model_id": "google/gemma-3-4b-it" }
+        }
+    ],
+    "passes": {
+        "conversion": { "type": "OnnxConversion", "target_opset": 20 },
+        "add_metadata": { "type": "AddOliveMetadata", "graph_name": "gemma-3-4b-it-vision" }
+    },
+    "target": "local_system",
+    "log_severity_level": 1,
+    "output_dir": "models/gemma-3-4b-it-embed",
+    "cache_dir": "cache-embd",
+    "no_artifacts": true
+}
\ No newline at end of file
diff --git a/examples/gemma3/qnn/gemma3-4b-text-qnn-config.json b/examples/gemma3/qnn/gemma3-4b-text-qnn-config.json
index 675d991bb..630a86b78 100644
--- a/examples/gemma3/qnn/gemma3-4b-text-qnn-config.json
+++ b/examples/gemma3/qnn/gemma3-4b-text-qnn-config.json
@@ -10,7 +10,7 @@
     "data_configs": [
         {
             "name": "gemma_text_data_config",
-            "user_script": "user_script.py",
+            "user_script": "custom_gemma3_4b_datasets.py",
             "load_dataset_config": { "type": "gemma_text_dataset", "model_id": "google/gemma-3-4b-it" }
         }
     ],
@@ -25,7 +25,7 @@
             "device": "cuda",
             "data_config": "gemma_text_data_config"
         },
-        "cs": { "type": "CaptureSplitInfo", "num_splits": 4, "unique_embeds_lm_head_splits": true },
+        "cs": { "type": "CaptureSplitInfo", "num_splits": 2, "unique_embeds_lm_head_splits": true },
         "mb": {
             "type": "ModelBuilder",
             "precision": "int4",
@@ -68,6 +68,7 @@
                 "htp_graph_finalization_optimization_mode": "3",
                 "soc_model": "60"
             },
+            "session_options": {"intra_op_num_threads": 2, "inter_op_num_threads": 1},
             "weight_sharing": true
         },
         "cp": { "type": "ComposeOnnxModels" }
diff --git a/examples/gemma3/qnn/gemma3-4b-vision-qnn-config.json b/examples/gemma3/qnn/gemma3-4b-vision-qnn-config.json
index cb2860fd7..1ce2126f1 100644
--- a/examples/gemma3/qnn/gemma3-4b-vision-qnn-config.json
+++ b/examples/gemma3/qnn/gemma3-4b-vision-qnn-config.json
@@ -2,7 +2,7 @@
     "input_model": {
         "type": "PyTorchModel",
         "model_script": "custom_gemma3_4b_it_vision.py",
-        "model_loader": "load_gemma3_model",
+        "model_loader": "load_gemma3_vision_model",
         "io_config": {
             "input_names": [ "pixel_values" ],
             "input_shapes": [ [ 1, 3, 896, 896 ] ],
@@ -21,7 +21,7 @@
     "data_configs": [
         {
             "name": "gemma_vision_data_config",
-            "user_script": "user_script.py",
+            "user_script": "custom_gemma3_4b_datasets.py",
             "load_dataset_config": { "type": "gemma_vision_dataset", "model_id": "google/gemma-3-4b-it" }
         }
     ],
diff --git a/examples/gemma3/qnn/genai_config.json b/examples/gemma3/qnn/genai_config.json
new file mode 100644
index 000000000..0605cff14
--- /dev/null
+++ b/examples/gemma3/qnn/genai_config.json
@@ -0,0 +1,422 @@
+{
+    "model": {
+        "bos_token_id": 2,
+        "context_length": 131072,
+        "decoder": {
+            "session_options": {
+                "log_id": "onnxruntime-genai",
+                "provider_options": [
+                    {
+                        "qnn": {
+                            "htp_performance_mode": "burst",
+                            "htp_graph_finalization_optimization_mode": "3",
+                            "soc_model": "60"
+                        }
+                    }
+                ]
+            },
+            "head_size": 256,
+            "hidden_size": 2560,
+            "inputs": {
+                "input_ids": "inputs_embeds",
+                "attention_mask": "attention_mask",
+                "past_key_names": "past_key_values.%d.key",
+                "past_value_names": "past_key_values.%d.value",
+                "past_sequence_length": "past_seq_len",
+                "total_sequence_length": "total_seq_len"
+            },
+            "outputs": {
+                "logits": "logits",
+                "present_key_names": "present.%d.key",
+                "present_value_names": "present.%d.value"
+            },
+            "num_attention_heads": 8,
+            "num_hidden_layers": 34,
+            "num_key_value_heads": 4,
+            "sliding_window": {
+                "window_size": 64,
+                "slide_key_value_cache": false,
+                "slide_inputs": true,
+                "pad_value": 0,
+                "alignment": "left"
+            },
+            "pipeline": [
+                {
+                    "context_ctx": {
+                        "filename": "context_ctx.onnx",
+                        "inputs": [
+                            "inputs_embeds",
+                            "past_key_values.0.key",
+                            "past_key_values.0.value",
+                            "past_seq_len",
+                            "total_seq_len",
+                            "past_key_values.1.key",
+                            "past_key_values.1.value",
+                            "past_key_values.2.key",
+                            "past_key_values.2.value",
+                            "past_key_values.3.key",
+                            "past_key_values.3.value",
+                            "past_key_values.4.key",
+                            "past_key_values.4.value",
+                            "past_key_values.5.key",
+                            "past_key_values.5.value",
+                            "past_key_values.6.key",
+                            "past_key_values.6.value",
+                            "past_key_values.7.key",
+                            "past_key_values.7.value",
+                            "past_key_values.8.key",
+                            "past_key_values.8.value",
+                            "past_key_values.9.key",
+                            "past_key_values.9.value",
+                            "past_key_values.10.key",
+                            "past_key_values.10.value",
+                            "past_key_values.11.key",
+                            "past_key_values.11.value",
+                            "past_key_values.12.key",
+                            "past_key_values.12.value",
+                            "past_key_values.13.key",
+                            "past_key_values.13.value",
+                            "past_key_values.14.key",
+                            "past_key_values.14.value",
+                            "past_key_values.15.key",
+                            "past_key_values.15.value",
+                            "past_key_values.16.key",
+                            "past_key_values.16.value",
+                            "past_key_values.17.key",
+                            "past_key_values.17.value",
+                            "past_key_values.18.key",
+                            "past_key_values.18.value",
+                            "past_key_values.19.key",
+                            "past_key_values.19.value",
+                            "past_key_values.20.key",
+                            "past_key_values.20.value",
+                            "past_key_values.21.key",
+                            "past_key_values.21.value",
+                            "past_key_values.22.key",
+                            "past_key_values.22.value",
+                            "past_key_values.23.key",
+                            "past_key_values.23.value",
+                            "past_key_values.24.key",
+                            "past_key_values.24.value",
+                            "past_key_values.25.key",
+                            "past_key_values.25.value",
+                            "past_key_values.26.key",
+                            "past_key_values.26.value",
+                            "past_key_values.27.key",
+                            "past_key_values.27.value",
+                            "past_key_values.28.key",
+                            "past_key_values.28.value",
+                            "past_key_values.29.key",
+                            "past_key_values.29.value",
+                            "past_key_values.30.key",
+                            "past_key_values.30.value",
+                            "past_key_values.31.key",
+                            "past_key_values.31.value",
+                            "past_key_values.32.key",
+                            "past_key_values.32.value",
+                            "past_key_values.33.key",
+                            "past_key_values.33.value"
+                        ],
+                        "outputs": [
+                            "present.0.key",
+                            "present.0.value",
+                            "present.1.key",
+                            "present.1.value",
+                            "present.2.key",
+                            "present.2.value",
+                            "present.3.key",
+                            "present.3.value",
+                            "present.4.key",
+                            "present.4.value",
+                            "present.5.key",
+                            "present.5.value",
+                            "present.6.key",
+                            "present.6.value",
+                            "present.7.key",
+                            "present.7.value",
+                            "present.8.key",
+                            "present.8.value",
+                            "present.9.key",
+                            "present.9.value",
+                            "present.10.key",
+                            "present.10.value",
+                            "present.11.key",
+                            "present.11.value",
+                            "present.12.key",
+                            "present.12.value",
+                            "present.13.key",
+                            "present.13.value",
+                            "present.14.key",
+                            "present.14.value",
+                            "present.15.key",
+                            "present.15.value",
+                            "present.16.key",
+                            "present.16.value",
+                            "present.17.key",
+                            "present.17.value",
+                            "present.18.key",
+                            "present.18.value",
+                            "present.19.key",
+                            "present.19.value",
+                            "present.20.key",
+                            "present.20.value",
+                            "present.21.key",
+                            "present.21.value",
+                            "present.22.key",
+                            "present.22.value",
+                            "present.23.key",
+                            "present.23.value",
+                            "present.24.key",
+                            "present.24.value",
+                            "present.25.key",
+                            "present.25.value",
+                            "present.26.key",
+                            "present.26.value",
+                            "present.27.key",
+                            "present.27.value",
+                            "present.28.key",
+                            "present.28.value",
+                            "present.29.key",
+                            "present.29.value",
+                            "present.30.key",
+                            "present.30.value",
+                            "present.31.key",
+                            "present.31.value",
+                            "present.32.key",
+                            "present.32.value",
+                            "present.33.key",
+                            "present.33.value",
+                            "/model/layers.34/final_norm_layernorm/SkipLayerNorm_Mul_output_0_QuantizeLinear_Output"
+                        ],
+                        "session_options": {
+                            "intra_op_num_threads": 2,
+                            "inter_op_num_threads": 1,
+                            "provider_options": [
+                                {
+                                    "qnn": {
+                                        "htp_performance_mode": "burst",
+                                        "htp_graph_finalization_optimization_mode": "3",
+                                        "soc_model": "60"
+                                    }
+                                }
+                            ]
+                        },
+                        "run_on_token_gen": false
+                    },
+                    "iterator_ctx": {
+                        "filename": "iterator_ctx.onnx",
+                        "inputs": [
+                            "inputs_embeds",
+                            "past_key_values.0.key",
+                            "past_key_values.0.value",
+                            "past_seq_len",
+                            "total_seq_len",
+                            "past_key_values.1.key",
+                            "past_key_values.1.value",
+                            "past_key_values.2.key",
+                            "past_key_values.2.value",
+                            "past_key_values.3.key",
+                            "past_key_values.3.value",
+                            "past_key_values.4.key",
+                            "past_key_values.4.value",
+                            "past_key_values.5.key",
+                            "past_key_values.5.value",
+                            "past_key_values.6.key",
+                            "past_key_values.6.value",
+                            "past_key_values.7.key",
+                            "past_key_values.7.value",
+                            "past_key_values.8.key",
+                            "past_key_values.8.value",
+                            "past_key_values.9.key",
+                            "past_key_values.9.value",
+                            "past_key_values.10.key",
+                            "past_key_values.10.value",
+                            "past_key_values.11.key",
+                            "past_key_values.11.value",
+                            "past_key_values.12.key",
+                            "past_key_values.12.value",
+                            "past_key_values.13.key",
+                            "past_key_values.13.value",
+                            "past_key_values.14.key",
+                            "past_key_values.14.value",
+                            "past_key_values.15.key",
+                            "past_key_values.15.value",
+                            "past_key_values.16.key",
+                            "past_key_values.16.value",
+                            "past_key_values.17.key",
+                            "past_key_values.17.value",
+                            "past_key_values.18.key",
+                            "past_key_values.18.value",
+                            "past_key_values.19.key",
+                            "past_key_values.19.value",
+                            "past_key_values.20.key",
+                            "past_key_values.20.value",
+                            "past_key_values.21.key",
+                            "past_key_values.21.value",
+                            "past_key_values.22.key",
+                            "past_key_values.22.value",
+                            "past_key_values.23.key",
+                            "past_key_values.23.value",
+                            "past_key_values.24.key",
+                            "past_key_values.24.value",
+                            "past_key_values.25.key",
+                            "past_key_values.25.value",
+                            "past_key_values.26.key",
+                            "past_key_values.26.value",
+                            "past_key_values.27.key",
+                            "past_key_values.27.value",
+                            "past_key_values.28.key",
+                            "past_key_values.28.value",
+                            "past_key_values.29.key",
+                            "past_key_values.29.value",
+                            "past_key_values.30.key",
+                            "past_key_values.30.value",
+                            "past_key_values.31.key",
+                            "past_key_values.31.value",
+                            "past_key_values.32.key",
+                            "past_key_values.32.value",
+                            "past_key_values.33.key",
+                            "past_key_values.33.value"
+                        ],
+                        "outputs": [
+                            "present.0.key",
+                            "present.0.value",
+                            "present.1.key",
+                            "present.1.value",
+                            "present.2.key",
+                            "present.2.value",
+                            "present.3.key",
+                            "present.3.value",
+                            "present.4.key",
+                            "present.4.value",
+                            "present.5.key",
+                            "present.5.value",
+                            "present.6.key",
+                            "present.6.value",
+                            "present.7.key",
+                            "present.7.value",
+                            "present.8.key",
+                            "present.8.value",
+                            "present.9.key",
+                            "present.9.value",
+                            "present.10.key",
+                            "present.10.value",
+                            "present.11.key",
+                            "present.11.value",
+                            "present.12.key",
+                            "present.12.value",
+                            "present.13.key",
+                            "present.13.value",
+                            "present.14.key",
+                            "present.14.value",
+                            "present.15.key",
+                            "present.15.value",
+                            "present.16.key",
+                            "present.16.value",
+                            "present.17.key",
+                            "present.17.value",
+                            "present.18.key",
+                            "present.18.value",
+                            "present.19.key",
+                            "present.19.value",
+                            "present.20.key",
+                            "present.20.value",
+                            "present.21.key",
+                            "present.21.value",
+                            "present.22.key",
+                            "present.22.value",
+                            "present.23.key",
+                            "present.23.value",
+                            "present.24.key",
+                            "present.24.value",
+                            "present.25.key",
+                            "present.25.value",
+                            "present.26.key",
+                            "present.26.value",
+                            "present.27.key",
+                            "present.27.value",
+                            "present.28.key",
+                            "present.28.value",
+                            "present.29.key",
+                            "present.29.value",
+                            "present.30.key",
+                            "present.30.value",
+                            "present.31.key",
+                            "present.31.value",
+                            "present.32.key",
+                            "present.32.value",
+                            "present.33.key",
+                            "present.33.value",
+                            "/model/layers.34/final_norm_layernorm/SkipLayerNorm_Mul_output_0_QuantizeLinear_Output"
+                        ],
+                        "session_options": {
+                            "intra_op_num_threads": 2,
+                            "inter_op_num_threads": 1,
+                            "provider_options": [
+                                {
+                                    "qnn": {
+                                        "htp_performance_mode": "burst",
+                                        "htp_graph_finalization_optimization_mode": "3",
+                                        "soc_model": "60"
+                                    }
+                                }
+                            ]
+                        },
+                        "run_on_prompt": false
+                    },
+                    "lm_head": {
+                        "filename": "lm_head.onnx",
+                        "inputs": [
+                            "/model/layers.34/final_norm_layernorm/SkipLayerNorm_Mul_output_0_QuantizeLinear_Output"
+                        ],
+                        "outputs": [
+                            "logits"
+                        ]
+                    }
+                }
+            ]
+        },
+        "embedding": {
+            "filename": "embeddings_combined.onnx",
+            "inputs": {
+                "input_ids": "input_ids",
+                "image_features": "image_features"
+            },
+            "outputs": {
+                "inputs_embeds": "inputs_embeds"
+            }
+        },
+        "vision": {
+            "filename": "model_ctx.onnx",
+            "inputs": {
+                "pixel_values": "pixel_values"
+            },
+            "outputs": {
+                "image_features": "image_features"
+            }
+        },
+        "eos_token_id": [
+            1,
+            106
+        ],
+        "pad_token_id": 0,
+        "type": "gemma3",
+        "vocab_size": 262208
+    },
+    "search": {
+        "diversity_penalty": 0.0,
+        "do_sample": true,
+        "early_stopping": true,
+        "length_penalty": 1.0,
+        "max_length": 131072,
+        "min_length": 0,
+        "no_repeat_ngram_size": 0,
+        "num_beams": 1,
+        "num_return_sequences": 1,
+        "past_present_share_buffer": true,
+        "repetition_penalty": 1.0,
+        "temperature": 1.0,
+        "top_k": 64,
+        "top_p": 0.95
+    }
+}
\ No newline at end of file

From f7178ae55ced96f6be96d84af9392dcffb229670 Mon Sep 17 00:00:00 2001
From: Kyle Romero <kromero@qti.qualcomm.com>
Date: Tue, 2 Sep 2025 10:42:13 -0700
Subject: [PATCH 14/24] Updates

---
 examples/gemma3/qnn/app.py                    |   1 -
 .../gemma3/qnn/custom_gemma3_4b_datasets.py   | 160 +++++++++++++++++-
 .../gemma3/qnn/custom_gemma3_4b_embedding.py  |   9 +-
 .../qnn/gemma3-4b-embedding-qnn-config.json   |  30 +++-
 .../gemma3/qnn/gemma3-4b-text-qnn-config.json |  10 +-
 .../qnn/gemma3-4b-vision-qnn-config.json      |   2 +-
 examples/gemma3/qnn/genai_config.json         |  28 +--
 7 files changed, 190 insertions(+), 50 deletions(-)

diff --git a/examples/gemma3/qnn/app.py b/examples/gemma3/qnn/app.py
index 380e15220..13a0fe4b4 100644
--- a/examples/gemma3/qnn/app.py
+++ b/examples/gemma3/qnn/app.py
@@ -1,7 +1,6 @@
 # app.py
 # ruff: noqa: T201
 from argparse import ArgumentParser
-import numpy as np
 
 import onnxruntime_genai as og
 
diff --git a/examples/gemma3/qnn/custom_gemma3_4b_datasets.py b/examples/gemma3/qnn/custom_gemma3_4b_datasets.py
index 5df7d2ad2..88cda6d8c 100644
--- a/examples/gemma3/qnn/custom_gemma3_4b_datasets.py
+++ b/examples/gemma3/qnn/custom_gemma3_4b_datasets.py
@@ -3,6 +3,7 @@
 # Licensed under the MIT License.
 # --------------------------------------------------------------------------
 
+import copy
 import logging
 import os
 import subprocess
@@ -11,10 +12,12 @@
 from pathlib import Path
 from typing import Optional
 
+import torch
 from datasets import load_dataset
 from huggingface_hub import hf_hub_download
 from PIL import Image as PILImage
 from transformers import (
+    AutoModel,
     AutoProcessor,
     AutoTokenizer,
 )
@@ -310,16 +313,15 @@ def _process_dataset_entry(self, entry: dict[str, any]):
         return {k: v.squeeze(0) for k, v in inputs.items()}  # Remove batch dimension
 
 
-class GemmaVisionOnlyDataset(BaseGemmaDataset):
-    """Dataset for only the vision tower of the Gemma 3 model."""
+class GemmaImageDataset(BaseGemmaDataset):
+    """Dataset for only the image processing of the Gemma 3 model."""
 
     def _initialize_processor_components(self):
-        """No additional components needed for vision-only processing."""
+        """No additional components needed for image-only processing."""
 
     def _process_dataset_entry(self, entry: dict[str, any]):
-        """Load image and extract only pixel_values for vision-only processing."""
+        """Load image and extract only pixel_values for image-only processing."""
         # Load and process the image
-        logger.error("PROCESSING IMAGE")
         image = PILImage.open(fp=os.path.join(self.image_data_path, entry["image"][0]))
 
         # Process image to get pixel_values
@@ -329,6 +331,136 @@ def _process_dataset_entry(self, entry: dict[str, any]):
         return {"pixel_values": inputs["pixel_values"]}
 
 
+class GemmaImageEmbeddingDataset(BaseGemmaDataset):
+    """Dataset that pre-computes and caches image embeddings as numpy arrays."""
+
+    def __init__(self, model_id, first_n=None):
+        # Initialize lazy-loaded model components
+        self._vision_tower = None
+        self._multi_modal_projector = None
+
+        super().__init__(model_id, first_n)
+
+    def _initialize_processor_components(self):
+        """Initialize only standard processor components."""
+        self.tokenizer = AutoTokenizer.from_pretrained(
+            self.model_id, cache_dir=self.CACHE_DIR, use_fast=True, trust_remote_code=True
+        )
+
+    def _get_vision_components(self):
+        """Lazy-load vision model components when first needed."""
+        if self._vision_tower is None:
+            logger.info("Loading vision model components for cached embedding dataset")
+            full_model = AutoModel.from_pretrained(self.model_id)
+
+            # Extract vision components (equivalent to Gemma3VisualEmbeddingGenerator)
+            self._vision_tower = full_model.vision_tower
+            self._multi_modal_projector = full_model.multi_modal_projector
+
+            # Clean up full model to save memory
+            del full_model.language_model
+
+        return self._vision_tower, self._multi_modal_projector
+
+    def _process_dataset_entry(self, entry: dict[str, any]):
+        """Process entry to return input_ids and cached image features."""
+        # Convert conversation and tokenize
+        inputs = self.processor.apply_chat_template(
+            entry["text"][0], add_generation_prompt=True, tokenize=True, return_tensors="pt", return_dict=True
+        )
+
+        # Load and process image
+        image = PILImage.open(fp=os.path.join(self.image_data_path, entry["image"][0]))
+        pixel_values = torch.tensor(self.processor(text="<start_of_image>", images=image).pixel_values)
+
+        # Get vision components and extract features
+        vision_tower, projector = self._get_vision_components()
+        pixel_values = pixel_values.to(device="cuda")
+
+        with torch.no_grad():
+            # Process through vision tower
+            image_outputs = vision_tower(pixel_values, output_hidden_states=True)
+            selected_image_feature = image_outputs.last_hidden_state
+            # Project to final embedding space
+            image_features = projector(selected_image_feature)
+            # Convert to numpy for caching
+            image_features = image_features.cpu().detach().numpy()
+
+        return {"input_ids": inputs["input_ids"].squeeze(0), "image_features": image_features}
+
+
+class GemmaEmbeddingDataset(BaseGemmaDataset):
+    """Dataset that pre-merges text and image embeddings."""
+
+    def __init__(self, model_id, first_n=None):
+        # Initialize lazy-loaded model components
+        self._vision_tower = None
+        self._multi_modal_projector = None
+        self._embedding_layer = None
+
+        super().__init__(model_id, first_n)
+
+    def _initialize_processor_components(self):
+        """Initialize only standard processor components."""
+        self.tokenizer = AutoTokenizer.from_pretrained(
+            self.model_id, cache_dir=self.CACHE_DIR, use_fast=True, trust_remote_code=True
+        )
+
+    def _get_model_components(self):
+        """Lazy-load all required model components when first needed."""
+        if self._embedding_layer is None:
+            logger.info("Loading model components for merged embedding dataset")
+            full_model = AutoModel.from_pretrained(self.model_id)
+
+            # Extract components
+            self._vision_tower = full_model.vision_tower
+            self._multi_modal_projector = full_model.multi_modal_projector
+            self._embedding_layer = copy.deepcopy(full_model.language_model.embed_tokens)
+
+            # Clean up full model
+            del full_model.language_model
+
+        return self._vision_tower, self._multi_modal_projector, self._embedding_layer
+
+    def _merge_embeddings(self, input_ids: torch.Tensor, pixel_values: torch.Tensor):
+        """Merge text and image embeddings at special token positions."""
+        vision_tower, projector, embedding_layer = self._get_model_components()
+
+        # Get text embeddings
+        inputs_embeds = embedding_layer(input_ids.to(device="cuda"))
+
+        # Process image
+        pixel_values = pixel_values.to(dtype=inputs_embeds.dtype, device="cuda")
+        with torch.no_grad():
+            image_outputs = vision_tower(pixel_values, output_hidden_states=True)
+            selected_image_feature = image_outputs.last_hidden_state
+            image_features = projector(selected_image_feature)
+
+        # Merge at special token positions (image_token_index = 262144)
+        image_token_index = 262144
+        special_image_mask = (input_ids == image_token_index).unsqueeze(-1)
+        special_image_mask = special_image_mask.expand_as(inputs_embeds).to(inputs_embeds.device)
+
+        image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype)
+        return inputs_embeds.masked_scatter(special_image_mask, image_features)
+
+    def _process_dataset_entry(self, entry: dict[str, any]):
+        """Process entry to return merged embeddings."""
+        # Convert conversation and tokenize
+        inputs = self.processor.apply_chat_template(
+            entry["text"][0], add_generation_prompt=True, tokenize=True, return_tensors="pt", return_dict=True
+        )
+
+        # Load and process image
+        image = PILImage.open(fp=os.path.join(self.image_data_path, entry["image"][0]))
+        pixel_values = torch.tensor(self.processor(text="<start_of_image>", images=image).pixel_values)
+
+        # Merge embeddings
+        inputs_embeds = self._merge_embeddings(inputs["input_ids"], pixel_values)
+
+        return {"inputs_embeds": inputs_embeds, "attention_mask": inputs["attention_mask"].squeeze(0)}
+
+
 # Remove this when submitting for review
 SHORTCUT_FIRST_N = 20
 
@@ -346,6 +478,18 @@ def gemma_text_dataset(model_id: str):
 
 
 @Registry.register_dataset()
-def gemma_vision_dataset(model_id: str):
-    """Vision-only Gemma 3 dataset."""
-    return GemmaVisionOnlyDataset(model_id, first_n=SHORTCUT_FIRST_N).get_dataset()
+def gemma_image_dataset(model_id: str):
+    """Image-only Gemma 3 dataset."""
+    return GemmaImageDataset(model_id, first_n=SHORTCUT_FIRST_N).get_dataset()
+
+
+@Registry.register_dataset()
+def gemma_embedding_dataset(model_id: str):
+    """Gemma 3 dataset with pre-merged text and image embeddings."""
+    return GemmaEmbeddingDataset(model_id, first_n=SHORTCUT_FIRST_N).get_dataset()
+
+
+@Registry.register_dataset()
+def gemma_image_embedding_dataset(model_id: str):
+    """Gemma 3 dataset with pre-computed cached image embeddings."""
+    return GemmaImageEmbeddingDataset(model_id, first_n=SHORTCUT_FIRST_N).get_dataset()
diff --git a/examples/gemma3/qnn/custom_gemma3_4b_embedding.py b/examples/gemma3/qnn/custom_gemma3_4b_embedding.py
index 414756808..1af28cd55 100644
--- a/examples/gemma3/qnn/custom_gemma3_4b_embedding.py
+++ b/examples/gemma3/qnn/custom_gemma3_4b_embedding.py
@@ -16,16 +16,15 @@ class EmbeddingLayer(torch.nn.Module):
     def __init__(self, full_model):
         super().__init__()
         self.embedding_layer = full_model.language_model.embed_tokens
- 
+
     def forward(self, input_ids, image_features):
-        image_token_index=262144
+        image_token_index = 262144
         inputs_embeds = self.embedding_layer(input_ids)
- 
+
         special_image_mask = (input_ids == image_token_index).unsqueeze(-1)
         special_image_mask = special_image_mask.expand_as(inputs_embeds).to(inputs_embeds.device)
         image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype)
-        inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features)
-        return inputs_embeds
+        return inputs_embeds.masked_scatter(special_image_mask, image_features)
 
 
 def load_gemma3_embedding_model(model_path):
diff --git a/examples/gemma3/qnn/gemma3-4b-embedding-qnn-config.json b/examples/gemma3/qnn/gemma3-4b-embedding-qnn-config.json
index dc2acc3ed..365141dc4 100644
--- a/examples/gemma3/qnn/gemma3-4b-embedding-qnn-config.json
+++ b/examples/gemma3/qnn/gemma3-4b-embedding-qnn-config.json
@@ -7,29 +7,41 @@
             "input_names": [ "input_ids", "image_features" ],
             "input_shapes": [ [ 1, 64 ], [ 1, 256, 2560 ] ],
             "input_types": [ "int32", "float32" ],
-            "output_names": [ "inputs_embeds" ],
+            "output_names": [ "/model/embed_tokens/Mul/output_0" ],
             "output_shapes": [ [ 1, 64, 2560 ] ],
             "dynamic_axes": {
-                "input_ids": {"0": "batch_size", "1": "seq_length"},
-                "image_features": {"0": "batch_size"}
+                "input_ids": { "0": "batch_size", "1": "seq_length" },
+                "image_features": { "0": "batch_size" }
             }
         }
     },
     "systems": {
-        "local_system": {
-            "type": "LocalSystem",
-            "accelerators": [{"device": "cpu", "execution_providers": ["CPUExecutionProvider"]}],
+        "qnn_system": {
+            "type": "PythonEnvironment",
+            "python_environment_path": "/local/mnt2/workspace/kromero/olive/olive-venv/bin",
+            "accelerators": [ { "execution_providers": [ "QNNExecutionProvider" ] } ]
         }
     },
     "data_configs": [
         {
             "name": "gemma_embedding_data_config",
-            "user_script": "user_script.py",
-            "load_dataset_config": { "type": "gemma_embedding_layer_dataset", "model_id": "google/gemma-3-4b-it" }
+            "user_script": "custom_gemma3_4b_datasets.py",
+            "load_dataset_config": { "type": "gemma_image_embedding_dataset", "model_id": "google/gemma-3-4b-it" }
         }
     ],
     "passes": {
         "conversion": { "type": "OnnxConversion", "target_opset": 20 },
+        "quantization": {
+            "type": "OnnxStaticQuantization",
+            "quant_preprocess": false,
+            "data_config": "gemma_embedding_data_config",
+            "activation_type": "uint16",
+            "precision": "uint8",
+            "calibrate_method": "MinMax",
+            "calibration_providers": [ "CUDAExecutionProvider" ],
+            "per_channel": true,
+            "weight_symmetric": true
+        },
         "add_metadata": { "type": "AddOliveMetadata", "graph_name": "gemma-3-4b-it-vision" }
     },
     "target": "local_system",
@@ -37,4 +49,4 @@
     "output_dir": "models/gemma-3-4b-it-embed",
     "cache_dir": "cache-embd",
     "no_artifacts": true
-}
\ No newline at end of file
+}
diff --git a/examples/gemma3/qnn/gemma3-4b-text-qnn-config.json b/examples/gemma3/qnn/gemma3-4b-text-qnn-config.json
index 630a86b78..1cad472ab 100644
--- a/examples/gemma3/qnn/gemma3-4b-text-qnn-config.json
+++ b/examples/gemma3/qnn/gemma3-4b-text-qnn-config.json
@@ -9,9 +9,9 @@
     },
     "data_configs": [
         {
-            "name": "gemma_text_data_config",
+            "name": "gemma_embedding_data_config",
             "user_script": "custom_gemma3_4b_datasets.py",
-            "load_dataset_config": { "type": "gemma_text_dataset", "model_id": "google/gemma-3-4b-it" }
+            "load_dataset_config": { "type": "gemma_embedding_dataset", "model_id": "google/gemma-3-4b-it" }
         }
     ],
     "passes": {
@@ -23,7 +23,7 @@
             "group_size": -1,
             "lm_head": false,
             "device": "cuda",
-            "data_config": "gemma_text_data_config"
+            "data_config": "gemma_embedding_data_config"
         },
         "cs": { "type": "CaptureSplitInfo", "num_splits": 2, "unique_embeds_lm_head_splits": true },
         "mb": {
@@ -51,7 +51,7 @@
         },
         "sq": {
             "type": "OnnxStaticQuantization",
-            "data_config": "gemma_text_data_config",
+            "data_config": "gemma_embedding_data_config",
             "activation_type": "uint16",
             "precision": "uint8",
             "calibration_providers": [ "CUDAExecutionProvider" ],
@@ -68,7 +68,7 @@
                 "htp_graph_finalization_optimization_mode": "3",
                 "soc_model": "60"
             },
-            "session_options": {"intra_op_num_threads": 2, "inter_op_num_threads": 1},
+            "session_options": { "intra_op_num_threads": 2, "inter_op_num_threads": 1 },
             "weight_sharing": true
         },
         "cp": { "type": "ComposeOnnxModels" }
diff --git a/examples/gemma3/qnn/gemma3-4b-vision-qnn-config.json b/examples/gemma3/qnn/gemma3-4b-vision-qnn-config.json
index 1ce2126f1..d0d747170 100644
--- a/examples/gemma3/qnn/gemma3-4b-vision-qnn-config.json
+++ b/examples/gemma3/qnn/gemma3-4b-vision-qnn-config.json
@@ -22,7 +22,7 @@
         {
             "name": "gemma_vision_data_config",
             "user_script": "custom_gemma3_4b_datasets.py",
-            "load_dataset_config": { "type": "gemma_vision_dataset", "model_id": "google/gemma-3-4b-it" }
+            "load_dataset_config": { "type": "gemma_image_dataset", "model_id": "google/gemma-3-4b-it" }
         }
     ],
     "passes": {
diff --git a/examples/gemma3/qnn/genai_config.json b/examples/gemma3/qnn/genai_config.json
index 0605cff14..d1185aa08 100644
--- a/examples/gemma3/qnn/genai_config.json
+++ b/examples/gemma3/qnn/genai_config.json
@@ -369,36 +369,22 @@
                         "inputs": [
                             "/model/layers.34/final_norm_layernorm/SkipLayerNorm_Mul_output_0_QuantizeLinear_Output"
                         ],
-                        "outputs": [
-                            "logits"
-                        ]
+                        "outputs": [ "logits" ]
                     }
                 }
             ]
         },
         "embedding": {
             "filename": "embeddings_combined.onnx",
-            "inputs": {
-                "input_ids": "input_ids",
-                "image_features": "image_features"
-            },
-            "outputs": {
-                "inputs_embeds": "inputs_embeds"
-            }
+            "inputs": { "input_ids": "input_ids", "image_features": "image_features" },
+            "outputs": { "inputs_embeds": "inputs_embeds" }
         },
         "vision": {
             "filename": "model_ctx.onnx",
-            "inputs": {
-                "pixel_values": "pixel_values"
-            },
-            "outputs": {
-                "image_features": "image_features"
-            }
+            "inputs": { "pixel_values": "pixel_values" },
+            "outputs": { "image_features": "image_features" }
         },
-        "eos_token_id": [
-            1,
-            106
-        ],
+        "eos_token_id": [ 1, 106 ],
         "pad_token_id": 0,
         "type": "gemma3",
         "vocab_size": 262208
@@ -419,4 +405,4 @@
         "top_k": 64,
         "top_p": 0.95
     }
-}
\ No newline at end of file
+}

From bd70ff40be8ce73ceee3ad0b8fefac533b71a268 Mon Sep 17 00:00:00 2001
From: Kyle Romero <kromero@qti.qualcomm.com>
Date: Wed, 3 Sep 2025 07:46:32 -0700
Subject: [PATCH 15/24] Updates

---
 examples/gemma3/qnn/app.py                    | 224 +++++++++++++-----
 .../gemma3/qnn/custom_gemma3_4b_datasets.py   |  70 ++++--
 .../gemma3/qnn/custom_gemma3_4b_embedding.py  |   2 +-
 .../qnn/gemma3-4b-embedding-qnn-config.json   |   4 +-
 4 files changed, 222 insertions(+), 78 deletions(-)

diff --git a/examples/gemma3/qnn/app.py b/examples/gemma3/qnn/app.py
index 13a0fe4b4..e83d6420f 100644
--- a/examples/gemma3/qnn/app.py
+++ b/examples/gemma3/qnn/app.py
@@ -1,60 +1,170 @@
-# app.py
-# ruff: noqa: T201
-from argparse import ArgumentParser
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License
+
+import argparse
+import glob
+import json
+import os
+import time
+from pathlib import Path
 
 import onnxruntime_genai as og
 
-parser = ArgumentParser(description="Run a simple chat application with the Gemma3 model.")
-parser.add_argument(
-    "-m",
-    "--model_folder",
-    type=str,
-    default="",
-    help="Path to the folder containing the outputs of Olive run",
-)
-args = parser.parse_args()
-
-# Load the base model and tokenizer
-model = og.Model(f"{args.model_folder}/model")
-tokenizer = og.Tokenizer(model)
-tokenizer_stream = tokenizer.create_stream()
-
-# Set the max length to something sensible by default,
-# since otherwise it will be set to the entire context length
-search_options = {}
-search_options["max_length"] = 512
-
-
-text = "Write a Python function to reverse a string."
-
-# Generate prompt (prompt template + input)
-prompt = tokenizer.apply_chat_template(
-    messages=f"""[{{"role": "user", "content": "{text}"}}]""", add_generation_prompt=True
-)
-
-# Encode the prompt using the tokenizer
-input_tokens = tokenizer.encode(prompt)
-
-# Create params and generator
-params = og.GeneratorParams(model)
-params.set_search_options(**search_options)
-generator = og.Generator(model, params)
-
-# Append input tokens to the generator
-generator.append_tokens(input_tokens)
-
-print("")
-print("Output: ", end="", flush=True)
-# Stream the output
-try:
-    while not generator.is_done():
-        generator.generate_next_token()
-
-        new_token = generator.get_next_tokens()[0]
-        print(tokenizer_stream.decode(new_token), end="", flush=True)
-except KeyboardInterrupt:
-    print("  --control+c pressed, aborting generation--")
-print()
-print()
-
-del generator
+# og.set_log_options(enabled=True, model_input_values=True, model_output_values=True)
+
+
+def _find_dir_contains_sub_dir(current_dir: Path, target_dir_name):
+    curr_path = Path(current_dir).absolute()
+    target_dir = glob.glob(target_dir_name, root_dir=curr_path)
+    if target_dir:
+        return Path(curr_path / target_dir[0]).absolute()
+    else:
+        if curr_path.parent == curr_path:
+            # Root dir
+            return None
+        return _find_dir_contains_sub_dir(curr_path / "..", target_dir_name)
+
+
+def _complete(text, state):
+    return (glob.glob(text + "*") + [None])[state]
+
+
+def run(args: argparse.Namespace):
+    print("Loading model...")
+    config = og.Config(args.model_path)
+    if args.execution_provider != "follow_config":
+        config.clear_providers()
+        if args.execution_provider != "cpu":
+            print(f"Setting model to {args.execution_provider}...")
+            config.append_provider(args.execution_provider)
+    model = og.Model(config)
+    print("Model loaded")
+
+    tokenizer = og.Tokenizer(model)
+    processor = model.create_multimodal_processor()
+    stream = processor.create_stream()
+
+    interactive = not args.non_interactive
+
+    while True:
+        if interactive:
+            try:
+                import readline
+
+                readline.set_completer_delims(" \t\n;")
+                readline.parse_and_bind("tab: complete")
+                readline.set_completer(_complete)
+            except ImportError:
+                # Not available on some platforms. Ignore it.
+                pass
+            image_paths = [
+                image_path.strip()
+                for image_path in input("Image Path (comma separated; leave empty if no image): ").split(",")
+            ]
+        else:
+            if args.image_paths:
+                image_paths = args.image_paths
+            else:
+                image_paths = [str(Path(__file__).parent / "images" / "dog.jpg")]
+
+        image_paths = [image_path for image_path in image_paths if image_path]
+        print(image_paths)
+
+        images = None
+        if len(image_paths) == 0:
+            print("No image provided")
+        else:
+            for i, image_path in enumerate(image_paths):
+                if not os.path.exists(image_path):
+                    raise FileNotFoundError(f"Image file not found: {image_path}")
+                print(f"Using image: {image_path}")
+
+            images = og.Images.open(*image_paths)
+
+        if interactive:
+            text = input("Prompt: ")
+        else:
+            if args.prompt:
+                text = args.prompt
+            else:
+                text = "What is shown in this image?"
+
+        # Construct the "messages" argument passed to apply_chat_template
+        messages = []
+        if model.type == "phi3v":
+            # Combine all image tags and text into one user message
+            content = "".join([f"<|image_{i + 1}|>\n" for i in range(len(image_paths))]) + text
+            messages.append({"role": "user", "content": content})
+        else:
+            # Gemma3-style multimodal: structured content
+            content_list = [{"type": "image"} for _ in image_paths]
+            content_list.append({"type": "text", "text": text})
+            messages.append({"role": "user", "content": content_list})
+
+        # Apply the chat template using the tokenizer
+        message_json = json.dumps(messages)
+        print(message_json)
+        prompt = tokenizer.apply_chat_template(message_json, add_generation_prompt=True)
+
+        print("Processing images and prompt...")
+        inputs = processor(prompt, images=images)
+
+        print("Generating response...")
+        params = og.GeneratorParams(model)
+        params.set_search_options(max_length=1024)
+
+        print(inputs)
+
+        generator = og.Generator(model, params)
+        generator.set_inputs(inputs)
+        start_time = time.time()
+
+        while not generator.is_done():
+            generator.generate_next_token()
+
+            new_token = generator.get_next_tokens()[0]
+            print(stream.decode(new_token), end="", flush=True)
+
+        print()
+        total_run_time = time.time() - start_time
+        print(f"Total Time : {total_run_time:.2f}")
+
+        for _ in range(3):
+            print()
+
+        # Delete the generator to free the captured graph before creating another one
+        del generator
+
+        if not interactive:
+            break
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "-m", "--model_path", type=str, default="", required=True, help="Path to the folder containing the model"
+    )
+    parser.add_argument(
+        "-e",
+        "--execution_provider",
+        type=str,
+        required=False,
+        default="follow_config",
+        choices=["cpu", "cuda", "dml", "follow_config"],
+        help="Execution provider to run the ONNX Runtime session with. Defaults to follow_config that uses the execution provider listed in the genai_config.json instead.",
+    )
+    parser.add_argument(
+        "--image_paths", nargs="*", type=str, required=False, help="Path to the images, mainly for CI usage"
+    )
+    parser.add_argument(
+        "-pr", "--prompt", required=False, help="Input prompts to generate tokens from, mainly for CI usage"
+    )
+    parser.add_argument(
+        "--non-interactive",
+        action=argparse.BooleanOptionalAction,
+        default=True,
+        required=False,
+        help="Non-interactive mode, mainly for CI usage",
+    )
+    args = parser.parse_args()
+    run(args)
diff --git a/examples/gemma3/qnn/custom_gemma3_4b_datasets.py b/examples/gemma3/qnn/custom_gemma3_4b_datasets.py
index 88cda6d8c..145ec2290 100644
--- a/examples/gemma3/qnn/custom_gemma3_4b_datasets.py
+++ b/examples/gemma3/qnn/custom_gemma3_4b_datasets.py
@@ -211,14 +211,8 @@ def setup_dataset(self):
         self.raw_datasets = self.raw_datasets.filter(lambda x: x["image_mode"] == "RGB")
 
         # Apply dataset-specific processing
-        logger.error(self.raw_datasets[0])
-        logger.error(self.raw_datasets[1])
-
         self.raw_datasets = self.raw_datasets.with_transform(self._process_dataset_entry)
 
-        logger.error(self.raw_datasets[0])
-        logger.error(self.raw_datasets[1])
-
     def get_dataset(self):
         """Return the processed dataset."""
         return self.raw_datasets
@@ -331,8 +325,8 @@ def _process_dataset_entry(self, entry: dict[str, any]):
         return {"pixel_values": inputs["pixel_values"]}
 
 
-class GemmaImageEmbeddingDataset(BaseGemmaDataset):
-    """Dataset that pre-computes and caches image embeddings as numpy arrays."""
+class GemmaEmbeddingInputDataset(BaseGemmaDataset):
+    """Dataset that is the input to the embedding layer."""
 
     def __init__(self, model_id, first_n=None):
         # Initialize lazy-loaded model components
@@ -362,6 +356,24 @@ def _get_vision_components(self):
 
         return self._vision_tower, self._multi_modal_projector
 
+    def setup_dataset(self):
+        """Set up the multimodal dataset with text conversation conversion."""
+        self._load_base_dataset()
+
+        # Convert the Llava-style conversation to Gemma-style conversation (preserve images)
+        self.raw_datasets = self.raw_datasets.map(
+            lambda entry: self._convert_llava_to_gemma_conversation(entry, strip_images=False)
+        )
+
+        # Extract image details
+        self.raw_datasets = self.raw_datasets.map(self._extract_image_details)
+
+        # Filter out any images that are not RGB
+        self.raw_datasets = self.raw_datasets.filter(lambda x: x["image_mode"] == "RGB")
+
+        # Apply multimodal processing
+        self.raw_datasets = self.raw_datasets.with_transform(self._process_dataset_entry)
+
     def _process_dataset_entry(self, entry: dict[str, any]):
         """Process entry to return input_ids and cached image features."""
         # Convert conversation and tokenize
@@ -413,9 +425,9 @@ def _get_model_components(self):
             full_model = AutoModel.from_pretrained(self.model_id)
 
             # Extract components
-            self._vision_tower = full_model.vision_tower
-            self._multi_modal_projector = full_model.multi_modal_projector
-            self._embedding_layer = copy.deepcopy(full_model.language_model.embed_tokens)
+            self._vision_tower = full_model.vision_tower.cuda()
+            self._multi_modal_projector = full_model.multi_modal_projector.cuda()
+            self._embedding_layer = copy.deepcopy(full_model.language_model.embed_tokens).cuda()
 
             # Clean up full model
             del full_model.language_model
@@ -444,6 +456,24 @@ def _merge_embeddings(self, input_ids: torch.Tensor, pixel_values: torch.Tensor)
         image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype)
         return inputs_embeds.masked_scatter(special_image_mask, image_features)
 
+    def setup_dataset(self):
+        """Set up the multimodal dataset with text conversation conversion."""
+        self._load_base_dataset()
+
+        # Convert the Llava-style conversation to Gemma-style conversation (preserve images)
+        self.raw_datasets = self.raw_datasets.map(
+            lambda entry: self._convert_llava_to_gemma_conversation(entry, strip_images=False)
+        )
+
+        # Extract image details
+        self.raw_datasets = self.raw_datasets.map(self._extract_image_details)
+
+        # Filter out any images that are not RGB
+        self.raw_datasets = self.raw_datasets.filter(lambda x: x["image_mode"] == "RGB")
+
+        # Apply multimodal processing
+        self.raw_datasets = self.raw_datasets.with_transform(self._process_dataset_entry)
+
     def _process_dataset_entry(self, entry: dict[str, any]):
         """Process entry to return merged embeddings."""
         # Convert conversation and tokenize
@@ -458,7 +488,11 @@ def _process_dataset_entry(self, entry: dict[str, any]):
         # Merge embeddings
         inputs_embeds = self._merge_embeddings(inputs["input_ids"], pixel_values)
 
-        return {"inputs_embeds": inputs_embeds, "attention_mask": inputs["attention_mask"].squeeze(0)}
+        return {
+            "input_ids": inputs["input_ids"],
+            "inputs_embeds": inputs_embeds,
+            "attention_mask": inputs["attention_mask"].squeeze(0),
+        }
 
 
 # Remove this when submitting for review
@@ -484,12 +518,12 @@ def gemma_image_dataset(model_id: str):
 
 
 @Registry.register_dataset()
-def gemma_embedding_dataset(model_id: str):
-    """Gemma 3 dataset with pre-merged text and image embeddings."""
-    return GemmaEmbeddingDataset(model_id, first_n=SHORTCUT_FIRST_N).get_dataset()
+def gemma_embedding_input_dataset(model_id: str):
+    """Gemma 3 dataset with embedding layer input."""
+    return GemmaEmbeddingInputDataset(model_id, first_n=SHORTCUT_FIRST_N).get_dataset()
 
 
 @Registry.register_dataset()
-def gemma_image_embedding_dataset(model_id: str):
-    """Gemma 3 dataset with pre-computed cached image embeddings."""
-    return GemmaImageEmbeddingDataset(model_id, first_n=SHORTCUT_FIRST_N).get_dataset()
+def gemma_embedding_dataset(model_id: str):
+    """Gemma 3 dataset with pre-merged text and image embeddings."""
+    return GemmaEmbeddingDataset(model_id, first_n=SHORTCUT_FIRST_N).get_dataset()
diff --git a/examples/gemma3/qnn/custom_gemma3_4b_embedding.py b/examples/gemma3/qnn/custom_gemma3_4b_embedding.py
index 1af28cd55..97c9cf2ea 100644
--- a/examples/gemma3/qnn/custom_gemma3_4b_embedding.py
+++ b/examples/gemma3/qnn/custom_gemma3_4b_embedding.py
@@ -31,7 +31,7 @@ def load_gemma3_embedding_model(model_path):
     full_model = AutoModel.from_pretrained("google/gemma-3-4b-it")
     logger.info("Loaded full model: %s", full_model)
 
-    embedding_layer = EmbeddingLayer(full_model.language_model.embed_tokens)
+    embedding_layer = EmbeddingLayer(full_model)
 
     logger.info("Created embedding-only model: %s", embedding_layer)
     return embedding_layer
diff --git a/examples/gemma3/qnn/gemma3-4b-embedding-qnn-config.json b/examples/gemma3/qnn/gemma3-4b-embedding-qnn-config.json
index 365141dc4..8a70e359a 100644
--- a/examples/gemma3/qnn/gemma3-4b-embedding-qnn-config.json
+++ b/examples/gemma3/qnn/gemma3-4b-embedding-qnn-config.json
@@ -26,7 +26,7 @@
         {
             "name": "gemma_embedding_data_config",
             "user_script": "custom_gemma3_4b_datasets.py",
-            "load_dataset_config": { "type": "gemma_image_embedding_dataset", "model_id": "google/gemma-3-4b-it" }
+            "load_dataset_config": { "type": "gemma_embedding_input_dataset", "model_id": "google/gemma-3-4b-it" }
         }
     ],
     "passes": {
@@ -44,7 +44,7 @@
         },
         "add_metadata": { "type": "AddOliveMetadata", "graph_name": "gemma-3-4b-it-vision" }
     },
-    "target": "local_system",
+    "target": "qnn_system",
     "log_severity_level": 1,
     "output_dir": "models/gemma-3-4b-it-embed",
     "cache_dir": "cache-embd",

From c962ceee8a1e0a5395a713d6a0f75d1d8537ca3f Mon Sep 17 00:00:00 2001
From: Alahari Prudhvi Akhil <prudhvi@qti.qualcomm.com>
Date: Thu, 4 Sep 2025 03:36:30 -0700
Subject: [PATCH 16/24] Add olive requirements file

---
 examples/gemma3/qnn/olive_req.txt | 7 +++++++
 1 file changed, 7 insertions(+)
 create mode 100755 examples/gemma3/qnn/olive_req.txt

diff --git a/examples/gemma3/qnn/olive_req.txt b/examples/gemma3/qnn/olive_req.txt
new file mode 100755
index 000000000..8923fbfa7
--- /dev/null
+++ b/examples/gemma3/qnn/olive_req.txt
@@ -0,0 +1,7 @@
+transformers
+datasets
+optimum
+onnxruntime-gpu==1.22.0
+onnxruntime-genai-cuda==0.9.0
+setuptools
+tabulate
\ No newline at end of file

From 360d9c29995de5054dae3d177e04daff1f3511cb Mon Sep 17 00:00:00 2001
From: Kyle Romero <kromero@qti.qualcomm.com>
Date: Thu, 4 Sep 2025 12:29:37 -0700
Subject: [PATCH 17/24] update

---
 examples/gemma3/qnn/custom_gemma3_4b_datasets.py     | 2 +-
 examples/gemma3/qnn/gemma3-4b-vision-qnn-config.json | 6 ++++--
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/examples/gemma3/qnn/custom_gemma3_4b_datasets.py b/examples/gemma3/qnn/custom_gemma3_4b_datasets.py
index 145ec2290..6d3c35c12 100644
--- a/examples/gemma3/qnn/custom_gemma3_4b_datasets.py
+++ b/examples/gemma3/qnn/custom_gemma3_4b_datasets.py
@@ -496,7 +496,7 @@ def _process_dataset_entry(self, entry: dict[str, any]):
 
 
 # Remove this when submitting for review
-SHORTCUT_FIRST_N = 20
+SHORTCUT_FIRST_N = 25
 
 
 @Registry.register_dataset()
diff --git a/examples/gemma3/qnn/gemma3-4b-vision-qnn-config.json b/examples/gemma3/qnn/gemma3-4b-vision-qnn-config.json
index d0d747170..fe5328c6e 100644
--- a/examples/gemma3/qnn/gemma3-4b-vision-qnn-config.json
+++ b/examples/gemma3/qnn/gemma3-4b-vision-qnn-config.json
@@ -1,7 +1,7 @@
 {
     "input_model": {
         "type": "PyTorchModel",
-        "model_script": "custom_gemma3_4b_it_vision.py",
+        "model_script": "custom_gemma3_4b_vision.py",
         "model_loader": "load_gemma3_vision_model",
         "io_config": {
             "input_names": [ "pixel_values" ],
@@ -34,7 +34,9 @@
             "data_config": "gemma_vision_data_config",
             "activation_type": "uint16",
             "precision": "uint8",
-            "calibrate_method": "MinMax"
+            "calibrate_method": "MinMax",
+            "per_channel": true,
+            "weight_symmetric": true
         },
         "cb": {
             "type": "EPContextBinaryGenerator",

From 5fcda5c05d1474d462fdacef7d2420d4b5aa3e5d Mon Sep 17 00:00:00 2001
From: Alahari Prudhvi Akhil <prudhvi@qti.qualcomm.com>
Date: Thu, 4 Sep 2025 13:43:43 -0700
Subject: [PATCH 18/24] Update Olive scripts for gemma3

---
 .../gemma3/qnn/custom_gemma3_4b_datasets.py   |   5 +-
 examples/gemma3/qnn/gemma-3-4b.ipynb          | 344 ++++++++++++++++++
 .../qnn/gemma3-4b-embedding-qnn-config.json   |  81 ++++-
 .../gemma3/qnn/gemma3-4b-text-qnn-config.json | 117 ++++--
 .../qnn/gemma3-4b-vision-qnn-config.json      |  75 +++-
 examples/gemma3/qnn/{ => genai}/app.py        |   0
 .../gemma3/qnn/{ => genai}/genai_config.json  |  69 +++-
 .../gemma3/qnn/genai/processor_config.json    |  53 +++
 examples/gemma3/qnn/olive_req.txt             |   7 -
 examples/gemma3/qnn/qnn_req.txt               |   7 +
 examples/gemma3/requirements.txt              |   8 +-
 olive/common/hf/utils.py                      |  10 +-
 olive/model/handler/hf.py                     |   8 +-
 13 files changed, 693 insertions(+), 91 deletions(-)
 create mode 100755 examples/gemma3/qnn/gemma-3-4b.ipynb
 rename examples/gemma3/qnn/{ => genai}/app.py (100%)
 rename examples/gemma3/qnn/{ => genai}/genai_config.json (92%)
 mode change 100644 => 100755
 create mode 100755 examples/gemma3/qnn/genai/processor_config.json
 delete mode 100755 examples/gemma3/qnn/olive_req.txt
 create mode 100755 examples/gemma3/qnn/qnn_req.txt

diff --git a/examples/gemma3/qnn/custom_gemma3_4b_datasets.py b/examples/gemma3/qnn/custom_gemma3_4b_datasets.py
index 6d3c35c12..987297cb8 100644
--- a/examples/gemma3/qnn/custom_gemma3_4b_datasets.py
+++ b/examples/gemma3/qnn/custom_gemma3_4b_datasets.py
@@ -150,7 +150,6 @@ def _download_and_extract_images(self):
                         zip_path,
                     ],
                     check=True,
-                    cwd=self.CACHE_DIR,
                 )
                 logger.info("Download completed successfully")
             except subprocess.CalledProcessError:
@@ -354,7 +353,7 @@ def _get_vision_components(self):
             # Clean up full model to save memory
             del full_model.language_model
 
-        return self._vision_tower, self._multi_modal_projector
+        return self._vision_tower.cuda(), self._multi_modal_projector.cuda()
 
     def setup_dataset(self):
         """Set up the multimodal dataset with text conversation conversion."""
@@ -398,7 +397,7 @@ def _process_dataset_entry(self, entry: dict[str, any]):
             # Convert to numpy for caching
             image_features = image_features.cpu().detach().numpy()
 
-        return {"input_ids": inputs["input_ids"].squeeze(0), "image_features": image_features}
+        return {"input_ids": inputs["input_ids"], "image_features": image_features}
 
 
 class GemmaEmbeddingDataset(BaseGemmaDataset):
diff --git a/examples/gemma3/qnn/gemma-3-4b.ipynb b/examples/gemma3/qnn/gemma-3-4b.ipynb
new file mode 100755
index 000000000..8203288a3
--- /dev/null
+++ b/examples/gemma3/qnn/gemma-3-4b.ipynb
@@ -0,0 +1,344 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Gemma 3 4B QNN model conversion with Olive \n",
+    "### Task: Text + Vision Generation 📝\n",
+    "\n",
+    "In this notebook, you'll:\n",
+    "- Download the required datasets\n",
+    "- Convert LLM to QNN format\n",
+    "- Convert Vision to QNN format\n",
+    "- Convert Embedding layer with image to QNN format\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 🐍 Python Virtual environments\n",
+    "Creates Olive and QNN python virtual environments"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import venv\n",
+    "from pathlib import Path\n",
+    "import subprocess\n",
+    "import json\n",
+    "import shutil\n",
+    "import urllib.request\n",
+    "import onnx\n",
+    "from onnx import helper, TensorProto\n",
+    "\n",
+    "current_dir = os.getcwd()\n",
+    "MODEL=\"google/gemma-3-4b-it\"\n",
+    "OLIVE_PYTHON_PATH = './olive_venv'\n",
+    "OLIVE_PYTHON_BIN = './olive_venv/bin/python'\n",
+    "olive_pip_path = Path(OLIVE_PYTHON_PATH) / \"bin\" / \"pip\"\n",
+    "OLIVE_REPO_PATH = Path(\"../../../\")\n",
+    "OLIVE_REQ = \"../requirements.txt\"\n",
+    "QNN_REQ = \"./qnn_req.txt\"\n",
+    "\n",
+    "QNN_PYTHON_PATH = './qnn_venv'\n",
+    "QNN_PYTHON_BIN_PATH = './qnn_venv/bin'\n",
+    "qnn_pip_path = Path(QNN_PYTHON_PATH) / \"bin\" / \"pip\"\n",
+    "QNN_PYTHON_BIN_FULL_PATH = f\"{current_dir}/{QNN_PYTHON_BIN_PATH}\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Prepare Olive Python Environment"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "if not os.path.exists(OLIVE_PYTHON_PATH):\n",
+    "    print(\"Creating Olive Venv\")\n",
+    "    builder = venv.EnvBuilder(with_pip=True)\n",
+    "    builder.create(Path(OLIVE_PYTHON_PATH))\n",
+    "my_env = os.environ.copy()\n",
+    "my_env[\"BUILD_CUDA_EXT\"] = \"0\"\n",
+    "GPTQ=\"git+https://github.com/ModelCloud/GPTQModel.git\"\n",
+    "subprocess.check_call([str(olive_pip_path), \"install\", \"-U\", \"-r\" , OLIVE_REQ], env=my_env)\n",
+    "subprocess.check_call([str(olive_pip_path), \"install\", \"--no-build-isolation\", GPTQ], env=my_env)\n",
+    "subprocess.check_call([str(olive_pip_path), \"install\", \"-e\", OLIVE_REPO_PATH])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Prepare QNN Python Environment"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "if not os.path.exists(QNN_PYTHON_PATH):\n",
+    "    print(\"Creating QNN Venv\")\n",
+    "    builder = venv.EnvBuilder(with_pip=True)\n",
+    "    builder.create(Path(QNN_PYTHON_PATH))\n",
+    "subprocess.check_call([str(qnn_pip_path), \"install\", \"--no-build-isolation\", \"-r\" , QNN_REQ], env=my_env)\n",
+    "subprocess.check_call([str(qnn_pip_path), \"install\", \"-e\", OLIVE_REPO_PATH])\n",
+    "subprocess.check_call([str(qnn_pip_path), \"install\", \"-U\", \"--pre\", \"--extra-index-url\",\n",
+    "                       \"https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/ORT-Nightly/pypi/simple\",\n",
+    "                       \"onnxruntime-qnn==1.23.0.dev20250716009\", \"--no-deps\"])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 🤗 Login to Hugging Face\n",
+    "To access models, you'll need to log-in to Hugging Face with a [user access token](https://huggingface.co/docs/hub/security-tokens). The following command will run you through the steps to login:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!huggingface-cli login --token <>"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Apply few patches to Onnxruntime\n",
+    "\n",
+    "This is needed for running the Olive recipies for this model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "base_url = \"https://raw.githubusercontent.com/CodeLinaro/onnxruntime/326d9d30129bbad698e0306d24dcea0ec5a19e60\"\n",
+    "urls = [\n",
+    "    base_url + \"/onnxruntime/python/tools/quantization/execution_providers/qnn/quant_config.py\",\n",
+    "    base_url + \"/onnxruntime/python/tools/quantization/quant_utils.py\"\n",
+    "]\n",
+    "\n",
+    "destinations = [\n",
+    "    OLIVE_PYTHON_PATH+\"/lib/python3.10/site-packages/onnxruntime/quantization/execution_providers/qnn/quant_config.py\",\n",
+    "    OLIVE_PYTHON_PATH+\"/lib/python3.10/site-packages/onnxruntime/quantization/quant_utils.py\"\n",
+    "]\n",
+    "\n",
+    "for url, dest in zip(urls, destinations):\n",
+    "    urllib.request.urlretrieve(url, dest)\n",
+    "    print(f\"Downloaded and replaced: {dest}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Run Olive Recipes"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "**GPU utilization observed during the run**\n",
+    "\n",
+    "\t\ta. Text GPTQModel quantization:        12gb\n",
+    "\t\tb. Text Onnx static quantization:      41gb\n",
+    "\t\tc. Vision Onnx static quantization:    68gb\n",
+    "        d. Embedding Onnx static quantization: 3gb"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 1️⃣ LLM model generation"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "config_path = Path(f\"./gemma3-4b-text-qnn-config.json\")\n",
+    "with open(config_path, \"r\") as file:\n",
+    "    data = json.load(file)\n",
+    "\n",
+    "data[\"systems\"][\"qnn_system\"][\"python_environment_path\"] = QNN_PYTHON_BIN_FULL_PATH\n",
+    "data[\"input_model\"][\"model_path\"] = MODEL\n",
+    "\n",
+    "with open(config_path, \"w\") as file:\n",
+    "    json.dump(data, file, indent=4)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "!./olive_venv/bin/olive run --config ./gemma3-4b-text-qnn-config.json"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 2️⃣ Vision model Quantization"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "config_path = Path(f\"./gemma3-4b-vision-qnn-config.json\")\n",
+    "with open(config_path, \"r\") as file:\n",
+    "    data = json.load(file)\n",
+    "data[\"systems\"][\"qnn_system\"][\"python_environment_path\"] = QNN_PYTHON_BIN_FULL_PATH\n",
+    "\n",
+    "with open(config_path, \"w\") as file:\n",
+    "    json.dump(data, file, indent=4)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "!./olive_venv/bin/olive run --config ./gemma3-4b-vision-qnn-config.json"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 3️⃣ Embedding Model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "!./olive_venv/bin/olive run --config ./gemma3-4b-embedding-qnn-config.json"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Keep output of the embedding model as uint16 instead of float"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model = onnx.load(\"./models/gemma-3-4b-it-embed/model/model.onnx\")\n",
+    "graph = model.graph\n",
+    "\n",
+    "last_node = graph.node[-1]\n",
+    "graph.node.remove(last_node)\n",
+    "previous_node_output = graph.node[-1].output[0]\n",
+    "new_output = helper.make_tensor_value_info(\n",
+    "    name=previous_node_output,\n",
+    "    elem_type=TensorProto.UINT16,\n",
+    "    shape=[\"batch_size\", \"seq_length\", 2560]\n",
+    ")\n",
+    "graph.output.remove(graph.output[0])\n",
+    "graph.output.extend([new_output])\n",
+    "onnx.save(model, \"./models/gemma-3-4b-it-embed/model/embeddings_with_image.onnx\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Prepare final ORT GenAI folder for on-device inference "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!cp ./models/gemma-3-4b-it-embed/model/embeddings_with_image.onnx ./models/gemma3_qnn/model/\n",
+    "!cp ./models/gemma-3-4b-it-vision/model/model_ctx.onnx ./models/gemma3_qnn/model/model_ctx_vision.onnx \n",
+    "!cp ./models/gemma-3-4b-it-vision/model/model_ctx_qnn.bin ./models/gemma3_qnn/model/model_ctx_qnn.bin \n",
+    "!cp ./genai/*.* ./models/gemma3_qnn/model/\n",
+    "\n",
+    "print(\"ORT GenAI inference setup: ./models/gemma3_qnn\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/examples/gemma3/qnn/gemma3-4b-embedding-qnn-config.json b/examples/gemma3/qnn/gemma3-4b-embedding-qnn-config.json
index 8a70e359a..1c5b7f626 100644
--- a/examples/gemma3/qnn/gemma3-4b-embedding-qnn-config.json
+++ b/examples/gemma3/qnn/gemma3-4b-embedding-qnn-config.json
@@ -4,33 +4,75 @@
         "model_script": "custom_gemma3_4b_embedding.py",
         "model_loader": "load_gemma3_embedding_model",
         "io_config": {
-            "input_names": [ "input_ids", "image_features" ],
-            "input_shapes": [ [ 1, 64 ], [ 1, 256, 2560 ] ],
-            "input_types": [ "int32", "float32" ],
-            "output_names": [ "/model/embed_tokens/Mul/output_0" ],
-            "output_shapes": [ [ 1, 64, 2560 ] ],
+            "input_names": [
+                "input_ids",
+                "image_features"
+            ],
+            "input_shapes": [
+                [
+                    1,
+                    64
+                ],
+                [
+                    1,
+                    256,
+                    2560
+                ]
+            ],
+            "input_types": [
+                "int64",
+                "float32"
+            ],
+            "output_names": [
+                "/model/embed_tokens/Mul/output_0"
+            ],
+            "output_shapes": [
+                [
+                    1,
+                    64,
+                    2560
+                ]
+            ],
             "dynamic_axes": {
-                "input_ids": { "0": "batch_size", "1": "seq_length" },
-                "image_features": { "0": "batch_size" }
+                "input_ids": {
+                    "0": "batch_size",
+                    "1": "seq_length"
+                },
+                "image_features": {
+                    "0": "batch_size",
+                    "1": "image_tokens_length"
+                }
             }
         }
     },
     "systems": {
-        "qnn_system": {
-            "type": "PythonEnvironment",
-            "python_environment_path": "/local/mnt2/workspace/kromero/olive/olive-venv/bin",
-            "accelerators": [ { "execution_providers": [ "QNNExecutionProvider" ] } ]
+        "local_system": {
+            "type": "LocalSystem",
+            "accelerators": [
+                {
+                    "device": "cpu",
+                    "execution_providers": [
+                        "CPUExecutionProvider"
+                    ]
+                }
+            ]
         }
     },
     "data_configs": [
         {
             "name": "gemma_embedding_data_config",
             "user_script": "custom_gemma3_4b_datasets.py",
-            "load_dataset_config": { "type": "gemma_embedding_input_dataset", "model_id": "google/gemma-3-4b-it" }
+            "load_dataset_config": {
+                "type": "gemma_embedding_input_dataset",
+                "model_id": "google/gemma-3-4b-it"
+            }
         }
     ],
     "passes": {
-        "conversion": { "type": "OnnxConversion", "target_opset": 20 },
+        "conversion": {
+            "type": "OnnxConversion",
+            "target_opset": 20
+        },
         "quantization": {
             "type": "OnnxStaticQuantization",
             "quant_preprocess": false,
@@ -38,15 +80,20 @@
             "activation_type": "uint16",
             "precision": "uint8",
             "calibrate_method": "MinMax",
-            "calibration_providers": [ "CUDAExecutionProvider" ],
+            "calibration_providers": [
+                "CUDAExecutionProvider"
+            ],
             "per_channel": true,
             "weight_symmetric": true
         },
-        "add_metadata": { "type": "AddOliveMetadata", "graph_name": "gemma-3-4b-it-vision" }
+        "add_metadata": {
+            "type": "AddOliveMetadata",
+            "graph_name": "gemma-3-4b-it-embedding"
+        }
     },
-    "target": "qnn_system",
+    "target": "local_system",
     "log_severity_level": 1,
     "output_dir": "models/gemma-3-4b-it-embed",
     "cache_dir": "cache-embd",
     "no_artifacts": true
-}
+}
\ No newline at end of file
diff --git a/examples/gemma3/qnn/gemma3-4b-text-qnn-config.json b/examples/gemma3/qnn/gemma3-4b-text-qnn-config.json
index 1cad472ab..672cd0263 100644
--- a/examples/gemma3/qnn/gemma3-4b-text-qnn-config.json
+++ b/examples/gemma3/qnn/gemma3-4b-text-qnn-config.json
@@ -1,21 +1,56 @@
 {
-    "input_model": { "type": "HfModel", "model_path": "google/gemma-3-4b-it" },
+    "input_model": {
+        "type": "HfModel",
+        "model_path": "google/gemma-3-4b-it",
+        "custom_task_class_name": "Gemma3ForCausalLM",
+        "custom_task_class_module": "transformers"
+    },
     "systems": {
         "qnn_system": {
             "type": "PythonEnvironment",
-            "python_environment_path": "/local/mnt2/workspace/kromero/olive/olive-venv/bin",
-            "accelerators": [ { "execution_providers": [ "QNNExecutionProvider" ] } ]
+            "python_environment_path": "",
+            "accelerators": [
+                {
+                    "execution_providers": [
+                        "QNNExecutionProvider"
+                    ]
+                }
+            ]
         }
     },
     "data_configs": [
         {
-            "name": "gemma_embedding_data_config",
-            "user_script": "custom_gemma3_4b_datasets.py",
-            "load_dataset_config": { "type": "gemma_embedding_dataset", "model_id": "google/gemma-3-4b-it" }
+            "name": "wikitext2_train_joined",
+            "type": "HuggingfaceContainer",
+            "load_dataset_config": {
+                "data_name": "wikitext",
+                "subset": "wikitext-2-raw-v1",
+                "split": "train"
+            },
+            "pre_process_data_config": {
+                "strategy": "join",
+                "add_special_tokens": false,
+                "max_seq_len": 4096,
+                "max_samples": 256
+            }
+        },
+        {
+            "name": "wikitext2_train_act",
+            "type": "HuggingfaceContainer",
+            "load_dataset_config": {
+                "data_name": "wikitext",
+                "subset": "wikitext-2-raw-v1",
+                "split": "train"
+            },
+            "pre_process_data_config": {
+                "strategy": "line-by-line",
+                "add_special_tokens": true,
+                "max_samples": 256,
+                "max_seq_len": 2048
+            }
         }
     ],
     "passes": {
-        "q": { "type": "QuaRot", "device": "cpu" },
         "g": {
             "type": "GptqModel",
             "bits": 4,
@@ -23,59 +58,93 @@
             "group_size": -1,
             "lm_head": false,
             "device": "cuda",
-            "data_config": "gemma_embedding_data_config"
+            "data_config": "wikitext2_train_joined"
+        },
+        "cs": {
+            "type": "CaptureSplitInfo",
+            "num_splits": 2,
+            "unique_embeds_lm_head_splits": true
         },
-        "cs": { "type": "CaptureSplitInfo", "num_splits": 2, "unique_embeds_lm_head_splits": true },
         "mb": {
             "type": "ModelBuilder",
             "precision": "int4",
             "int4_block_size": 32,
             "int4_accuracy_level": 4,
-            "int4_op_types_to_quantize": [ "MatMul", "Gather" ]
+            "int4_op_types_to_quantize": [
+                "MatMul",
+                "Gather"
+            ]
         },
         "mq": {
             "type": "MatMulNBitsToQDQ",
             "use_int4": true,
             "add_zero_point": true,
-            "nodes_to_exclude": [ "/lm_head/MatMul_Q4" ],
+            "nodes_to_exclude": [
+                "/lm_head/MatMul_Q4"
+            ],
             "save_as_external_data": true
         },
         "gs": {
             "type": "GraphSurgeries",
             "surgeries": [
-                { "surgeon": "RemoveRopeMultiCache" },
-                { "surgeon": "AttentionMaskToSequenceLengths" },
-                { "surgeon": "SimplifiedLayerNormToL2Norm" }
+                {
+                    "surgeon": "RemoveRopeMultiCache"
+                },
+                {
+                    "surgeon": "AttentionMaskToSequenceLengths"
+                },
+                {
+                    "surgeon": "SimplifiedLayerNormToL2Norm"
+                }
             ],
             "save_as_external_data": true
         },
         "sq": {
             "type": "OnnxStaticQuantization",
-            "data_config": "gemma_embedding_data_config",
+            "data_config": "wikitext2_train_act",
             "activation_type": "uint16",
             "precision": "uint8",
-            "calibration_providers": [ "CUDAExecutionProvider" ],
+            "calibration_providers": [
+                "CUDAExecutionProvider"
+            ],
             "quant_preprocess": true,
-            "op_types_to_exclude": [ "GatherBlockQuantized", "GroupQueryAttention", "MatMulNBits" ],
+            "op_types_to_exclude": [
+                "GatherBlockQuantized",
+                "GroupQueryAttention",
+                "MatMulNBits"
+            ],
             "save_as_external_data": true
         },
-        "sp": { "type": "SplitModel" },
-        "st": { "type": "StaticLLM", "batch_size": 1, "context_length": 64 },
+        "sp": {
+            "type": "SplitModel"
+        },
+        "st": {
+            "type": "StaticLLM",
+            "batch_size": 1,
+            "context_length": 64
+        },
         "cb": {
             "type": "EPContextBinaryGenerator",
             "provider_options": {
                 "htp_performance_mode": "burst",
                 "htp_graph_finalization_optimization_mode": "3",
+                "vtcm_mb": "8",
+                "htp_arch": "v73",
                 "soc_model": "60"
             },
-            "session_options": { "intra_op_num_threads": 2, "inter_op_num_threads": 1 },
+            "session_options": {
+                "intra_op_num_threads": 2,
+                "inter_op_num_threads": 1
+            },
             "weight_sharing": true
         },
-        "cp": { "type": "ComposeOnnxModels" }
+        "cp": {
+            "type": "ComposeOnnxModels"
+        }
     },
     "target": "qnn_system",
-    "log_severity_level": 1,
-    "output_dir": "models/gemma-3-4b-it-text",
+    "log_severity_level": 0,
+    "output_dir": "models/gemma3_qnn",
     "cache_dir": "cache",
     "no_artifacts": true
-}
+}
\ No newline at end of file
diff --git a/examples/gemma3/qnn/gemma3-4b-vision-qnn-config.json b/examples/gemma3/qnn/gemma3-4b-vision-qnn-config.json
index fe5328c6e..e252381ab 100644
--- a/examples/gemma3/qnn/gemma3-4b-vision-qnn-config.json
+++ b/examples/gemma3/qnn/gemma3-4b-vision-qnn-config.json
@@ -4,30 +4,68 @@
         "model_script": "custom_gemma3_4b_vision.py",
         "model_loader": "load_gemma3_vision_model",
         "io_config": {
-            "input_names": [ "pixel_values" ],
-            "input_shapes": [ [ 1, 3, 896, 896 ] ],
-            "input_types": [ "float32" ],
-            "output_names": [ "image_features" ],
-            "output_shapes": [ [ 1, 256, 2560 ] ]
+            "input_names": [
+                "pixel_values"
+            ],
+            "input_shapes": [
+                [
+                    1,
+                    3,
+                    896,
+                    896
+                ]
+            ],
+            "input_types": [
+                "float32"
+            ],
+            "output_names": [
+                "image_features"
+            ],
+            "output_shapes": [
+                [
+                    1,
+                    256,
+                    2560
+                ]
+            ]
         }
     },
     "systems": {
         "qnn_system": {
             "type": "PythonEnvironment",
-            "python_environment_path": "/local/mnt2/workspace/kromero/olive/olive-venv/bin",
-            "accelerators": [ { "execution_providers": [ "QNNExecutionProvider" ] } ]
+            "python_environment_path": "",
+            "accelerators": [
+                {
+                    "execution_providers": [
+                        "QNNExecutionProvider"
+                    ]
+                }
+            ]
         }
     },
     "data_configs": [
         {
             "name": "gemma_vision_data_config",
             "user_script": "custom_gemma3_4b_datasets.py",
-            "load_dataset_config": { "type": "gemma_image_dataset", "model_id": "google/gemma-3-4b-it" }
+            "load_dataset_config": {
+                "type": "gemma_image_dataset",
+                "model_id": "google/gemma-3-4b-it"
+            }
         }
     ],
     "passes": {
-        "conversion": { "type": "OnnxConversion", "target_opset": 20 },
-        "surgery": { "type": "GraphSurgeries", "surgeries": [ { "surgeon": "MatMulAddToGemm" } ] },
+        "conversion": {
+            "type": "OnnxConversion",
+            "target_opset": 20
+        },
+        "surgery": {
+            "type": "GraphSurgeries",
+            "surgeries": [
+                {
+                    "surgeon": "MatMulAddToGemm"
+                }
+            ]
+        },
         "quantization": {
             "type": "OnnxStaticQuantization",
             "quant_preprocess": true,
@@ -35,21 +73,30 @@
             "activation_type": "uint16",
             "precision": "uint8",
             "calibrate_method": "MinMax",
+            "calibration_providers": [
+                "CUDAExecutionProvider"
+            ],
             "per_channel": true,
             "weight_symmetric": true
         },
         "cb": {
             "type": "EPContextBinaryGenerator",
             "provider_options": {
+                "htp_performance_mode": "burst",
                 "htp_graph_finalization_optimization_mode": "3",
-                "offload_graph_io_quantization": "0"
+                "vtcm_mb": "8",
+                "htp_arch": "v73",
+                "soc_model": "60"
             }
         },
-        "add_metadata": { "type": "AddOliveMetadata", "graph_name": "gemma-3-4b-it-vision" }
+        "add_metadata": {
+            "type": "AddOliveMetadata",
+            "graph_name": "gemma-3-4b-it-vision"
+        }
     },
     "target": "qnn_system",
     "log_severity_level": 1,
     "output_dir": "models/gemma-3-4b-it-vision",
-    "cache_dir": "cache",
+    "cache_dir": "cache-vision",
     "no_artifacts": true
-}
+}
\ No newline at end of file
diff --git a/examples/gemma3/qnn/app.py b/examples/gemma3/qnn/genai/app.py
similarity index 100%
rename from examples/gemma3/qnn/app.py
rename to examples/gemma3/qnn/genai/app.py
diff --git a/examples/gemma3/qnn/genai_config.json b/examples/gemma3/qnn/genai/genai_config.json
old mode 100644
new mode 100755
similarity index 92%
rename from examples/gemma3/qnn/genai_config.json
rename to examples/gemma3/qnn/genai/genai_config.json
index d1185aa08..a835fb863
--- a/examples/gemma3/qnn/genai_config.json
+++ b/examples/gemma3/qnn/genai/genai_config.json
@@ -6,19 +6,13 @@
             "session_options": {
                 "log_id": "onnxruntime-genai",
                 "provider_options": [
-                    {
-                        "qnn": {
-                            "htp_performance_mode": "burst",
-                            "htp_graph_finalization_optimization_mode": "3",
-                            "soc_model": "60"
-                        }
-                    }
                 ]
             },
             "head_size": 256,
             "hidden_size": 2560,
             "inputs": {
-                "input_ids": "inputs_embeds",
+                "input_ids":"input_ids",
+                "inputs_embeds": "inputs_embeds",
                 "attention_mask": "attention_mask",
                 "past_key_names": "past_key_values.%d.key",
                 "past_value_names": "past_key_values.%d.value",
@@ -42,10 +36,20 @@
             },
             "pipeline": [
                 {
+                    "embeddings": {
+                        "filename": "embeddings.onnx",
+                        "inputs": [
+                            "input_ids"
+                        ],
+                        "outputs": [
+                            "/model/embed_tokens/Mul/output_0_QuantizeLinear_Output"
+                        ],
+                        "run_on_prompt": false
+                    },
                     "context_ctx": {
                         "filename": "context_ctx.onnx",
                         "inputs": [
-                            "inputs_embeds",
+                            "/model/embed_tokens/Mul/output_0_QuantizeLinear_Output",
                             "past_key_values.0.key",
                             "past_key_values.0.value",
                             "past_seq_len",
@@ -206,7 +210,7 @@
                     "iterator_ctx": {
                         "filename": "iterator_ctx.onnx",
                         "inputs": [
-                            "inputs_embeds",
+                            "/model/embed_tokens/Mul/output_0_QuantizeLinear_Output",
                             "past_key_values.0.key",
                             "past_key_values.0.value",
                             "past_seq_len",
@@ -369,22 +373,49 @@
                         "inputs": [
                             "/model/layers.34/final_norm_layernorm/SkipLayerNorm_Mul_output_0_QuantizeLinear_Output"
                         ],
-                        "outputs": [ "logits" ]
+                        "outputs": [
+                            "logits"
+                        ]
                     }
                 }
             ]
         },
         "embedding": {
-            "filename": "embeddings_combined.onnx",
-            "inputs": { "input_ids": "input_ids", "image_features": "image_features" },
-            "outputs": { "inputs_embeds": "inputs_embeds" }
+            "filename": "embeddings_with_image.onnx",
+            "inputs": {
+                "input_ids": "input_ids",
+                "image_features": "image_features"
+            },
+            "outputs": {
+                "inputs_embeds": "/model/embed_tokens/Mul/output_0_QuantizeLinear_Output"
+            }
         },
         "vision": {
-            "filename": "model_ctx.onnx",
-            "inputs": { "pixel_values": "pixel_values" },
-            "outputs": { "image_features": "image_features" }
+            "filename": "model_ctx_vision.onnx",
+            "inputs": {
+                "pixel_values": "pixel_values"
+            },
+            "outputs": {
+                "image_features": "image_features"
+            },
+            "session_options": {
+                "intra_op_num_threads": 2,
+                "inter_op_num_threads": 1,
+                "provider_options": [
+                    {
+                        "qnn": {
+                            "htp_performance_mode": "burst",
+                            "htp_graph_finalization_optimization_mode": "3",
+                            "soc_model": "60"
+                        }
+                    }
+                ]
+            }
         },
-        "eos_token_id": [ 1, 106 ],
+        "eos_token_id": [
+            1,
+            106
+        ],
         "pad_token_id": 0,
         "type": "gemma3",
         "vocab_size": 262208
@@ -405,4 +436,4 @@
         "top_k": 64,
         "top_p": 0.95
     }
-}
+}
\ No newline at end of file
diff --git a/examples/gemma3/qnn/genai/processor_config.json b/examples/gemma3/qnn/genai/processor_config.json
new file mode 100755
index 000000000..d1c66b6ce
--- /dev/null
+++ b/examples/gemma3/qnn/genai/processor_config.json
@@ -0,0 +1,53 @@
+{
+  "processor": {
+    "name": "gemma_3_image_processing",
+    "transforms": [
+      {
+        "operation": {
+          "name": "decode_image",
+          "type": "DecodeImage",
+          "attrs": {
+            "color_space": "RGB"
+          }
+        }
+      },
+      {
+        "operation": {
+          "name": "resize",
+          "type": "Resize",
+          "attrs": {
+            "interpolation": "CUBIC",
+            "width": 896,
+            "height": 896,
+            "keep_aspect_ratio": 0
+          }
+        }
+      },
+      {
+        "operation": {
+          "name": "re-scale",
+          "type": "Rescale"
+        }
+      },
+      {
+        "operation": {
+          "name": "normalize",
+          "type": "Normalize",
+          "attrs": {
+            "mean": [0.5, 0.5, 0.5],
+            "std": [0.5, 0.5, 0.5]
+          }
+        }
+      },
+      {
+        "operation": {
+          "name": "to_channel_first",
+          "type": "Permute3D",
+          "attrs": {
+            "dims": [2, 0, 1]
+          }
+        }
+      }
+    ]
+  }
+}
\ No newline at end of file
diff --git a/examples/gemma3/qnn/olive_req.txt b/examples/gemma3/qnn/olive_req.txt
deleted file mode 100755
index 8923fbfa7..000000000
--- a/examples/gemma3/qnn/olive_req.txt
+++ /dev/null
@@ -1,7 +0,0 @@
-transformers
-datasets
-optimum
-onnxruntime-gpu==1.22.0
-onnxruntime-genai-cuda==0.9.0
-setuptools
-tabulate
\ No newline at end of file
diff --git a/examples/gemma3/qnn/qnn_req.txt b/examples/gemma3/qnn/qnn_req.txt
new file mode 100755
index 000000000..3cabc5919
--- /dev/null
+++ b/examples/gemma3/qnn/qnn_req.txt
@@ -0,0 +1,7 @@
+coloredlogs
+flatbuffers
+numpy >= 1.21.6
+packaging
+protobuf
+sympy
+
diff --git a/examples/gemma3/requirements.txt b/examples/gemma3/requirements.txt
index 0b56b7908..8923fbfa7 100644
--- a/examples/gemma3/requirements.txt
+++ b/examples/gemma3/requirements.txt
@@ -1,5 +1,7 @@
+transformers
 datasets
-onnxruntime-genai-cuda==0.7.1
-onnxruntime-gpu==1.21.1
 optimum
-transformers
+onnxruntime-gpu==1.22.0
+onnxruntime-genai-cuda==0.9.0
+setuptools
+tabulate
\ No newline at end of file
diff --git a/olive/common/hf/utils.py b/olive/common/hf/utils.py
index 9a98ff0e3..752bf4990 100644
--- a/olive/common/hf/utils.py
+++ b/olive/common/hf/utils.py
@@ -11,6 +11,7 @@
 from olive.common.hf.mappings import TASK_TO_PEFT_TASK_TYPE
 from olive.common.hf.mlflow import get_pretrained_name_or_path
 from olive.common.utils import hardlink_copy_file
+import importlib
 
 if TYPE_CHECKING:
     from transformers import PretrainedConfig, PreTrainedModel, PreTrainedTokenizer, PreTrainedTokenizerFast
@@ -18,7 +19,7 @@
 logger = logging.getLogger(__name__)
 
 
-def load_model_from_task(task: str, model_name_or_path: str, **kwargs) -> "PreTrainedModel":
+def load_model_from_task(task: str, model_name_or_path: str, custom_task_class_name:str = None, custom_task_class_module:str = None, **kwargs) -> "PreTrainedModel":
     """Load huggingface model from task and model_name_or_path."""
     from transformers.pipelines import check_task
 
@@ -55,7 +56,12 @@ def load_model_from_task(task: str, model_name_or_path: str, **kwargs) -> "PreTr
             AUTO_QUANTIZATION_CONFIG_MAPPING["olive"] = OliveHfQuantizationConfig
             AUTO_QUANTIZER_MAPPING["olive"] = OliveHfQuantizer
 
-    class_tuple = targeted_task["pt"] or (AutoModel,)
+    if (custom_task_class_module is not None and custom_task_class_name is not None):
+        module = importlib.import_module(custom_task_class_module)
+        class_tuple = (getattr(module, custom_task_class_name),)
+    else:
+        class_tuple = targeted_task["pt"] or (AutoModel,)
+    print("class_tuple", class_tuple)
     model = None
     for i, model_class in enumerate(class_tuple):
         try:
diff --git a/olive/model/handler/hf.py b/olive/model/handler/hf.py
index bf46d7417..396a36d2a 100644
--- a/olive/model/handler/hf.py
+++ b/olive/model/handler/hf.py
@@ -28,7 +28,7 @@
 @model_handler_registry("HFModel")
 class HfModelHandler(PyTorchModelHandlerBase, MLFlowTransformersMixin, HfMixin):  # pylint: disable=too-many-ancestors
     resource_keys: tuple[str, ...] = ("model_path", "adapter_path")
-    json_config_keys: tuple[str, ...] = ("task", "load_kwargs")
+    json_config_keys: tuple[str, ...] = ("task", "load_kwargs", "custom_task_class_name", "custom_task_class_module")
 
     def __init__(
         self,
@@ -38,6 +38,8 @@ def __init__(
         io_config: Union[dict[str, Any], IoConfig, str] = None,
         adapter_path: OLIVE_RESOURCE_ANNOTATIONS = None,
         model_attributes: Optional[dict[str, Any]] = None,
+        custom_task_class_name: str = None,
+        custom_task_class_module: str = None
     ):
         super().__init__(
             model_file_format=None,
@@ -47,6 +49,8 @@ def __init__(
         )
         self.add_resources(locals())
         self.task = task
+        self.custom_task_class_name = custom_task_class_name
+        self.custom_task_class_module = custom_task_class_module
         self.load_kwargs = validate_config(load_kwargs, HfLoadKwargs, warn_unused_keys=False) if load_kwargs else None
 
         self.model_attributes = {**self.get_hf_model_config().to_dict(), **(self.model_attributes or {})}
@@ -72,7 +76,7 @@ def load_model(self, rank: int = None, cache_model: bool = True) -> "torch.nn.Mo
         if self.model:
             model = self.model
         else:
-            model = load_model_from_task(self.task, self.model_path, **self.get_load_kwargs())
+            model = load_model_from_task(self.task, self.model_path, self.custom_task_class_name, self.custom_task_class_module, **self.get_load_kwargs())
 
             # we only have peft adapters for now
             if self.adapter_path:

From 14018ee2ec48f4d3bb4c7efc892386adc3f06952 Mon Sep 17 00:00:00 2001
From: Alahari Prudhvi Akhil <prudhvi@qti.qualcomm.com>
Date: Fri, 5 Sep 2025 02:53:17 -0700
Subject: [PATCH 19/24] Update few python packages

---
 .../gemma3/qnn/custom_gemma3_4b_datasets.py   |  2 +-
 examples/gemma3/qnn/gemma-3-4b.ipynb          | 33 +++++++++++++------
 .../qnn/gemma3-4b-embedding-qnn-config.json   |  0
 .../gemma3/qnn/gemma3-4b-text-qnn-config.json |  0
 .../qnn/gemma3-4b-vision-qnn-config.json      |  0
 examples/gemma3/qnn/qnn_req.txt               |  2 +-
 examples/gemma3/requirements.txt              |  5 ++-
 7 files changed, 29 insertions(+), 13 deletions(-)
 mode change 100644 => 100755 examples/gemma3/qnn/gemma3-4b-embedding-qnn-config.json
 mode change 100644 => 100755 examples/gemma3/qnn/gemma3-4b-text-qnn-config.json
 mode change 100644 => 100755 examples/gemma3/qnn/gemma3-4b-vision-qnn-config.json

diff --git a/examples/gemma3/qnn/custom_gemma3_4b_datasets.py b/examples/gemma3/qnn/custom_gemma3_4b_datasets.py
index 987297cb8..03f77fea0 100644
--- a/examples/gemma3/qnn/custom_gemma3_4b_datasets.py
+++ b/examples/gemma3/qnn/custom_gemma3_4b_datasets.py
@@ -495,7 +495,7 @@ def _process_dataset_entry(self, entry: dict[str, any]):
 
 
 # Remove this when submitting for review
-SHORTCUT_FIRST_N = 25
+SHORTCUT_FIRST_N = 200
 
 
 @Registry.register_dataset()
diff --git a/examples/gemma3/qnn/gemma-3-4b.ipynb b/examples/gemma3/qnn/gemma-3-4b.ipynb
index 8203288a3..dbf127b44 100755
--- a/examples/gemma3/qnn/gemma-3-4b.ipynb
+++ b/examples/gemma3/qnn/gemma-3-4b.ipynb
@@ -37,6 +37,7 @@
     "import urllib.request\n",
     "import onnx\n",
     "from onnx import helper, TensorProto\n",
+    "import glob\n",
     "\n",
     "current_dir = os.getcwd()\n",
     "MODEL=\"google/gemma-3-4b-it\"\n",
@@ -103,7 +104,7 @@
     "subprocess.check_call([str(qnn_pip_path), \"install\", \"-e\", OLIVE_REPO_PATH])\n",
     "subprocess.check_call([str(qnn_pip_path), \"install\", \"-U\", \"--pre\", \"--extra-index-url\",\n",
     "                       \"https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/ORT-Nightly/pypi/simple\",\n",
-    "                       \"onnxruntime-qnn==1.23.0.dev20250716009\", \"--no-deps\"])"
+    "                       \"onnxruntime-qnn==1.23.0.dev20250815002\", \"--no-deps\"])"
    ]
   },
   {
@@ -174,6 +175,27 @@
     "        d. Embedding Onnx static quantization: 3gb"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Clean Context binary directories if they exist\n",
+    "def clean_directory(path):\n",
+    "    if os.path.exists(path):\n",
+    "        for file in glob.glob(os.path.join(path, '*')):\n",
+    "            if os.path.isfile(file):\n",
+    "                os.remove(file)\n",
+    "dirs_to_clean = [\n",
+    "    './models/gemma3_qnn/model/',\n",
+    "    './models/gemma-3-4b-it-vision/model/'\n",
+    "]\n",
+    "\n",
+    "for dir_path in dirs_to_clean:\n",
+    "    clean_directory(dir_path)"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -309,15 +331,6 @@
     "\n",
     "print(\"ORT GenAI inference setup: ./models/gemma3_qnn\")"
    ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "\n"
-   ]
   }
  ],
  "metadata": {
diff --git a/examples/gemma3/qnn/gemma3-4b-embedding-qnn-config.json b/examples/gemma3/qnn/gemma3-4b-embedding-qnn-config.json
old mode 100644
new mode 100755
diff --git a/examples/gemma3/qnn/gemma3-4b-text-qnn-config.json b/examples/gemma3/qnn/gemma3-4b-text-qnn-config.json
old mode 100644
new mode 100755
diff --git a/examples/gemma3/qnn/gemma3-4b-vision-qnn-config.json b/examples/gemma3/qnn/gemma3-4b-vision-qnn-config.json
old mode 100644
new mode 100755
diff --git a/examples/gemma3/qnn/qnn_req.txt b/examples/gemma3/qnn/qnn_req.txt
index 3cabc5919..05c845791 100755
--- a/examples/gemma3/qnn/qnn_req.txt
+++ b/examples/gemma3/qnn/qnn_req.txt
@@ -4,4 +4,4 @@ numpy >= 1.21.6
 packaging
 protobuf
 sympy
-
+transformers==4.55.2
diff --git a/examples/gemma3/requirements.txt b/examples/gemma3/requirements.txt
index 8923fbfa7..3d365f2bb 100644
--- a/examples/gemma3/requirements.txt
+++ b/examples/gemma3/requirements.txt
@@ -4,4 +4,7 @@ optimum
 onnxruntime-gpu==1.22.0
 onnxruntime-genai-cuda==0.9.0
 setuptools
-tabulate
\ No newline at end of file
+tabulate
+onnx==1.18.0
+onnx-ir==0.1.4
+onnxscript==0.3.2
\ No newline at end of file

From 1f892410860f53d5009606fb221cc22fe965e865 Mon Sep 17 00:00:00 2001
From: Alahari Prudhvi Akhil <prudhvi@qti.qualcomm.com>
Date: Mon, 8 Sep 2025 09:22:22 -0700
Subject: [PATCH 20/24] Use the same llava dataset for text model as well

This fixes the issue of text model repeating words in
the output.
---
 .../gemma3/qnn/custom_gemma3_4b_datasets.py   |  6 ++--
 examples/gemma3/qnn/gemma-3-4b.ipynb          | 11 +++++-
 .../gemma3/qnn/gemma3-4b-text-qnn-config.json | 34 ++++---------------
 3 files changed, 18 insertions(+), 33 deletions(-)
 mode change 100644 => 100755 examples/gemma3/qnn/custom_gemma3_4b_datasets.py

diff --git a/examples/gemma3/qnn/custom_gemma3_4b_datasets.py b/examples/gemma3/qnn/custom_gemma3_4b_datasets.py
old mode 100644
new mode 100755
index 03f77fea0..71410af4f
--- a/examples/gemma3/qnn/custom_gemma3_4b_datasets.py
+++ b/examples/gemma3/qnn/custom_gemma3_4b_datasets.py
@@ -257,8 +257,6 @@ def _process_dataset_entry(self, entry: dict[str, any]):
         inputs = self.processor.apply_chat_template(
             entry["text"][0], add_generation_prompt=True, tokenize=True, return_tensors="pt", return_dict=True
         )
-        inputs = {k: v.unsqueeze(0) for k, v in inputs.items()}
-        inputs["input_ids"] = inputs["input_ids"][0]
         return inputs
 
 
@@ -495,13 +493,13 @@ def _process_dataset_entry(self, entry: dict[str, any]):
 
 
 # Remove this when submitting for review
+TEXT_SHORTCUT_FIRST_N = 600
 SHORTCUT_FIRST_N = 200
 
-
 @Registry.register_dataset()
 def gemma_dataset(model_id: str):
     """Full E2E Gemma 3 multi-modal dataset (image + text)."""
-    return GemmaMultimodalDataset(model_id, first_n=SHORTCUT_FIRST_N).get_dataset()
+    return GemmaMultimodalDataset(model_id, first_n=TEXT_SHORTCUT_FIRST_N).get_dataset()
 
 
 @Registry.register_dataset()
diff --git a/examples/gemma3/qnn/gemma-3-4b.ipynb b/examples/gemma3/qnn/gemma-3-4b.ipynb
index dbf127b44..7b36c9cf5 100755
--- a/examples/gemma3/qnn/gemma-3-4b.ipynb
+++ b/examples/gemma3/qnn/gemma-3-4b.ipynb
@@ -189,7 +189,8 @@
     "                os.remove(file)\n",
     "dirs_to_clean = [\n",
     "    './models/gemma3_qnn/model/',\n",
-    "    './models/gemma-3-4b-it-vision/model/'\n",
+    "    './models/gemma-3-4b-it-vision/model/',\n",
+    "    './models/gemma-3-4b-it-embed/model/'\n",
     "]\n",
     "\n",
     "for dir_path in dirs_to_clean:\n",
@@ -328,9 +329,17 @@
     "!cp ./models/gemma-3-4b-it-vision/model/model_ctx.onnx ./models/gemma3_qnn/model/model_ctx_vision.onnx \n",
     "!cp ./models/gemma-3-4b-it-vision/model/model_ctx_qnn.bin ./models/gemma3_qnn/model/model_ctx_qnn.bin \n",
     "!cp ./genai/*.* ./models/gemma3_qnn/model/\n",
+    "!ls -al ./models/gemma3_qnn/model/\n",
     "\n",
     "print(\"ORT GenAI inference setup: ./models/gemma3_qnn\")"
    ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
   }
  ],
  "metadata": {
diff --git a/examples/gemma3/qnn/gemma3-4b-text-qnn-config.json b/examples/gemma3/qnn/gemma3-4b-text-qnn-config.json
index 672cd0263..06eb58078 100755
--- a/examples/gemma3/qnn/gemma3-4b-text-qnn-config.json
+++ b/examples/gemma3/qnn/gemma3-4b-text-qnn-config.json
@@ -20,33 +20,11 @@
     },
     "data_configs": [
         {
-            "name": "wikitext2_train_joined",
-            "type": "HuggingfaceContainer",
+            "name": "gemma_data_config",
+            "user_script": "custom_gemma3_4b_datasets.py",
             "load_dataset_config": {
-                "data_name": "wikitext",
-                "subset": "wikitext-2-raw-v1",
-                "split": "train"
-            },
-            "pre_process_data_config": {
-                "strategy": "join",
-                "add_special_tokens": false,
-                "max_seq_len": 4096,
-                "max_samples": 256
-            }
-        },
-        {
-            "name": "wikitext2_train_act",
-            "type": "HuggingfaceContainer",
-            "load_dataset_config": {
-                "data_name": "wikitext",
-                "subset": "wikitext-2-raw-v1",
-                "split": "train"
-            },
-            "pre_process_data_config": {
-                "strategy": "line-by-line",
-                "add_special_tokens": true,
-                "max_samples": 256,
-                "max_seq_len": 2048
+                "type": "gemma_dataset",
+                "model_id": "google/gemma-3-4b-it"
             }
         }
     ],
@@ -58,7 +36,7 @@
             "group_size": -1,
             "lm_head": false,
             "device": "cuda",
-            "data_config": "wikitext2_train_joined"
+            "data_config": "gemma_data_config"
         },
         "cs": {
             "type": "CaptureSplitInfo",
@@ -101,7 +79,7 @@
         },
         "sq": {
             "type": "OnnxStaticQuantization",
-            "data_config": "wikitext2_train_act",
+            "data_config": "gemma_data_config",
             "activation_type": "uint16",
             "precision": "uint8",
             "calibration_providers": [

From 7d4ced80df73118194263079c92a04c072608c94 Mon Sep 17 00:00:00 2001
From: Kyle Romero <kromero@qti.qualcomm.com>
Date: Mon, 8 Sep 2025 17:19:11 -0700
Subject: [PATCH 21/24] Minor cleanup

---
 .../gemma3/qnn/custom_gemma3_4b_datasets.py   |  4 +-
 .../qnn/gemma3-4b-embedding-qnn-config.json   | 74 ++++--------------
 .../gemma3/qnn/gemma3-4b-text-qnn-config.json | 71 ++++--------------
 .../qnn/gemma3-4b-vision-qnn-config.json      | 67 +++--------------
 examples/gemma3/qnn/genai/app.py              | 29 +++----
 examples/gemma3/qnn/genai/genai_config.json   | 44 +++--------
 .../gemma3/qnn/genai/processor_config.json    | 75 ++++++-------------
 examples/gemma3/requirements.txt              | 12 +--
 olive/common/hf/utils.py                      | 12 ++-
 olive/model/handler/hf.py                     | 10 ++-
 10 files changed, 110 insertions(+), 288 deletions(-)

diff --git a/examples/gemma3/qnn/custom_gemma3_4b_datasets.py b/examples/gemma3/qnn/custom_gemma3_4b_datasets.py
index 71410af4f..77751530b 100755
--- a/examples/gemma3/qnn/custom_gemma3_4b_datasets.py
+++ b/examples/gemma3/qnn/custom_gemma3_4b_datasets.py
@@ -254,10 +254,9 @@ def _process_dataset_entry(self, entry: dict[str, any]):
             Tokenized inputs ready for model processing
 
         """
-        inputs = self.processor.apply_chat_template(
+        return self.processor.apply_chat_template(
             entry["text"][0], add_generation_prompt=True, tokenize=True, return_tensors="pt", return_dict=True
         )
-        return inputs
 
 
 class GemmaTextOnlyDataset(BaseGemmaDataset):
@@ -496,6 +495,7 @@ def _process_dataset_entry(self, entry: dict[str, any]):
 TEXT_SHORTCUT_FIRST_N = 600
 SHORTCUT_FIRST_N = 200
 
+
 @Registry.register_dataset()
 def gemma_dataset(model_id: str):
     """Full E2E Gemma 3 multi-modal dataset (image + text)."""
diff --git a/examples/gemma3/qnn/gemma3-4b-embedding-qnn-config.json b/examples/gemma3/qnn/gemma3-4b-embedding-qnn-config.json
index 1c5b7f626..360f0e2bb 100755
--- a/examples/gemma3/qnn/gemma3-4b-embedding-qnn-config.json
+++ b/examples/gemma3/qnn/gemma3-4b-embedding-qnn-config.json
@@ -4,75 +4,32 @@
         "model_script": "custom_gemma3_4b_embedding.py",
         "model_loader": "load_gemma3_embedding_model",
         "io_config": {
-            "input_names": [
-                "input_ids",
-                "image_features"
-            ],
-            "input_shapes": [
-                [
-                    1,
-                    64
-                ],
-                [
-                    1,
-                    256,
-                    2560
-                ]
-            ],
-            "input_types": [
-                "int64",
-                "float32"
-            ],
-            "output_names": [
-                "/model/embed_tokens/Mul/output_0"
-            ],
-            "output_shapes": [
-                [
-                    1,
-                    64,
-                    2560
-                ]
-            ],
+            "input_names": [ "input_ids", "image_features" ],
+            "input_shapes": [ [ 1, 64 ], [ 1, 256, 2560 ] ],
+            "input_types": [ "int64", "float32" ],
+            "output_names": [ "/model/embed_tokens/Mul/output_0" ],
+            "output_shapes": [ [ 1, 64, 2560 ] ],
             "dynamic_axes": {
-                "input_ids": {
-                    "0": "batch_size",
-                    "1": "seq_length"
-                },
-                "image_features": {
-                    "0": "batch_size",
-                    "1": "image_tokens_length"
-                }
+                "input_ids": { "0": "batch_size", "1": "seq_length" },
+                "image_features": { "0": "batch_size", "1": "image_tokens_length" }
             }
         }
     },
     "systems": {
         "local_system": {
             "type": "LocalSystem",
-            "accelerators": [
-                {
-                    "device": "cpu",
-                    "execution_providers": [
-                        "CPUExecutionProvider"
-                    ]
-                }
-            ]
+            "accelerators": [ { "device": "cpu", "execution_providers": [ "CPUExecutionProvider" ] } ]
         }
     },
     "data_configs": [
         {
             "name": "gemma_embedding_data_config",
             "user_script": "custom_gemma3_4b_datasets.py",
-            "load_dataset_config": {
-                "type": "gemma_embedding_input_dataset",
-                "model_id": "google/gemma-3-4b-it"
-            }
+            "load_dataset_config": { "type": "gemma_embedding_input_dataset", "model_id": "google/gemma-3-4b-it" }
         }
     ],
     "passes": {
-        "conversion": {
-            "type": "OnnxConversion",
-            "target_opset": 20
-        },
+        "conversion": { "type": "OnnxConversion", "target_opset": 20 },
         "quantization": {
             "type": "OnnxStaticQuantization",
             "quant_preprocess": false,
@@ -80,20 +37,15 @@
             "activation_type": "uint16",
             "precision": "uint8",
             "calibrate_method": "MinMax",
-            "calibration_providers": [
-                "CUDAExecutionProvider"
-            ],
+            "calibration_providers": [ "CUDAExecutionProvider" ],
             "per_channel": true,
             "weight_symmetric": true
         },
-        "add_metadata": {
-            "type": "AddOliveMetadata",
-            "graph_name": "gemma-3-4b-it-embedding"
-        }
+        "add_metadata": { "type": "AddOliveMetadata", "graph_name": "gemma-3-4b-it-embedding" }
     },
     "target": "local_system",
     "log_severity_level": 1,
     "output_dir": "models/gemma-3-4b-it-embed",
     "cache_dir": "cache-embd",
     "no_artifacts": true
-}
\ No newline at end of file
+}
diff --git a/examples/gemma3/qnn/gemma3-4b-text-qnn-config.json b/examples/gemma3/qnn/gemma3-4b-text-qnn-config.json
index 06eb58078..12fc5c8dc 100755
--- a/examples/gemma3/qnn/gemma3-4b-text-qnn-config.json
+++ b/examples/gemma3/qnn/gemma3-4b-text-qnn-config.json
@@ -9,23 +9,14 @@
         "qnn_system": {
             "type": "PythonEnvironment",
             "python_environment_path": "",
-            "accelerators": [
-                {
-                    "execution_providers": [
-                        "QNNExecutionProvider"
-                    ]
-                }
-            ]
+            "accelerators": [ { "execution_providers": [ "QNNExecutionProvider" ] } ]
         }
     },
     "data_configs": [
         {
             "name": "gemma_data_config",
             "user_script": "custom_gemma3_4b_datasets.py",
-            "load_dataset_config": {
-                "type": "gemma_dataset",
-                "model_id": "google/gemma-3-4b-it"
-            }
+            "load_dataset_config": { "type": "gemma_dataset", "model_id": "google/gemma-3-4b-it" }
         }
     ],
     "passes": {
@@ -38,42 +29,27 @@
             "device": "cuda",
             "data_config": "gemma_data_config"
         },
-        "cs": {
-            "type": "CaptureSplitInfo",
-            "num_splits": 2,
-            "unique_embeds_lm_head_splits": true
-        },
+        "cs": { "type": "CaptureSplitInfo", "num_splits": 2, "unique_embeds_lm_head_splits": true },
         "mb": {
             "type": "ModelBuilder",
             "precision": "int4",
             "int4_block_size": 32,
             "int4_accuracy_level": 4,
-            "int4_op_types_to_quantize": [
-                "MatMul",
-                "Gather"
-            ]
+            "int4_op_types_to_quantize": [ "MatMul", "Gather" ]
         },
         "mq": {
             "type": "MatMulNBitsToQDQ",
             "use_int4": true,
             "add_zero_point": true,
-            "nodes_to_exclude": [
-                "/lm_head/MatMul_Q4"
-            ],
+            "nodes_to_exclude": [ "/lm_head/MatMul_Q4" ],
             "save_as_external_data": true
         },
         "gs": {
             "type": "GraphSurgeries",
             "surgeries": [
-                {
-                    "surgeon": "RemoveRopeMultiCache"
-                },
-                {
-                    "surgeon": "AttentionMaskToSequenceLengths"
-                },
-                {
-                    "surgeon": "SimplifiedLayerNormToL2Norm"
-                }
+                { "surgeon": "RemoveRopeMultiCache" },
+                { "surgeon": "AttentionMaskToSequenceLengths" },
+                { "surgeon": "SimplifiedLayerNormToL2Norm" }
             ],
             "save_as_external_data": true
         },
@@ -82,25 +58,13 @@
             "data_config": "gemma_data_config",
             "activation_type": "uint16",
             "precision": "uint8",
-            "calibration_providers": [
-                "CUDAExecutionProvider"
-            ],
+            "calibration_providers": [ "CUDAExecutionProvider" ],
             "quant_preprocess": true,
-            "op_types_to_exclude": [
-                "GatherBlockQuantized",
-                "GroupQueryAttention",
-                "MatMulNBits"
-            ],
+            "op_types_to_exclude": [ "GatherBlockQuantized", "GroupQueryAttention", "MatMulNBits" ],
             "save_as_external_data": true
         },
-        "sp": {
-            "type": "SplitModel"
-        },
-        "st": {
-            "type": "StaticLLM",
-            "batch_size": 1,
-            "context_length": 64
-        },
+        "sp": { "type": "SplitModel" },
+        "st": { "type": "StaticLLM", "batch_size": 1, "context_length": 64 },
         "cb": {
             "type": "EPContextBinaryGenerator",
             "provider_options": {
@@ -110,19 +74,14 @@
                 "htp_arch": "v73",
                 "soc_model": "60"
             },
-            "session_options": {
-                "intra_op_num_threads": 2,
-                "inter_op_num_threads": 1
-            },
+            "session_options": { "intra_op_num_threads": 2, "inter_op_num_threads": 1 },
             "weight_sharing": true
         },
-        "cp": {
-            "type": "ComposeOnnxModels"
-        }
+        "cp": { "type": "ComposeOnnxModels" }
     },
     "target": "qnn_system",
     "log_severity_level": 0,
     "output_dir": "models/gemma3_qnn",
     "cache_dir": "cache",
     "no_artifacts": true
-}
\ No newline at end of file
+}
diff --git a/examples/gemma3/qnn/gemma3-4b-vision-qnn-config.json b/examples/gemma3/qnn/gemma3-4b-vision-qnn-config.json
index e252381ab..b15d6185f 100755
--- a/examples/gemma3/qnn/gemma3-4b-vision-qnn-config.json
+++ b/examples/gemma3/qnn/gemma3-4b-vision-qnn-config.json
@@ -4,68 +4,30 @@
         "model_script": "custom_gemma3_4b_vision.py",
         "model_loader": "load_gemma3_vision_model",
         "io_config": {
-            "input_names": [
-                "pixel_values"
-            ],
-            "input_shapes": [
-                [
-                    1,
-                    3,
-                    896,
-                    896
-                ]
-            ],
-            "input_types": [
-                "float32"
-            ],
-            "output_names": [
-                "image_features"
-            ],
-            "output_shapes": [
-                [
-                    1,
-                    256,
-                    2560
-                ]
-            ]
+            "input_names": [ "pixel_values" ],
+            "input_shapes": [ [ 1, 3, 896, 896 ] ],
+            "input_types": [ "float32" ],
+            "output_names": [ "image_features" ],
+            "output_shapes": [ [ 1, 256, 2560 ] ]
         }
     },
     "systems": {
         "qnn_system": {
             "type": "PythonEnvironment",
             "python_environment_path": "",
-            "accelerators": [
-                {
-                    "execution_providers": [
-                        "QNNExecutionProvider"
-                    ]
-                }
-            ]
+            "accelerators": [ { "execution_providers": [ "QNNExecutionProvider" ] } ]
         }
     },
     "data_configs": [
         {
             "name": "gemma_vision_data_config",
             "user_script": "custom_gemma3_4b_datasets.py",
-            "load_dataset_config": {
-                "type": "gemma_image_dataset",
-                "model_id": "google/gemma-3-4b-it"
-            }
+            "load_dataset_config": { "type": "gemma_image_dataset", "model_id": "google/gemma-3-4b-it" }
         }
     ],
     "passes": {
-        "conversion": {
-            "type": "OnnxConversion",
-            "target_opset": 20
-        },
-        "surgery": {
-            "type": "GraphSurgeries",
-            "surgeries": [
-                {
-                    "surgeon": "MatMulAddToGemm"
-                }
-            ]
-        },
+        "conversion": { "type": "OnnxConversion", "target_opset": 20 },
+        "surgery": { "type": "GraphSurgeries", "surgeries": [ { "surgeon": "MatMulAddToGemm" } ] },
         "quantization": {
             "type": "OnnxStaticQuantization",
             "quant_preprocess": true,
@@ -73,9 +35,7 @@
             "activation_type": "uint16",
             "precision": "uint8",
             "calibrate_method": "MinMax",
-            "calibration_providers": [
-                "CUDAExecutionProvider"
-            ],
+            "calibration_providers": [ "CUDAExecutionProvider" ],
             "per_channel": true,
             "weight_symmetric": true
         },
@@ -89,14 +49,11 @@
                 "soc_model": "60"
             }
         },
-        "add_metadata": {
-            "type": "AddOliveMetadata",
-            "graph_name": "gemma-3-4b-it-vision"
-        }
+        "add_metadata": { "type": "AddOliveMetadata", "graph_name": "gemma-3-4b-it-vision" }
     },
     "target": "qnn_system",
     "log_severity_level": 1,
     "output_dir": "models/gemma-3-4b-it-vision",
     "cache_dir": "cache-vision",
     "no_artifacts": true
-}
\ No newline at end of file
+}
diff --git a/examples/gemma3/qnn/genai/app.py b/examples/gemma3/qnn/genai/app.py
index e83d6420f..0b5da39c3 100644
--- a/examples/gemma3/qnn/genai/app.py
+++ b/examples/gemma3/qnn/genai/app.py
@@ -4,13 +4,14 @@
 import argparse
 import glob
 import json
+import logging
 import os
 import time
 from pathlib import Path
 
 import onnxruntime_genai as og
 
-# og.set_log_options(enabled=True, model_input_values=True, model_output_values=True)
+logger = logging.getLogger(__name__)
 
 
 def _find_dir_contains_sub_dir(current_dir: Path, target_dir_name):
@@ -30,15 +31,15 @@ def _complete(text, state):
 
 
 def run(args: argparse.Namespace):
-    print("Loading model...")
+    logger.info("Loading model...")
     config = og.Config(args.model_path)
     if args.execution_provider != "follow_config":
         config.clear_providers()
         if args.execution_provider != "cpu":
-            print(f"Setting model to {args.execution_provider}...")
+            logger.info(f"Setting model to {args.execution_provider}...")
             config.append_provider(args.execution_provider)
     model = og.Model(config)
-    print("Model loaded")
+    logger.info("Model loaded")
 
     tokenizer = og.Tokenizer(model)
     processor = model.create_multimodal_processor()
@@ -68,16 +69,15 @@ def run(args: argparse.Namespace):
                 image_paths = [str(Path(__file__).parent / "images" / "dog.jpg")]
 
         image_paths = [image_path for image_path in image_paths if image_path]
-        print(image_paths)
 
         images = None
         if len(image_paths) == 0:
-            print("No image provided")
+            logger.info("No image provided")
         else:
             for i, image_path in enumerate(image_paths):
                 if not os.path.exists(image_path):
                     raise FileNotFoundError(f"Image file not found: {image_path}")
-                print(f"Using image: {image_path}")
+                logger.info(f"Using image: {image_path}")
 
             images = og.Images.open(*image_paths)
 
@@ -103,18 +103,15 @@ def run(args: argparse.Namespace):
 
         # Apply the chat template using the tokenizer
         message_json = json.dumps(messages)
-        print(message_json)
         prompt = tokenizer.apply_chat_template(message_json, add_generation_prompt=True)
 
-        print("Processing images and prompt...")
+        logger.info("Processing images and prompt...")
         inputs = processor(prompt, images=images)
 
-        print("Generating response...")
+        logger.info("Generating response...")
         params = og.GeneratorParams(model)
         params.set_search_options(max_length=1024)
 
-        print(inputs)
-
         generator = og.Generator(model, params)
         generator.set_inputs(inputs)
         start_time = time.time()
@@ -123,14 +120,10 @@ def run(args: argparse.Namespace):
             generator.generate_next_token()
 
             new_token = generator.get_next_tokens()[0]
-            print(stream.decode(new_token), end="", flush=True)
+            logger.info(stream.decode(new_token), end="", flush=True)
 
-        print()
         total_run_time = time.time() - start_time
-        print(f"Total Time : {total_run_time:.2f}")
-
-        for _ in range(3):
-            print()
+        logger.info(f"Total Time : {total_run_time:.2f}")
 
         # Delete the generator to free the captured graph before creating another one
         del generator
diff --git a/examples/gemma3/qnn/genai/genai_config.json b/examples/gemma3/qnn/genai/genai_config.json
index a835fb863..754b33cd0 100755
--- a/examples/gemma3/qnn/genai/genai_config.json
+++ b/examples/gemma3/qnn/genai/genai_config.json
@@ -3,15 +3,11 @@
         "bos_token_id": 2,
         "context_length": 131072,
         "decoder": {
-            "session_options": {
-                "log_id": "onnxruntime-genai",
-                "provider_options": [
-                ]
-            },
+            "session_options": { "log_id": "onnxruntime-genai", "provider_options": [  ] },
             "head_size": 256,
             "hidden_size": 2560,
             "inputs": {
-                "input_ids":"input_ids",
+                "input_ids": "input_ids",
                 "inputs_embeds": "inputs_embeds",
                 "attention_mask": "attention_mask",
                 "past_key_names": "past_key_values.%d.key",
@@ -38,12 +34,8 @@
                 {
                     "embeddings": {
                         "filename": "embeddings.onnx",
-                        "inputs": [
-                            "input_ids"
-                        ],
-                        "outputs": [
-                            "/model/embed_tokens/Mul/output_0_QuantizeLinear_Output"
-                        ],
+                        "inputs": [ "input_ids" ],
+                        "outputs": [ "/model/embed_tokens/Mul/output_0_QuantizeLinear_Output" ],
                         "run_on_prompt": false
                     },
                     "context_ctx": {
@@ -373,31 +365,20 @@
                         "inputs": [
                             "/model/layers.34/final_norm_layernorm/SkipLayerNorm_Mul_output_0_QuantizeLinear_Output"
                         ],
-                        "outputs": [
-                            "logits"
-                        ]
+                        "outputs": [ "logits" ]
                     }
                 }
             ]
         },
         "embedding": {
             "filename": "embeddings_with_image.onnx",
-            "inputs": {
-                "input_ids": "input_ids",
-                "image_features": "image_features"
-            },
-            "outputs": {
-                "inputs_embeds": "/model/embed_tokens/Mul/output_0_QuantizeLinear_Output"
-            }
+            "inputs": { "input_ids": "input_ids", "image_features": "image_features" },
+            "outputs": { "inputs_embeds": "/model/embed_tokens/Mul/output_0_QuantizeLinear_Output" }
         },
         "vision": {
             "filename": "model_ctx_vision.onnx",
-            "inputs": {
-                "pixel_values": "pixel_values"
-            },
-            "outputs": {
-                "image_features": "image_features"
-            },
+            "inputs": { "pixel_values": "pixel_values" },
+            "outputs": { "image_features": "image_features" },
             "session_options": {
                 "intra_op_num_threads": 2,
                 "inter_op_num_threads": 1,
@@ -412,10 +393,7 @@
                 ]
             }
         },
-        "eos_token_id": [
-            1,
-            106
-        ],
+        "eos_token_id": [ 1, 106 ],
         "pad_token_id": 0,
         "type": "gemma3",
         "vocab_size": 262208
@@ -436,4 +414,4 @@
         "top_k": 64,
         "top_p": 0.95
     }
-}
\ No newline at end of file
+}
diff --git a/examples/gemma3/qnn/genai/processor_config.json b/examples/gemma3/qnn/genai/processor_config.json
index d1c66b6ce..b25059aa2 100755
--- a/examples/gemma3/qnn/genai/processor_config.json
+++ b/examples/gemma3/qnn/genai/processor_config.json
@@ -1,53 +1,24 @@
 {
-  "processor": {
-    "name": "gemma_3_image_processing",
-    "transforms": [
-      {
-        "operation": {
-          "name": "decode_image",
-          "type": "DecodeImage",
-          "attrs": {
-            "color_space": "RGB"
-          }
-        }
-      },
-      {
-        "operation": {
-          "name": "resize",
-          "type": "Resize",
-          "attrs": {
-            "interpolation": "CUBIC",
-            "width": 896,
-            "height": 896,
-            "keep_aspect_ratio": 0
-          }
-        }
-      },
-      {
-        "operation": {
-          "name": "re-scale",
-          "type": "Rescale"
-        }
-      },
-      {
-        "operation": {
-          "name": "normalize",
-          "type": "Normalize",
-          "attrs": {
-            "mean": [0.5, 0.5, 0.5],
-            "std": [0.5, 0.5, 0.5]
-          }
-        }
-      },
-      {
-        "operation": {
-          "name": "to_channel_first",
-          "type": "Permute3D",
-          "attrs": {
-            "dims": [2, 0, 1]
-          }
-        }
-      }
-    ]
-  }
-}
\ No newline at end of file
+    "processor": {
+        "name": "gemma_3_image_processing",
+        "transforms": [
+            { "operation": { "name": "decode_image", "type": "DecodeImage", "attrs": { "color_space": "RGB" } } },
+            {
+                "operation": {
+                    "name": "resize",
+                    "type": "Resize",
+                    "attrs": { "interpolation": "CUBIC", "width": 896, "height": 896, "keep_aspect_ratio": 0 }
+                }
+            },
+            { "operation": { "name": "re-scale", "type": "Rescale" } },
+            {
+                "operation": {
+                    "name": "normalize",
+                    "type": "Normalize",
+                    "attrs": { "mean": [ 0.5, 0.5, 0.5 ], "std": [ 0.5, 0.5, 0.5 ] }
+                }
+            },
+            { "operation": { "name": "to_channel_first", "type": "Permute3D", "attrs": { "dims": [ 2, 0, 1 ] } } }
+        ]
+    }
+}
diff --git a/examples/gemma3/requirements.txt b/examples/gemma3/requirements.txt
index 3d365f2bb..337d1987d 100644
--- a/examples/gemma3/requirements.txt
+++ b/examples/gemma3/requirements.txt
@@ -1,10 +1,10 @@
-transformers
 datasets
-optimum
-onnxruntime-gpu==1.22.0
+onnx==1.18.0
+onnx-ir==0.1.4
 onnxruntime-genai-cuda==0.9.0
+onnxruntime-gpu==1.22.0
+onnxscript==0.3.2
+optimum
 setuptools
 tabulate
-onnx==1.18.0
-onnx-ir==0.1.4
-onnxscript==0.3.2
\ No newline at end of file
+transformers
diff --git a/olive/common/hf/utils.py b/olive/common/hf/utils.py
index 752bf4990..dee79e6e8 100644
--- a/olive/common/hf/utils.py
+++ b/olive/common/hf/utils.py
@@ -2,6 +2,7 @@
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # Licensed under the MIT License.
 # --------------------------------------------------------------------------
+import importlib
 import logging
 from pathlib import Path
 from typing import TYPE_CHECKING, Optional, Union
@@ -11,7 +12,6 @@
 from olive.common.hf.mappings import TASK_TO_PEFT_TASK_TYPE
 from olive.common.hf.mlflow import get_pretrained_name_or_path
 from olive.common.utils import hardlink_copy_file
-import importlib
 
 if TYPE_CHECKING:
     from transformers import PretrainedConfig, PreTrainedModel, PreTrainedTokenizer, PreTrainedTokenizerFast
@@ -19,7 +19,13 @@
 logger = logging.getLogger(__name__)
 
 
-def load_model_from_task(task: str, model_name_or_path: str, custom_task_class_name:str = None, custom_task_class_module:str = None, **kwargs) -> "PreTrainedModel":
+def load_model_from_task(
+    task: str,
+    model_name_or_path: str,
+    custom_task_class_name: str = None,
+    custom_task_class_module: str = None,
+    **kwargs,
+) -> "PreTrainedModel":
     """Load huggingface model from task and model_name_or_path."""
     from transformers.pipelines import check_task
 
@@ -56,7 +62,7 @@ def load_model_from_task(task: str, model_name_or_path: str, custom_task_class_n
             AUTO_QUANTIZATION_CONFIG_MAPPING["olive"] = OliveHfQuantizationConfig
             AUTO_QUANTIZER_MAPPING["olive"] = OliveHfQuantizer
 
-    if (custom_task_class_module is not None and custom_task_class_name is not None):
+    if custom_task_class_module is not None and custom_task_class_name is not None:
         module = importlib.import_module(custom_task_class_module)
         class_tuple = (getattr(module, custom_task_class_name),)
     else:
diff --git a/olive/model/handler/hf.py b/olive/model/handler/hf.py
index 396a36d2a..343c84f77 100644
--- a/olive/model/handler/hf.py
+++ b/olive/model/handler/hf.py
@@ -39,7 +39,7 @@ def __init__(
         adapter_path: OLIVE_RESOURCE_ANNOTATIONS = None,
         model_attributes: Optional[dict[str, Any]] = None,
         custom_task_class_name: str = None,
-        custom_task_class_module: str = None
+        custom_task_class_module: str = None,
     ):
         super().__init__(
             model_file_format=None,
@@ -76,7 +76,13 @@ def load_model(self, rank: int = None, cache_model: bool = True) -> "torch.nn.Mo
         if self.model:
             model = self.model
         else:
-            model = load_model_from_task(self.task, self.model_path, self.custom_task_class_name, self.custom_task_class_module, **self.get_load_kwargs())
+            model = load_model_from_task(
+                self.task,
+                self.model_path,
+                self.custom_task_class_name,
+                self.custom_task_class_module,
+                **self.get_load_kwargs(),
+            )
 
             # we only have peft adapters for now
             if self.adapter_path:

From a0bd7031557ae1e7cdad4cbbfb8e3db0bb036e74 Mon Sep 17 00:00:00 2001
From: Alahari Prudhvi Akhil <prudhvi@qti.qualcomm.com>
Date: Tue, 9 Sep 2025 04:39:27 -0700
Subject: [PATCH 22/24] Add system requirements

---
 examples/gemma3/qnn/gemma-3-4b.ipynb | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/examples/gemma3/qnn/gemma-3-4b.ipynb b/examples/gemma3/qnn/gemma-3-4b.ipynb
index 7b36c9cf5..42890860e 100755
--- a/examples/gemma3/qnn/gemma-3-4b.ipynb
+++ b/examples/gemma3/qnn/gemma-3-4b.ipynb
@@ -14,6 +14,19 @@
     "- Convert Embedding layer with image to QNN format\n"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Platform requirements\n",
+    "This notebook is intended to run on a machine with:\n",
+    "  * **Operating System**: Linux Ubuntu 22.04 (automated setup script is Linux-only)\n",
+    "  * **Python**: 3.10\n",
+    "  * NVIDIA driver version equivalent to 525.60.13\n",
+    "  * NVIDIA A100 GPU\n",
+    "  * **Storage**: ~13GB for COCO train2017 dataset (downloaded automatically)"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},

From f685073f15bd0b19fdc269803eabe8be2c58b554 Mon Sep 17 00:00:00 2001
From: Kyle Romero <kromero@qti.qualcomm.com>
Date: Thu, 18 Sep 2025 10:39:36 -0700
Subject: [PATCH 23/24] Remove examples

---
 examples/gemma3/qnn/README.md                 | 122 ----
 .../gemma3/qnn/custom_gemma3_4b_datasets.py   | 526 ------------------
 .../gemma3/qnn/custom_gemma3_4b_embedding.py  |  37 --
 .../gemma3/qnn/custom_gemma3_4b_vision.py     |  36 --
 examples/gemma3/qnn/env_setup.sh              |  28 -
 examples/gemma3/qnn/gemma-3-4b.ipynb          | 379 -------------
 .../qnn/gemma3-4b-embedding-qnn-config.json   |  51 --
 .../gemma3/qnn/gemma3-4b-text-qnn-config.json |  87 ---
 .../qnn/gemma3-4b-vision-qnn-config.json      |  59 --
 examples/gemma3/qnn/genai/app.py              | 163 ------
 examples/gemma3/qnn/genai/genai_config.json   | 417 --------------
 .../gemma3/qnn/genai/processor_config.json    |  24 -
 examples/gemma3/qnn/qnn_req.txt               |   7 -
 examples/gemma3/requirements.txt              |  10 -
 14 files changed, 1946 deletions(-)
 delete mode 100644 examples/gemma3/qnn/README.md
 delete mode 100755 examples/gemma3/qnn/custom_gemma3_4b_datasets.py
 delete mode 100644 examples/gemma3/qnn/custom_gemma3_4b_embedding.py
 delete mode 100644 examples/gemma3/qnn/custom_gemma3_4b_vision.py
 delete mode 100644 examples/gemma3/qnn/env_setup.sh
 delete mode 100755 examples/gemma3/qnn/gemma-3-4b.ipynb
 delete mode 100755 examples/gemma3/qnn/gemma3-4b-embedding-qnn-config.json
 delete mode 100755 examples/gemma3/qnn/gemma3-4b-text-qnn-config.json
 delete mode 100755 examples/gemma3/qnn/gemma3-4b-vision-qnn-config.json
 delete mode 100644 examples/gemma3/qnn/genai/app.py
 delete mode 100755 examples/gemma3/qnn/genai/genai_config.json
 delete mode 100755 examples/gemma3/qnn/genai/processor_config.json
 delete mode 100755 examples/gemma3/qnn/qnn_req.txt
 delete mode 100644 examples/gemma3/requirements.txt

diff --git a/examples/gemma3/qnn/README.md b/examples/gemma3/qnn/README.md
deleted file mode 100644
index 93c347fbe..000000000
--- a/examples/gemma3/qnn/README.md
+++ /dev/null
@@ -1,122 +0,0 @@
-# Gemma-3-4B Model Optimization
-
-This repository demonstrates the optimization of the [Google Gemma-3-4B](https://huggingface.co/google/gemma-3-4b-it) model using **post-training quantization (PTQ)** techniques for QNN (Qualcomm Neural Network) execution. The optimization process utilizes an environment based heavily upon the [PTQ tutorial for Phi-3.5](https://github.com/CodeLinaro/Olive/blob/main/examples/phi3_5/README.md)
-
-## File Overview
-
-This example contains the following key files:
-
-- **`env_setup.sh`** - Automated environment setup script (Linux only)
-- **`gemma3-4b-text-qnn-config.json`** - Olive configuration for optimizing the text component
-- **`gemma3-4b-vision-qnn-config.json`** - Olive configuration for optimizing the vision component
-- **`user_script.py`** - Dataset handling and preprocessing utilities
-- **`custom_gemma3_4b_it_vision.py`** - Vision model loader for the optimization pipeline
-
-## Prerequisites
-
-### System Requirements
-- **Operating System**: Linux (automated setup script is Linux-only)
-- **Python**: 3.10
-- **Package Manager**: [uv](https://docs.astral.sh/uv/getting-started/installation/#installation-methods)
-- **Storage**: ~13GB for COCO train2017 dataset (downloaded automatically)
-
-### Dependencies Installed by Setup Script
-The `env_setup.sh` script installs the following components:
-- setuptools (for building Olive from source)
-- Olive requirements and dependencies
-- AutoGPTQ (from source)
-- GPTQModel (specific commit: `558449bed3ef2653c36041650d30da6bbbca440d`)
-- onnxruntime-qnn (pre-release version)
-
-## Setup Instructions
-
-### Automated Setup (Recommended)
-```bash
-source env_setup.sh
-```
-
-### Manual Setup (Alternative)
-If you prefer to set up manually or need to troubleshoot:
-
-1. Install setuptools:
-   ```bash
-   uv pip install setuptools
-   ```
-
-2. Install requirements:
-   ```bash
-   uv pip install -r ../requirements.txt
-   uv pip install -r ../../../requirements.txt
-   ```
-
-3. Install AutoGPTQ from source:
-   ```bash
-   export BUILD_CUDA_EXT=0
-   uv pip install --no-build-isolation git+https://github.com/PanQiWei/AutoGPTQ.git
-   ```
-
-4. Install GPTQModel with Gemma3 fix:
-   ```bash
-   uv pip install --no-build-isolation git+https://github.com/ModelCloud/GPTQModel.git@558449bed3ef2653c36041650d30da6bbbca440d
-   ```
-
-5. Install onnxruntime-qnn:
-   ```bash
-   uv pip install -r https://raw.githubusercontent.com/microsoft/onnxruntime/refs/heads/main/requirements.txt
-   uv pip install -U --pre --extra-index-url https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/ORT-Nightly/pypi/simple onnxruntime-qnn --no-deps
-   ```
-
-> **Important:** The setup uses a specific commit hash for GPTQModel (`558449bed3ef2653c36041650d30da6bbbca440d`) to address a [memory leak issue](https://github.com/ModelCloud/GPTQModel/commit/558449bed3ef2653c36041650d30da6bbbca440d) with Gemma3 models.
-
-## Optimization Process
-
-Since Gemma-3-4B is a multi-modal model composed of both vision and text components, the strategy for optimizing it through Olive is to operate on the constituent models separately before configuring them to work together at the onnxruntime-genai stage.
-
-### Configuration Differences
-
-**Text Configuration (`gemma3-4b-text-qnn-config.json`)**:
-- Uses HuggingFace model directly (`google/gemma-3-4b-it`)
-- Applies comprehensive optimization pipeline: QuaRot → GptqModel → ModelBuilder → Quantization
-- Outputs to: `models/gemma-3-4b-it-text/`
-
-**Vision Configuration (`gemma3-4b-vision-qnn-config.json`)**:
-- Uses custom PyTorch model loader (`custom_gemma3_4b_it_vision.py`)
-- Simpler pipeline: ONNX Conversion → Graph Surgery → Quantization
-- Outputs to: `models/gemma-3-4b-it-vision/`
-
-### Running Optimization
-
-Execute the following commands to separately produce optimized binaries for each component:
-
-```bash
-olive run --config gemma3-4b-text-qnn-config.json
-```
-
-```bash
-olive run --config gemma3-4b-vision-qnn-config.json
-```
-
-## Expected Outputs
-
-After successful optimization, you will find:
-
-- **Text model outputs**: `models/gemma-3-4b-it-text/`
-- **Vision model outputs**: `models/gemma-3-4b-it-vision/`
-- **Cache directory**: `cache/` (intermediate files and downloaded datasets)
-- **Dataset**: `.cache/train2017/` (COCO train2017 images, ~13GB)
-
-Both configurations use `"no_artifacts": true`, meaning only the final optimized models are retained.
-
-## Troubleshooting
-
-### Common Issues
-
-**Insufficient Storage**: The COCO train2017 dataset requires ~13GB of storage and is downloaded automatically to `.cache/train2017/`.
-
-**Memory Requirements**: The optimization process, particularly for the text model with its comprehensive pipeline, requires substantial memory.
-
-**QNN Provider**: Ensure the QNNExecutionProvider is properly installed and configured in your environment.
-
-**Platform Limitation**: The current setup script is designed for Linux only. Windows/macOS users will need to adapt the manual setup steps.
-
-**Dataset Download**: If the COCO dataset download fails, check your internet connection and available storage. The script uses `wget` which must be available on your system.
diff --git a/examples/gemma3/qnn/custom_gemma3_4b_datasets.py b/examples/gemma3/qnn/custom_gemma3_4b_datasets.py
deleted file mode 100755
index 77751530b..000000000
--- a/examples/gemma3/qnn/custom_gemma3_4b_datasets.py
+++ /dev/null
@@ -1,526 +0,0 @@
-# -------------------------------------------------------------------------
-# Copyright (c) Microsoft Corporation. All rights reserved.
-# Licensed under the MIT License.
-# --------------------------------------------------------------------------
-
-import copy
-import logging
-import os
-import subprocess
-import zipfile
-from abc import ABC, abstractmethod
-from pathlib import Path
-from typing import Optional
-
-import torch
-from datasets import load_dataset
-from huggingface_hub import hf_hub_download
-from PIL import Image as PILImage
-from transformers import (
-    AutoModel,
-    AutoProcessor,
-    AutoTokenizer,
-)
-
-from olive.data.registry import Registry
-
-logger = logging.getLogger(__name__)
-
-
-class BaseGemmaDataset(ABC):
-    """Abstract base class for Gemma dataset implementations."""
-
-    CACHE_DIR = os.getenv("CACHE_DIR", ".cache")
-
-    def __init__(self, model_id: str, first_n: Optional[int] = None):
-        self.model_id = model_id
-        self.first_n = first_n
-        self.processor = AutoProcessor.from_pretrained(self.model_id)
-
-        # Initialize attributes that will be set during dataset loading
-        self.image_data_path = None
-        self.raw_datasets = None
-
-        # Initialize processor components based on subclass requirements
-        self._initialize_processor_components()
-
-        self.setup_dataset()
-
-    @abstractmethod
-    def _initialize_processor_components(self):
-        """Initialize processor components specific to the dataset type."""
-
-    @abstractmethod
-    def _process_dataset_entry(self, entry: dict[str, any]):
-        """Process a single dataset entry according to the dataset type."""
-
-    def _convert_single_llava_to_gemma_conversation(
-        self, conversation: list[dict[str, str]], strip_images: bool = False
-    ) -> dict[str, str | list[dict]]:
-        """Convert a single llava-style conversation entry to Gemma-style.
-
-        Args:
-            conversation: The conversation entry to convert
-            strip_images: If True, remove <image> tokens and create text-only content.
-                         If False, preserve <image> tokens and create multimodal content.
-
-        Examples:
-            >>> conversation = {"from": "human", "value": "<image>What are the colors of the bus in the image?"}
-            >>> _convert_single_llava_to_gemma_conversation(conversation, strip_images=False)
-            {
-                'role': 'user',
-                'content': [{'type': 'image'}, {'type': 'text', 'text': 'What are the colors of the bus in the image?'}]
-            }
-            >>> _convert_single_llava_to_gemma_conversation(conversation, strip_images=True)
-            {
-                'role': 'user',
-                'content': [{'type': 'text', 'text': 'What are the colors of the bus in the image?'}]
-            }
-
-        """
-        who = conversation.get("from")
-        match who:
-            case "human":
-                role = "user"
-            case "gpt":
-                role = "assistant"
-            case _:
-                raise ValueError(f"Unknown role: {who}")
-
-        text = conversation.get("value")
-
-        if strip_images:
-            # Text-only: remove image references completely
-            text = text.replace("<image>", "").strip()
-            return {
-                "role": role,
-                "content": [{"type": "text", "text": text}],
-            }
-        else:
-            # Multimodal: preserve image references
-            if "<image>" in text:
-                has_image = True
-                text = text.replace("<image>", "")
-            else:
-                has_image = False
-
-            return {
-                "role": role,
-                "content": (
-                    [{"type": "image"}, {"type": "text", "text": text}]
-                    if has_image
-                    else [{"type": "text", "text": text}]
-                ),
-            }
-
-    def _convert_llava_to_gemma_conversation(self, entry: dict[str, any], strip_images: bool = False):
-        """Convert LlaVA-style conversations to Gemma-style."""
-        entry["text"] = [
-            self._convert_single_llava_to_gemma_conversation(conversation, strip_images=strip_images)
-            for conversation in entry["conversations"]
-        ]
-        del entry["conversations"]
-        return entry
-
-    def _download_and_extract_images(self):
-        """Download the COCO train2017 image dataset and extract to the cache directory."""
-        zip_filename = "train2017.zip"
-        zip_path = os.path.join(self.CACHE_DIR, zip_filename)
-        extract_path = os.path.join(self.CACHE_DIR, "train2017")
-
-        # Create cache directory if it doesn't exist
-        os.makedirs(self.CACHE_DIR, exist_ok=True)
-
-        # Check if images are already downloaded and extracted
-        extract_path_obj = Path(extract_path)
-        if extract_path_obj.exists() and any(extract_path_obj.iterdir()):
-            logger.info("Images already exist at %s", extract_path)
-            return extract_path
-
-        # Download the dataset if zip doesn't exist
-        if not os.path.exists(zip_path):
-            logger.info("Downloading COCO train2017 dataset to %s", zip_path)
-            try:
-                subprocess.run(
-                    [
-                        "wget",
-                        "https://images.cocodataset.org/zips/train2017.zip",
-                        "--no-check-certificate",
-                        "-O",
-                        zip_path,
-                    ],
-                    check=True,
-                )
-                logger.info("Download completed successfully")
-            except subprocess.CalledProcessError:
-                logger.exception("Failed to download dataset")
-                raise
-            except FileNotFoundError:
-                logger.exception("wget command not found. Please install wget or use an alternative download method.")
-                raise
-
-        # Extract the zip file
-        logger.info("Extracting %s to %s", zip_path, self.CACHE_DIR)
-        try:
-            with zipfile.ZipFile(zip_path, "r") as zip_ref:
-                zip_ref.extractall(self.CACHE_DIR)
-            logger.info("Extraction completed successfully")
-        except zipfile.BadZipFile:
-            logger.exception("Failed to extract zip file")
-            # Remove corrupted zip file so it can be re-downloaded
-            if os.path.exists(zip_path):
-                os.remove(zip_path)
-            raise
-
-        return extract_path
-
-    def _load_base_dataset(self):
-        """Load the base LlaVA dataset."""
-        # Issue with Arrow leads to errors when using load_dataset directly on liuhaotian/LLaVA-Instruct-150K
-        file_path = hf_hub_download(
-            repo_id="liuhaotian/LLaVA-Instruct-150K",
-            filename="llava_instruct_80k.json",
-            repo_type="dataset",
-            cache_dir=self.CACHE_DIR,
-        )
-
-        self.image_data_path = self._download_and_extract_images()
-        self.raw_datasets = load_dataset("json", data_files=[file_path], split="train")
-
-        # Limit data processing to the first_n rows
-        self.raw_datasets = self.raw_datasets if self.first_n is None else self.raw_datasets.select(range(self.first_n))
-
-    def _extract_image_details(self, entry: dict[str, any]):
-        """Extract image details from the dataset example.
-
-        Opens the image file and adds image mode information to the example.
-        """
-        image = PILImage.open(fp=os.path.join(self.image_data_path, entry["image"]))
-        entry["image_mode"] = image.mode
-        return entry
-
-    def setup_dataset(self):
-        """Set up the dataset with common preprocessing steps."""
-        self._load_base_dataset()
-
-        # Extract image details
-        self.raw_datasets = self.raw_datasets.map(self._extract_image_details)
-
-        # Filter out any images that are not RGB
-        self.raw_datasets = self.raw_datasets.filter(lambda x: x["image_mode"] == "RGB")
-
-        # Apply dataset-specific processing
-        self.raw_datasets = self.raw_datasets.with_transform(self._process_dataset_entry)
-
-    def get_dataset(self):
-        """Return the processed dataset."""
-        return self.raw_datasets
-
-
-class GemmaMultimodalDataset(BaseGemmaDataset):
-    """Dataset for full E2E Gemma 3 multi-modal model including both image and text."""
-
-    def _initialize_processor_components(self):
-        """Initialize tokenizer for multimodal processing."""
-        self.tokenizer = AutoTokenizer.from_pretrained(
-            self.model_id, cache_dir=self.CACHE_DIR, use_fast=True, trust_remote_code=True
-        )
-
-    def setup_dataset(self):
-        """Set up the multimodal dataset with text conversation conversion."""
-        self._load_base_dataset()
-
-        # Convert the Llava-style conversation to Gemma-style conversation (preserve images)
-        self.raw_datasets = self.raw_datasets.map(
-            lambda entry: self._convert_llava_to_gemma_conversation(entry, strip_images=False)
-        )
-
-        # Extract image details
-        self.raw_datasets = self.raw_datasets.map(self._extract_image_details)
-
-        # Filter out any images that are not RGB
-        self.raw_datasets = self.raw_datasets.filter(lambda x: x["image_mode"] == "RGB")
-
-        # Apply multimodal processing
-        self.raw_datasets = self.raw_datasets.with_transform(self._process_dataset_entry)
-
-    def _process_dataset_entry(self, entry: dict[str, any]):
-        """Load image and tokenize the conversation for model input.
-
-        Args:
-            entry: Dataset entry containing text conversation and image path
-
-        Returns:
-            Tokenized inputs ready for model processing
-
-        """
-        return self.processor.apply_chat_template(
-            entry["text"][0], add_generation_prompt=True, tokenize=True, return_tensors="pt", return_dict=True
-        )
-
-
-class GemmaTextOnlyDataset(BaseGemmaDataset):
-    """Dataset for only the text portion of the Gemma 3 model."""
-
-    def _initialize_processor_components(self):
-        """Initialize tokenizer for text-only processing."""
-        self.tokenizer = AutoTokenizer.from_pretrained(
-            self.model_id, cache_dir=self.CACHE_DIR, use_fast=True, trust_remote_code=True
-        )
-
-    def setup_dataset(self):
-        """Set up the text-only dataset with conversation conversion."""
-        self._load_base_dataset()
-
-        # Convert the Llava-style conversation to Gemma-style conversation (strip images)
-        self.raw_datasets = self.raw_datasets.map(
-            lambda entry: self._convert_llava_to_gemma_conversation(entry, strip_images=True)
-        )
-
-        # Extract image details (still needed for filtering)
-        self.raw_datasets = self.raw_datasets.map(self._extract_image_details)
-
-        # Filter out any images that are not RGB
-        self.raw_datasets = self.raw_datasets.filter(lambda x: x["image_mode"] == "RGB")
-
-        # Apply text-only processing
-        self.raw_datasets = self.raw_datasets.with_transform(self._process_dataset_entry)
-
-    def _process_dataset_entry(self, entry: dict[str, any]):
-        """Extract and tokenize only the text content.
-
-        Args:
-            entry: Dataset entry containing text conversation
-
-        Returns:
-            Tokenized text inputs ready for model processing
-
-        """
-        # Apply chat template without images, text-only
-        inputs = self.tokenizer.apply_chat_template(
-            entry["text"][0], add_generation_prompt=True, tokenize=True, return_tensors="pt", return_dict=True
-        )
-        return {k: v.squeeze(0) for k, v in inputs.items()}  # Remove batch dimension
-
-
-class GemmaImageDataset(BaseGemmaDataset):
-    """Dataset for only the image processing of the Gemma 3 model."""
-
-    def _initialize_processor_components(self):
-        """No additional components needed for image-only processing."""
-
-    def _process_dataset_entry(self, entry: dict[str, any]):
-        """Load image and extract only pixel_values for image-only processing."""
-        # Load and process the image
-        image = PILImage.open(fp=os.path.join(self.image_data_path, entry["image"][0]))
-
-        # Process image to get pixel_values
-        inputs = self.processor(text="<start_of_image>", images=image, return_tensors="pt")
-
-        # Return only pixel_values
-        return {"pixel_values": inputs["pixel_values"]}
-
-
-class GemmaEmbeddingInputDataset(BaseGemmaDataset):
-    """Dataset that is the input to the embedding layer."""
-
-    def __init__(self, model_id, first_n=None):
-        # Initialize lazy-loaded model components
-        self._vision_tower = None
-        self._multi_modal_projector = None
-
-        super().__init__(model_id, first_n)
-
-    def _initialize_processor_components(self):
-        """Initialize only standard processor components."""
-        self.tokenizer = AutoTokenizer.from_pretrained(
-            self.model_id, cache_dir=self.CACHE_DIR, use_fast=True, trust_remote_code=True
-        )
-
-    def _get_vision_components(self):
-        """Lazy-load vision model components when first needed."""
-        if self._vision_tower is None:
-            logger.info("Loading vision model components for cached embedding dataset")
-            full_model = AutoModel.from_pretrained(self.model_id)
-
-            # Extract vision components (equivalent to Gemma3VisualEmbeddingGenerator)
-            self._vision_tower = full_model.vision_tower
-            self._multi_modal_projector = full_model.multi_modal_projector
-
-            # Clean up full model to save memory
-            del full_model.language_model
-
-        return self._vision_tower.cuda(), self._multi_modal_projector.cuda()
-
-    def setup_dataset(self):
-        """Set up the multimodal dataset with text conversation conversion."""
-        self._load_base_dataset()
-
-        # Convert the Llava-style conversation to Gemma-style conversation (preserve images)
-        self.raw_datasets = self.raw_datasets.map(
-            lambda entry: self._convert_llava_to_gemma_conversation(entry, strip_images=False)
-        )
-
-        # Extract image details
-        self.raw_datasets = self.raw_datasets.map(self._extract_image_details)
-
-        # Filter out any images that are not RGB
-        self.raw_datasets = self.raw_datasets.filter(lambda x: x["image_mode"] == "RGB")
-
-        # Apply multimodal processing
-        self.raw_datasets = self.raw_datasets.with_transform(self._process_dataset_entry)
-
-    def _process_dataset_entry(self, entry: dict[str, any]):
-        """Process entry to return input_ids and cached image features."""
-        # Convert conversation and tokenize
-        inputs = self.processor.apply_chat_template(
-            entry["text"][0], add_generation_prompt=True, tokenize=True, return_tensors="pt", return_dict=True
-        )
-
-        # Load and process image
-        image = PILImage.open(fp=os.path.join(self.image_data_path, entry["image"][0]))
-        pixel_values = torch.tensor(self.processor(text="<start_of_image>", images=image).pixel_values)
-
-        # Get vision components and extract features
-        vision_tower, projector = self._get_vision_components()
-        pixel_values = pixel_values.to(device="cuda")
-
-        with torch.no_grad():
-            # Process through vision tower
-            image_outputs = vision_tower(pixel_values, output_hidden_states=True)
-            selected_image_feature = image_outputs.last_hidden_state
-            # Project to final embedding space
-            image_features = projector(selected_image_feature)
-            # Convert to numpy for caching
-            image_features = image_features.cpu().detach().numpy()
-
-        return {"input_ids": inputs["input_ids"], "image_features": image_features}
-
-
-class GemmaEmbeddingDataset(BaseGemmaDataset):
-    """Dataset that pre-merges text and image embeddings."""
-
-    def __init__(self, model_id, first_n=None):
-        # Initialize lazy-loaded model components
-        self._vision_tower = None
-        self._multi_modal_projector = None
-        self._embedding_layer = None
-
-        super().__init__(model_id, first_n)
-
-    def _initialize_processor_components(self):
-        """Initialize only standard processor components."""
-        self.tokenizer = AutoTokenizer.from_pretrained(
-            self.model_id, cache_dir=self.CACHE_DIR, use_fast=True, trust_remote_code=True
-        )
-
-    def _get_model_components(self):
-        """Lazy-load all required model components when first needed."""
-        if self._embedding_layer is None:
-            logger.info("Loading model components for merged embedding dataset")
-            full_model = AutoModel.from_pretrained(self.model_id)
-
-            # Extract components
-            self._vision_tower = full_model.vision_tower.cuda()
-            self._multi_modal_projector = full_model.multi_modal_projector.cuda()
-            self._embedding_layer = copy.deepcopy(full_model.language_model.embed_tokens).cuda()
-
-            # Clean up full model
-            del full_model.language_model
-
-        return self._vision_tower, self._multi_modal_projector, self._embedding_layer
-
-    def _merge_embeddings(self, input_ids: torch.Tensor, pixel_values: torch.Tensor):
-        """Merge text and image embeddings at special token positions."""
-        vision_tower, projector, embedding_layer = self._get_model_components()
-
-        # Get text embeddings
-        inputs_embeds = embedding_layer(input_ids.to(device="cuda"))
-
-        # Process image
-        pixel_values = pixel_values.to(dtype=inputs_embeds.dtype, device="cuda")
-        with torch.no_grad():
-            image_outputs = vision_tower(pixel_values, output_hidden_states=True)
-            selected_image_feature = image_outputs.last_hidden_state
-            image_features = projector(selected_image_feature)
-
-        # Merge at special token positions (image_token_index = 262144)
-        image_token_index = 262144
-        special_image_mask = (input_ids == image_token_index).unsqueeze(-1)
-        special_image_mask = special_image_mask.expand_as(inputs_embeds).to(inputs_embeds.device)
-
-        image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype)
-        return inputs_embeds.masked_scatter(special_image_mask, image_features)
-
-    def setup_dataset(self):
-        """Set up the multimodal dataset with text conversation conversion."""
-        self._load_base_dataset()
-
-        # Convert the Llava-style conversation to Gemma-style conversation (preserve images)
-        self.raw_datasets = self.raw_datasets.map(
-            lambda entry: self._convert_llava_to_gemma_conversation(entry, strip_images=False)
-        )
-
-        # Extract image details
-        self.raw_datasets = self.raw_datasets.map(self._extract_image_details)
-
-        # Filter out any images that are not RGB
-        self.raw_datasets = self.raw_datasets.filter(lambda x: x["image_mode"] == "RGB")
-
-        # Apply multimodal processing
-        self.raw_datasets = self.raw_datasets.with_transform(self._process_dataset_entry)
-
-    def _process_dataset_entry(self, entry: dict[str, any]):
-        """Process entry to return merged embeddings."""
-        # Convert conversation and tokenize
-        inputs = self.processor.apply_chat_template(
-            entry["text"][0], add_generation_prompt=True, tokenize=True, return_tensors="pt", return_dict=True
-        )
-
-        # Load and process image
-        image = PILImage.open(fp=os.path.join(self.image_data_path, entry["image"][0]))
-        pixel_values = torch.tensor(self.processor(text="<start_of_image>", images=image).pixel_values)
-
-        # Merge embeddings
-        inputs_embeds = self._merge_embeddings(inputs["input_ids"], pixel_values)
-
-        return {
-            "input_ids": inputs["input_ids"],
-            "inputs_embeds": inputs_embeds,
-            "attention_mask": inputs["attention_mask"].squeeze(0),
-        }
-
-
-# Remove this when submitting for review
-TEXT_SHORTCUT_FIRST_N = 600
-SHORTCUT_FIRST_N = 200
-
-
-@Registry.register_dataset()
-def gemma_dataset(model_id: str):
-    """Full E2E Gemma 3 multi-modal dataset (image + text)."""
-    return GemmaMultimodalDataset(model_id, first_n=TEXT_SHORTCUT_FIRST_N).get_dataset()
-
-
-@Registry.register_dataset()
-def gemma_text_dataset(model_id: str):
-    """Text-only Gemma 3 dataset."""
-    return GemmaTextOnlyDataset(model_id, first_n=SHORTCUT_FIRST_N).get_dataset()
-
-
-@Registry.register_dataset()
-def gemma_image_dataset(model_id: str):
-    """Image-only Gemma 3 dataset."""
-    return GemmaImageDataset(model_id, first_n=SHORTCUT_FIRST_N).get_dataset()
-
-
-@Registry.register_dataset()
-def gemma_embedding_input_dataset(model_id: str):
-    """Gemma 3 dataset with embedding layer input."""
-    return GemmaEmbeddingInputDataset(model_id, first_n=SHORTCUT_FIRST_N).get_dataset()
-
-
-@Registry.register_dataset()
-def gemma_embedding_dataset(model_id: str):
-    """Gemma 3 dataset with pre-merged text and image embeddings."""
-    return GemmaEmbeddingDataset(model_id, first_n=SHORTCUT_FIRST_N).get_dataset()
diff --git a/examples/gemma3/qnn/custom_gemma3_4b_embedding.py b/examples/gemma3/qnn/custom_gemma3_4b_embedding.py
deleted file mode 100644
index 97c9cf2ea..000000000
--- a/examples/gemma3/qnn/custom_gemma3_4b_embedding.py
+++ /dev/null
@@ -1,37 +0,0 @@
-# -------------------------------------------------------------------------
-# Copyright (c) Microsoft Corporation. All rights reserved.
-# Licensed under the MIT License.
-# --------------------------------------------------------------------------
-
-
-import logging
-
-import torch
-from transformers import AutoModel
-
-logger = logging.getLogger(__name__)
-
-
-class EmbeddingLayer(torch.nn.Module):
-    def __init__(self, full_model):
-        super().__init__()
-        self.embedding_layer = full_model.language_model.embed_tokens
-
-    def forward(self, input_ids, image_features):
-        image_token_index = 262144
-        inputs_embeds = self.embedding_layer(input_ids)
-
-        special_image_mask = (input_ids == image_token_index).unsqueeze(-1)
-        special_image_mask = special_image_mask.expand_as(inputs_embeds).to(inputs_embeds.device)
-        image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype)
-        return inputs_embeds.masked_scatter(special_image_mask, image_features)
-
-
-def load_gemma3_embedding_model(model_path):
-    full_model = AutoModel.from_pretrained("google/gemma-3-4b-it")
-    logger.info("Loaded full model: %s", full_model)
-
-    embedding_layer = EmbeddingLayer(full_model)
-
-    logger.info("Created embedding-only model: %s", embedding_layer)
-    return embedding_layer
diff --git a/examples/gemma3/qnn/custom_gemma3_4b_vision.py b/examples/gemma3/qnn/custom_gemma3_4b_vision.py
deleted file mode 100644
index 1eb7f8f33..000000000
--- a/examples/gemma3/qnn/custom_gemma3_4b_vision.py
+++ /dev/null
@@ -1,36 +0,0 @@
-# -------------------------------------------------------------------------
-# Copyright (c) Microsoft Corporation. All rights reserved.
-# Licensed under the MIT License.
-# --------------------------------------------------------------------------
-
-
-import logging
-
-import torch
-from transformers import AutoModel
-
-logger = logging.getLogger(__name__)
-
-
-class Gemma3VisualEmbeddingGenerator(torch.nn.Module):
-    def __init__(self, full_model):
-        super().__init__()
-        # Extract only the vision components
-        self.vision_tower = full_model.vision_tower
-        self.multi_modal_projector = full_model.multi_modal_projector
-
-    def forward(self, pixel_values):
-        # Process images through vision tower
-        image_outputs = self.vision_tower(pixel_values, output_hidden_states=True)
-        selected_image_feature = image_outputs.last_hidden_state
-        # Project to final embedding space
-        return self.multi_modal_projector(selected_image_feature)
-
-
-def load_gemma3_vision_model(model_path):
-    full_model = AutoModel.from_pretrained("google/gemma-3-4b-it")
-    logger.info("Loaded full model: %s", full_model)
-
-    vision_model = Gemma3VisualEmbeddingGenerator(full_model)
-    logger.info("Created vision-only model: %s", vision_model)
-    return vision_model
diff --git a/examples/gemma3/qnn/env_setup.sh b/examples/gemma3/qnn/env_setup.sh
deleted file mode 100644
index aa117afc0..000000000
--- a/examples/gemma3/qnn/env_setup.sh
+++ /dev/null
@@ -1,28 +0,0 @@
-#!/bin/bash
-# -------------------------------------------------------------------------
-# Copyright (c) Microsoft Corporation. All rights reserved.
-# Licensed under the MIT License.
-# --------------------------------------------------------------------------
-
-# Installing setuptools to build Olive from source
-uv pip install setuptools
-
-# Requires installation of uv
-uv pip install -r ../requirements.txt
-
-# Require installation of Olive dependencies
-uv pip install -r ../../../requirements.txt
-
-# Disable CUDA extension build
-export BUILD_CUDA_EXT=0
-
-# Install AutoGPTQ from source
-uv pip install --no-build-isolation git+https://github.com/PanQiWei/AutoGPTQ.git
-
-# Install GptqModel from source
-# Note: Commit hash corresponds to commit which fixes Gemma 3 memory leak issue. See README.md for additional details.
-uv pip install --no-build-isolation git+https://github.com/ModelCloud/GPTQModel.git@558449bed3ef2653c36041650d30da6bbbca440d
-
-# Install onnxruntime-qnn without installing onnxruntime
-uv pip install -r https://raw.githubusercontent.com/microsoft/onnxruntime/refs/heads/main/requirements.txt
-uv pip install -U --pre --extra-index-url https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/ORT-Nightly/pypi/simple onnxruntime-qnn --no-deps
diff --git a/examples/gemma3/qnn/gemma-3-4b.ipynb b/examples/gemma3/qnn/gemma-3-4b.ipynb
deleted file mode 100755
index 42890860e..000000000
--- a/examples/gemma3/qnn/gemma-3-4b.ipynb
+++ /dev/null
@@ -1,379 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# Gemma 3 4B QNN model conversion with Olive \n",
-    "### Task: Text + Vision Generation 📝\n",
-    "\n",
-    "In this notebook, you'll:\n",
-    "- Download the required datasets\n",
-    "- Convert LLM to QNN format\n",
-    "- Convert Vision to QNN format\n",
-    "- Convert Embedding layer with image to QNN format\n"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Platform requirements\n",
-    "This notebook is intended to run on a machine with:\n",
-    "  * **Operating System**: Linux Ubuntu 22.04 (automated setup script is Linux-only)\n",
-    "  * **Python**: 3.10\n",
-    "  * NVIDIA driver version equivalent to 525.60.13\n",
-    "  * NVIDIA A100 GPU\n",
-    "  * **Storage**: ~13GB for COCO train2017 dataset (downloaded automatically)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## 🐍 Python Virtual environments\n",
-    "Creates Olive and QNN python virtual environments"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import os\n",
-    "import venv\n",
-    "from pathlib import Path\n",
-    "import subprocess\n",
-    "import json\n",
-    "import shutil\n",
-    "import urllib.request\n",
-    "import onnx\n",
-    "from onnx import helper, TensorProto\n",
-    "import glob\n",
-    "\n",
-    "current_dir = os.getcwd()\n",
-    "MODEL=\"google/gemma-3-4b-it\"\n",
-    "OLIVE_PYTHON_PATH = './olive_venv'\n",
-    "OLIVE_PYTHON_BIN = './olive_venv/bin/python'\n",
-    "olive_pip_path = Path(OLIVE_PYTHON_PATH) / \"bin\" / \"pip\"\n",
-    "OLIVE_REPO_PATH = Path(\"../../../\")\n",
-    "OLIVE_REQ = \"../requirements.txt\"\n",
-    "QNN_REQ = \"./qnn_req.txt\"\n",
-    "\n",
-    "QNN_PYTHON_PATH = './qnn_venv'\n",
-    "QNN_PYTHON_BIN_PATH = './qnn_venv/bin'\n",
-    "qnn_pip_path = Path(QNN_PYTHON_PATH) / \"bin\" / \"pip\"\n",
-    "QNN_PYTHON_BIN_FULL_PATH = f\"{current_dir}/{QNN_PYTHON_BIN_PATH}\""
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Prepare Olive Python Environment"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "scrolled": true
-   },
-   "outputs": [],
-   "source": [
-    "if not os.path.exists(OLIVE_PYTHON_PATH):\n",
-    "    print(\"Creating Olive Venv\")\n",
-    "    builder = venv.EnvBuilder(with_pip=True)\n",
-    "    builder.create(Path(OLIVE_PYTHON_PATH))\n",
-    "my_env = os.environ.copy()\n",
-    "my_env[\"BUILD_CUDA_EXT\"] = \"0\"\n",
-    "GPTQ=\"git+https://github.com/ModelCloud/GPTQModel.git\"\n",
-    "subprocess.check_call([str(olive_pip_path), \"install\", \"-U\", \"-r\" , OLIVE_REQ], env=my_env)\n",
-    "subprocess.check_call([str(olive_pip_path), \"install\", \"--no-build-isolation\", GPTQ], env=my_env)\n",
-    "subprocess.check_call([str(olive_pip_path), \"install\", \"-e\", OLIVE_REPO_PATH])"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Prepare QNN Python Environment"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "scrolled": true
-   },
-   "outputs": [],
-   "source": [
-    "if not os.path.exists(QNN_PYTHON_PATH):\n",
-    "    print(\"Creating QNN Venv\")\n",
-    "    builder = venv.EnvBuilder(with_pip=True)\n",
-    "    builder.create(Path(QNN_PYTHON_PATH))\n",
-    "subprocess.check_call([str(qnn_pip_path), \"install\", \"--no-build-isolation\", \"-r\" , QNN_REQ], env=my_env)\n",
-    "subprocess.check_call([str(qnn_pip_path), \"install\", \"-e\", OLIVE_REPO_PATH])\n",
-    "subprocess.check_call([str(qnn_pip_path), \"install\", \"-U\", \"--pre\", \"--extra-index-url\",\n",
-    "                       \"https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/ORT-Nightly/pypi/simple\",\n",
-    "                       \"onnxruntime-qnn==1.23.0.dev20250815002\", \"--no-deps\"])"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### 🤗 Login to Hugging Face\n",
-    "To access models, you'll need to log-in to Hugging Face with a [user access token](https://huggingface.co/docs/hub/security-tokens). The following command will run you through the steps to login:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "!huggingface-cli login --token <>"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Apply few patches to Onnxruntime\n",
-    "\n",
-    "This is needed for running the Olive recipies for this model"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "base_url = \"https://raw.githubusercontent.com/CodeLinaro/onnxruntime/326d9d30129bbad698e0306d24dcea0ec5a19e60\"\n",
-    "urls = [\n",
-    "    base_url + \"/onnxruntime/python/tools/quantization/execution_providers/qnn/quant_config.py\",\n",
-    "    base_url + \"/onnxruntime/python/tools/quantization/quant_utils.py\"\n",
-    "]\n",
-    "\n",
-    "destinations = [\n",
-    "    OLIVE_PYTHON_PATH+\"/lib/python3.10/site-packages/onnxruntime/quantization/execution_providers/qnn/quant_config.py\",\n",
-    "    OLIVE_PYTHON_PATH+\"/lib/python3.10/site-packages/onnxruntime/quantization/quant_utils.py\"\n",
-    "]\n",
-    "\n",
-    "for url, dest in zip(urls, destinations):\n",
-    "    urllib.request.urlretrieve(url, dest)\n",
-    "    print(f\"Downloaded and replaced: {dest}\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Run Olive Recipes"
-   ]
-  },
-  {
-   "attachments": {},
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "**GPU utilization observed during the run**\n",
-    "\n",
-    "\t\ta. Text GPTQModel quantization:        12gb\n",
-    "\t\tb. Text Onnx static quantization:      41gb\n",
-    "\t\tc. Vision Onnx static quantization:    68gb\n",
-    "        d. Embedding Onnx static quantization: 3gb"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Clean Context binary directories if they exist\n",
-    "def clean_directory(path):\n",
-    "    if os.path.exists(path):\n",
-    "        for file in glob.glob(os.path.join(path, '*')):\n",
-    "            if os.path.isfile(file):\n",
-    "                os.remove(file)\n",
-    "dirs_to_clean = [\n",
-    "    './models/gemma3_qnn/model/',\n",
-    "    './models/gemma-3-4b-it-vision/model/',\n",
-    "    './models/gemma-3-4b-it-embed/model/'\n",
-    "]\n",
-    "\n",
-    "for dir_path in dirs_to_clean:\n",
-    "    clean_directory(dir_path)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### 1️⃣ LLM model generation"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "config_path = Path(f\"./gemma3-4b-text-qnn-config.json\")\n",
-    "with open(config_path, \"r\") as file:\n",
-    "    data = json.load(file)\n",
-    "\n",
-    "data[\"systems\"][\"qnn_system\"][\"python_environment_path\"] = QNN_PYTHON_BIN_FULL_PATH\n",
-    "data[\"input_model\"][\"model_path\"] = MODEL\n",
-    "\n",
-    "with open(config_path, \"w\") as file:\n",
-    "    json.dump(data, file, indent=4)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "scrolled": true
-   },
-   "outputs": [],
-   "source": [
-    "!./olive_venv/bin/olive run --config ./gemma3-4b-text-qnn-config.json"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### 2️⃣ Vision model Quantization"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "config_path = Path(f\"./gemma3-4b-vision-qnn-config.json\")\n",
-    "with open(config_path, \"r\") as file:\n",
-    "    data = json.load(file)\n",
-    "data[\"systems\"][\"qnn_system\"][\"python_environment_path\"] = QNN_PYTHON_BIN_FULL_PATH\n",
-    "\n",
-    "with open(config_path, \"w\") as file:\n",
-    "    json.dump(data, file, indent=4)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "scrolled": true
-   },
-   "outputs": [],
-   "source": [
-    "!./olive_venv/bin/olive run --config ./gemma3-4b-vision-qnn-config.json"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### 3️⃣ Embedding Model"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "scrolled": true
-   },
-   "outputs": [],
-   "source": [
-    "!./olive_venv/bin/olive run --config ./gemma3-4b-embedding-qnn-config.json"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Keep output of the embedding model as uint16 instead of float"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "model = onnx.load(\"./models/gemma-3-4b-it-embed/model/model.onnx\")\n",
-    "graph = model.graph\n",
-    "\n",
-    "last_node = graph.node[-1]\n",
-    "graph.node.remove(last_node)\n",
-    "previous_node_output = graph.node[-1].output[0]\n",
-    "new_output = helper.make_tensor_value_info(\n",
-    "    name=previous_node_output,\n",
-    "    elem_type=TensorProto.UINT16,\n",
-    "    shape=[\"batch_size\", \"seq_length\", 2560]\n",
-    ")\n",
-    "graph.output.remove(graph.output[0])\n",
-    "graph.output.extend([new_output])\n",
-    "onnx.save(model, \"./models/gemma-3-4b-it-embed/model/embeddings_with_image.onnx\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Prepare final ORT GenAI folder for on-device inference "
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "!cp ./models/gemma-3-4b-it-embed/model/embeddings_with_image.onnx ./models/gemma3_qnn/model/\n",
-    "!cp ./models/gemma-3-4b-it-vision/model/model_ctx.onnx ./models/gemma3_qnn/model/model_ctx_vision.onnx \n",
-    "!cp ./models/gemma-3-4b-it-vision/model/model_ctx_qnn.bin ./models/gemma3_qnn/model/model_ctx_qnn.bin \n",
-    "!cp ./genai/*.* ./models/gemma3_qnn/model/\n",
-    "!ls -al ./models/gemma3_qnn/model/\n",
-    "\n",
-    "print(\"ORT GenAI inference setup: ./models/gemma3_qnn\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.10.12"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 4
-}
diff --git a/examples/gemma3/qnn/gemma3-4b-embedding-qnn-config.json b/examples/gemma3/qnn/gemma3-4b-embedding-qnn-config.json
deleted file mode 100755
index 360f0e2bb..000000000
--- a/examples/gemma3/qnn/gemma3-4b-embedding-qnn-config.json
+++ /dev/null
@@ -1,51 +0,0 @@
-{
-    "input_model": {
-        "type": "PyTorchModel",
-        "model_script": "custom_gemma3_4b_embedding.py",
-        "model_loader": "load_gemma3_embedding_model",
-        "io_config": {
-            "input_names": [ "input_ids", "image_features" ],
-            "input_shapes": [ [ 1, 64 ], [ 1, 256, 2560 ] ],
-            "input_types": [ "int64", "float32" ],
-            "output_names": [ "/model/embed_tokens/Mul/output_0" ],
-            "output_shapes": [ [ 1, 64, 2560 ] ],
-            "dynamic_axes": {
-                "input_ids": { "0": "batch_size", "1": "seq_length" },
-                "image_features": { "0": "batch_size", "1": "image_tokens_length" }
-            }
-        }
-    },
-    "systems": {
-        "local_system": {
-            "type": "LocalSystem",
-            "accelerators": [ { "device": "cpu", "execution_providers": [ "CPUExecutionProvider" ] } ]
-        }
-    },
-    "data_configs": [
-        {
-            "name": "gemma_embedding_data_config",
-            "user_script": "custom_gemma3_4b_datasets.py",
-            "load_dataset_config": { "type": "gemma_embedding_input_dataset", "model_id": "google/gemma-3-4b-it" }
-        }
-    ],
-    "passes": {
-        "conversion": { "type": "OnnxConversion", "target_opset": 20 },
-        "quantization": {
-            "type": "OnnxStaticQuantization",
-            "quant_preprocess": false,
-            "data_config": "gemma_embedding_data_config",
-            "activation_type": "uint16",
-            "precision": "uint8",
-            "calibrate_method": "MinMax",
-            "calibration_providers": [ "CUDAExecutionProvider" ],
-            "per_channel": true,
-            "weight_symmetric": true
-        },
-        "add_metadata": { "type": "AddOliveMetadata", "graph_name": "gemma-3-4b-it-embedding" }
-    },
-    "target": "local_system",
-    "log_severity_level": 1,
-    "output_dir": "models/gemma-3-4b-it-embed",
-    "cache_dir": "cache-embd",
-    "no_artifacts": true
-}
diff --git a/examples/gemma3/qnn/gemma3-4b-text-qnn-config.json b/examples/gemma3/qnn/gemma3-4b-text-qnn-config.json
deleted file mode 100755
index 12fc5c8dc..000000000
--- a/examples/gemma3/qnn/gemma3-4b-text-qnn-config.json
+++ /dev/null
@@ -1,87 +0,0 @@
-{
-    "input_model": {
-        "type": "HfModel",
-        "model_path": "google/gemma-3-4b-it",
-        "custom_task_class_name": "Gemma3ForCausalLM",
-        "custom_task_class_module": "transformers"
-    },
-    "systems": {
-        "qnn_system": {
-            "type": "PythonEnvironment",
-            "python_environment_path": "",
-            "accelerators": [ { "execution_providers": [ "QNNExecutionProvider" ] } ]
-        }
-    },
-    "data_configs": [
-        {
-            "name": "gemma_data_config",
-            "user_script": "custom_gemma3_4b_datasets.py",
-            "load_dataset_config": { "type": "gemma_dataset", "model_id": "google/gemma-3-4b-it" }
-        }
-    ],
-    "passes": {
-        "g": {
-            "type": "GptqModel",
-            "bits": 4,
-            "sym": true,
-            "group_size": -1,
-            "lm_head": false,
-            "device": "cuda",
-            "data_config": "gemma_data_config"
-        },
-        "cs": { "type": "CaptureSplitInfo", "num_splits": 2, "unique_embeds_lm_head_splits": true },
-        "mb": {
-            "type": "ModelBuilder",
-            "precision": "int4",
-            "int4_block_size": 32,
-            "int4_accuracy_level": 4,
-            "int4_op_types_to_quantize": [ "MatMul", "Gather" ]
-        },
-        "mq": {
-            "type": "MatMulNBitsToQDQ",
-            "use_int4": true,
-            "add_zero_point": true,
-            "nodes_to_exclude": [ "/lm_head/MatMul_Q4" ],
-            "save_as_external_data": true
-        },
-        "gs": {
-            "type": "GraphSurgeries",
-            "surgeries": [
-                { "surgeon": "RemoveRopeMultiCache" },
-                { "surgeon": "AttentionMaskToSequenceLengths" },
-                { "surgeon": "SimplifiedLayerNormToL2Norm" }
-            ],
-            "save_as_external_data": true
-        },
-        "sq": {
-            "type": "OnnxStaticQuantization",
-            "data_config": "gemma_data_config",
-            "activation_type": "uint16",
-            "precision": "uint8",
-            "calibration_providers": [ "CUDAExecutionProvider" ],
-            "quant_preprocess": true,
-            "op_types_to_exclude": [ "GatherBlockQuantized", "GroupQueryAttention", "MatMulNBits" ],
-            "save_as_external_data": true
-        },
-        "sp": { "type": "SplitModel" },
-        "st": { "type": "StaticLLM", "batch_size": 1, "context_length": 64 },
-        "cb": {
-            "type": "EPContextBinaryGenerator",
-            "provider_options": {
-                "htp_performance_mode": "burst",
-                "htp_graph_finalization_optimization_mode": "3",
-                "vtcm_mb": "8",
-                "htp_arch": "v73",
-                "soc_model": "60"
-            },
-            "session_options": { "intra_op_num_threads": 2, "inter_op_num_threads": 1 },
-            "weight_sharing": true
-        },
-        "cp": { "type": "ComposeOnnxModels" }
-    },
-    "target": "qnn_system",
-    "log_severity_level": 0,
-    "output_dir": "models/gemma3_qnn",
-    "cache_dir": "cache",
-    "no_artifacts": true
-}
diff --git a/examples/gemma3/qnn/gemma3-4b-vision-qnn-config.json b/examples/gemma3/qnn/gemma3-4b-vision-qnn-config.json
deleted file mode 100755
index b15d6185f..000000000
--- a/examples/gemma3/qnn/gemma3-4b-vision-qnn-config.json
+++ /dev/null
@@ -1,59 +0,0 @@
-{
-    "input_model": {
-        "type": "PyTorchModel",
-        "model_script": "custom_gemma3_4b_vision.py",
-        "model_loader": "load_gemma3_vision_model",
-        "io_config": {
-            "input_names": [ "pixel_values" ],
-            "input_shapes": [ [ 1, 3, 896, 896 ] ],
-            "input_types": [ "float32" ],
-            "output_names": [ "image_features" ],
-            "output_shapes": [ [ 1, 256, 2560 ] ]
-        }
-    },
-    "systems": {
-        "qnn_system": {
-            "type": "PythonEnvironment",
-            "python_environment_path": "",
-            "accelerators": [ { "execution_providers": [ "QNNExecutionProvider" ] } ]
-        }
-    },
-    "data_configs": [
-        {
-            "name": "gemma_vision_data_config",
-            "user_script": "custom_gemma3_4b_datasets.py",
-            "load_dataset_config": { "type": "gemma_image_dataset", "model_id": "google/gemma-3-4b-it" }
-        }
-    ],
-    "passes": {
-        "conversion": { "type": "OnnxConversion", "target_opset": 20 },
-        "surgery": { "type": "GraphSurgeries", "surgeries": [ { "surgeon": "MatMulAddToGemm" } ] },
-        "quantization": {
-            "type": "OnnxStaticQuantization",
-            "quant_preprocess": true,
-            "data_config": "gemma_vision_data_config",
-            "activation_type": "uint16",
-            "precision": "uint8",
-            "calibrate_method": "MinMax",
-            "calibration_providers": [ "CUDAExecutionProvider" ],
-            "per_channel": true,
-            "weight_symmetric": true
-        },
-        "cb": {
-            "type": "EPContextBinaryGenerator",
-            "provider_options": {
-                "htp_performance_mode": "burst",
-                "htp_graph_finalization_optimization_mode": "3",
-                "vtcm_mb": "8",
-                "htp_arch": "v73",
-                "soc_model": "60"
-            }
-        },
-        "add_metadata": { "type": "AddOliveMetadata", "graph_name": "gemma-3-4b-it-vision" }
-    },
-    "target": "qnn_system",
-    "log_severity_level": 1,
-    "output_dir": "models/gemma-3-4b-it-vision",
-    "cache_dir": "cache-vision",
-    "no_artifacts": true
-}
diff --git a/examples/gemma3/qnn/genai/app.py b/examples/gemma3/qnn/genai/app.py
deleted file mode 100644
index 0b5da39c3..000000000
--- a/examples/gemma3/qnn/genai/app.py
+++ /dev/null
@@ -1,163 +0,0 @@
-# Copyright (c) Microsoft Corporation. All rights reserved.
-# Licensed under the MIT License
-
-import argparse
-import glob
-import json
-import logging
-import os
-import time
-from pathlib import Path
-
-import onnxruntime_genai as og
-
-logger = logging.getLogger(__name__)
-
-
-def _find_dir_contains_sub_dir(current_dir: Path, target_dir_name):
-    curr_path = Path(current_dir).absolute()
-    target_dir = glob.glob(target_dir_name, root_dir=curr_path)
-    if target_dir:
-        return Path(curr_path / target_dir[0]).absolute()
-    else:
-        if curr_path.parent == curr_path:
-            # Root dir
-            return None
-        return _find_dir_contains_sub_dir(curr_path / "..", target_dir_name)
-
-
-def _complete(text, state):
-    return (glob.glob(text + "*") + [None])[state]
-
-
-def run(args: argparse.Namespace):
-    logger.info("Loading model...")
-    config = og.Config(args.model_path)
-    if args.execution_provider != "follow_config":
-        config.clear_providers()
-        if args.execution_provider != "cpu":
-            logger.info(f"Setting model to {args.execution_provider}...")
-            config.append_provider(args.execution_provider)
-    model = og.Model(config)
-    logger.info("Model loaded")
-
-    tokenizer = og.Tokenizer(model)
-    processor = model.create_multimodal_processor()
-    stream = processor.create_stream()
-
-    interactive = not args.non_interactive
-
-    while True:
-        if interactive:
-            try:
-                import readline
-
-                readline.set_completer_delims(" \t\n;")
-                readline.parse_and_bind("tab: complete")
-                readline.set_completer(_complete)
-            except ImportError:
-                # Not available on some platforms. Ignore it.
-                pass
-            image_paths = [
-                image_path.strip()
-                for image_path in input("Image Path (comma separated; leave empty if no image): ").split(",")
-            ]
-        else:
-            if args.image_paths:
-                image_paths = args.image_paths
-            else:
-                image_paths = [str(Path(__file__).parent / "images" / "dog.jpg")]
-
-        image_paths = [image_path for image_path in image_paths if image_path]
-
-        images = None
-        if len(image_paths) == 0:
-            logger.info("No image provided")
-        else:
-            for i, image_path in enumerate(image_paths):
-                if not os.path.exists(image_path):
-                    raise FileNotFoundError(f"Image file not found: {image_path}")
-                logger.info(f"Using image: {image_path}")
-
-            images = og.Images.open(*image_paths)
-
-        if interactive:
-            text = input("Prompt: ")
-        else:
-            if args.prompt:
-                text = args.prompt
-            else:
-                text = "What is shown in this image?"
-
-        # Construct the "messages" argument passed to apply_chat_template
-        messages = []
-        if model.type == "phi3v":
-            # Combine all image tags and text into one user message
-            content = "".join([f"<|image_{i + 1}|>\n" for i in range(len(image_paths))]) + text
-            messages.append({"role": "user", "content": content})
-        else:
-            # Gemma3-style multimodal: structured content
-            content_list = [{"type": "image"} for _ in image_paths]
-            content_list.append({"type": "text", "text": text})
-            messages.append({"role": "user", "content": content_list})
-
-        # Apply the chat template using the tokenizer
-        message_json = json.dumps(messages)
-        prompt = tokenizer.apply_chat_template(message_json, add_generation_prompt=True)
-
-        logger.info("Processing images and prompt...")
-        inputs = processor(prompt, images=images)
-
-        logger.info("Generating response...")
-        params = og.GeneratorParams(model)
-        params.set_search_options(max_length=1024)
-
-        generator = og.Generator(model, params)
-        generator.set_inputs(inputs)
-        start_time = time.time()
-
-        while not generator.is_done():
-            generator.generate_next_token()
-
-            new_token = generator.get_next_tokens()[0]
-            logger.info(stream.decode(new_token), end="", flush=True)
-
-        total_run_time = time.time() - start_time
-        logger.info(f"Total Time : {total_run_time:.2f}")
-
-        # Delete the generator to free the captured graph before creating another one
-        del generator
-
-        if not interactive:
-            break
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "-m", "--model_path", type=str, default="", required=True, help="Path to the folder containing the model"
-    )
-    parser.add_argument(
-        "-e",
-        "--execution_provider",
-        type=str,
-        required=False,
-        default="follow_config",
-        choices=["cpu", "cuda", "dml", "follow_config"],
-        help="Execution provider to run the ONNX Runtime session with. Defaults to follow_config that uses the execution provider listed in the genai_config.json instead.",
-    )
-    parser.add_argument(
-        "--image_paths", nargs="*", type=str, required=False, help="Path to the images, mainly for CI usage"
-    )
-    parser.add_argument(
-        "-pr", "--prompt", required=False, help="Input prompts to generate tokens from, mainly for CI usage"
-    )
-    parser.add_argument(
-        "--non-interactive",
-        action=argparse.BooleanOptionalAction,
-        default=True,
-        required=False,
-        help="Non-interactive mode, mainly for CI usage",
-    )
-    args = parser.parse_args()
-    run(args)
diff --git a/examples/gemma3/qnn/genai/genai_config.json b/examples/gemma3/qnn/genai/genai_config.json
deleted file mode 100755
index 754b33cd0..000000000
--- a/examples/gemma3/qnn/genai/genai_config.json
+++ /dev/null
@@ -1,417 +0,0 @@
-{
-    "model": {
-        "bos_token_id": 2,
-        "context_length": 131072,
-        "decoder": {
-            "session_options": { "log_id": "onnxruntime-genai", "provider_options": [  ] },
-            "head_size": 256,
-            "hidden_size": 2560,
-            "inputs": {
-                "input_ids": "input_ids",
-                "inputs_embeds": "inputs_embeds",
-                "attention_mask": "attention_mask",
-                "past_key_names": "past_key_values.%d.key",
-                "past_value_names": "past_key_values.%d.value",
-                "past_sequence_length": "past_seq_len",
-                "total_sequence_length": "total_seq_len"
-            },
-            "outputs": {
-                "logits": "logits",
-                "present_key_names": "present.%d.key",
-                "present_value_names": "present.%d.value"
-            },
-            "num_attention_heads": 8,
-            "num_hidden_layers": 34,
-            "num_key_value_heads": 4,
-            "sliding_window": {
-                "window_size": 64,
-                "slide_key_value_cache": false,
-                "slide_inputs": true,
-                "pad_value": 0,
-                "alignment": "left"
-            },
-            "pipeline": [
-                {
-                    "embeddings": {
-                        "filename": "embeddings.onnx",
-                        "inputs": [ "input_ids" ],
-                        "outputs": [ "/model/embed_tokens/Mul/output_0_QuantizeLinear_Output" ],
-                        "run_on_prompt": false
-                    },
-                    "context_ctx": {
-                        "filename": "context_ctx.onnx",
-                        "inputs": [
-                            "/model/embed_tokens/Mul/output_0_QuantizeLinear_Output",
-                            "past_key_values.0.key",
-                            "past_key_values.0.value",
-                            "past_seq_len",
-                            "total_seq_len",
-                            "past_key_values.1.key",
-                            "past_key_values.1.value",
-                            "past_key_values.2.key",
-                            "past_key_values.2.value",
-                            "past_key_values.3.key",
-                            "past_key_values.3.value",
-                            "past_key_values.4.key",
-                            "past_key_values.4.value",
-                            "past_key_values.5.key",
-                            "past_key_values.5.value",
-                            "past_key_values.6.key",
-                            "past_key_values.6.value",
-                            "past_key_values.7.key",
-                            "past_key_values.7.value",
-                            "past_key_values.8.key",
-                            "past_key_values.8.value",
-                            "past_key_values.9.key",
-                            "past_key_values.9.value",
-                            "past_key_values.10.key",
-                            "past_key_values.10.value",
-                            "past_key_values.11.key",
-                            "past_key_values.11.value",
-                            "past_key_values.12.key",
-                            "past_key_values.12.value",
-                            "past_key_values.13.key",
-                            "past_key_values.13.value",
-                            "past_key_values.14.key",
-                            "past_key_values.14.value",
-                            "past_key_values.15.key",
-                            "past_key_values.15.value",
-                            "past_key_values.16.key",
-                            "past_key_values.16.value",
-                            "past_key_values.17.key",
-                            "past_key_values.17.value",
-                            "past_key_values.18.key",
-                            "past_key_values.18.value",
-                            "past_key_values.19.key",
-                            "past_key_values.19.value",
-                            "past_key_values.20.key",
-                            "past_key_values.20.value",
-                            "past_key_values.21.key",
-                            "past_key_values.21.value",
-                            "past_key_values.22.key",
-                            "past_key_values.22.value",
-                            "past_key_values.23.key",
-                            "past_key_values.23.value",
-                            "past_key_values.24.key",
-                            "past_key_values.24.value",
-                            "past_key_values.25.key",
-                            "past_key_values.25.value",
-                            "past_key_values.26.key",
-                            "past_key_values.26.value",
-                            "past_key_values.27.key",
-                            "past_key_values.27.value",
-                            "past_key_values.28.key",
-                            "past_key_values.28.value",
-                            "past_key_values.29.key",
-                            "past_key_values.29.value",
-                            "past_key_values.30.key",
-                            "past_key_values.30.value",
-                            "past_key_values.31.key",
-                            "past_key_values.31.value",
-                            "past_key_values.32.key",
-                            "past_key_values.32.value",
-                            "past_key_values.33.key",
-                            "past_key_values.33.value"
-                        ],
-                        "outputs": [
-                            "present.0.key",
-                            "present.0.value",
-                            "present.1.key",
-                            "present.1.value",
-                            "present.2.key",
-                            "present.2.value",
-                            "present.3.key",
-                            "present.3.value",
-                            "present.4.key",
-                            "present.4.value",
-                            "present.5.key",
-                            "present.5.value",
-                            "present.6.key",
-                            "present.6.value",
-                            "present.7.key",
-                            "present.7.value",
-                            "present.8.key",
-                            "present.8.value",
-                            "present.9.key",
-                            "present.9.value",
-                            "present.10.key",
-                            "present.10.value",
-                            "present.11.key",
-                            "present.11.value",
-                            "present.12.key",
-                            "present.12.value",
-                            "present.13.key",
-                            "present.13.value",
-                            "present.14.key",
-                            "present.14.value",
-                            "present.15.key",
-                            "present.15.value",
-                            "present.16.key",
-                            "present.16.value",
-                            "present.17.key",
-                            "present.17.value",
-                            "present.18.key",
-                            "present.18.value",
-                            "present.19.key",
-                            "present.19.value",
-                            "present.20.key",
-                            "present.20.value",
-                            "present.21.key",
-                            "present.21.value",
-                            "present.22.key",
-                            "present.22.value",
-                            "present.23.key",
-                            "present.23.value",
-                            "present.24.key",
-                            "present.24.value",
-                            "present.25.key",
-                            "present.25.value",
-                            "present.26.key",
-                            "present.26.value",
-                            "present.27.key",
-                            "present.27.value",
-                            "present.28.key",
-                            "present.28.value",
-                            "present.29.key",
-                            "present.29.value",
-                            "present.30.key",
-                            "present.30.value",
-                            "present.31.key",
-                            "present.31.value",
-                            "present.32.key",
-                            "present.32.value",
-                            "present.33.key",
-                            "present.33.value",
-                            "/model/layers.34/final_norm_layernorm/SkipLayerNorm_Mul_output_0_QuantizeLinear_Output"
-                        ],
-                        "session_options": {
-                            "intra_op_num_threads": 2,
-                            "inter_op_num_threads": 1,
-                            "provider_options": [
-                                {
-                                    "qnn": {
-                                        "htp_performance_mode": "burst",
-                                        "htp_graph_finalization_optimization_mode": "3",
-                                        "soc_model": "60"
-                                    }
-                                }
-                            ]
-                        },
-                        "run_on_token_gen": false
-                    },
-                    "iterator_ctx": {
-                        "filename": "iterator_ctx.onnx",
-                        "inputs": [
-                            "/model/embed_tokens/Mul/output_0_QuantizeLinear_Output",
-                            "past_key_values.0.key",
-                            "past_key_values.0.value",
-                            "past_seq_len",
-                            "total_seq_len",
-                            "past_key_values.1.key",
-                            "past_key_values.1.value",
-                            "past_key_values.2.key",
-                            "past_key_values.2.value",
-                            "past_key_values.3.key",
-                            "past_key_values.3.value",
-                            "past_key_values.4.key",
-                            "past_key_values.4.value",
-                            "past_key_values.5.key",
-                            "past_key_values.5.value",
-                            "past_key_values.6.key",
-                            "past_key_values.6.value",
-                            "past_key_values.7.key",
-                            "past_key_values.7.value",
-                            "past_key_values.8.key",
-                            "past_key_values.8.value",
-                            "past_key_values.9.key",
-                            "past_key_values.9.value",
-                            "past_key_values.10.key",
-                            "past_key_values.10.value",
-                            "past_key_values.11.key",
-                            "past_key_values.11.value",
-                            "past_key_values.12.key",
-                            "past_key_values.12.value",
-                            "past_key_values.13.key",
-                            "past_key_values.13.value",
-                            "past_key_values.14.key",
-                            "past_key_values.14.value",
-                            "past_key_values.15.key",
-                            "past_key_values.15.value",
-                            "past_key_values.16.key",
-                            "past_key_values.16.value",
-                            "past_key_values.17.key",
-                            "past_key_values.17.value",
-                            "past_key_values.18.key",
-                            "past_key_values.18.value",
-                            "past_key_values.19.key",
-                            "past_key_values.19.value",
-                            "past_key_values.20.key",
-                            "past_key_values.20.value",
-                            "past_key_values.21.key",
-                            "past_key_values.21.value",
-                            "past_key_values.22.key",
-                            "past_key_values.22.value",
-                            "past_key_values.23.key",
-                            "past_key_values.23.value",
-                            "past_key_values.24.key",
-                            "past_key_values.24.value",
-                            "past_key_values.25.key",
-                            "past_key_values.25.value",
-                            "past_key_values.26.key",
-                            "past_key_values.26.value",
-                            "past_key_values.27.key",
-                            "past_key_values.27.value",
-                            "past_key_values.28.key",
-                            "past_key_values.28.value",
-                            "past_key_values.29.key",
-                            "past_key_values.29.value",
-                            "past_key_values.30.key",
-                            "past_key_values.30.value",
-                            "past_key_values.31.key",
-                            "past_key_values.31.value",
-                            "past_key_values.32.key",
-                            "past_key_values.32.value",
-                            "past_key_values.33.key",
-                            "past_key_values.33.value"
-                        ],
-                        "outputs": [
-                            "present.0.key",
-                            "present.0.value",
-                            "present.1.key",
-                            "present.1.value",
-                            "present.2.key",
-                            "present.2.value",
-                            "present.3.key",
-                            "present.3.value",
-                            "present.4.key",
-                            "present.4.value",
-                            "present.5.key",
-                            "present.5.value",
-                            "present.6.key",
-                            "present.6.value",
-                            "present.7.key",
-                            "present.7.value",
-                            "present.8.key",
-                            "present.8.value",
-                            "present.9.key",
-                            "present.9.value",
-                            "present.10.key",
-                            "present.10.value",
-                            "present.11.key",
-                            "present.11.value",
-                            "present.12.key",
-                            "present.12.value",
-                            "present.13.key",
-                            "present.13.value",
-                            "present.14.key",
-                            "present.14.value",
-                            "present.15.key",
-                            "present.15.value",
-                            "present.16.key",
-                            "present.16.value",
-                            "present.17.key",
-                            "present.17.value",
-                            "present.18.key",
-                            "present.18.value",
-                            "present.19.key",
-                            "present.19.value",
-                            "present.20.key",
-                            "present.20.value",
-                            "present.21.key",
-                            "present.21.value",
-                            "present.22.key",
-                            "present.22.value",
-                            "present.23.key",
-                            "present.23.value",
-                            "present.24.key",
-                            "present.24.value",
-                            "present.25.key",
-                            "present.25.value",
-                            "present.26.key",
-                            "present.26.value",
-                            "present.27.key",
-                            "present.27.value",
-                            "present.28.key",
-                            "present.28.value",
-                            "present.29.key",
-                            "present.29.value",
-                            "present.30.key",
-                            "present.30.value",
-                            "present.31.key",
-                            "present.31.value",
-                            "present.32.key",
-                            "present.32.value",
-                            "present.33.key",
-                            "present.33.value",
-                            "/model/layers.34/final_norm_layernorm/SkipLayerNorm_Mul_output_0_QuantizeLinear_Output"
-                        ],
-                        "session_options": {
-                            "intra_op_num_threads": 2,
-                            "inter_op_num_threads": 1,
-                            "provider_options": [
-                                {
-                                    "qnn": {
-                                        "htp_performance_mode": "burst",
-                                        "htp_graph_finalization_optimization_mode": "3",
-                                        "soc_model": "60"
-                                    }
-                                }
-                            ]
-                        },
-                        "run_on_prompt": false
-                    },
-                    "lm_head": {
-                        "filename": "lm_head.onnx",
-                        "inputs": [
-                            "/model/layers.34/final_norm_layernorm/SkipLayerNorm_Mul_output_0_QuantizeLinear_Output"
-                        ],
-                        "outputs": [ "logits" ]
-                    }
-                }
-            ]
-        },
-        "embedding": {
-            "filename": "embeddings_with_image.onnx",
-            "inputs": { "input_ids": "input_ids", "image_features": "image_features" },
-            "outputs": { "inputs_embeds": "/model/embed_tokens/Mul/output_0_QuantizeLinear_Output" }
-        },
-        "vision": {
-            "filename": "model_ctx_vision.onnx",
-            "inputs": { "pixel_values": "pixel_values" },
-            "outputs": { "image_features": "image_features" },
-            "session_options": {
-                "intra_op_num_threads": 2,
-                "inter_op_num_threads": 1,
-                "provider_options": [
-                    {
-                        "qnn": {
-                            "htp_performance_mode": "burst",
-                            "htp_graph_finalization_optimization_mode": "3",
-                            "soc_model": "60"
-                        }
-                    }
-                ]
-            }
-        },
-        "eos_token_id": [ 1, 106 ],
-        "pad_token_id": 0,
-        "type": "gemma3",
-        "vocab_size": 262208
-    },
-    "search": {
-        "diversity_penalty": 0.0,
-        "do_sample": true,
-        "early_stopping": true,
-        "length_penalty": 1.0,
-        "max_length": 131072,
-        "min_length": 0,
-        "no_repeat_ngram_size": 0,
-        "num_beams": 1,
-        "num_return_sequences": 1,
-        "past_present_share_buffer": true,
-        "repetition_penalty": 1.0,
-        "temperature": 1.0,
-        "top_k": 64,
-        "top_p": 0.95
-    }
-}
diff --git a/examples/gemma3/qnn/genai/processor_config.json b/examples/gemma3/qnn/genai/processor_config.json
deleted file mode 100755
index b25059aa2..000000000
--- a/examples/gemma3/qnn/genai/processor_config.json
+++ /dev/null
@@ -1,24 +0,0 @@
-{
-    "processor": {
-        "name": "gemma_3_image_processing",
-        "transforms": [
-            { "operation": { "name": "decode_image", "type": "DecodeImage", "attrs": { "color_space": "RGB" } } },
-            {
-                "operation": {
-                    "name": "resize",
-                    "type": "Resize",
-                    "attrs": { "interpolation": "CUBIC", "width": 896, "height": 896, "keep_aspect_ratio": 0 }
-                }
-            },
-            { "operation": { "name": "re-scale", "type": "Rescale" } },
-            {
-                "operation": {
-                    "name": "normalize",
-                    "type": "Normalize",
-                    "attrs": { "mean": [ 0.5, 0.5, 0.5 ], "std": [ 0.5, 0.5, 0.5 ] }
-                }
-            },
-            { "operation": { "name": "to_channel_first", "type": "Permute3D", "attrs": { "dims": [ 2, 0, 1 ] } } }
-        ]
-    }
-}
diff --git a/examples/gemma3/qnn/qnn_req.txt b/examples/gemma3/qnn/qnn_req.txt
deleted file mode 100755
index 05c845791..000000000
--- a/examples/gemma3/qnn/qnn_req.txt
+++ /dev/null
@@ -1,7 +0,0 @@
-coloredlogs
-flatbuffers
-numpy >= 1.21.6
-packaging
-protobuf
-sympy
-transformers==4.55.2
diff --git a/examples/gemma3/requirements.txt b/examples/gemma3/requirements.txt
deleted file mode 100644
index 337d1987d..000000000
--- a/examples/gemma3/requirements.txt
+++ /dev/null
@@ -1,10 +0,0 @@
-datasets
-onnx==1.18.0
-onnx-ir==0.1.4
-onnxruntime-genai-cuda==0.9.0
-onnxruntime-gpu==1.22.0
-onnxscript==0.3.2
-optimum
-setuptools
-tabulate
-transformers

From 5dff1552e310329862ae450c35886984ac1ab2f4 Mon Sep 17 00:00:00 2001
From: Kyle Romero <kromero@qti.qualcomm.com>
Date: Thu, 18 Sep 2025 10:44:34 -0700
Subject: [PATCH 24/24] Fix review comments

---
 olive/common/hf/utils.py  | 2 +-
 olive/model/handler/hf.py | 3 ---
 2 files changed, 1 insertion(+), 4 deletions(-)

diff --git a/olive/common/hf/utils.py b/olive/common/hf/utils.py
index dee79e6e8..8359396b5 100644
--- a/olive/common/hf/utils.py
+++ b/olive/common/hf/utils.py
@@ -67,7 +67,7 @@ def load_model_from_task(
         class_tuple = (getattr(module, custom_task_class_name),)
     else:
         class_tuple = targeted_task["pt"] or (AutoModel,)
-    print("class_tuple", class_tuple)
+
     model = None
     for i, model_class in enumerate(class_tuple):
         try:
diff --git a/olive/model/handler/hf.py b/olive/model/handler/hf.py
index 343c84f77..4e4bb917d 100644
--- a/olive/model/handler/hf.py
+++ b/olive/model/handler/hf.py
@@ -91,9 +91,6 @@ def load_model(self, rank: int = None, cache_model: bool = True) -> "torch.nn.Mo
                 model = PeftModel.from_pretrained(model, self.adapter_path)
 
         self.model = model if cache_model else None
-
-        logger.error(self.model)
-
         return model
 
     @property