From d494c8277385edd8d39b8ff1d1e9fcfd87c23778 Mon Sep 17 00:00:00 2001 From: Kyle Romero Date: Wed, 13 Aug 2025 14:29:59 -0700 Subject: [PATCH 01/24] Initial commit --- examples/gemma3/README.md | 2 + examples/gemma3/qnn_config.json | 95 ++++++++++++++++++++++++++++++++ examples/gemma3/requirements.txt | 5 ++ 3 files changed, 102 insertions(+) create mode 100644 examples/gemma3/README.md create mode 100644 examples/gemma3/qnn_config.json create mode 100644 examples/gemma3/requirements.txt diff --git a/examples/gemma3/README.md b/examples/gemma3/README.md new file mode 100644 index 000000000..fa20478d2 --- /dev/null +++ b/examples/gemma3/README.md @@ -0,0 +1,2 @@ +# Gemma-3-4B Model Optimization + diff --git a/examples/gemma3/qnn_config.json b/examples/gemma3/qnn_config.json new file mode 100644 index 000000000..d84d5cc13 --- /dev/null +++ b/examples/gemma3/qnn_config.json @@ -0,0 +1,95 @@ +{ + "input_model": { "type": "HfModel", "model_path": "google/gemma-3-4b-it" }, + "systems": { + "qnn_system": { + "type": "PythonEnvironment", + "python_environment_path": "/path/to/qnn/env/bin", + "accelerators": [ { "execution_providers": [ "QNNExecutionProvider" ] } ] + } + }, + "data_configs": [ + { + "name": "wikitext2_train_joined", + "type": "HuggingfaceContainer", + "load_dataset_config": { "data_name": "wikitext", "subset": "wikitext-2-raw-v1", "split": "train" }, + "pre_process_data_config": { + "strategy": "join", + "add_special_tokens": false, + "max_seq_len": 4096, + "max_samples": 128 + } + }, + { + "name": "wikitext2_train_act", + "type": "HuggingfaceContainer", + "load_dataset_config": { "data_name": "wikitext", "subset": "wikitext-2-raw-v1", "split": "train" }, + "pre_process_data_config": { + "strategy": "line-by-line", + "add_special_tokens": true, + "max_samples": 256, + "max_seq_len": 4096 + } + } + ], + "passes": { + "q": { "type": "QuaRot" }, + "g": { + "type": "GptqQuantizer", + "sym": true, + "group_size": -1, + "desc_act": true, + "data_config": "wikitext2_train_joined" + }, + "cs": { "type": "CaptureSplitInfo", "num_splits": 4, "unique_embeds_lm_head_splits": true }, + "mb": { + "type": "ModelBuilder", + "precision": "int4", + "int4_block_size": 32, + "int4_accuracy_level": 4, + "int4_op_types_to_quantize": [ "MatMul", "Gather" ] + }, + "mq": { + "type": "MatMulNBitsToQDQ", + "use_int4": true, + "add_zero_point": true, + "nodes_to_exclude": [ "/lm_head/MatMul_Q4" ], + "save_as_external_data": true + }, + "gs": { + "type": "GraphSurgeries", + "surgeries": [ + { "surgeon": "RemoveRopeMultiCache" }, + { "surgeon": "AttentionMaskToSequenceLengths" }, + { "surgeon": "SimplifiedLayerNormToL2Norm" } + ], + "save_as_external_data": true + }, + "sq": { + "type": "OnnxStaticQuantization", + "data_config": "wikitext2_train_act", + "activation_type": "uint16", + "precision": "uint8", + "calibration_providers": [ "CUDAExecutionProvider" ], + "quant_preprocess": true, + "op_types_to_exclude": [ "GatherBlockQuantized", "GroupQueryAttention", "MatMulNBits" ], + "save_as_external_data": true + }, + "sp": { "type": "SplitModel" }, + "st": { "type": "StaticLLM", "batch_size": 1, "context_length": 64 }, + "cb": { + "type": "EPContextBinaryGenerator", + "provider_options": { + "htp_performance_mode": "burst", + "htp_graph_finalization_optimization_mode": "3", + "soc_model": "60" + }, + "weight_sharing": true + }, + "cp": { "type": "ComposeOnnxModels" } + }, + "target": "qnn_system", + "log_severity_level": 1, + "output_dir": "models/gemma-3-4b-it", + "cache_dir": "cache", + "no_artifacts": true +} diff --git a/examples/gemma3/requirements.txt b/examples/gemma3/requirements.txt new file mode 100644 index 000000000..c51bff135 --- /dev/null +++ b/examples/gemma3/requirements.txt @@ -0,0 +1,5 @@ +datasets +transformers +optimum +onnxruntime-gpu==1.21.1 +onnxruntime-genai-cuda==0.7.1 From ddf3ea861bfb62c5c473a146c95e4c604c6f3f17 Mon Sep 17 00:00:00 2001 From: Kyle Romero Date: Wed, 13 Aug 2025 16:26:33 -0700 Subject: [PATCH 02/24] Add README and start config --- examples/gemma3/README.md | 2 -- examples/gemma3/qnn/README.md | 23 +++++++++++++++++++ examples/gemma3/qnn/env_setup.sh | 20 ++++++++++++++++ .../gemma3-4b-qnn-config.json} | 0 4 files changed, 43 insertions(+), 2 deletions(-) delete mode 100644 examples/gemma3/README.md create mode 100644 examples/gemma3/qnn/README.md create mode 100644 examples/gemma3/qnn/env_setup.sh rename examples/gemma3/{qnn_config.json => qnn/gemma3-4b-qnn-config.json} (100%) diff --git a/examples/gemma3/README.md b/examples/gemma3/README.md deleted file mode 100644 index fa20478d2..000000000 --- a/examples/gemma3/README.md +++ /dev/null @@ -1,2 +0,0 @@ -# Gemma-3-4B Model Optimization - diff --git a/examples/gemma3/qnn/README.md b/examples/gemma3/qnn/README.md new file mode 100644 index 000000000..edfc0ac0a --- /dev/null +++ b/examples/gemma3/qnn/README.md @@ -0,0 +1,23 @@ +# Gemma-3-4B Model Optimization + +This repository demonstrates the optimization of the [Google Gemma-3-4B](https://huggingface.co/google/gemma-3-4b-it) model using **post-training quantization (PTQ)** techniques. The optimization process utilizes an environment based heavily upon the [PTQ tutorial for Phi-3.5](https://github.com/CodeLinaro/Olive/blob/main/examples/phi3_5/README.md) + +## Automated Setup (Linux Only) + +Requirements: +* Python 3.10 +* uv + +This repository contains an automated setup script for Linux that can be used to help automate many of the steps listed in the tutorial above: + +```bash +source env_setup.sh +``` + +## Optimization Process + +Run the following command in your Olive environment after completing the above setup steps: + +```bash +olive run --config gemma3-4b-qnn-config.json +``` diff --git a/examples/gemma3/qnn/env_setup.sh b/examples/gemma3/qnn/env_setup.sh new file mode 100644 index 000000000..a51e84462 --- /dev/null +++ b/examples/gemma3/qnn/env_setup.sh @@ -0,0 +1,20 @@ + +# Installing setuptools to build Olive from source +uv pip install setuptools + +# Requires installation of uv +uv pip install -r ../requirements.txt + +# Disable CUDA extension build +export BUILD_CUDA_EXT=0 + +# Install AutoGPTQ from source +uv pip install --no-build-isolation git+https://github.com/PanQiWei/AutoGPTQ.git + +# Install GptqModel from source +uv pip install --no-build-isolation git+https://github.com/ModelCloud/GPTQModel.git@5d2911a4b2a709afb0941d53c3882d0cd80b9649 + +# Install onnxruntime-qnn without installing onnxruntime +# Note: Installing both at the same time may cause conflicts +uv pip install -r https://raw.githubusercontent.com/microsoft/onnxruntime/refs/heads/main/requirements.txt +uv pip install -U --pre --extra-index-url https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/ORT-Nightly/pypi/simple onnxruntime-qnn --no-deps diff --git a/examples/gemma3/qnn_config.json b/examples/gemma3/qnn/gemma3-4b-qnn-config.json similarity index 100% rename from examples/gemma3/qnn_config.json rename to examples/gemma3/qnn/gemma3-4b-qnn-config.json From 1f540743381cbae522d207ae2502022bacf255d2 Mon Sep 17 00:00:00 2001 From: Kyle Romero Date: Wed, 13 Aug 2025 18:17:39 -0700 Subject: [PATCH 03/24] QuaRot passing, working on GptqQuantizer --- examples/gemma3/qnn/env_setup.sh | 3 +++ examples/gemma3/qnn/gemma3-4b-qnn-config.json | 4 +-- olive/common/hf/utils.py | 4 +++ olive/common/hf/wrapper.py | 26 ++++++++++++++----- 4 files changed, 28 insertions(+), 9 deletions(-) diff --git a/examples/gemma3/qnn/env_setup.sh b/examples/gemma3/qnn/env_setup.sh index a51e84462..03a3a9993 100644 --- a/examples/gemma3/qnn/env_setup.sh +++ b/examples/gemma3/qnn/env_setup.sh @@ -5,6 +5,9 @@ uv pip install setuptools # Requires installation of uv uv pip install -r ../requirements.txt +# Require installation of Olive dependencies +uv pip install -r ../../../requirements.txt + # Disable CUDA extension build export BUILD_CUDA_EXT=0 diff --git a/examples/gemma3/qnn/gemma3-4b-qnn-config.json b/examples/gemma3/qnn/gemma3-4b-qnn-config.json index d84d5cc13..d1efe69d2 100644 --- a/examples/gemma3/qnn/gemma3-4b-qnn-config.json +++ b/examples/gemma3/qnn/gemma3-4b-qnn-config.json @@ -1,9 +1,9 @@ { - "input_model": { "type": "HfModel", "model_path": "google/gemma-3-4b-it" }, + "input_model": { "type": "HfModel", "model_path": "google/gemma-3-4b-it" , "model_attributes": {"head_dim": 256}}, "systems": { "qnn_system": { "type": "PythonEnvironment", - "python_environment_path": "/path/to/qnn/env/bin", + "python_environment_path": "/local/mnt2/workspace/kromero/olive/olive-venv/bin", "accelerators": [ { "execution_providers": [ "QNNExecutionProvider" ] } ] } }, diff --git a/olive/common/hf/utils.py b/olive/common/hf/utils.py index a070e85ac..9a98ff0e3 100644 --- a/olive/common/hf/utils.py +++ b/olive/common/hf/utils.py @@ -119,6 +119,10 @@ def save_model_config(config: Union["PretrainedConfig", "GenerationConfig"], out config.save_pretrained(output_dir, **kwargs) +def get_model_attributes_config(config: "PretrainedConfig", model_type: str): + return config.text_config if model_type == "gemma3" else config + + def save_module_files( config: "PretrainedConfig", model_name_or_path: str, output_dir: str, **kwargs ) -> tuple["PretrainedConfig", list[str]]: diff --git a/olive/common/hf/wrapper.py b/olive/common/hf/wrapper.py index 3946f0f3a..8bb832c21 100644 --- a/olive/common/hf/wrapper.py +++ b/olive/common/hf/wrapper.py @@ -10,6 +10,7 @@ from transformers import PretrainedConfig from olive.common.utils import find_first_matched_value, get_attr, replace_submodules, set_attr +from olive.common.hf.utils import get_model_attributes_config if TYPE_CHECKING: from transformers import PreTrainedModel @@ -195,6 +196,7 @@ class ModelWrapper: "default": ["model.embed_tokens"], "bloom": ["transformer.word_embeddings", "transformer.word_embeddings_layernorm"], "falcon": ["transformer.word_embeddings"], + "gemma3": ["model.language_model.embed_tokens"], "gpt2": ["transformer.wte", "transformer.wpe"], "gpt_neox": ["gpt_neox.embed_in"], "gptj": ["transformer.wte"], @@ -209,11 +211,17 @@ class ModelWrapper: "qwen": "transformer.rotary_emb", } LM_HEAD = {"default": "lm_head"} - PRE_HEAD_LAYERNORM = {"default": "model.norm", "gpt2": "transformer.ln_f", "qwen": "transformer.ln_f"} + PRE_HEAD_LAYERNORM = { + "default": "model.norm", + "gemma3": "model.language_model.norm", + "gpt2": "transformer.ln_f", + "qwen": "transformer.ln_f" + } LAYERS = { "default": "model.layers", "bloom": "transformer.h", "falcon": "transformer.h", + "gemma3": "model.language_model.layers", "gpt2": "transformer.h", "gpt_neox": "gpt_neox.layers", "gptj": "transformer.h", @@ -225,17 +233,20 @@ def __init__(self, config: Union[PretrainedConfig, dict]): self.config = config if isinstance(config, PretrainedConfig) else PretrainedConfig.from_dict(config) self.model_type = find_first_matched_value(self.config, "model_type") + logger.error(self.config) + # model attributes - self.hidden_size = find_first_matched_value(self.config, self.HIDDEN_SIZE_NAMES) - self.num_attention_heads = find_first_matched_value(self.config, self.NUM_ATTENTION_HEADS_NAMES) + model_attributes_config = get_model_attributes_config(self.config, self.model_type) + self.hidden_size = find_first_matched_value(model_attributes_config, self.HIDDEN_SIZE_NAMES) + self.num_attention_heads = find_first_matched_value(model_attributes_config, self.NUM_ATTENTION_HEADS_NAMES) self.num_key_value_heads = ( - find_first_matched_value(self.config, self.NUM_KEY_VALUE_HEADS_NAMES) or self.num_attention_heads + find_first_matched_value(model_attributes_config, self.NUM_KEY_VALUE_HEADS_NAMES) or self.num_attention_heads ) self.head_dim = ( - find_first_matched_value(self.config, self.HEAD_DIM_NAMES) or self.hidden_size // self.num_attention_heads + find_first_matched_value(model_attributes_config, self.HEAD_DIM_NAMES) or self.hidden_size // self.num_attention_heads ) - self.num_hidden_layers = find_first_matched_value(self.config, self.NUM_HIDDEN_LAYER_NAMES) - self.max_length = find_first_matched_value(self.config, self.MAX_LENGTH) + self.num_hidden_layers = find_first_matched_value(model_attributes_config, self.NUM_HIDDEN_LAYER_NAMES) + self.max_length = find_first_matched_value(model_attributes_config, self.MAX_LENGTH) self._model = None self._layer_wrappers = None @@ -266,6 +277,7 @@ def get_pre_head_layernorm(self, return_name: bool = True): return get_submodules(self.model, self.PRE_HEAD_LAYERNORM, self.model_type, return_name=return_name) def get_layers(self, return_name: bool = True): + logger.error(self.model) return get_submodules(self.model, self.LAYERS, self.model_type, return_name=return_name) def get_layer_wrappers(self): From 6cae95ffb03a3a4a2f17a1a08497e4a696c7e1f9 Mon Sep 17 00:00:00 2001 From: Kyle Romero Date: Thu, 14 Aug 2025 18:37:24 -0700 Subject: [PATCH 04/24] Work on dataset integration --- examples/gemma3/qnn/gemma3-4b-qnn-config.json | 33 +--- examples/gemma3/qnn/user_script.py | 168 ++++++++++++++++++ 2 files changed, 177 insertions(+), 24 deletions(-) create mode 100644 examples/gemma3/qnn/user_script.py diff --git a/examples/gemma3/qnn/gemma3-4b-qnn-config.json b/examples/gemma3/qnn/gemma3-4b-qnn-config.json index d1efe69d2..39c2b40ea 100644 --- a/examples/gemma3/qnn/gemma3-4b-qnn-config.json +++ b/examples/gemma3/qnn/gemma3-4b-qnn-config.json @@ -9,36 +9,21 @@ }, "data_configs": [ { - "name": "wikitext2_train_joined", - "type": "HuggingfaceContainer", - "load_dataset_config": { "data_name": "wikitext", "subset": "wikitext-2-raw-v1", "split": "train" }, - "pre_process_data_config": { - "strategy": "join", - "add_special_tokens": false, - "max_seq_len": 4096, - "max_samples": 128 - } - }, - { - "name": "wikitext2_train_act", - "type": "HuggingfaceContainer", - "load_dataset_config": { "data_name": "wikitext", "subset": "wikitext-2-raw-v1", "split": "train" }, - "pre_process_data_config": { - "strategy": "line-by-line", - "add_special_tokens": true, - "max_samples": 256, - "max_seq_len": 4096 - } + "name": "gemma_data_config", + "user_script": "user_script.py", + "load_dataset_config": { "type": "gemma_dataset", "model_id": "google/gemma-3-4b-it" } } ], "passes": { "q": { "type": "QuaRot" }, "g": { - "type": "GptqQuantizer", + "type": "GptqModel", + "bits": 4, "sym": true, "group_size": -1, - "desc_act": true, - "data_config": "wikitext2_train_joined" + "lm_head": false, + "device": "cuda", + "data_config": "gemma_data_config" }, "cs": { "type": "CaptureSplitInfo", "num_splits": 4, "unique_embeds_lm_head_splits": true }, "mb": { @@ -66,7 +51,7 @@ }, "sq": { "type": "OnnxStaticQuantization", - "data_config": "wikitext2_train_act", + "data_config": "gemma_data_config", "activation_type": "uint16", "precision": "uint8", "calibration_providers": [ "CUDAExecutionProvider" ], diff --git a/examples/gemma3/qnn/user_script.py b/examples/gemma3/qnn/user_script.py new file mode 100644 index 000000000..4fc8e8568 --- /dev/null +++ b/examples/gemma3/qnn/user_script.py @@ -0,0 +1,168 @@ +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------- + +import logging +import numpy as np +import os +import torch + +from huggingface_hub import hf_hub_download +from typing import Optional + +from transformers import pipeline +import requests +from PIL import Image + +from transformers import AutoProcessor, LlavaForConditionalGeneration +from transformers import AutoProcessor, Gemma3ForConditionalGeneration +from transformers import AutoConfig, AutoTokenizer +from itertools import chain +from torch.utils.data import DataLoader, Dataset +from datasets import IterableDataset, load_dataset +from transformers import default_data_collator +from PIL import Image as PILImage +from transformers.feature_extraction_utils import BatchFeature +from transformers.image_utils import make_nested_list_of_images + +import numpy as np +from datasets import load_dataset +from torch.utils.data import Dataset +from transformers import CLIPProcessor + +from olive.data.registry import Registry +from torch.utils.data import DataLoader + +logger = logging.getLogger(__name__) + + + +def get_gemma3_dataset(tokenzier, processor, data_files, dataset_path, cache_dir): + def _map1(example): + example['text'] = [_convert_one_conversation(conversation=conversation) for conversation in + example['conversations']] + return example + + def _map2(example): + image = PILImage.open(fp=os.path.join(dataset_path, example["image"])) + example['image_mode'] = image.mode + return example + + def _load_image_and_tokenize(example): + # try: + #print(example['text']) + inputs = processor.apply_chat_template(example['text'][0], + add_generation_prompt=True, tokenize=True, + return_tensors="pt", return_dict=True) + # print("image=", example["image"][0]) + inputs = {k: v.unsqueeze(0) for k, v in inputs.items()} + # inputs.update({"pixel_values": torch.tensor(processor(text="", images=PILImage.open(fp=os.path.join(dataset_path, example["image"][0]))).pixel_values).unsqueeze(0)}) + #print(inputs.keys()) + inputs["input_ids"] = inputs["input_ids"][0] + #print(inputs["input_ids"]) + return inputs + + # except Exception as e: + # print(f"Skipping example due to error: {e}") + # return None + + + dataset = load_dataset("json", data_files=data_files, cache_dir=cache_dir, split='train') + + dataset = dataset.map(_map1) + dataset = dataset.map(_map2) + + dataset = dataset.filter(lambda x: x["image_mode"] == 'RGB') + + return dataset.with_transform(_load_image_and_tokenize) + +class GemmaDataset: + + CACHE_DIR = os.getenv("CACHE_DIR", ".cache") + + def __init__(self, model_id: str, first_n: Optional[int] = None): + self.model_id = model_id + self.first_n = first_n + + self.processor = AutoProcessor.from_pretrained(self.model_id) + self.tokenizer = AutoTokenizer.from_pretrained(self.model_id, cache_dir=None, use_fast=True, trust_remote_code=True) + + self.setup_dataset() + + def setup_dataset(self): + # Uses a LlaVA dataset and transforms it to something Gemma-compatible + + # Issue with Arrow leads to errors when using load_dataset directly on liuhaotian/LLaVA-Instruct-150K + file_path = hf_hub_download(repo_id="liuhaotian/LLaVA-Instruct-150K", filename="llava_instruct_80k.json", repo_type="dataset") + + + + logger.error(file_path) + logger.error(image_file_path) + self.raw_datasets = load_dataset("json", data_files=[file_path], split="train") + self.raw_datasets = self.raw_datasets if self.first_n is None else self.raw_datasets.select(range(self.first_n)) + logger.error(self.raw_datasets) + + # Convert the Llava-style conversation to Gemma-style conversation + self.raw_datasets = self.raw_datasets.map(self._convert_llava_to_gemma_conversation) + for row in self.raw_datasets: + print(row) + + def get_train_dataset(self, first_n: Optional[int] = None): + self.train_dataset = self.raw_datasets if first_n is None else self.raw_datasets[:first_n] + return self.train_dataset + + @staticmethod + def _convert_llava_to_gemma_conversation(entry: dict[str, any]): + entry['text'] = [GemmaDataset._convert_single_llava_to_gemma_conversation(conversation) for conversation in entry["conversations"]] + del entry['conversations'] + return entry + + @staticmethod + def _convert_single_llava_to_gemma_conversation(conversation: list[dict[str, str]]) -> dict[str, str | list[dict]]: + """Convert a single llava-style conversation entry to Gemma-style. + + Examples: + + >>> conversation = {"from": "human", "value": "What are the colors of the bus in the image?"} + >>> _convert_llava_to_gemma_conversation(conversation) + { + 'role': 'user', + 'content': [{'type': 'image'}, {'type': 'text', 'text': 'What are the colors of the bus in the image?'}] + } + >>> conversation = {"from": "gpt", "value": "The bus in the image is white and red."} + >>> _convert_llava_to_gemma_conversation(conversation) + { + 'role': 'assistant', + 'content': [{'type': 'text', 'text': 'The bus in the image is white and red.'}] + } + """ + who = conversation.get("from") + match who: + case "human": + role = "user" + case "gpt": + role = "assistant" + case _: + raise ValueError(f"Unknown role: {who}") + + text = conversation.get("value") + + if "" in text: + has_image = True + text = text.replace("", "") + else: + has_image = False + + return { + "role": role, + "content": ( + [{"type": "image"}, {"type": "text", "text": text}] if has_image else [{"type": "text", "text": text}] + ), + } + + +@Registry.register_dataset() +def gemma_dataset(model_id: str): + return GemmaDataset(model_id, first_n=5).get_train_dataset() From 2d0872e7b327ba24fb1a45ea9d95666dbe3ffacb Mon Sep 17 00:00:00 2001 From: Kyle Romero Date: Fri, 15 Aug 2025 14:54:48 -0700 Subject: [PATCH 05/24] Data processing works --- examples/gemma3/qnn/user_script.py | 110 +++++++++++++++++++++++++---- olive/passes/pytorch/gptqmodel.py | 14 +++- 2 files changed, 109 insertions(+), 15 deletions(-) diff --git a/examples/gemma3/qnn/user_script.py b/examples/gemma3/qnn/user_script.py index 4fc8e8568..1c10e08c0 100644 --- a/examples/gemma3/qnn/user_script.py +++ b/examples/gemma3/qnn/user_script.py @@ -6,6 +6,8 @@ import logging import numpy as np import os +import subprocess +import zipfile import torch from huggingface_hub import hf_hub_download @@ -86,32 +88,88 @@ def __init__(self, model_id: str, first_n: Optional[int] = None): self.first_n = first_n self.processor = AutoProcessor.from_pretrained(self.model_id) - self.tokenizer = AutoTokenizer.from_pretrained(self.model_id, cache_dir=None, use_fast=True, trust_remote_code=True) + self.tokenizer = AutoTokenizer.from_pretrained(self.model_id, cache_dir=self.CACHE_DIR, use_fast=True, trust_remote_code=True) self.setup_dataset() + def _download_and_extract_images(self): + """ + Downloads the coco train2017 image dataset and extracts them to the cache directory + """ + zip_filename = "train2017.zip" + zip_path = os.path.join(self.CACHE_DIR, zip_filename) + extract_path = os.path.join(self.CACHE_DIR, "train2017") + + # Create cache directory if it doesn't exist + os.makedirs(self.CACHE_DIR, exist_ok=True) + + # Check if images are already downloaded and extracted + if os.path.exists(extract_path) and os.listdir(extract_path): + logger.info(f"Images already exist at {extract_path}") + return extract_path + + # Download the dataset if zip doesn't exist + if not os.path.exists(zip_path): + logger.info(f"Downloading COCO train2017 dataset to {zip_path}") + try: + subprocess.run([ + "wget", + "https://images.cocodataset.org/zips/train2017.zip", + "--no-check-certificate", + "-O", zip_path + ], check=True, cwd=self.CACHE_DIR) + logger.info("Download completed successfully") + except subprocess.CalledProcessError as e: + logger.error(f"Failed to download dataset: {e}") + raise + except FileNotFoundError: + logger.error("wget command not found. Please install wget or use an alternative download method.") + raise + + # Extract the zip file + logger.info(f"Extracting {zip_path} to {self.CACHE_DIR}") + try: + with zipfile.ZipFile(zip_path, 'r') as zip_ref: + zip_ref.extractall(self.CACHE_DIR) + logger.info("Extraction completed successfully") + except zipfile.BadZipFile as e: + logger.error(f"Failed to extract zip file: {e}") + # Remove corrupted zip file so it can be re-downloaded + if os.path.exists(zip_path): + os.remove(zip_path) + raise + + return extract_path + def setup_dataset(self): # Uses a LlaVA dataset and transforms it to something Gemma-compatible # Issue with Arrow leads to errors when using load_dataset directly on liuhaotian/LLaVA-Instruct-150K - file_path = hf_hub_download(repo_id="liuhaotian/LLaVA-Instruct-150K", filename="llava_instruct_80k.json", repo_type="dataset") - - + file_path = hf_hub_download(repo_id="liuhaotian/LLaVA-Instruct-150K", filename="llava_instruct_80k.json", repo_type="dataset", cache_dir=self.CACHE_DIR) - logger.error(file_path) - logger.error(image_file_path) + self.image_data_path = self._download_and_extract_images() self.raw_datasets = load_dataset("json", data_files=[file_path], split="train") + + # Limit data processing to the first_n rows self.raw_datasets = self.raw_datasets if self.first_n is None else self.raw_datasets.select(range(self.first_n)) - logger.error(self.raw_datasets) # Convert the Llava-style conversation to Gemma-style conversation self.raw_datasets = self.raw_datasets.map(self._convert_llava_to_gemma_conversation) - for row in self.raw_datasets: - print(row) - def get_train_dataset(self, first_n: Optional[int] = None): - self.train_dataset = self.raw_datasets if first_n is None else self.raw_datasets[:first_n] - return self.train_dataset + # Extract image details using a lambda to pass the dataset_path + self.raw_datasets = self.raw_datasets.map(self._extract_image_details) + + # Filter out any images that are not RGB + self.raw_datasets = self.raw_datasets.filter(lambda x: x["image_mode"] == 'RGB') + + # Loads the images and tokenizes the text + self.raw_datasets = self.raw_datasets.with_transform(self._load_image_and_tokenize) + + for entry in self.raw_datasets: + logger.error(entry) + + def get_train_dataset(self): + return self.raw_datasets @staticmethod def _convert_llava_to_gemma_conversation(entry: dict[str, any]): @@ -162,7 +220,33 @@ def _convert_single_llava_to_gemma_conversation(conversation: list[dict[str, str ), } + def _extract_image_details(self, entry: dict[str, any]): + """ + Extract image details from the dataset example. + Opens the image file and adds image mode information to the example. + """ + image = PILImage.open(fp=os.path.join(self.image_data_path, entry["image"])) + entry['image_mode'] = image.mode + return entry + + def _load_image_and_tokenize(self, entry: dict[str, any]): + """ + Load image and tokenize the conversation for model input. + + Args: + entry: Dataset entry containing text conversation and image path + + Returns: + Tokenized inputs ready for model processing + """ + inputs = self.processor.apply_chat_template(entry['text'][0], + add_generation_prompt=True, tokenize=True, + return_tensors="pt", return_dict=True) + inputs = {k: v.unsqueeze(0) for k, v in inputs.items()} + inputs["input_ids"] = inputs["input_ids"][0] + return inputs + @Registry.register_dataset() def gemma_dataset(model_id: str): - return GemmaDataset(model_id, first_n=5).get_train_dataset() + return GemmaDataset(model_id, first_n=200).get_train_dataset() diff --git a/olive/passes/pytorch/gptqmodel.py b/olive/passes/pytorch/gptqmodel.py index cb54385f3..eeedf8f62 100644 --- a/olive/passes/pytorch/gptqmodel.py +++ b/olive/passes/pytorch/gptqmodel.py @@ -189,8 +189,18 @@ def get_dataset( raise ValueError("Data config is required for PyTorch model.") data_config = validate_config(data_config, DataConfig) dataloader = data_config.to_data_container().create_dataloader() - # each batch consists of (input_data, labels) - dataset = [data[0] for data in dataloader] + # each batch consists of (input_data, labels) or just input_data + dataset = [] + for data in dataloader: + if isinstance(data, (tuple, list)) and len(data) > 0: + # Standard format: (input_data, labels) + dataset.append(data[0]) + elif isinstance(data, dict): + # Data is already in the expected dictionary format + dataset.append(data) + else: + # Data is the input data directly + dataset.append(data) if ( not dataset or not isinstance(dataset, list) From 6a6f67dd3afda4903b4654d22c7ded23ad5ece97 Mon Sep 17 00:00:00 2001 From: Kyle Romero Date: Fri, 15 Aug 2025 15:12:22 -0700 Subject: [PATCH 06/24] Fix lint issues and cleanup --- examples/gemma3/qnn/gemma3-4b-qnn-config.json | 2 +- examples/gemma3/qnn/user_script.py | 205 +++++++----------- examples/gemma3/requirements.txt | 6 +- olive/common/hf/wrapper.py | 14 +- 4 files changed, 93 insertions(+), 134 deletions(-) diff --git a/examples/gemma3/qnn/gemma3-4b-qnn-config.json b/examples/gemma3/qnn/gemma3-4b-qnn-config.json index 39c2b40ea..71986d135 100644 --- a/examples/gemma3/qnn/gemma3-4b-qnn-config.json +++ b/examples/gemma3/qnn/gemma3-4b-qnn-config.json @@ -1,5 +1,5 @@ { - "input_model": { "type": "HfModel", "model_path": "google/gemma-3-4b-it" , "model_attributes": {"head_dim": 256}}, + "input_model": { "type": "HfModel", "model_path": "google/gemma-3-4b-it", "model_attributes": { "head_dim": 256 } }, "systems": { "qnn_system": { "type": "PythonEnvironment", diff --git a/examples/gemma3/qnn/user_script.py b/examples/gemma3/qnn/user_script.py index 1c10e08c0..4c62fa735 100644 --- a/examples/gemma3/qnn/user_script.py +++ b/examples/gemma3/qnn/user_script.py @@ -4,185 +4,141 @@ # -------------------------------------------------------------------------- import logging -import numpy as np import os import subprocess import zipfile -import torch - -from huggingface_hub import hf_hub_download +from pathlib import Path from typing import Optional -from transformers import pipeline -import requests -from PIL import Image - -from transformers import AutoProcessor, LlavaForConditionalGeneration -from transformers import AutoProcessor, Gemma3ForConditionalGeneration -from transformers import AutoConfig, AutoTokenizer -from itertools import chain -from torch.utils.data import DataLoader, Dataset -from datasets import IterableDataset, load_dataset -from transformers import default_data_collator -from PIL import Image as PILImage -from transformers.feature_extraction_utils import BatchFeature -from transformers.image_utils import make_nested_list_of_images - -import numpy as np from datasets import load_dataset -from torch.utils.data import Dataset -from transformers import CLIPProcessor +from huggingface_hub import hf_hub_download +from PIL import Image as PILImage +from transformers import ( + AutoProcessor, + AutoTokenizer, +) from olive.data.registry import Registry -from torch.utils.data import DataLoader logger = logging.getLogger(__name__) - -def get_gemma3_dataset(tokenzier, processor, data_files, dataset_path, cache_dir): - def _map1(example): - example['text'] = [_convert_one_conversation(conversation=conversation) for conversation in - example['conversations']] - return example - - def _map2(example): - image = PILImage.open(fp=os.path.join(dataset_path, example["image"])) - example['image_mode'] = image.mode - return example - - def _load_image_and_tokenize(example): - # try: - #print(example['text']) - inputs = processor.apply_chat_template(example['text'][0], - add_generation_prompt=True, tokenize=True, - return_tensors="pt", return_dict=True) - # print("image=", example["image"][0]) - inputs = {k: v.unsqueeze(0) for k, v in inputs.items()} - # inputs.update({"pixel_values": torch.tensor(processor(text="", images=PILImage.open(fp=os.path.join(dataset_path, example["image"][0]))).pixel_values).unsqueeze(0)}) - #print(inputs.keys()) - inputs["input_ids"] = inputs["input_ids"][0] - #print(inputs["input_ids"]) - return inputs - - # except Exception as e: - # print(f"Skipping example due to error: {e}") - # return None - - - dataset = load_dataset("json", data_files=data_files, cache_dir=cache_dir, split='train') - - dataset = dataset.map(_map1) - dataset = dataset.map(_map2) - - dataset = dataset.filter(lambda x: x["image_mode"] == 'RGB') - - return dataset.with_transform(_load_image_and_tokenize) - class GemmaDataset: - CACHE_DIR = os.getenv("CACHE_DIR", ".cache") - + def __init__(self, model_id: str, first_n: Optional[int] = None): self.model_id = model_id self.first_n = first_n - + self.processor = AutoProcessor.from_pretrained(self.model_id) - self.tokenizer = AutoTokenizer.from_pretrained(self.model_id, cache_dir=self.CACHE_DIR, use_fast=True, trust_remote_code=True) + self.tokenizer = AutoTokenizer.from_pretrained( + self.model_id, cache_dir=self.CACHE_DIR, use_fast=True, trust_remote_code=True + ) self.setup_dataset() def _download_and_extract_images(self): - """ - Downloads the coco train2017 image dataset and extracts them to the cache directory - """ + """Download the COCO train2017 image dataset and extract to the cache directory.""" zip_filename = "train2017.zip" zip_path = os.path.join(self.CACHE_DIR, zip_filename) extract_path = os.path.join(self.CACHE_DIR, "train2017") - + # Create cache directory if it doesn't exist os.makedirs(self.CACHE_DIR, exist_ok=True) - + # Check if images are already downloaded and extracted - if os.path.exists(extract_path) and os.listdir(extract_path): - logger.info(f"Images already exist at {extract_path}") + extract_path_obj = Path(extract_path) + if extract_path_obj.exists() and any(extract_path_obj.iterdir()): + logger.info("Images already exist at %s", extract_path) return extract_path - + # Download the dataset if zip doesn't exist if not os.path.exists(zip_path): - logger.info(f"Downloading COCO train2017 dataset to {zip_path}") + logger.info("Downloading COCO train2017 dataset to %s", zip_path) try: - subprocess.run([ - "wget", - "https://images.cocodataset.org/zips/train2017.zip", - "--no-check-certificate", - "-O", zip_path - ], check=True, cwd=self.CACHE_DIR) + subprocess.run( + [ + "wget", + "https://images.cocodataset.org/zips/train2017.zip", + "--no-check-certificate", + "-O", + zip_path, + ], + check=True, + cwd=self.CACHE_DIR, + ) logger.info("Download completed successfully") - except subprocess.CalledProcessError as e: - logger.error(f"Failed to download dataset: {e}") + except subprocess.CalledProcessError: + logger.exception("Failed to download dataset") raise except FileNotFoundError: - logger.error("wget command not found. Please install wget or use an alternative download method.") + logger.exception("wget command not found. Please install wget or use an alternative download method.") raise - + # Extract the zip file - logger.info(f"Extracting {zip_path} to {self.CACHE_DIR}") + logger.info("Extracting %s to %s", zip_path, self.CACHE_DIR) try: - with zipfile.ZipFile(zip_path, 'r') as zip_ref: + with zipfile.ZipFile(zip_path, "r") as zip_ref: zip_ref.extractall(self.CACHE_DIR) logger.info("Extraction completed successfully") - except zipfile.BadZipFile as e: - logger.error(f"Failed to extract zip file: {e}") + except zipfile.BadZipFile: + logger.exception("Failed to extract zip file") # Remove corrupted zip file so it can be re-downloaded if os.path.exists(zip_path): os.remove(zip_path) raise - + return extract_path def setup_dataset(self): - # Uses a LlaVA dataset and transforms it to something Gemma-compatible + # Uses a LlaVA dataset and transforms it to something Gemma-compatible - # Issue with Arrow leads to errors when using load_dataset directly on liuhaotian/LLaVA-Instruct-150K - file_path = hf_hub_download(repo_id="liuhaotian/LLaVA-Instruct-150K", filename="llava_instruct_80k.json", repo_type="dataset", cache_dir=self.CACHE_DIR) + # Issue with Arrow leads to errors when using load_dataset directly on liuhaotian/LLaVA-Instruct-150K + file_path = hf_hub_download( + repo_id="liuhaotian/LLaVA-Instruct-150K", + filename="llava_instruct_80k.json", + repo_type="dataset", + cache_dir=self.CACHE_DIR, + ) - self.image_data_path = self._download_and_extract_images() - self.raw_datasets = load_dataset("json", data_files=[file_path], split="train") + self.image_data_path = self._download_and_extract_images() + self.raw_datasets = load_dataset("json", data_files=[file_path], split="train") - # Limit data processing to the first_n rows - self.raw_datasets = self.raw_datasets if self.first_n is None else self.raw_datasets.select(range(self.first_n)) + # Limit data processing to the first_n rows + self.raw_datasets = self.raw_datasets if self.first_n is None else self.raw_datasets.select(range(self.first_n)) - # Convert the Llava-style conversation to Gemma-style conversation - self.raw_datasets = self.raw_datasets.map(self._convert_llava_to_gemma_conversation) + # Convert the Llava-style conversation to Gemma-style conversation + self.raw_datasets = self.raw_datasets.map(self._convert_llava_to_gemma_conversation) - # Extract image details using a lambda to pass the dataset_path - self.raw_datasets = self.raw_datasets.map(self._extract_image_details) + # Extract image details using a lambda to pass the dataset_path + self.raw_datasets = self.raw_datasets.map(self._extract_image_details) - # Filter out any images that are not RGB - self.raw_datasets = self.raw_datasets.filter(lambda x: x["image_mode"] == 'RGB') + # Filter out any images that are not RGB + self.raw_datasets = self.raw_datasets.filter(lambda x: x["image_mode"] == "RGB") - # Loads the images and tokenizes the text - self.raw_datasets = self.raw_datasets.with_transform(self._load_image_and_tokenize) + # Loads the images and tokenizes the text + self.raw_datasets = self.raw_datasets.with_transform(self._load_image_and_tokenize) - for entry in self.raw_datasets: - logger.error(entry) + for entry in self.raw_datasets: + logger.error(entry) def get_train_dataset(self): return self.raw_datasets - + @staticmethod def _convert_llava_to_gemma_conversation(entry: dict[str, any]): - entry['text'] = [GemmaDataset._convert_single_llava_to_gemma_conversation(conversation) for conversation in entry["conversations"]] - del entry['conversations'] + entry["text"] = [ + GemmaDataset._convert_single_llava_to_gemma_conversation(conversation) + for conversation in entry["conversations"] + ] + del entry["conversations"] return entry - + @staticmethod def _convert_single_llava_to_gemma_conversation(conversation: list[dict[str, str]]) -> dict[str, str | list[dict]]: """Convert a single llava-style conversation entry to Gemma-style. Examples: - >>> conversation = {"from": "human", "value": "What are the colors of the bus in the image?"} >>> _convert_llava_to_gemma_conversation(conversation) { @@ -195,6 +151,7 @@ def _convert_single_llava_to_gemma_conversation(conversation: list[dict[str, str 'role': 'assistant', 'content': [{'type': 'text', 'text': 'The bus in the image is white and red.'}] } + """ who = conversation.get("from") match who: @@ -219,29 +176,29 @@ def _convert_single_llava_to_gemma_conversation(conversation: list[dict[str, str [{"type": "image"}, {"type": "text", "text": text}] if has_image else [{"type": "text", "text": text}] ), } - + def _extract_image_details(self, entry: dict[str, any]): - """ - Extract image details from the dataset example. + """Extract image details from the dataset example. + Opens the image file and adds image mode information to the example. """ image = PILImage.open(fp=os.path.join(self.image_data_path, entry["image"])) - entry['image_mode'] = image.mode + entry["image_mode"] = image.mode return entry def _load_image_and_tokenize(self, entry: dict[str, any]): - """ - Load image and tokenize the conversation for model input. - + """Load image and tokenize the conversation for model input. + Args: entry: Dataset entry containing text conversation and image path - + Returns: Tokenized inputs ready for model processing + """ - inputs = self.processor.apply_chat_template(entry['text'][0], - add_generation_prompt=True, tokenize=True, - return_tensors="pt", return_dict=True) + inputs = self.processor.apply_chat_template( + entry["text"][0], add_generation_prompt=True, tokenize=True, return_tensors="pt", return_dict=True + ) inputs = {k: v.unsqueeze(0) for k, v in inputs.items()} inputs["input_ids"] = inputs["input_ids"][0] return inputs diff --git a/examples/gemma3/requirements.txt b/examples/gemma3/requirements.txt index c51bff135..0b56b7908 100644 --- a/examples/gemma3/requirements.txt +++ b/examples/gemma3/requirements.txt @@ -1,5 +1,5 @@ datasets -transformers -optimum -onnxruntime-gpu==1.21.1 onnxruntime-genai-cuda==0.7.1 +onnxruntime-gpu==1.21.1 +optimum +transformers diff --git a/olive/common/hf/wrapper.py b/olive/common/hf/wrapper.py index 8bb832c21..6877d7720 100644 --- a/olive/common/hf/wrapper.py +++ b/olive/common/hf/wrapper.py @@ -9,8 +9,8 @@ from torch import nn from transformers import PretrainedConfig -from olive.common.utils import find_first_matched_value, get_attr, replace_submodules, set_attr from olive.common.hf.utils import get_model_attributes_config +from olive.common.utils import find_first_matched_value, get_attr, replace_submodules, set_attr if TYPE_CHECKING: from transformers import PreTrainedModel @@ -213,9 +213,9 @@ class ModelWrapper: LM_HEAD = {"default": "lm_head"} PRE_HEAD_LAYERNORM = { "default": "model.norm", - "gemma3": "model.language_model.norm", - "gpt2": "transformer.ln_f", - "qwen": "transformer.ln_f" + "gemma3": "model.language_model.norm", + "gpt2": "transformer.ln_f", + "qwen": "transformer.ln_f", } LAYERS = { "default": "model.layers", @@ -240,10 +240,12 @@ def __init__(self, config: Union[PretrainedConfig, dict]): self.hidden_size = find_first_matched_value(model_attributes_config, self.HIDDEN_SIZE_NAMES) self.num_attention_heads = find_first_matched_value(model_attributes_config, self.NUM_ATTENTION_HEADS_NAMES) self.num_key_value_heads = ( - find_first_matched_value(model_attributes_config, self.NUM_KEY_VALUE_HEADS_NAMES) or self.num_attention_heads + find_first_matched_value(model_attributes_config, self.NUM_KEY_VALUE_HEADS_NAMES) + or self.num_attention_heads ) self.head_dim = ( - find_first_matched_value(model_attributes_config, self.HEAD_DIM_NAMES) or self.hidden_size // self.num_attention_heads + find_first_matched_value(model_attributes_config, self.HEAD_DIM_NAMES) + or self.hidden_size // self.num_attention_heads ) self.num_hidden_layers = find_first_matched_value(model_attributes_config, self.NUM_HIDDEN_LAYER_NAMES) self.max_length = find_first_matched_value(model_attributes_config, self.MAX_LENGTH) From cd24ddf938cde13b922e038515fb50568fbc1901 Mon Sep 17 00:00:00 2001 From: Kyle Romero Date: Mon, 18 Aug 2025 13:43:56 -0700 Subject: [PATCH 07/24] Adding vision resources --- examples/gemma3/qnn/README.md | 12 +++-- ...ig.json => gemma3-4b-text-qnn-config.json} | 12 ++--- .../qnn/gemma3-4b-vision-qnn-config.json | 45 +++++++++++++++++++ examples/gemma3/qnn/user_script.py | 11 ++++- 4 files changed, 70 insertions(+), 10 deletions(-) rename examples/gemma3/qnn/{gemma3-4b-qnn-config.json => gemma3-4b-text-qnn-config.json} (88%) create mode 100644 examples/gemma3/qnn/gemma3-4b-vision-qnn-config.json diff --git a/examples/gemma3/qnn/README.md b/examples/gemma3/qnn/README.md index edfc0ac0a..b7ff54fb4 100644 --- a/examples/gemma3/qnn/README.md +++ b/examples/gemma3/qnn/README.md @@ -6,7 +6,7 @@ This repository demonstrates the optimization of the [Google Gemma-3-4B](https:/ Requirements: * Python 3.10 -* uv +* uv - Used throughout the setup scripts, please follow the [publically available installation instructions](https://docs.astral.sh/uv/getting-started/installation/#installation-methods) This repository contains an automated setup script for Linux that can be used to help automate many of the steps listed in the tutorial above: @@ -16,8 +16,14 @@ source env_setup.sh ## Optimization Process -Run the following command in your Olive environment after completing the above setup steps: +Since Gemma-3-4B is a multi-modal model composed of both vision and text components, the strategy for optimizing it through Olive is to operate on the constituent models separately before configuring them to work in concert at the onnxruntime-genai stage. + +Thus, the following commands should be used to separately produce context binaries for the text and vision portions of the model, respectively. + +```bash +olive run --config gemma3-4b-text-qnn-config.json +``` ```bash -olive run --config gemma3-4b-qnn-config.json +olive run --config gemma3-4b-vision-qnn-config.json ``` diff --git a/examples/gemma3/qnn/gemma3-4b-qnn-config.json b/examples/gemma3/qnn/gemma3-4b-text-qnn-config.json similarity index 88% rename from examples/gemma3/qnn/gemma3-4b-qnn-config.json rename to examples/gemma3/qnn/gemma3-4b-text-qnn-config.json index 71986d135..d2eff5678 100644 --- a/examples/gemma3/qnn/gemma3-4b-qnn-config.json +++ b/examples/gemma3/qnn/gemma3-4b-text-qnn-config.json @@ -1,5 +1,5 @@ { - "input_model": { "type": "HfModel", "model_path": "google/gemma-3-4b-it", "model_attributes": { "head_dim": 256 } }, + "input_model": { "type": "HfModel", "model_path": "google/gemma-3-4b-it" }, "systems": { "qnn_system": { "type": "PythonEnvironment", @@ -9,9 +9,9 @@ }, "data_configs": [ { - "name": "gemma_data_config", + "name": "gemma_text_data_config", "user_script": "user_script.py", - "load_dataset_config": { "type": "gemma_dataset", "model_id": "google/gemma-3-4b-it" } + "load_dataset_config": { "type": "gemma_text_dataset", "model_id": "google/gemma-3-4b-it" } } ], "passes": { @@ -23,7 +23,7 @@ "group_size": -1, "lm_head": false, "device": "cuda", - "data_config": "gemma_data_config" + "data_config": "gemma_text_data_config" }, "cs": { "type": "CaptureSplitInfo", "num_splits": 4, "unique_embeds_lm_head_splits": true }, "mb": { @@ -51,7 +51,7 @@ }, "sq": { "type": "OnnxStaticQuantization", - "data_config": "gemma_data_config", + "data_config": "gemma_text_data_config", "activation_type": "uint16", "precision": "uint8", "calibration_providers": [ "CUDAExecutionProvider" ], @@ -74,7 +74,7 @@ }, "target": "qnn_system", "log_severity_level": 1, - "output_dir": "models/gemma-3-4b-it", + "output_dir": "models/gemma-3-4b-it-text", "cache_dir": "cache", "no_artifacts": true } diff --git a/examples/gemma3/qnn/gemma3-4b-vision-qnn-config.json b/examples/gemma3/qnn/gemma3-4b-vision-qnn-config.json new file mode 100644 index 000000000..803000d36 --- /dev/null +++ b/examples/gemma3/qnn/gemma3-4b-vision-qnn-config.json @@ -0,0 +1,45 @@ +{ + "input_model": { + "type": "HfModel", + "model_path": "google/gemma-3-4b-it", + "io_config": { + "input_names": [ "input_ids", "pixel_values", "attention_mask" ], + "input_shapes": [ [ 10, 77 ], [ 1, 3, 224, 224 ], [ 10, 77 ] ], + "input_types": [ "int64", "float32", "int64" ], + "output_names": [ "logits_per_image" ], + "output_shapes": [ [ 1, 2 ] ] + } + }, + "systems": { + "qnn_system": { + "type": "PythonEnvironment", + "python_environment_path": "/local/mnt2/workspace/kromero/olive/olive-venv/bin", + "accelerators": [ { "execution_providers": [ "QNNExecutionProvider" ] } ] + } + }, + "data_configs": [ + { + "name": "gemma_vision_data_config", + "user_script": "user_script.py", + "load_dataset_config": { "type": "gemma_vision_dataset", "model_id": "google/gemma-3-4b-it" } + } + ], + "passes": { + "conversion": { "type": "OnnxConversion", "target_opset": 17 }, + "quantization": { + "type": "OnnxStaticQuantization", + "quant_preprocess": true, + "data_config": "gemma_vision_data_config", + "op_types_to_quantize": [ "MatMul", "LayerNormalization", "Gemm", "Sigmoid", "Gelu" ], + "activation_type": "uint16", + "precision": "uint8", + "calibrate_method": "MinMax" + }, + "add_metadata": { "type": "AddOliveMetadata", "graph_name": "gemma-3-4b-it-vision" } + }, + "target": "qnn_system", + "log_severity_level": 1, + "output_dir": "models/gemma-3-4b-it-vision", + "cache_dir": "cache", + "no_artifacts": true +} diff --git a/examples/gemma3/qnn/user_script.py b/examples/gemma3/qnn/user_script.py index 4c62fa735..816db2e12 100644 --- a/examples/gemma3/qnn/user_script.py +++ b/examples/gemma3/qnn/user_script.py @@ -203,7 +203,16 @@ def _load_image_and_tokenize(self, entry: dict[str, any]): inputs["input_ids"] = inputs["input_ids"][0] return inputs +SHORTCUT_FIRST_N = 256 @Registry.register_dataset() def gemma_dataset(model_id: str): - return GemmaDataset(model_id, first_n=200).get_train_dataset() + return GemmaDataset(model_id, first_n=SHORTCUT_FIRST_N).get_train_dataset() + +@Registry.register_dataset() +def gemma_text_dataset(model_id: str): + return GemmaDataset(model_id, first_n=SHORTCUT_FIRST_N, filter='text').get_train_dataset + +@Registry.register_dataset() +def gemma_vision_dataset(model_id: str): + return GemmaDataset(model_id, first_n=SHORTCUT_FIRST_N, filter='images').get_train_dataset() From 636e982f77e478d4dcd592b86d19cfc634b52bec Mon Sep 17 00:00:00 2001 From: Kyle Romero Date: Mon, 18 Aug 2025 18:22:19 -0700 Subject: [PATCH 08/24] Add Gemma3 vision configurations --- .../gemma3/qnn/custom_gemma3_4b_it_vision.py | 18 ++++++++++++++++++ .../qnn/gemma3-4b-vision-qnn-config.json | 16 +++++++++------- olive/model/handler/hf.py | 2 ++ olive/passes/onnx/conversion.py | 14 +++++++------- 4 files changed, 36 insertions(+), 14 deletions(-) create mode 100644 examples/gemma3/qnn/custom_gemma3_4b_it_vision.py diff --git a/examples/gemma3/qnn/custom_gemma3_4b_it_vision.py b/examples/gemma3/qnn/custom_gemma3_4b_it_vision.py new file mode 100644 index 000000000..686de4395 --- /dev/null +++ b/examples/gemma3/qnn/custom_gemma3_4b_it_vision.py @@ -0,0 +1,18 @@ +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------- + + +import torch +from transformers import AutoModel + +def load_gemma3_model(model_path): + return AutoModel.from_pretrained("google/gemma-3-4b-it") + +def get_dummy_inputs(model_handler): + return { + "input_ids": torch.full((1, 256), 262144, dtype=torch.long), # Image token ID + "pixel_values": torch.randn(1, 3, 896, 896, dtype=torch.float32), + "attention_mask": torch.ones((1, 256), dtype=torch.long) + } \ No newline at end of file diff --git a/examples/gemma3/qnn/gemma3-4b-vision-qnn-config.json b/examples/gemma3/qnn/gemma3-4b-vision-qnn-config.json index 803000d36..f09a76885 100644 --- a/examples/gemma3/qnn/gemma3-4b-vision-qnn-config.json +++ b/examples/gemma3/qnn/gemma3-4b-vision-qnn-config.json @@ -1,13 +1,15 @@ { "input_model": { - "type": "HfModel", - "model_path": "google/gemma-3-4b-it", + "type": "PyTorchModel", + "model_script": "custom_gemma3_4b_it_vision.py", + "model_loader": "load_gemma3_model", + "dummy_inputs_func": "get_dummy_inputs", "io_config": { - "input_names": [ "input_ids", "pixel_values", "attention_mask" ], - "input_shapes": [ [ 10, 77 ], [ 1, 3, 224, 224 ], [ 10, 77 ] ], - "input_types": [ "int64", "float32", "int64" ], - "output_names": [ "logits_per_image" ], - "output_shapes": [ [ 1, 2 ] ] + "input_names": ["input_ids", "pixel_values", "attention_mask"], + "input_shapes": [[1, 256], [1, 3, 896, 896], [1, 256]], + "input_types": ["int64", "float32", "int64"], + "output_names": ["last_hidden_state"], + "output_shapes": [[1, 256, 2560]] } }, "systems": { diff --git a/olive/model/handler/hf.py b/olive/model/handler/hf.py index a56a1aab6..bf46d7417 100644 --- a/olive/model/handler/hf.py +++ b/olive/model/handler/hf.py @@ -82,6 +82,8 @@ def load_model(self, rank: int = None, cache_model: bool = True) -> "torch.nn.Mo self.model = model if cache_model else None + logger.error(self.model) + return model @property diff --git a/olive/passes/onnx/conversion.py b/olive/passes/onnx/conversion.py index 9af4c0187..e9ff76799 100644 --- a/olive/passes/onnx/conversion.py +++ b/olive/passes/onnx/conversion.py @@ -49,7 +49,7 @@ def forward(self, *input_data, **input_dict): return self.model(*input_data, **input_dict) -class OnnxConversion(Pass): +class (Pass): """Convert a PyTorch model to ONNX model using torch.onnx.export on CPU.""" @classmethod @@ -212,7 +212,7 @@ def _export_pytorch_model( pytorch_model = pytorch_model.to(torch_dtype) # Apply any necessary patches - OnnxConversion._patch_model_if_necessary(pytorch_model) + ._patch_model_if_necessary(pytorch_model) # get input and output names, and dynamic axes assert io_config is not None, "Cannot get io_config for the model." @@ -502,7 +502,7 @@ def _convert_model_on_device( dummy_inputs = self._get_dummy_inputs(model, config) io_config = model.io_config - converted_onnx_model = OnnxConversion._export_pytorch_model( + converted_onnx_model = ._export_pytorch_model( pytorch_model, dummy_inputs, io_config, config, device, torch_dtype, tempfile.tempdir ) @@ -570,11 +570,11 @@ def _export_ranked_model(params): input_model = DistributedHfModelHandler(**model_config) olive_pytorch_model = input_model.load_model(local_rank) - dummy_inputs = OnnxConversion._get_dummy_inputs(olive_pytorch_model, pass_config) + dummy_inputs = ._get_dummy_inputs(olive_pytorch_model, pass_config) io_config = None if pass_config.use_dynamo_exporter else olive_pytorch_model.io_config pytorch_model = olive_pytorch_model.prepare_session(rank=local_rank) - ranked_onnx_modelproto = OnnxConversion._export_pytorch_model( + ranked_onnx_modelproto = ._export_pytorch_model( pytorch_model, dummy_inputs, io_config, @@ -621,11 +621,11 @@ def _convert_distributed_model_on_device( max_parallel_jobs = min(world_size, config.parallel_jobs or multiprocessing.cpu_count()) if max_parallel_jobs <= 1: - results = [OnnxConversion._export_ranked_model(_) for _ in params] + results = [._export_ranked_model(_) for _ in params] else: context = multiprocessing.get_context("spawn") with context.Pool(processes=max_parallel_jobs) as pool: - results = pool.map(OnnxConversion._export_ranked_model, params) + results = pool.map(._export_ranked_model, params) if world_size != sum(results): raise RuntimeError("Failed to convert models") From b4ea7a3509e18d7723deea87207edf5ca218b412 Mon Sep 17 00:00:00 2001 From: Kyle Romero Date: Mon, 18 Aug 2025 18:27:35 -0700 Subject: [PATCH 09/24] Fix linting error --- examples/gemma3/qnn/custom_gemma3_4b_it_vision.py | 6 ++++-- .../gemma3/qnn/gemma3-4b-vision-qnn-config.json | 14 +++++++------- examples/gemma3/qnn/user_script.py | 8 ++++++-- olive/passes/onnx/conversion.py | 14 +++++++------- 4 files changed, 24 insertions(+), 18 deletions(-) diff --git a/examples/gemma3/qnn/custom_gemma3_4b_it_vision.py b/examples/gemma3/qnn/custom_gemma3_4b_it_vision.py index 686de4395..a969adecb 100644 --- a/examples/gemma3/qnn/custom_gemma3_4b_it_vision.py +++ b/examples/gemma3/qnn/custom_gemma3_4b_it_vision.py @@ -7,12 +7,14 @@ import torch from transformers import AutoModel + def load_gemma3_model(model_path): return AutoModel.from_pretrained("google/gemma-3-4b-it") + def get_dummy_inputs(model_handler): return { "input_ids": torch.full((1, 256), 262144, dtype=torch.long), # Image token ID "pixel_values": torch.randn(1, 3, 896, 896, dtype=torch.float32), - "attention_mask": torch.ones((1, 256), dtype=torch.long) - } \ No newline at end of file + "attention_mask": torch.ones((1, 256), dtype=torch.long), + } diff --git a/examples/gemma3/qnn/gemma3-4b-vision-qnn-config.json b/examples/gemma3/qnn/gemma3-4b-vision-qnn-config.json index f09a76885..fb8dba200 100644 --- a/examples/gemma3/qnn/gemma3-4b-vision-qnn-config.json +++ b/examples/gemma3/qnn/gemma3-4b-vision-qnn-config.json @@ -1,16 +1,16 @@ { - "input_model": { + "input_model": { "type": "PyTorchModel", "model_script": "custom_gemma3_4b_it_vision.py", "model_loader": "load_gemma3_model", "dummy_inputs_func": "get_dummy_inputs", "io_config": { - "input_names": ["input_ids", "pixel_values", "attention_mask"], - "input_shapes": [[1, 256], [1, 3, 896, 896], [1, 256]], - "input_types": ["int64", "float32", "int64"], - "output_names": ["last_hidden_state"], - "output_shapes": [[1, 256, 2560]] - } + "input_names": [ "input_ids", "pixel_values", "attention_mask" ], + "input_shapes": [ [ 1, 256 ], [ 1, 3, 896, 896 ], [ 1, 256 ] ], + "input_types": [ "int64", "float32", "int64" ], + "output_names": [ "last_hidden_state" ], + "output_shapes": [ [ 1, 256, 2560 ] ] + } }, "systems": { "qnn_system": { diff --git a/examples/gemma3/qnn/user_script.py b/examples/gemma3/qnn/user_script.py index 816db2e12..b5d36ead1 100644 --- a/examples/gemma3/qnn/user_script.py +++ b/examples/gemma3/qnn/user_script.py @@ -203,16 +203,20 @@ def _load_image_and_tokenize(self, entry: dict[str, any]): inputs["input_ids"] = inputs["input_ids"][0] return inputs + SHORTCUT_FIRST_N = 256 + @Registry.register_dataset() def gemma_dataset(model_id: str): return GemmaDataset(model_id, first_n=SHORTCUT_FIRST_N).get_train_dataset() + @Registry.register_dataset() def gemma_text_dataset(model_id: str): - return GemmaDataset(model_id, first_n=SHORTCUT_FIRST_N, filter='text').get_train_dataset + return GemmaDataset(model_id, first_n=SHORTCUT_FIRST_N, filter="text").get_train_dataset + @Registry.register_dataset() def gemma_vision_dataset(model_id: str): - return GemmaDataset(model_id, first_n=SHORTCUT_FIRST_N, filter='images').get_train_dataset() + return GemmaDataset(model_id, first_n=SHORTCUT_FIRST_N, filter="images").get_train_dataset() diff --git a/olive/passes/onnx/conversion.py b/olive/passes/onnx/conversion.py index e9ff76799..9af4c0187 100644 --- a/olive/passes/onnx/conversion.py +++ b/olive/passes/onnx/conversion.py @@ -49,7 +49,7 @@ def forward(self, *input_data, **input_dict): return self.model(*input_data, **input_dict) -class (Pass): +class OnnxConversion(Pass): """Convert a PyTorch model to ONNX model using torch.onnx.export on CPU.""" @classmethod @@ -212,7 +212,7 @@ def _export_pytorch_model( pytorch_model = pytorch_model.to(torch_dtype) # Apply any necessary patches - ._patch_model_if_necessary(pytorch_model) + OnnxConversion._patch_model_if_necessary(pytorch_model) # get input and output names, and dynamic axes assert io_config is not None, "Cannot get io_config for the model." @@ -502,7 +502,7 @@ def _convert_model_on_device( dummy_inputs = self._get_dummy_inputs(model, config) io_config = model.io_config - converted_onnx_model = ._export_pytorch_model( + converted_onnx_model = OnnxConversion._export_pytorch_model( pytorch_model, dummy_inputs, io_config, config, device, torch_dtype, tempfile.tempdir ) @@ -570,11 +570,11 @@ def _export_ranked_model(params): input_model = DistributedHfModelHandler(**model_config) olive_pytorch_model = input_model.load_model(local_rank) - dummy_inputs = ._get_dummy_inputs(olive_pytorch_model, pass_config) + dummy_inputs = OnnxConversion._get_dummy_inputs(olive_pytorch_model, pass_config) io_config = None if pass_config.use_dynamo_exporter else olive_pytorch_model.io_config pytorch_model = olive_pytorch_model.prepare_session(rank=local_rank) - ranked_onnx_modelproto = ._export_pytorch_model( + ranked_onnx_modelproto = OnnxConversion._export_pytorch_model( pytorch_model, dummy_inputs, io_config, @@ -621,11 +621,11 @@ def _convert_distributed_model_on_device( max_parallel_jobs = min(world_size, config.parallel_jobs or multiprocessing.cpu_count()) if max_parallel_jobs <= 1: - results = [._export_ranked_model(_) for _ in params] + results = [OnnxConversion._export_ranked_model(_) for _ in params] else: context = multiprocessing.get_context("spawn") with context.Pool(processes=max_parallel_jobs) as pool: - results = pool.map(._export_ranked_model, params) + results = pool.map(OnnxConversion._export_ranked_model, params) if world_size != sum(results): raise RuntimeError("Failed to convert models") From 1f69af3939e780fea976676dddf254fa21ede420 Mon Sep 17 00:00:00 2001 From: Kyle Romero Date: Tue, 19 Aug 2025 16:08:52 -0700 Subject: [PATCH 10/24] Vision model onnx conversion working --- examples/gemma3/qnn/README.md | 4 +- .../gemma3/qnn/custom_gemma3_4b_it_vision.py | 32 +- examples/gemma3/qnn/env_setup.sh | 5 + .../qnn/gemma3-4b-vision-qnn-config.json | 20 +- examples/gemma3/qnn/user_script.py | 277 +++++++++++++----- 5 files changed, 247 insertions(+), 91 deletions(-) diff --git a/examples/gemma3/qnn/README.md b/examples/gemma3/qnn/README.md index b7ff54fb4..6fb3e3cb6 100644 --- a/examples/gemma3/qnn/README.md +++ b/examples/gemma3/qnn/README.md @@ -8,7 +8,7 @@ Requirements: * Python 3.10 * uv - Used throughout the setup scripts, please follow the [publically available installation instructions](https://docs.astral.sh/uv/getting-started/installation/#installation-methods) -This repository contains an automated setup script for Linux that can be used to help automate many of the steps listed in the tutorial above: +This repository contains an automated setup script for Linux that can be used to help automate many of the steps listed in the Phi-3.5 tutorial above: ```bash source env_setup.sh @@ -16,7 +16,7 @@ source env_setup.sh ## Optimization Process -Since Gemma-3-4B is a multi-modal model composed of both vision and text components, the strategy for optimizing it through Olive is to operate on the constituent models separately before configuring them to work in concert at the onnxruntime-genai stage. +Since Gemma-3-4B is a multi-modal model composed of both vision and text components, the strategy for optimizing it through Olive is to operate on the constituent models before configuring them to work in concert at the onnxruntime-genai stage. Thus, the following commands should be used to separately produce context binaries for the text and vision portions of the model, respectively. diff --git a/examples/gemma3/qnn/custom_gemma3_4b_it_vision.py b/examples/gemma3/qnn/custom_gemma3_4b_it_vision.py index a969adecb..c0d35ecb5 100644 --- a/examples/gemma3/qnn/custom_gemma3_4b_it_vision.py +++ b/examples/gemma3/qnn/custom_gemma3_4b_it_vision.py @@ -4,17 +4,33 @@ # -------------------------------------------------------------------------- +import logging + import torch from transformers import AutoModel +logger = logging.getLogger(__name__) -def load_gemma3_model(model_path): - return AutoModel.from_pretrained("google/gemma-3-4b-it") +class Gemma3VisualEmbeddingGenerator(torch.nn.Module): + def __init__(self, full_model): + super().__init__() + # Extract only the vision components + self.vision_tower = full_model.vision_tower + self.multi_modal_projector = full_model.multi_modal_projector + + def forward(self, pixel_values): + # Process images through vision tower + image_outputs = self.vision_tower(pixel_values, output_hidden_states=True) + selected_image_feature = image_outputs.last_hidden_state + # Project to final embedding space + return self.multi_modal_projector(selected_image_feature) + + +def load_gemma3_model(model_path): + full_model = AutoModel.from_pretrained("google/gemma-3-4b-it") + logger.info("Loaded full model: %s", full_model) -def get_dummy_inputs(model_handler): - return { - "input_ids": torch.full((1, 256), 262144, dtype=torch.long), # Image token ID - "pixel_values": torch.randn(1, 3, 896, 896, dtype=torch.float32), - "attention_mask": torch.ones((1, 256), dtype=torch.long), - } + vision_model = Gemma3VisualEmbeddingGenerator(full_model) + logger.info("Created vision-only model: %s", vision_model) + return vision_model diff --git a/examples/gemma3/qnn/env_setup.sh b/examples/gemma3/qnn/env_setup.sh index 03a3a9993..bc799d110 100644 --- a/examples/gemma3/qnn/env_setup.sh +++ b/examples/gemma3/qnn/env_setup.sh @@ -1,3 +1,8 @@ +#!/bin/bash +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------- # Installing setuptools to build Olive from source uv pip install setuptools diff --git a/examples/gemma3/qnn/gemma3-4b-vision-qnn-config.json b/examples/gemma3/qnn/gemma3-4b-vision-qnn-config.json index fb8dba200..cb2860fd7 100644 --- a/examples/gemma3/qnn/gemma3-4b-vision-qnn-config.json +++ b/examples/gemma3/qnn/gemma3-4b-vision-qnn-config.json @@ -3,12 +3,11 @@ "type": "PyTorchModel", "model_script": "custom_gemma3_4b_it_vision.py", "model_loader": "load_gemma3_model", - "dummy_inputs_func": "get_dummy_inputs", "io_config": { - "input_names": [ "input_ids", "pixel_values", "attention_mask" ], - "input_shapes": [ [ 1, 256 ], [ 1, 3, 896, 896 ], [ 1, 256 ] ], - "input_types": [ "int64", "float32", "int64" ], - "output_names": [ "last_hidden_state" ], + "input_names": [ "pixel_values" ], + "input_shapes": [ [ 1, 3, 896, 896 ] ], + "input_types": [ "float32" ], + "output_names": [ "image_features" ], "output_shapes": [ [ 1, 256, 2560 ] ] } }, @@ -27,16 +26,23 @@ } ], "passes": { - "conversion": { "type": "OnnxConversion", "target_opset": 17 }, + "conversion": { "type": "OnnxConversion", "target_opset": 20 }, + "surgery": { "type": "GraphSurgeries", "surgeries": [ { "surgeon": "MatMulAddToGemm" } ] }, "quantization": { "type": "OnnxStaticQuantization", "quant_preprocess": true, "data_config": "gemma_vision_data_config", - "op_types_to_quantize": [ "MatMul", "LayerNormalization", "Gemm", "Sigmoid", "Gelu" ], "activation_type": "uint16", "precision": "uint8", "calibrate_method": "MinMax" }, + "cb": { + "type": "EPContextBinaryGenerator", + "provider_options": { + "htp_graph_finalization_optimization_mode": "3", + "offload_graph_io_quantization": "0" + } + }, "add_metadata": { "type": "AddOliveMetadata", "graph_name": "gemma-3-4b-it-vision" } }, "target": "qnn_system", diff --git a/examples/gemma3/qnn/user_script.py b/examples/gemma3/qnn/user_script.py index b5d36ead1..2388cbe7f 100644 --- a/examples/gemma3/qnn/user_script.py +++ b/examples/gemma3/qnn/user_script.py @@ -7,6 +7,7 @@ import os import subprocess import zipfile +from abc import ABC, abstractmethod from pathlib import Path from typing import Optional @@ -23,20 +24,101 @@ logger = logging.getLogger(__name__) -class GemmaDataset: +class BaseGemmaDataset(ABC): + """Abstract base class for Gemma dataset implementations.""" + CACHE_DIR = os.getenv("CACHE_DIR", ".cache") def __init__(self, model_id: str, first_n: Optional[int] = None): self.model_id = model_id self.first_n = first_n - self.processor = AutoProcessor.from_pretrained(self.model_id) - self.tokenizer = AutoTokenizer.from_pretrained( - self.model_id, cache_dir=self.CACHE_DIR, use_fast=True, trust_remote_code=True - ) + + # Initialize attributes that will be set during dataset loading + self.image_data_path = None + self.raw_datasets = None + + # Initialize processor components based on subclass requirements + self._initialize_processor_components() self.setup_dataset() + @abstractmethod + def _initialize_processor_components(self): + """Initialize processor components specific to the dataset type.""" + + @abstractmethod + def _process_dataset_entry(self, entry: dict[str, any]): + """Process a single dataset entry according to the dataset type.""" + + def _convert_single_llava_to_gemma_conversation( + self, conversation: list[dict[str, str]], strip_images: bool = False + ) -> dict[str, str | list[dict]]: + """Convert a single llava-style conversation entry to Gemma-style. + + Args: + conversation: The conversation entry to convert + strip_images: If True, remove tokens and create text-only content. + If False, preserve tokens and create multimodal content. + + Examples: + >>> conversation = {"from": "human", "value": "What are the colors of the bus in the image?"} + >>> _convert_single_llava_to_gemma_conversation(conversation, strip_images=False) + { + 'role': 'user', + 'content': [{'type': 'image'}, {'type': 'text', 'text': 'What are the colors of the bus in the image?'}] + } + >>> _convert_single_llava_to_gemma_conversation(conversation, strip_images=True) + { + 'role': 'user', + 'content': [{'type': 'text', 'text': 'What are the colors of the bus in the image?'}] + } + + """ + who = conversation.get("from") + match who: + case "human": + role = "user" + case "gpt": + role = "assistant" + case _: + raise ValueError(f"Unknown role: {who}") + + text = conversation.get("value") + + if strip_images: + # Text-only: remove image references completely + text = text.replace("", "").strip() + return { + "role": role, + "content": [{"type": "text", "text": text}], + } + else: + # Multimodal: preserve image references + if "" in text: + has_image = True + text = text.replace("", "") + else: + has_image = False + + return { + "role": role, + "content": ( + [{"type": "image"}, {"type": "text", "text": text}] + if has_image + else [{"type": "text", "text": text}] + ), + } + + def _convert_llava_to_gemma_conversation(self, entry: dict[str, any], strip_images: bool = False): + """Convert LlaVA-style conversations to Gemma-style.""" + entry["text"] = [ + self._convert_single_llava_to_gemma_conversation(conversation, strip_images=strip_images) + for conversation in entry["conversations"] + ] + del entry["conversations"] + return entry + def _download_and_extract_images(self): """Download the COCO train2017 image dataset and extract to the cache directory.""" zip_filename = "train2017.zip" @@ -90,9 +172,8 @@ def _download_and_extract_images(self): return extract_path - def setup_dataset(self): - # Uses a LlaVA dataset and transforms it to something Gemma-compatible - + def _load_base_dataset(self): + """Load the base LlaVA dataset.""" # Issue with Arrow leads to errors when using load_dataset directly on liuhaotian/LLaVA-Instruct-150K file_path = hf_hub_download( repo_id="liuhaotian/LLaVA-Instruct-150K", @@ -107,86 +188,67 @@ def setup_dataset(self): # Limit data processing to the first_n rows self.raw_datasets = self.raw_datasets if self.first_n is None else self.raw_datasets.select(range(self.first_n)) - # Convert the Llava-style conversation to Gemma-style conversation - self.raw_datasets = self.raw_datasets.map(self._convert_llava_to_gemma_conversation) + def _extract_image_details(self, entry: dict[str, any]): + """Extract image details from the dataset example. + + Opens the image file and adds image mode information to the example. + """ + image = PILImage.open(fp=os.path.join(self.image_data_path, entry["image"])) + entry["image_mode"] = image.mode + return entry + + def setup_dataset(self): + """Set up the dataset with common preprocessing steps.""" + self._load_base_dataset() - # Extract image details using a lambda to pass the dataset_path + # Extract image details self.raw_datasets = self.raw_datasets.map(self._extract_image_details) # Filter out any images that are not RGB self.raw_datasets = self.raw_datasets.filter(lambda x: x["image_mode"] == "RGB") - # Loads the images and tokenizes the text - self.raw_datasets = self.raw_datasets.with_transform(self._load_image_and_tokenize) + # Apply dataset-specific processing + logger.error(self.raw_datasets[0]) + logger.error(self.raw_datasets[1]) - for entry in self.raw_datasets: - logger.error(entry) + self.raw_datasets = self.raw_datasets.with_transform(self._process_dataset_entry) - def get_train_dataset(self): - return self.raw_datasets + logger.error(self.raw_datasets[0]) + logger.error(self.raw_datasets[1]) - @staticmethod - def _convert_llava_to_gemma_conversation(entry: dict[str, any]): - entry["text"] = [ - GemmaDataset._convert_single_llava_to_gemma_conversation(conversation) - for conversation in entry["conversations"] - ] - del entry["conversations"] - return entry + def get_dataset(self): + """Return the processed dataset.""" + return self.raw_datasets - @staticmethod - def _convert_single_llava_to_gemma_conversation(conversation: list[dict[str, str]]) -> dict[str, str | list[dict]]: - """Convert a single llava-style conversation entry to Gemma-style. - Examples: - >>> conversation = {"from": "human", "value": "What are the colors of the bus in the image?"} - >>> _convert_llava_to_gemma_conversation(conversation) - { - 'role': 'user', - 'content': [{'type': 'image'}, {'type': 'text', 'text': 'What are the colors of the bus in the image?'}] - } - >>> conversation = {"from": "gpt", "value": "The bus in the image is white and red."} - >>> _convert_llava_to_gemma_conversation(conversation) - { - 'role': 'assistant', - 'content': [{'type': 'text', 'text': 'The bus in the image is white and red.'}] - } +class GemmaMultimodalDataset(BaseGemmaDataset): + """Dataset for full E2E Gemma 3 multi-modal model including both image and text.""" - """ - who = conversation.get("from") - match who: - case "human": - role = "user" - case "gpt": - role = "assistant" - case _: - raise ValueError(f"Unknown role: {who}") + def _initialize_processor_components(self): + """Initialize tokenizer for multimodal processing.""" + self.tokenizer = AutoTokenizer.from_pretrained( + self.model_id, cache_dir=self.CACHE_DIR, use_fast=True, trust_remote_code=True + ) - text = conversation.get("value") + def setup_dataset(self): + """Set up the multimodal dataset with text conversation conversion.""" + self._load_base_dataset() - if "" in text: - has_image = True - text = text.replace("", "") - else: - has_image = False + # Convert the Llava-style conversation to Gemma-style conversation (preserve images) + self.raw_datasets = self.raw_datasets.map( + lambda entry: self._convert_llava_to_gemma_conversation(entry, strip_images=False) + ) - return { - "role": role, - "content": ( - [{"type": "image"}, {"type": "text", "text": text}] if has_image else [{"type": "text", "text": text}] - ), - } + # Extract image details + self.raw_datasets = self.raw_datasets.map(self._extract_image_details) - def _extract_image_details(self, entry: dict[str, any]): - """Extract image details from the dataset example. + # Filter out any images that are not RGB + self.raw_datasets = self.raw_datasets.filter(lambda x: x["image_mode"] == "RGB") - Opens the image file and adds image mode information to the example. - """ - image = PILImage.open(fp=os.path.join(self.image_data_path, entry["image"])) - entry["image_mode"] = image.mode - return entry + # Apply multimodal processing + self.raw_datasets = self.raw_datasets.with_transform(self._process_dataset_entry) - def _load_image_and_tokenize(self, entry: dict[str, any]): + def _process_dataset_entry(self, entry: dict[str, any]): """Load image and tokenize the conversation for model input. Args: @@ -204,19 +266,86 @@ def _load_image_and_tokenize(self, entry: dict[str, any]): return inputs -SHORTCUT_FIRST_N = 256 +class GemmaTextOnlyDataset(BaseGemmaDataset): + """Dataset for only the text portion of the Gemma 3 model.""" + + def _initialize_processor_components(self): + """Initialize tokenizer for text-only processing.""" + self.tokenizer = AutoTokenizer.from_pretrained( + self.model_id, cache_dir=self.CACHE_DIR, use_fast=True, trust_remote_code=True + ) + + def setup_dataset(self): + """Set up the text-only dataset with conversation conversion.""" + self._load_base_dataset() + + # Convert the Llava-style conversation to Gemma-style conversation (strip images) + self.raw_datasets = self.raw_datasets.map( + lambda entry: self._convert_llava_to_gemma_conversation(entry, strip_images=True) + ) + + # Extract image details (still needed for filtering) + self.raw_datasets = self.raw_datasets.map(self._extract_image_details) + + # Filter out any images that are not RGB + self.raw_datasets = self.raw_datasets.filter(lambda x: x["image_mode"] == "RGB") + + # Apply text-only processing + self.raw_datasets = self.raw_datasets.with_transform(self._process_dataset_entry) + + def _process_dataset_entry(self, entry: dict[str, any]): + """Extract and tokenize only the text content. + + Args: + entry: Dataset entry containing text conversation + + Returns: + Tokenized text inputs ready for model processing + + """ + # Apply chat template without images, text-only + inputs = self.tokenizer.apply_chat_template( + entry["text"][0], add_generation_prompt=True, tokenize=True, return_tensors="pt", return_dict=True + ) + return {k: v.squeeze(0) for k, v in inputs.items()} # Remove batch dimension + + +class GemmaVisionOnlyDataset(BaseGemmaDataset): + """Dataset for only the vision tower of the Gemma 3 model.""" + + def _initialize_processor_components(self): + """No additional components needed for vision-only processing.""" + + def _process_dataset_entry(self, entry: dict[str, any]): + """Load image and extract only pixel_values for vision-only processing.""" + # Load and process the image + logger.error("PROCESSING IMAGE") + image = PILImage.open(fp=os.path.join(self.image_data_path, entry["image"][0])) + + # Process image to get pixel_values + inputs = self.processor(text="", images=image, return_tensors="pt") + + # Return only pixel_values + return {"pixel_values": inputs["pixel_values"]} + + +# Remove this when submitting for review +SHORTCUT_FIRST_N = 2 @Registry.register_dataset() def gemma_dataset(model_id: str): - return GemmaDataset(model_id, first_n=SHORTCUT_FIRST_N).get_train_dataset() + """Full E2E Gemma 3 multi-modal dataset (image + text).""" + return GemmaMultimodalDataset(model_id, first_n=SHORTCUT_FIRST_N).get_dataset() @Registry.register_dataset() def gemma_text_dataset(model_id: str): - return GemmaDataset(model_id, first_n=SHORTCUT_FIRST_N, filter="text").get_train_dataset + """Text-only Gemma 3 dataset.""" + return GemmaTextOnlyDataset(model_id, first_n=SHORTCUT_FIRST_N).get_dataset() @Registry.register_dataset() def gemma_vision_dataset(model_id: str): - return GemmaDataset(model_id, first_n=SHORTCUT_FIRST_N, filter="images").get_train_dataset() + """Vision-only Gemma 3 dataset.""" + return GemmaVisionOnlyDataset(model_id, first_n=SHORTCUT_FIRST_N).get_dataset() From aed20eccf1d64dde6f33124d343ca25ac1faf01d Mon Sep 17 00:00:00 2001 From: Kyle Romero Date: Tue, 19 Aug 2025 17:02:26 -0700 Subject: [PATCH 11/24] Enable quant on text model --- examples/gemma3/qnn/README.md | 2 ++ examples/gemma3/qnn/env_setup.sh | 4 ++-- examples/gemma3/qnn/gemma3-4b-text-qnn-config.json | 2 +- .../gemma3/qnn/gemma3-4b-vision-qnn-config.json | 3 ++- examples/gemma3/qnn/user_script.py | 2 +- olive/passes/pytorch/rotate.py | 13 +++++++++++-- 6 files changed, 19 insertions(+), 7 deletions(-) diff --git a/examples/gemma3/qnn/README.md b/examples/gemma3/qnn/README.md index 6fb3e3cb6..275f1816f 100644 --- a/examples/gemma3/qnn/README.md +++ b/examples/gemma3/qnn/README.md @@ -14,6 +14,8 @@ This repository contains an automated setup script for Linux that can be used to source env_setup.sh ``` +> **Warning:** The above script uses a different commit hash (558449bed3ef2653c36041650d30da6bbbca440d) for building GPTQModel than the Phi-3.5 tutorial due to a [memory leak issue](https://github.com/ModelCloud/GPTQModel/commit/558449bed3ef2653c36041650d30da6bbbca440d) with Gemma3. + ## Optimization Process Since Gemma-3-4B is a multi-modal model composed of both vision and text components, the strategy for optimizing it through Olive is to operate on the constituent models before configuring them to work in concert at the onnxruntime-genai stage. diff --git a/examples/gemma3/qnn/env_setup.sh b/examples/gemma3/qnn/env_setup.sh index bc799d110..aa117afc0 100644 --- a/examples/gemma3/qnn/env_setup.sh +++ b/examples/gemma3/qnn/env_setup.sh @@ -20,9 +20,9 @@ export BUILD_CUDA_EXT=0 uv pip install --no-build-isolation git+https://github.com/PanQiWei/AutoGPTQ.git # Install GptqModel from source -uv pip install --no-build-isolation git+https://github.com/ModelCloud/GPTQModel.git@5d2911a4b2a709afb0941d53c3882d0cd80b9649 +# Note: Commit hash corresponds to commit which fixes Gemma 3 memory leak issue. See README.md for additional details. +uv pip install --no-build-isolation git+https://github.com/ModelCloud/GPTQModel.git@558449bed3ef2653c36041650d30da6bbbca440d # Install onnxruntime-qnn without installing onnxruntime -# Note: Installing both at the same time may cause conflicts uv pip install -r https://raw.githubusercontent.com/microsoft/onnxruntime/refs/heads/main/requirements.txt uv pip install -U --pre --extra-index-url https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/ORT-Nightly/pypi/simple onnxruntime-qnn --no-deps diff --git a/examples/gemma3/qnn/gemma3-4b-text-qnn-config.json b/examples/gemma3/qnn/gemma3-4b-text-qnn-config.json index d2eff5678..675d991bb 100644 --- a/examples/gemma3/qnn/gemma3-4b-text-qnn-config.json +++ b/examples/gemma3/qnn/gemma3-4b-text-qnn-config.json @@ -15,7 +15,7 @@ } ], "passes": { - "q": { "type": "QuaRot" }, + "q": { "type": "QuaRot", "device": "cpu" }, "g": { "type": "GptqModel", "bits": 4, diff --git a/examples/gemma3/qnn/gemma3-4b-vision-qnn-config.json b/examples/gemma3/qnn/gemma3-4b-vision-qnn-config.json index cb2860fd7..42b775087 100644 --- a/examples/gemma3/qnn/gemma3-4b-vision-qnn-config.json +++ b/examples/gemma3/qnn/gemma3-4b-vision-qnn-config.json @@ -34,7 +34,8 @@ "data_config": "gemma_vision_data_config", "activation_type": "uint16", "precision": "uint8", - "calibrate_method": "MinMax" + "calibrate_method": "MinMax", + "calibration_providers": [ "CUDAExecutionProvider" ] }, "cb": { "type": "EPContextBinaryGenerator", diff --git a/examples/gemma3/qnn/user_script.py b/examples/gemma3/qnn/user_script.py index 2388cbe7f..1ddbc1839 100644 --- a/examples/gemma3/qnn/user_script.py +++ b/examples/gemma3/qnn/user_script.py @@ -330,7 +330,7 @@ def _process_dataset_entry(self, entry: dict[str, any]): # Remove this when submitting for review -SHORTCUT_FIRST_N = 2 +SHORTCUT_FIRST_N = 256 @Registry.register_dataset() diff --git a/olive/passes/pytorch/rotate.py b/olive/passes/pytorch/rotate.py index 470eb619a..d82fe947d 100644 --- a/olive/passes/pytorch/rotate.py +++ b/olive/passes/pytorch/rotate.py @@ -44,6 +44,11 @@ class RotateMode(StrEnumBase): @classmethod def _default_config(cls, accelerator_spec: AcceleratorSpec) -> dict[str, PassConfigParam]: return { + "device": PassConfigParam( + type_=str, + default_value="cpu", + description="Whether to run rotation on cpu or gpu. Accepted values are 'cpu' and 'cuda'.", + ), "seed": PassConfigParam( type_=int, default_value=0, @@ -60,6 +65,7 @@ def _default_config(cls, accelerator_spec: AcceleratorSpec) -> dict[str, PassCon def rotate_model( self, model: HfModelHandler, + device: str, rotate_mode: str, seed: int, training_args: Optional[BaseHFTrainingArguments] = None, @@ -157,10 +163,13 @@ def rotate_model( count_trainable_parameters(model_wrapper.model), ) + if device == "cuda" and not torch.cuda.is_available(): + raise ValueError("Please install CUDA to rotate with it.") + return ( model_wrapper, rotation_params, - [((RotateEmbed, RotateLinear), lambda x: x.create_merged("cuda" if torch.cuda.is_available() else "cpu"))], + [((RotateEmbed, RotateLinear), lambda x: x.create_merged(device))], ) @classmethod @@ -246,7 +255,7 @@ class QuaRot(RotateBase): def _run_for_config( self, model: HfModelHandler, config: type[BasePassConfig], output_model_path: str ) -> HfModelHandler: - model_wrapper, _, save_replacements = self.rotate_model(model, config.rotate_mode, config.seed) + model_wrapper, _, save_replacements = self.rotate_model(model, config.device, config.rotate_mode, config.seed) # save the model model_wrapper.save_model(output_model_path, replacements=save_replacements) From ba0633c8bb354c705b326cc73cd12d73421930b9 Mon Sep 17 00:00:00 2001 From: Kyle Romero Date: Mon, 25 Aug 2025 20:11:32 -0700 Subject: [PATCH 12/24] Improve README --- examples/gemma3/qnn/README.md | 109 ++++++++++++++++-- .../qnn/gemma3-4b-vision-qnn-config.json | 3 +- examples/gemma3/qnn/user_script.py | 2 +- 3 files changed, 102 insertions(+), 12 deletions(-) diff --git a/examples/gemma3/qnn/README.md b/examples/gemma3/qnn/README.md index 275f1816f..93c347fbe 100644 --- a/examples/gemma3/qnn/README.md +++ b/examples/gemma3/qnn/README.md @@ -1,26 +1,92 @@ # Gemma-3-4B Model Optimization -This repository demonstrates the optimization of the [Google Gemma-3-4B](https://huggingface.co/google/gemma-3-4b-it) model using **post-training quantization (PTQ)** techniques. The optimization process utilizes an environment based heavily upon the [PTQ tutorial for Phi-3.5](https://github.com/CodeLinaro/Olive/blob/main/examples/phi3_5/README.md) +This repository demonstrates the optimization of the [Google Gemma-3-4B](https://huggingface.co/google/gemma-3-4b-it) model using **post-training quantization (PTQ)** techniques for QNN (Qualcomm Neural Network) execution. The optimization process utilizes an environment based heavily upon the [PTQ tutorial for Phi-3.5](https://github.com/CodeLinaro/Olive/blob/main/examples/phi3_5/README.md) -## Automated Setup (Linux Only) +## File Overview -Requirements: -* Python 3.10 -* uv - Used throughout the setup scripts, please follow the [publically available installation instructions](https://docs.astral.sh/uv/getting-started/installation/#installation-methods) +This example contains the following key files: -This repository contains an automated setup script for Linux that can be used to help automate many of the steps listed in the Phi-3.5 tutorial above: +- **`env_setup.sh`** - Automated environment setup script (Linux only) +- **`gemma3-4b-text-qnn-config.json`** - Olive configuration for optimizing the text component +- **`gemma3-4b-vision-qnn-config.json`** - Olive configuration for optimizing the vision component +- **`user_script.py`** - Dataset handling and preprocessing utilities +- **`custom_gemma3_4b_it_vision.py`** - Vision model loader for the optimization pipeline +## Prerequisites + +### System Requirements +- **Operating System**: Linux (automated setup script is Linux-only) +- **Python**: 3.10 +- **Package Manager**: [uv](https://docs.astral.sh/uv/getting-started/installation/#installation-methods) +- **Storage**: ~13GB for COCO train2017 dataset (downloaded automatically) + +### Dependencies Installed by Setup Script +The `env_setup.sh` script installs the following components: +- setuptools (for building Olive from source) +- Olive requirements and dependencies +- AutoGPTQ (from source) +- GPTQModel (specific commit: `558449bed3ef2653c36041650d30da6bbbca440d`) +- onnxruntime-qnn (pre-release version) + +## Setup Instructions + +### Automated Setup (Recommended) ```bash source env_setup.sh ``` -> **Warning:** The above script uses a different commit hash (558449bed3ef2653c36041650d30da6bbbca440d) for building GPTQModel than the Phi-3.5 tutorial due to a [memory leak issue](https://github.com/ModelCloud/GPTQModel/commit/558449bed3ef2653c36041650d30da6bbbca440d) with Gemma3. +### Manual Setup (Alternative) +If you prefer to set up manually or need to troubleshoot: + +1. Install setuptools: + ```bash + uv pip install setuptools + ``` + +2. Install requirements: + ```bash + uv pip install -r ../requirements.txt + uv pip install -r ../../../requirements.txt + ``` + +3. Install AutoGPTQ from source: + ```bash + export BUILD_CUDA_EXT=0 + uv pip install --no-build-isolation git+https://github.com/PanQiWei/AutoGPTQ.git + ``` + +4. Install GPTQModel with Gemma3 fix: + ```bash + uv pip install --no-build-isolation git+https://github.com/ModelCloud/GPTQModel.git@558449bed3ef2653c36041650d30da6bbbca440d + ``` + +5. Install onnxruntime-qnn: + ```bash + uv pip install -r https://raw.githubusercontent.com/microsoft/onnxruntime/refs/heads/main/requirements.txt + uv pip install -U --pre --extra-index-url https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/ORT-Nightly/pypi/simple onnxruntime-qnn --no-deps + ``` + +> **Important:** The setup uses a specific commit hash for GPTQModel (`558449bed3ef2653c36041650d30da6bbbca440d`) to address a [memory leak issue](https://github.com/ModelCloud/GPTQModel/commit/558449bed3ef2653c36041650d30da6bbbca440d) with Gemma3 models. ## Optimization Process -Since Gemma-3-4B is a multi-modal model composed of both vision and text components, the strategy for optimizing it through Olive is to operate on the constituent models before configuring them to work in concert at the onnxruntime-genai stage. +Since Gemma-3-4B is a multi-modal model composed of both vision and text components, the strategy for optimizing it through Olive is to operate on the constituent models separately before configuring them to work together at the onnxruntime-genai stage. + +### Configuration Differences + +**Text Configuration (`gemma3-4b-text-qnn-config.json`)**: +- Uses HuggingFace model directly (`google/gemma-3-4b-it`) +- Applies comprehensive optimization pipeline: QuaRot → GptqModel → ModelBuilder → Quantization +- Outputs to: `models/gemma-3-4b-it-text/` + +**Vision Configuration (`gemma3-4b-vision-qnn-config.json`)**: +- Uses custom PyTorch model loader (`custom_gemma3_4b_it_vision.py`) +- Simpler pipeline: ONNX Conversion → Graph Surgery → Quantization +- Outputs to: `models/gemma-3-4b-it-vision/` -Thus, the following commands should be used to separately produce context binaries for the text and vision portions of the model, respectively. +### Running Optimization + +Execute the following commands to separately produce optimized binaries for each component: ```bash olive run --config gemma3-4b-text-qnn-config.json @@ -29,3 +95,28 @@ olive run --config gemma3-4b-text-qnn-config.json ```bash olive run --config gemma3-4b-vision-qnn-config.json ``` + +## Expected Outputs + +After successful optimization, you will find: + +- **Text model outputs**: `models/gemma-3-4b-it-text/` +- **Vision model outputs**: `models/gemma-3-4b-it-vision/` +- **Cache directory**: `cache/` (intermediate files and downloaded datasets) +- **Dataset**: `.cache/train2017/` (COCO train2017 images, ~13GB) + +Both configurations use `"no_artifacts": true`, meaning only the final optimized models are retained. + +## Troubleshooting + +### Common Issues + +**Insufficient Storage**: The COCO train2017 dataset requires ~13GB of storage and is downloaded automatically to `.cache/train2017/`. + +**Memory Requirements**: The optimization process, particularly for the text model with its comprehensive pipeline, requires substantial memory. + +**QNN Provider**: Ensure the QNNExecutionProvider is properly installed and configured in your environment. + +**Platform Limitation**: The current setup script is designed for Linux only. Windows/macOS users will need to adapt the manual setup steps. + +**Dataset Download**: If the COCO dataset download fails, check your internet connection and available storage. The script uses `wget` which must be available on your system. diff --git a/examples/gemma3/qnn/gemma3-4b-vision-qnn-config.json b/examples/gemma3/qnn/gemma3-4b-vision-qnn-config.json index 42b775087..cb2860fd7 100644 --- a/examples/gemma3/qnn/gemma3-4b-vision-qnn-config.json +++ b/examples/gemma3/qnn/gemma3-4b-vision-qnn-config.json @@ -34,8 +34,7 @@ "data_config": "gemma_vision_data_config", "activation_type": "uint16", "precision": "uint8", - "calibrate_method": "MinMax", - "calibration_providers": [ "CUDAExecutionProvider" ] + "calibrate_method": "MinMax" }, "cb": { "type": "EPContextBinaryGenerator", diff --git a/examples/gemma3/qnn/user_script.py b/examples/gemma3/qnn/user_script.py index 1ddbc1839..5df7d2ad2 100644 --- a/examples/gemma3/qnn/user_script.py +++ b/examples/gemma3/qnn/user_script.py @@ -330,7 +330,7 @@ def _process_dataset_entry(self, entry: dict[str, any]): # Remove this when submitting for review -SHORTCUT_FIRST_N = 256 +SHORTCUT_FIRST_N = 20 @Registry.register_dataset() From acbdfdca0b99b157efc5e18447892302243b435c Mon Sep 17 00:00:00 2001 From: Kyle Romero Date: Thu, 28 Aug 2025 14:49:37 -0700 Subject: [PATCH 13/24] Add files from Prudvhi --- examples/gemma3/qnn/app.py | 61 +++ ...script.py => custom_gemma3_4b_datasets.py} | 0 .../gemma3/qnn/custom_gemma3_4b_embedding.py | 38 ++ ...t_vision.py => custom_gemma3_4b_vision.py} | 2 +- .../qnn/gemma3-4b-embedding-qnn-config.json | 40 ++ .../gemma3/qnn/gemma3-4b-text-qnn-config.json | 5 +- .../qnn/gemma3-4b-vision-qnn-config.json | 4 +- examples/gemma3/qnn/genai_config.json | 422 ++++++++++++++++++ 8 files changed, 567 insertions(+), 5 deletions(-) create mode 100644 examples/gemma3/qnn/app.py rename examples/gemma3/qnn/{user_script.py => custom_gemma3_4b_datasets.py} (100%) create mode 100644 examples/gemma3/qnn/custom_gemma3_4b_embedding.py rename examples/gemma3/qnn/{custom_gemma3_4b_it_vision.py => custom_gemma3_4b_vision.py} (96%) create mode 100644 examples/gemma3/qnn/gemma3-4b-embedding-qnn-config.json create mode 100644 examples/gemma3/qnn/genai_config.json diff --git a/examples/gemma3/qnn/app.py b/examples/gemma3/qnn/app.py new file mode 100644 index 000000000..380e15220 --- /dev/null +++ b/examples/gemma3/qnn/app.py @@ -0,0 +1,61 @@ +# app.py +# ruff: noqa: T201 +from argparse import ArgumentParser +import numpy as np + +import onnxruntime_genai as og + +parser = ArgumentParser(description="Run a simple chat application with the Gemma3 model.") +parser.add_argument( + "-m", + "--model_folder", + type=str, + default="", + help="Path to the folder containing the outputs of Olive run", +) +args = parser.parse_args() + +# Load the base model and tokenizer +model = og.Model(f"{args.model_folder}/model") +tokenizer = og.Tokenizer(model) +tokenizer_stream = tokenizer.create_stream() + +# Set the max length to something sensible by default, +# since otherwise it will be set to the entire context length +search_options = {} +search_options["max_length"] = 512 + + +text = "Write a Python function to reverse a string." + +# Generate prompt (prompt template + input) +prompt = tokenizer.apply_chat_template( + messages=f"""[{{"role": "user", "content": "{text}"}}]""", add_generation_prompt=True +) + +# Encode the prompt using the tokenizer +input_tokens = tokenizer.encode(prompt) + +# Create params and generator +params = og.GeneratorParams(model) +params.set_search_options(**search_options) +generator = og.Generator(model, params) + +# Append input tokens to the generator +generator.append_tokens(input_tokens) + +print("") +print("Output: ", end="", flush=True) +# Stream the output +try: + while not generator.is_done(): + generator.generate_next_token() + + new_token = generator.get_next_tokens()[0] + print(tokenizer_stream.decode(new_token), end="", flush=True) +except KeyboardInterrupt: + print(" --control+c pressed, aborting generation--") +print() +print() + +del generator diff --git a/examples/gemma3/qnn/user_script.py b/examples/gemma3/qnn/custom_gemma3_4b_datasets.py similarity index 100% rename from examples/gemma3/qnn/user_script.py rename to examples/gemma3/qnn/custom_gemma3_4b_datasets.py diff --git a/examples/gemma3/qnn/custom_gemma3_4b_embedding.py b/examples/gemma3/qnn/custom_gemma3_4b_embedding.py new file mode 100644 index 000000000..414756808 --- /dev/null +++ b/examples/gemma3/qnn/custom_gemma3_4b_embedding.py @@ -0,0 +1,38 @@ +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------- + + +import logging + +import torch +from transformers import AutoModel + +logger = logging.getLogger(__name__) + + +class EmbeddingLayer(torch.nn.Module): + def __init__(self, full_model): + super().__init__() + self.embedding_layer = full_model.language_model.embed_tokens + + def forward(self, input_ids, image_features): + image_token_index=262144 + inputs_embeds = self.embedding_layer(input_ids) + + special_image_mask = (input_ids == image_token_index).unsqueeze(-1) + special_image_mask = special_image_mask.expand_as(inputs_embeds).to(inputs_embeds.device) + image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype) + inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features) + return inputs_embeds + + +def load_gemma3_embedding_model(model_path): + full_model = AutoModel.from_pretrained("google/gemma-3-4b-it") + logger.info("Loaded full model: %s", full_model) + + embedding_layer = EmbeddingLayer(full_model.language_model.embed_tokens) + + logger.info("Created embedding-only model: %s", embedding_layer) + return embedding_layer diff --git a/examples/gemma3/qnn/custom_gemma3_4b_it_vision.py b/examples/gemma3/qnn/custom_gemma3_4b_vision.py similarity index 96% rename from examples/gemma3/qnn/custom_gemma3_4b_it_vision.py rename to examples/gemma3/qnn/custom_gemma3_4b_vision.py index c0d35ecb5..1eb7f8f33 100644 --- a/examples/gemma3/qnn/custom_gemma3_4b_it_vision.py +++ b/examples/gemma3/qnn/custom_gemma3_4b_vision.py @@ -27,7 +27,7 @@ def forward(self, pixel_values): return self.multi_modal_projector(selected_image_feature) -def load_gemma3_model(model_path): +def load_gemma3_vision_model(model_path): full_model = AutoModel.from_pretrained("google/gemma-3-4b-it") logger.info("Loaded full model: %s", full_model) diff --git a/examples/gemma3/qnn/gemma3-4b-embedding-qnn-config.json b/examples/gemma3/qnn/gemma3-4b-embedding-qnn-config.json new file mode 100644 index 000000000..dc2acc3ed --- /dev/null +++ b/examples/gemma3/qnn/gemma3-4b-embedding-qnn-config.json @@ -0,0 +1,40 @@ +{ + "input_model": { + "type": "PyTorchModel", + "model_script": "custom_gemma3_4b_embedding.py", + "model_loader": "load_gemma3_embedding_model", + "io_config": { + "input_names": [ "input_ids", "image_features" ], + "input_shapes": [ [ 1, 64 ], [ 1, 256, 2560 ] ], + "input_types": [ "int32", "float32" ], + "output_names": [ "inputs_embeds" ], + "output_shapes": [ [ 1, 64, 2560 ] ], + "dynamic_axes": { + "input_ids": {"0": "batch_size", "1": "seq_length"}, + "image_features": {"0": "batch_size"} + } + } + }, + "systems": { + "local_system": { + "type": "LocalSystem", + "accelerators": [{"device": "cpu", "execution_providers": ["CPUExecutionProvider"]}], + } + }, + "data_configs": [ + { + "name": "gemma_embedding_data_config", + "user_script": "user_script.py", + "load_dataset_config": { "type": "gemma_embedding_layer_dataset", "model_id": "google/gemma-3-4b-it" } + } + ], + "passes": { + "conversion": { "type": "OnnxConversion", "target_opset": 20 }, + "add_metadata": { "type": "AddOliveMetadata", "graph_name": "gemma-3-4b-it-vision" } + }, + "target": "local_system", + "log_severity_level": 1, + "output_dir": "models/gemma-3-4b-it-embed", + "cache_dir": "cache-embd", + "no_artifacts": true +} \ No newline at end of file diff --git a/examples/gemma3/qnn/gemma3-4b-text-qnn-config.json b/examples/gemma3/qnn/gemma3-4b-text-qnn-config.json index 675d991bb..630a86b78 100644 --- a/examples/gemma3/qnn/gemma3-4b-text-qnn-config.json +++ b/examples/gemma3/qnn/gemma3-4b-text-qnn-config.json @@ -10,7 +10,7 @@ "data_configs": [ { "name": "gemma_text_data_config", - "user_script": "user_script.py", + "user_script": "custom_gemma3_4b_datasets.py", "load_dataset_config": { "type": "gemma_text_dataset", "model_id": "google/gemma-3-4b-it" } } ], @@ -25,7 +25,7 @@ "device": "cuda", "data_config": "gemma_text_data_config" }, - "cs": { "type": "CaptureSplitInfo", "num_splits": 4, "unique_embeds_lm_head_splits": true }, + "cs": { "type": "CaptureSplitInfo", "num_splits": 2, "unique_embeds_lm_head_splits": true }, "mb": { "type": "ModelBuilder", "precision": "int4", @@ -68,6 +68,7 @@ "htp_graph_finalization_optimization_mode": "3", "soc_model": "60" }, + "session_options": {"intra_op_num_threads": 2, "inter_op_num_threads": 1}, "weight_sharing": true }, "cp": { "type": "ComposeOnnxModels" } diff --git a/examples/gemma3/qnn/gemma3-4b-vision-qnn-config.json b/examples/gemma3/qnn/gemma3-4b-vision-qnn-config.json index cb2860fd7..1ce2126f1 100644 --- a/examples/gemma3/qnn/gemma3-4b-vision-qnn-config.json +++ b/examples/gemma3/qnn/gemma3-4b-vision-qnn-config.json @@ -2,7 +2,7 @@ "input_model": { "type": "PyTorchModel", "model_script": "custom_gemma3_4b_it_vision.py", - "model_loader": "load_gemma3_model", + "model_loader": "load_gemma3_vision_model", "io_config": { "input_names": [ "pixel_values" ], "input_shapes": [ [ 1, 3, 896, 896 ] ], @@ -21,7 +21,7 @@ "data_configs": [ { "name": "gemma_vision_data_config", - "user_script": "user_script.py", + "user_script": "custom_gemma3_4b_datasets.py", "load_dataset_config": { "type": "gemma_vision_dataset", "model_id": "google/gemma-3-4b-it" } } ], diff --git a/examples/gemma3/qnn/genai_config.json b/examples/gemma3/qnn/genai_config.json new file mode 100644 index 000000000..0605cff14 --- /dev/null +++ b/examples/gemma3/qnn/genai_config.json @@ -0,0 +1,422 @@ +{ + "model": { + "bos_token_id": 2, + "context_length": 131072, + "decoder": { + "session_options": { + "log_id": "onnxruntime-genai", + "provider_options": [ + { + "qnn": { + "htp_performance_mode": "burst", + "htp_graph_finalization_optimization_mode": "3", + "soc_model": "60" + } + } + ] + }, + "head_size": 256, + "hidden_size": 2560, + "inputs": { + "input_ids": "inputs_embeds", + "attention_mask": "attention_mask", + "past_key_names": "past_key_values.%d.key", + "past_value_names": "past_key_values.%d.value", + "past_sequence_length": "past_seq_len", + "total_sequence_length": "total_seq_len" + }, + "outputs": { + "logits": "logits", + "present_key_names": "present.%d.key", + "present_value_names": "present.%d.value" + }, + "num_attention_heads": 8, + "num_hidden_layers": 34, + "num_key_value_heads": 4, + "sliding_window": { + "window_size": 64, + "slide_key_value_cache": false, + "slide_inputs": true, + "pad_value": 0, + "alignment": "left" + }, + "pipeline": [ + { + "context_ctx": { + "filename": "context_ctx.onnx", + "inputs": [ + "inputs_embeds", + "past_key_values.0.key", + "past_key_values.0.value", + "past_seq_len", + "total_seq_len", + "past_key_values.1.key", + "past_key_values.1.value", + "past_key_values.2.key", + "past_key_values.2.value", + "past_key_values.3.key", + "past_key_values.3.value", + "past_key_values.4.key", + "past_key_values.4.value", + "past_key_values.5.key", + "past_key_values.5.value", + "past_key_values.6.key", + "past_key_values.6.value", + "past_key_values.7.key", + "past_key_values.7.value", + "past_key_values.8.key", + "past_key_values.8.value", + "past_key_values.9.key", + "past_key_values.9.value", + "past_key_values.10.key", + "past_key_values.10.value", + "past_key_values.11.key", + "past_key_values.11.value", + "past_key_values.12.key", + "past_key_values.12.value", + "past_key_values.13.key", + "past_key_values.13.value", + "past_key_values.14.key", + "past_key_values.14.value", + "past_key_values.15.key", + "past_key_values.15.value", + "past_key_values.16.key", + "past_key_values.16.value", + "past_key_values.17.key", + "past_key_values.17.value", + "past_key_values.18.key", + "past_key_values.18.value", + "past_key_values.19.key", + "past_key_values.19.value", + "past_key_values.20.key", + "past_key_values.20.value", + "past_key_values.21.key", + "past_key_values.21.value", + "past_key_values.22.key", + "past_key_values.22.value", + "past_key_values.23.key", + "past_key_values.23.value", + "past_key_values.24.key", + "past_key_values.24.value", + "past_key_values.25.key", + "past_key_values.25.value", + "past_key_values.26.key", + "past_key_values.26.value", + "past_key_values.27.key", + "past_key_values.27.value", + "past_key_values.28.key", + "past_key_values.28.value", + "past_key_values.29.key", + "past_key_values.29.value", + "past_key_values.30.key", + "past_key_values.30.value", + "past_key_values.31.key", + "past_key_values.31.value", + "past_key_values.32.key", + "past_key_values.32.value", + "past_key_values.33.key", + "past_key_values.33.value" + ], + "outputs": [ + "present.0.key", + "present.0.value", + "present.1.key", + "present.1.value", + "present.2.key", + "present.2.value", + "present.3.key", + "present.3.value", + "present.4.key", + "present.4.value", + "present.5.key", + "present.5.value", + "present.6.key", + "present.6.value", + "present.7.key", + "present.7.value", + "present.8.key", + "present.8.value", + "present.9.key", + "present.9.value", + "present.10.key", + "present.10.value", + "present.11.key", + "present.11.value", + "present.12.key", + "present.12.value", + "present.13.key", + "present.13.value", + "present.14.key", + "present.14.value", + "present.15.key", + "present.15.value", + "present.16.key", + "present.16.value", + "present.17.key", + "present.17.value", + "present.18.key", + "present.18.value", + "present.19.key", + "present.19.value", + "present.20.key", + "present.20.value", + "present.21.key", + "present.21.value", + "present.22.key", + "present.22.value", + "present.23.key", + "present.23.value", + "present.24.key", + "present.24.value", + "present.25.key", + "present.25.value", + "present.26.key", + "present.26.value", + "present.27.key", + "present.27.value", + "present.28.key", + "present.28.value", + "present.29.key", + "present.29.value", + "present.30.key", + "present.30.value", + "present.31.key", + "present.31.value", + "present.32.key", + "present.32.value", + "present.33.key", + "present.33.value", + "/model/layers.34/final_norm_layernorm/SkipLayerNorm_Mul_output_0_QuantizeLinear_Output" + ], + "session_options": { + "intra_op_num_threads": 2, + "inter_op_num_threads": 1, + "provider_options": [ + { + "qnn": { + "htp_performance_mode": "burst", + "htp_graph_finalization_optimization_mode": "3", + "soc_model": "60" + } + } + ] + }, + "run_on_token_gen": false + }, + "iterator_ctx": { + "filename": "iterator_ctx.onnx", + "inputs": [ + "inputs_embeds", + "past_key_values.0.key", + "past_key_values.0.value", + "past_seq_len", + "total_seq_len", + "past_key_values.1.key", + "past_key_values.1.value", + "past_key_values.2.key", + "past_key_values.2.value", + "past_key_values.3.key", + "past_key_values.3.value", + "past_key_values.4.key", + "past_key_values.4.value", + "past_key_values.5.key", + "past_key_values.5.value", + "past_key_values.6.key", + "past_key_values.6.value", + "past_key_values.7.key", + "past_key_values.7.value", + "past_key_values.8.key", + "past_key_values.8.value", + "past_key_values.9.key", + "past_key_values.9.value", + "past_key_values.10.key", + "past_key_values.10.value", + "past_key_values.11.key", + "past_key_values.11.value", + "past_key_values.12.key", + "past_key_values.12.value", + "past_key_values.13.key", + "past_key_values.13.value", + "past_key_values.14.key", + "past_key_values.14.value", + "past_key_values.15.key", + "past_key_values.15.value", + "past_key_values.16.key", + "past_key_values.16.value", + "past_key_values.17.key", + "past_key_values.17.value", + "past_key_values.18.key", + "past_key_values.18.value", + "past_key_values.19.key", + "past_key_values.19.value", + "past_key_values.20.key", + "past_key_values.20.value", + "past_key_values.21.key", + "past_key_values.21.value", + "past_key_values.22.key", + "past_key_values.22.value", + "past_key_values.23.key", + "past_key_values.23.value", + "past_key_values.24.key", + "past_key_values.24.value", + "past_key_values.25.key", + "past_key_values.25.value", + "past_key_values.26.key", + "past_key_values.26.value", + "past_key_values.27.key", + "past_key_values.27.value", + "past_key_values.28.key", + "past_key_values.28.value", + "past_key_values.29.key", + "past_key_values.29.value", + "past_key_values.30.key", + "past_key_values.30.value", + "past_key_values.31.key", + "past_key_values.31.value", + "past_key_values.32.key", + "past_key_values.32.value", + "past_key_values.33.key", + "past_key_values.33.value" + ], + "outputs": [ + "present.0.key", + "present.0.value", + "present.1.key", + "present.1.value", + "present.2.key", + "present.2.value", + "present.3.key", + "present.3.value", + "present.4.key", + "present.4.value", + "present.5.key", + "present.5.value", + "present.6.key", + "present.6.value", + "present.7.key", + "present.7.value", + "present.8.key", + "present.8.value", + "present.9.key", + "present.9.value", + "present.10.key", + "present.10.value", + "present.11.key", + "present.11.value", + "present.12.key", + "present.12.value", + "present.13.key", + "present.13.value", + "present.14.key", + "present.14.value", + "present.15.key", + "present.15.value", + "present.16.key", + "present.16.value", + "present.17.key", + "present.17.value", + "present.18.key", + "present.18.value", + "present.19.key", + "present.19.value", + "present.20.key", + "present.20.value", + "present.21.key", + "present.21.value", + "present.22.key", + "present.22.value", + "present.23.key", + "present.23.value", + "present.24.key", + "present.24.value", + "present.25.key", + "present.25.value", + "present.26.key", + "present.26.value", + "present.27.key", + "present.27.value", + "present.28.key", + "present.28.value", + "present.29.key", + "present.29.value", + "present.30.key", + "present.30.value", + "present.31.key", + "present.31.value", + "present.32.key", + "present.32.value", + "present.33.key", + "present.33.value", + "/model/layers.34/final_norm_layernorm/SkipLayerNorm_Mul_output_0_QuantizeLinear_Output" + ], + "session_options": { + "intra_op_num_threads": 2, + "inter_op_num_threads": 1, + "provider_options": [ + { + "qnn": { + "htp_performance_mode": "burst", + "htp_graph_finalization_optimization_mode": "3", + "soc_model": "60" + } + } + ] + }, + "run_on_prompt": false + }, + "lm_head": { + "filename": "lm_head.onnx", + "inputs": [ + "/model/layers.34/final_norm_layernorm/SkipLayerNorm_Mul_output_0_QuantizeLinear_Output" + ], + "outputs": [ + "logits" + ] + } + } + ] + }, + "embedding": { + "filename": "embeddings_combined.onnx", + "inputs": { + "input_ids": "input_ids", + "image_features": "image_features" + }, + "outputs": { + "inputs_embeds": "inputs_embeds" + } + }, + "vision": { + "filename": "model_ctx.onnx", + "inputs": { + "pixel_values": "pixel_values" + }, + "outputs": { + "image_features": "image_features" + } + }, + "eos_token_id": [ + 1, + 106 + ], + "pad_token_id": 0, + "type": "gemma3", + "vocab_size": 262208 + }, + "search": { + "diversity_penalty": 0.0, + "do_sample": true, + "early_stopping": true, + "length_penalty": 1.0, + "max_length": 131072, + "min_length": 0, + "no_repeat_ngram_size": 0, + "num_beams": 1, + "num_return_sequences": 1, + "past_present_share_buffer": true, + "repetition_penalty": 1.0, + "temperature": 1.0, + "top_k": 64, + "top_p": 0.95 + } +} \ No newline at end of file From f7178ae55ced96f6be96d84af9392dcffb229670 Mon Sep 17 00:00:00 2001 From: Kyle Romero Date: Tue, 2 Sep 2025 10:42:13 -0700 Subject: [PATCH 14/24] Updates --- examples/gemma3/qnn/app.py | 1 - .../gemma3/qnn/custom_gemma3_4b_datasets.py | 160 +++++++++++++++++- .../gemma3/qnn/custom_gemma3_4b_embedding.py | 9 +- .../qnn/gemma3-4b-embedding-qnn-config.json | 30 +++- .../gemma3/qnn/gemma3-4b-text-qnn-config.json | 10 +- .../qnn/gemma3-4b-vision-qnn-config.json | 2 +- examples/gemma3/qnn/genai_config.json | 28 +-- 7 files changed, 190 insertions(+), 50 deletions(-) diff --git a/examples/gemma3/qnn/app.py b/examples/gemma3/qnn/app.py index 380e15220..13a0fe4b4 100644 --- a/examples/gemma3/qnn/app.py +++ b/examples/gemma3/qnn/app.py @@ -1,7 +1,6 @@ # app.py # ruff: noqa: T201 from argparse import ArgumentParser -import numpy as np import onnxruntime_genai as og diff --git a/examples/gemma3/qnn/custom_gemma3_4b_datasets.py b/examples/gemma3/qnn/custom_gemma3_4b_datasets.py index 5df7d2ad2..88cda6d8c 100644 --- a/examples/gemma3/qnn/custom_gemma3_4b_datasets.py +++ b/examples/gemma3/qnn/custom_gemma3_4b_datasets.py @@ -3,6 +3,7 @@ # Licensed under the MIT License. # -------------------------------------------------------------------------- +import copy import logging import os import subprocess @@ -11,10 +12,12 @@ from pathlib import Path from typing import Optional +import torch from datasets import load_dataset from huggingface_hub import hf_hub_download from PIL import Image as PILImage from transformers import ( + AutoModel, AutoProcessor, AutoTokenizer, ) @@ -310,16 +313,15 @@ def _process_dataset_entry(self, entry: dict[str, any]): return {k: v.squeeze(0) for k, v in inputs.items()} # Remove batch dimension -class GemmaVisionOnlyDataset(BaseGemmaDataset): - """Dataset for only the vision tower of the Gemma 3 model.""" +class GemmaImageDataset(BaseGemmaDataset): + """Dataset for only the image processing of the Gemma 3 model.""" def _initialize_processor_components(self): - """No additional components needed for vision-only processing.""" + """No additional components needed for image-only processing.""" def _process_dataset_entry(self, entry: dict[str, any]): - """Load image and extract only pixel_values for vision-only processing.""" + """Load image and extract only pixel_values for image-only processing.""" # Load and process the image - logger.error("PROCESSING IMAGE") image = PILImage.open(fp=os.path.join(self.image_data_path, entry["image"][0])) # Process image to get pixel_values @@ -329,6 +331,136 @@ def _process_dataset_entry(self, entry: dict[str, any]): return {"pixel_values": inputs["pixel_values"]} +class GemmaImageEmbeddingDataset(BaseGemmaDataset): + """Dataset that pre-computes and caches image embeddings as numpy arrays.""" + + def __init__(self, model_id, first_n=None): + # Initialize lazy-loaded model components + self._vision_tower = None + self._multi_modal_projector = None + + super().__init__(model_id, first_n) + + def _initialize_processor_components(self): + """Initialize only standard processor components.""" + self.tokenizer = AutoTokenizer.from_pretrained( + self.model_id, cache_dir=self.CACHE_DIR, use_fast=True, trust_remote_code=True + ) + + def _get_vision_components(self): + """Lazy-load vision model components when first needed.""" + if self._vision_tower is None: + logger.info("Loading vision model components for cached embedding dataset") + full_model = AutoModel.from_pretrained(self.model_id) + + # Extract vision components (equivalent to Gemma3VisualEmbeddingGenerator) + self._vision_tower = full_model.vision_tower + self._multi_modal_projector = full_model.multi_modal_projector + + # Clean up full model to save memory + del full_model.language_model + + return self._vision_tower, self._multi_modal_projector + + def _process_dataset_entry(self, entry: dict[str, any]): + """Process entry to return input_ids and cached image features.""" + # Convert conversation and tokenize + inputs = self.processor.apply_chat_template( + entry["text"][0], add_generation_prompt=True, tokenize=True, return_tensors="pt", return_dict=True + ) + + # Load and process image + image = PILImage.open(fp=os.path.join(self.image_data_path, entry["image"][0])) + pixel_values = torch.tensor(self.processor(text="", images=image).pixel_values) + + # Get vision components and extract features + vision_tower, projector = self._get_vision_components() + pixel_values = pixel_values.to(device="cuda") + + with torch.no_grad(): + # Process through vision tower + image_outputs = vision_tower(pixel_values, output_hidden_states=True) + selected_image_feature = image_outputs.last_hidden_state + # Project to final embedding space + image_features = projector(selected_image_feature) + # Convert to numpy for caching + image_features = image_features.cpu().detach().numpy() + + return {"input_ids": inputs["input_ids"].squeeze(0), "image_features": image_features} + + +class GemmaEmbeddingDataset(BaseGemmaDataset): + """Dataset that pre-merges text and image embeddings.""" + + def __init__(self, model_id, first_n=None): + # Initialize lazy-loaded model components + self._vision_tower = None + self._multi_modal_projector = None + self._embedding_layer = None + + super().__init__(model_id, first_n) + + def _initialize_processor_components(self): + """Initialize only standard processor components.""" + self.tokenizer = AutoTokenizer.from_pretrained( + self.model_id, cache_dir=self.CACHE_DIR, use_fast=True, trust_remote_code=True + ) + + def _get_model_components(self): + """Lazy-load all required model components when first needed.""" + if self._embedding_layer is None: + logger.info("Loading model components for merged embedding dataset") + full_model = AutoModel.from_pretrained(self.model_id) + + # Extract components + self._vision_tower = full_model.vision_tower + self._multi_modal_projector = full_model.multi_modal_projector + self._embedding_layer = copy.deepcopy(full_model.language_model.embed_tokens) + + # Clean up full model + del full_model.language_model + + return self._vision_tower, self._multi_modal_projector, self._embedding_layer + + def _merge_embeddings(self, input_ids: torch.Tensor, pixel_values: torch.Tensor): + """Merge text and image embeddings at special token positions.""" + vision_tower, projector, embedding_layer = self._get_model_components() + + # Get text embeddings + inputs_embeds = embedding_layer(input_ids.to(device="cuda")) + + # Process image + pixel_values = pixel_values.to(dtype=inputs_embeds.dtype, device="cuda") + with torch.no_grad(): + image_outputs = vision_tower(pixel_values, output_hidden_states=True) + selected_image_feature = image_outputs.last_hidden_state + image_features = projector(selected_image_feature) + + # Merge at special token positions (image_token_index = 262144) + image_token_index = 262144 + special_image_mask = (input_ids == image_token_index).unsqueeze(-1) + special_image_mask = special_image_mask.expand_as(inputs_embeds).to(inputs_embeds.device) + + image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype) + return inputs_embeds.masked_scatter(special_image_mask, image_features) + + def _process_dataset_entry(self, entry: dict[str, any]): + """Process entry to return merged embeddings.""" + # Convert conversation and tokenize + inputs = self.processor.apply_chat_template( + entry["text"][0], add_generation_prompt=True, tokenize=True, return_tensors="pt", return_dict=True + ) + + # Load and process image + image = PILImage.open(fp=os.path.join(self.image_data_path, entry["image"][0])) + pixel_values = torch.tensor(self.processor(text="", images=image).pixel_values) + + # Merge embeddings + inputs_embeds = self._merge_embeddings(inputs["input_ids"], pixel_values) + + return {"inputs_embeds": inputs_embeds, "attention_mask": inputs["attention_mask"].squeeze(0)} + + # Remove this when submitting for review SHORTCUT_FIRST_N = 20 @@ -346,6 +478,18 @@ def gemma_text_dataset(model_id: str): @Registry.register_dataset() -def gemma_vision_dataset(model_id: str): - """Vision-only Gemma 3 dataset.""" - return GemmaVisionOnlyDataset(model_id, first_n=SHORTCUT_FIRST_N).get_dataset() +def gemma_image_dataset(model_id: str): + """Image-only Gemma 3 dataset.""" + return GemmaImageDataset(model_id, first_n=SHORTCUT_FIRST_N).get_dataset() + + +@Registry.register_dataset() +def gemma_embedding_dataset(model_id: str): + """Gemma 3 dataset with pre-merged text and image embeddings.""" + return GemmaEmbeddingDataset(model_id, first_n=SHORTCUT_FIRST_N).get_dataset() + + +@Registry.register_dataset() +def gemma_image_embedding_dataset(model_id: str): + """Gemma 3 dataset with pre-computed cached image embeddings.""" + return GemmaImageEmbeddingDataset(model_id, first_n=SHORTCUT_FIRST_N).get_dataset() diff --git a/examples/gemma3/qnn/custom_gemma3_4b_embedding.py b/examples/gemma3/qnn/custom_gemma3_4b_embedding.py index 414756808..1af28cd55 100644 --- a/examples/gemma3/qnn/custom_gemma3_4b_embedding.py +++ b/examples/gemma3/qnn/custom_gemma3_4b_embedding.py @@ -16,16 +16,15 @@ class EmbeddingLayer(torch.nn.Module): def __init__(self, full_model): super().__init__() self.embedding_layer = full_model.language_model.embed_tokens - + def forward(self, input_ids, image_features): - image_token_index=262144 + image_token_index = 262144 inputs_embeds = self.embedding_layer(input_ids) - + special_image_mask = (input_ids == image_token_index).unsqueeze(-1) special_image_mask = special_image_mask.expand_as(inputs_embeds).to(inputs_embeds.device) image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype) - inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features) - return inputs_embeds + return inputs_embeds.masked_scatter(special_image_mask, image_features) def load_gemma3_embedding_model(model_path): diff --git a/examples/gemma3/qnn/gemma3-4b-embedding-qnn-config.json b/examples/gemma3/qnn/gemma3-4b-embedding-qnn-config.json index dc2acc3ed..365141dc4 100644 --- a/examples/gemma3/qnn/gemma3-4b-embedding-qnn-config.json +++ b/examples/gemma3/qnn/gemma3-4b-embedding-qnn-config.json @@ -7,29 +7,41 @@ "input_names": [ "input_ids", "image_features" ], "input_shapes": [ [ 1, 64 ], [ 1, 256, 2560 ] ], "input_types": [ "int32", "float32" ], - "output_names": [ "inputs_embeds" ], + "output_names": [ "/model/embed_tokens/Mul/output_0" ], "output_shapes": [ [ 1, 64, 2560 ] ], "dynamic_axes": { - "input_ids": {"0": "batch_size", "1": "seq_length"}, - "image_features": {"0": "batch_size"} + "input_ids": { "0": "batch_size", "1": "seq_length" }, + "image_features": { "0": "batch_size" } } } }, "systems": { - "local_system": { - "type": "LocalSystem", - "accelerators": [{"device": "cpu", "execution_providers": ["CPUExecutionProvider"]}], + "qnn_system": { + "type": "PythonEnvironment", + "python_environment_path": "/local/mnt2/workspace/kromero/olive/olive-venv/bin", + "accelerators": [ { "execution_providers": [ "QNNExecutionProvider" ] } ] } }, "data_configs": [ { "name": "gemma_embedding_data_config", - "user_script": "user_script.py", - "load_dataset_config": { "type": "gemma_embedding_layer_dataset", "model_id": "google/gemma-3-4b-it" } + "user_script": "custom_gemma3_4b_datasets.py", + "load_dataset_config": { "type": "gemma_image_embedding_dataset", "model_id": "google/gemma-3-4b-it" } } ], "passes": { "conversion": { "type": "OnnxConversion", "target_opset": 20 }, + "quantization": { + "type": "OnnxStaticQuantization", + "quant_preprocess": false, + "data_config": "gemma_embedding_data_config", + "activation_type": "uint16", + "precision": "uint8", + "calibrate_method": "MinMax", + "calibration_providers": [ "CUDAExecutionProvider" ], + "per_channel": true, + "weight_symmetric": true + }, "add_metadata": { "type": "AddOliveMetadata", "graph_name": "gemma-3-4b-it-vision" } }, "target": "local_system", @@ -37,4 +49,4 @@ "output_dir": "models/gemma-3-4b-it-embed", "cache_dir": "cache-embd", "no_artifacts": true -} \ No newline at end of file +} diff --git a/examples/gemma3/qnn/gemma3-4b-text-qnn-config.json b/examples/gemma3/qnn/gemma3-4b-text-qnn-config.json index 630a86b78..1cad472ab 100644 --- a/examples/gemma3/qnn/gemma3-4b-text-qnn-config.json +++ b/examples/gemma3/qnn/gemma3-4b-text-qnn-config.json @@ -9,9 +9,9 @@ }, "data_configs": [ { - "name": "gemma_text_data_config", + "name": "gemma_embedding_data_config", "user_script": "custom_gemma3_4b_datasets.py", - "load_dataset_config": { "type": "gemma_text_dataset", "model_id": "google/gemma-3-4b-it" } + "load_dataset_config": { "type": "gemma_embedding_dataset", "model_id": "google/gemma-3-4b-it" } } ], "passes": { @@ -23,7 +23,7 @@ "group_size": -1, "lm_head": false, "device": "cuda", - "data_config": "gemma_text_data_config" + "data_config": "gemma_embedding_data_config" }, "cs": { "type": "CaptureSplitInfo", "num_splits": 2, "unique_embeds_lm_head_splits": true }, "mb": { @@ -51,7 +51,7 @@ }, "sq": { "type": "OnnxStaticQuantization", - "data_config": "gemma_text_data_config", + "data_config": "gemma_embedding_data_config", "activation_type": "uint16", "precision": "uint8", "calibration_providers": [ "CUDAExecutionProvider" ], @@ -68,7 +68,7 @@ "htp_graph_finalization_optimization_mode": "3", "soc_model": "60" }, - "session_options": {"intra_op_num_threads": 2, "inter_op_num_threads": 1}, + "session_options": { "intra_op_num_threads": 2, "inter_op_num_threads": 1 }, "weight_sharing": true }, "cp": { "type": "ComposeOnnxModels" } diff --git a/examples/gemma3/qnn/gemma3-4b-vision-qnn-config.json b/examples/gemma3/qnn/gemma3-4b-vision-qnn-config.json index 1ce2126f1..d0d747170 100644 --- a/examples/gemma3/qnn/gemma3-4b-vision-qnn-config.json +++ b/examples/gemma3/qnn/gemma3-4b-vision-qnn-config.json @@ -22,7 +22,7 @@ { "name": "gemma_vision_data_config", "user_script": "custom_gemma3_4b_datasets.py", - "load_dataset_config": { "type": "gemma_vision_dataset", "model_id": "google/gemma-3-4b-it" } + "load_dataset_config": { "type": "gemma_image_dataset", "model_id": "google/gemma-3-4b-it" } } ], "passes": { diff --git a/examples/gemma3/qnn/genai_config.json b/examples/gemma3/qnn/genai_config.json index 0605cff14..d1185aa08 100644 --- a/examples/gemma3/qnn/genai_config.json +++ b/examples/gemma3/qnn/genai_config.json @@ -369,36 +369,22 @@ "inputs": [ "/model/layers.34/final_norm_layernorm/SkipLayerNorm_Mul_output_0_QuantizeLinear_Output" ], - "outputs": [ - "logits" - ] + "outputs": [ "logits" ] } } ] }, "embedding": { "filename": "embeddings_combined.onnx", - "inputs": { - "input_ids": "input_ids", - "image_features": "image_features" - }, - "outputs": { - "inputs_embeds": "inputs_embeds" - } + "inputs": { "input_ids": "input_ids", "image_features": "image_features" }, + "outputs": { "inputs_embeds": "inputs_embeds" } }, "vision": { "filename": "model_ctx.onnx", - "inputs": { - "pixel_values": "pixel_values" - }, - "outputs": { - "image_features": "image_features" - } + "inputs": { "pixel_values": "pixel_values" }, + "outputs": { "image_features": "image_features" } }, - "eos_token_id": [ - 1, - 106 - ], + "eos_token_id": [ 1, 106 ], "pad_token_id": 0, "type": "gemma3", "vocab_size": 262208 @@ -419,4 +405,4 @@ "top_k": 64, "top_p": 0.95 } -} \ No newline at end of file +} From bd70ff40be8ce73ceee3ad0b8fefac533b71a268 Mon Sep 17 00:00:00 2001 From: Kyle Romero Date: Wed, 3 Sep 2025 07:46:32 -0700 Subject: [PATCH 15/24] Updates --- examples/gemma3/qnn/app.py | 224 +++++++++++++----- .../gemma3/qnn/custom_gemma3_4b_datasets.py | 70 ++++-- .../gemma3/qnn/custom_gemma3_4b_embedding.py | 2 +- .../qnn/gemma3-4b-embedding-qnn-config.json | 4 +- 4 files changed, 222 insertions(+), 78 deletions(-) diff --git a/examples/gemma3/qnn/app.py b/examples/gemma3/qnn/app.py index 13a0fe4b4..e83d6420f 100644 --- a/examples/gemma3/qnn/app.py +++ b/examples/gemma3/qnn/app.py @@ -1,60 +1,170 @@ -# app.py -# ruff: noqa: T201 -from argparse import ArgumentParser +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License + +import argparse +import glob +import json +import os +import time +from pathlib import Path import onnxruntime_genai as og -parser = ArgumentParser(description="Run a simple chat application with the Gemma3 model.") -parser.add_argument( - "-m", - "--model_folder", - type=str, - default="", - help="Path to the folder containing the outputs of Olive run", -) -args = parser.parse_args() - -# Load the base model and tokenizer -model = og.Model(f"{args.model_folder}/model") -tokenizer = og.Tokenizer(model) -tokenizer_stream = tokenizer.create_stream() - -# Set the max length to something sensible by default, -# since otherwise it will be set to the entire context length -search_options = {} -search_options["max_length"] = 512 - - -text = "Write a Python function to reverse a string." - -# Generate prompt (prompt template + input) -prompt = tokenizer.apply_chat_template( - messages=f"""[{{"role": "user", "content": "{text}"}}]""", add_generation_prompt=True -) - -# Encode the prompt using the tokenizer -input_tokens = tokenizer.encode(prompt) - -# Create params and generator -params = og.GeneratorParams(model) -params.set_search_options(**search_options) -generator = og.Generator(model, params) - -# Append input tokens to the generator -generator.append_tokens(input_tokens) - -print("") -print("Output: ", end="", flush=True) -# Stream the output -try: - while not generator.is_done(): - generator.generate_next_token() - - new_token = generator.get_next_tokens()[0] - print(tokenizer_stream.decode(new_token), end="", flush=True) -except KeyboardInterrupt: - print(" --control+c pressed, aborting generation--") -print() -print() - -del generator +# og.set_log_options(enabled=True, model_input_values=True, model_output_values=True) + + +def _find_dir_contains_sub_dir(current_dir: Path, target_dir_name): + curr_path = Path(current_dir).absolute() + target_dir = glob.glob(target_dir_name, root_dir=curr_path) + if target_dir: + return Path(curr_path / target_dir[0]).absolute() + else: + if curr_path.parent == curr_path: + # Root dir + return None + return _find_dir_contains_sub_dir(curr_path / "..", target_dir_name) + + +def _complete(text, state): + return (glob.glob(text + "*") + [None])[state] + + +def run(args: argparse.Namespace): + print("Loading model...") + config = og.Config(args.model_path) + if args.execution_provider != "follow_config": + config.clear_providers() + if args.execution_provider != "cpu": + print(f"Setting model to {args.execution_provider}...") + config.append_provider(args.execution_provider) + model = og.Model(config) + print("Model loaded") + + tokenizer = og.Tokenizer(model) + processor = model.create_multimodal_processor() + stream = processor.create_stream() + + interactive = not args.non_interactive + + while True: + if interactive: + try: + import readline + + readline.set_completer_delims(" \t\n;") + readline.parse_and_bind("tab: complete") + readline.set_completer(_complete) + except ImportError: + # Not available on some platforms. Ignore it. + pass + image_paths = [ + image_path.strip() + for image_path in input("Image Path (comma separated; leave empty if no image): ").split(",") + ] + else: + if args.image_paths: + image_paths = args.image_paths + else: + image_paths = [str(Path(__file__).parent / "images" / "dog.jpg")] + + image_paths = [image_path for image_path in image_paths if image_path] + print(image_paths) + + images = None + if len(image_paths) == 0: + print("No image provided") + else: + for i, image_path in enumerate(image_paths): + if not os.path.exists(image_path): + raise FileNotFoundError(f"Image file not found: {image_path}") + print(f"Using image: {image_path}") + + images = og.Images.open(*image_paths) + + if interactive: + text = input("Prompt: ") + else: + if args.prompt: + text = args.prompt + else: + text = "What is shown in this image?" + + # Construct the "messages" argument passed to apply_chat_template + messages = [] + if model.type == "phi3v": + # Combine all image tags and text into one user message + content = "".join([f"<|image_{i + 1}|>\n" for i in range(len(image_paths))]) + text + messages.append({"role": "user", "content": content}) + else: + # Gemma3-style multimodal: structured content + content_list = [{"type": "image"} for _ in image_paths] + content_list.append({"type": "text", "text": text}) + messages.append({"role": "user", "content": content_list}) + + # Apply the chat template using the tokenizer + message_json = json.dumps(messages) + print(message_json) + prompt = tokenizer.apply_chat_template(message_json, add_generation_prompt=True) + + print("Processing images and prompt...") + inputs = processor(prompt, images=images) + + print("Generating response...") + params = og.GeneratorParams(model) + params.set_search_options(max_length=1024) + + print(inputs) + + generator = og.Generator(model, params) + generator.set_inputs(inputs) + start_time = time.time() + + while not generator.is_done(): + generator.generate_next_token() + + new_token = generator.get_next_tokens()[0] + print(stream.decode(new_token), end="", flush=True) + + print() + total_run_time = time.time() - start_time + print(f"Total Time : {total_run_time:.2f}") + + for _ in range(3): + print() + + # Delete the generator to free the captured graph before creating another one + del generator + + if not interactive: + break + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + "-m", "--model_path", type=str, default="", required=True, help="Path to the folder containing the model" + ) + parser.add_argument( + "-e", + "--execution_provider", + type=str, + required=False, + default="follow_config", + choices=["cpu", "cuda", "dml", "follow_config"], + help="Execution provider to run the ONNX Runtime session with. Defaults to follow_config that uses the execution provider listed in the genai_config.json instead.", + ) + parser.add_argument( + "--image_paths", nargs="*", type=str, required=False, help="Path to the images, mainly for CI usage" + ) + parser.add_argument( + "-pr", "--prompt", required=False, help="Input prompts to generate tokens from, mainly for CI usage" + ) + parser.add_argument( + "--non-interactive", + action=argparse.BooleanOptionalAction, + default=True, + required=False, + help="Non-interactive mode, mainly for CI usage", + ) + args = parser.parse_args() + run(args) diff --git a/examples/gemma3/qnn/custom_gemma3_4b_datasets.py b/examples/gemma3/qnn/custom_gemma3_4b_datasets.py index 88cda6d8c..145ec2290 100644 --- a/examples/gemma3/qnn/custom_gemma3_4b_datasets.py +++ b/examples/gemma3/qnn/custom_gemma3_4b_datasets.py @@ -211,14 +211,8 @@ def setup_dataset(self): self.raw_datasets = self.raw_datasets.filter(lambda x: x["image_mode"] == "RGB") # Apply dataset-specific processing - logger.error(self.raw_datasets[0]) - logger.error(self.raw_datasets[1]) - self.raw_datasets = self.raw_datasets.with_transform(self._process_dataset_entry) - logger.error(self.raw_datasets[0]) - logger.error(self.raw_datasets[1]) - def get_dataset(self): """Return the processed dataset.""" return self.raw_datasets @@ -331,8 +325,8 @@ def _process_dataset_entry(self, entry: dict[str, any]): return {"pixel_values": inputs["pixel_values"]} -class GemmaImageEmbeddingDataset(BaseGemmaDataset): - """Dataset that pre-computes and caches image embeddings as numpy arrays.""" +class GemmaEmbeddingInputDataset(BaseGemmaDataset): + """Dataset that is the input to the embedding layer.""" def __init__(self, model_id, first_n=None): # Initialize lazy-loaded model components @@ -362,6 +356,24 @@ def _get_vision_components(self): return self._vision_tower, self._multi_modal_projector + def setup_dataset(self): + """Set up the multimodal dataset with text conversation conversion.""" + self._load_base_dataset() + + # Convert the Llava-style conversation to Gemma-style conversation (preserve images) + self.raw_datasets = self.raw_datasets.map( + lambda entry: self._convert_llava_to_gemma_conversation(entry, strip_images=False) + ) + + # Extract image details + self.raw_datasets = self.raw_datasets.map(self._extract_image_details) + + # Filter out any images that are not RGB + self.raw_datasets = self.raw_datasets.filter(lambda x: x["image_mode"] == "RGB") + + # Apply multimodal processing + self.raw_datasets = self.raw_datasets.with_transform(self._process_dataset_entry) + def _process_dataset_entry(self, entry: dict[str, any]): """Process entry to return input_ids and cached image features.""" # Convert conversation and tokenize @@ -413,9 +425,9 @@ def _get_model_components(self): full_model = AutoModel.from_pretrained(self.model_id) # Extract components - self._vision_tower = full_model.vision_tower - self._multi_modal_projector = full_model.multi_modal_projector - self._embedding_layer = copy.deepcopy(full_model.language_model.embed_tokens) + self._vision_tower = full_model.vision_tower.cuda() + self._multi_modal_projector = full_model.multi_modal_projector.cuda() + self._embedding_layer = copy.deepcopy(full_model.language_model.embed_tokens).cuda() # Clean up full model del full_model.language_model @@ -444,6 +456,24 @@ def _merge_embeddings(self, input_ids: torch.Tensor, pixel_values: torch.Tensor) image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype) return inputs_embeds.masked_scatter(special_image_mask, image_features) + def setup_dataset(self): + """Set up the multimodal dataset with text conversation conversion.""" + self._load_base_dataset() + + # Convert the Llava-style conversation to Gemma-style conversation (preserve images) + self.raw_datasets = self.raw_datasets.map( + lambda entry: self._convert_llava_to_gemma_conversation(entry, strip_images=False) + ) + + # Extract image details + self.raw_datasets = self.raw_datasets.map(self._extract_image_details) + + # Filter out any images that are not RGB + self.raw_datasets = self.raw_datasets.filter(lambda x: x["image_mode"] == "RGB") + + # Apply multimodal processing + self.raw_datasets = self.raw_datasets.with_transform(self._process_dataset_entry) + def _process_dataset_entry(self, entry: dict[str, any]): """Process entry to return merged embeddings.""" # Convert conversation and tokenize @@ -458,7 +488,11 @@ def _process_dataset_entry(self, entry: dict[str, any]): # Merge embeddings inputs_embeds = self._merge_embeddings(inputs["input_ids"], pixel_values) - return {"inputs_embeds": inputs_embeds, "attention_mask": inputs["attention_mask"].squeeze(0)} + return { + "input_ids": inputs["input_ids"], + "inputs_embeds": inputs_embeds, + "attention_mask": inputs["attention_mask"].squeeze(0), + } # Remove this when submitting for review @@ -484,12 +518,12 @@ def gemma_image_dataset(model_id: str): @Registry.register_dataset() -def gemma_embedding_dataset(model_id: str): - """Gemma 3 dataset with pre-merged text and image embeddings.""" - return GemmaEmbeddingDataset(model_id, first_n=SHORTCUT_FIRST_N).get_dataset() +def gemma_embedding_input_dataset(model_id: str): + """Gemma 3 dataset with embedding layer input.""" + return GemmaEmbeddingInputDataset(model_id, first_n=SHORTCUT_FIRST_N).get_dataset() @Registry.register_dataset() -def gemma_image_embedding_dataset(model_id: str): - """Gemma 3 dataset with pre-computed cached image embeddings.""" - return GemmaImageEmbeddingDataset(model_id, first_n=SHORTCUT_FIRST_N).get_dataset() +def gemma_embedding_dataset(model_id: str): + """Gemma 3 dataset with pre-merged text and image embeddings.""" + return GemmaEmbeddingDataset(model_id, first_n=SHORTCUT_FIRST_N).get_dataset() diff --git a/examples/gemma3/qnn/custom_gemma3_4b_embedding.py b/examples/gemma3/qnn/custom_gemma3_4b_embedding.py index 1af28cd55..97c9cf2ea 100644 --- a/examples/gemma3/qnn/custom_gemma3_4b_embedding.py +++ b/examples/gemma3/qnn/custom_gemma3_4b_embedding.py @@ -31,7 +31,7 @@ def load_gemma3_embedding_model(model_path): full_model = AutoModel.from_pretrained("google/gemma-3-4b-it") logger.info("Loaded full model: %s", full_model) - embedding_layer = EmbeddingLayer(full_model.language_model.embed_tokens) + embedding_layer = EmbeddingLayer(full_model) logger.info("Created embedding-only model: %s", embedding_layer) return embedding_layer diff --git a/examples/gemma3/qnn/gemma3-4b-embedding-qnn-config.json b/examples/gemma3/qnn/gemma3-4b-embedding-qnn-config.json index 365141dc4..8a70e359a 100644 --- a/examples/gemma3/qnn/gemma3-4b-embedding-qnn-config.json +++ b/examples/gemma3/qnn/gemma3-4b-embedding-qnn-config.json @@ -26,7 +26,7 @@ { "name": "gemma_embedding_data_config", "user_script": "custom_gemma3_4b_datasets.py", - "load_dataset_config": { "type": "gemma_image_embedding_dataset", "model_id": "google/gemma-3-4b-it" } + "load_dataset_config": { "type": "gemma_embedding_input_dataset", "model_id": "google/gemma-3-4b-it" } } ], "passes": { @@ -44,7 +44,7 @@ }, "add_metadata": { "type": "AddOliveMetadata", "graph_name": "gemma-3-4b-it-vision" } }, - "target": "local_system", + "target": "qnn_system", "log_severity_level": 1, "output_dir": "models/gemma-3-4b-it-embed", "cache_dir": "cache-embd", From c962ceee8a1e0a5395a713d6a0f75d1d8537ca3f Mon Sep 17 00:00:00 2001 From: Alahari Prudhvi Akhil Date: Thu, 4 Sep 2025 03:36:30 -0700 Subject: [PATCH 16/24] Add olive requirements file --- examples/gemma3/qnn/olive_req.txt | 7 +++++++ 1 file changed, 7 insertions(+) create mode 100755 examples/gemma3/qnn/olive_req.txt diff --git a/examples/gemma3/qnn/olive_req.txt b/examples/gemma3/qnn/olive_req.txt new file mode 100755 index 000000000..8923fbfa7 --- /dev/null +++ b/examples/gemma3/qnn/olive_req.txt @@ -0,0 +1,7 @@ +transformers +datasets +optimum +onnxruntime-gpu==1.22.0 +onnxruntime-genai-cuda==0.9.0 +setuptools +tabulate \ No newline at end of file From 360d9c29995de5054dae3d177e04daff1f3511cb Mon Sep 17 00:00:00 2001 From: Kyle Romero Date: Thu, 4 Sep 2025 12:29:37 -0700 Subject: [PATCH 17/24] update --- examples/gemma3/qnn/custom_gemma3_4b_datasets.py | 2 +- examples/gemma3/qnn/gemma3-4b-vision-qnn-config.json | 6 ++++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/examples/gemma3/qnn/custom_gemma3_4b_datasets.py b/examples/gemma3/qnn/custom_gemma3_4b_datasets.py index 145ec2290..6d3c35c12 100644 --- a/examples/gemma3/qnn/custom_gemma3_4b_datasets.py +++ b/examples/gemma3/qnn/custom_gemma3_4b_datasets.py @@ -496,7 +496,7 @@ def _process_dataset_entry(self, entry: dict[str, any]): # Remove this when submitting for review -SHORTCUT_FIRST_N = 20 +SHORTCUT_FIRST_N = 25 @Registry.register_dataset() diff --git a/examples/gemma3/qnn/gemma3-4b-vision-qnn-config.json b/examples/gemma3/qnn/gemma3-4b-vision-qnn-config.json index d0d747170..fe5328c6e 100644 --- a/examples/gemma3/qnn/gemma3-4b-vision-qnn-config.json +++ b/examples/gemma3/qnn/gemma3-4b-vision-qnn-config.json @@ -1,7 +1,7 @@ { "input_model": { "type": "PyTorchModel", - "model_script": "custom_gemma3_4b_it_vision.py", + "model_script": "custom_gemma3_4b_vision.py", "model_loader": "load_gemma3_vision_model", "io_config": { "input_names": [ "pixel_values" ], @@ -34,7 +34,9 @@ "data_config": "gemma_vision_data_config", "activation_type": "uint16", "precision": "uint8", - "calibrate_method": "MinMax" + "calibrate_method": "MinMax", + "per_channel": true, + "weight_symmetric": true }, "cb": { "type": "EPContextBinaryGenerator", From 5fcda5c05d1474d462fdacef7d2420d4b5aa3e5d Mon Sep 17 00:00:00 2001 From: Alahari Prudhvi Akhil Date: Thu, 4 Sep 2025 13:43:43 -0700 Subject: [PATCH 18/24] Update Olive scripts for gemma3 --- .../gemma3/qnn/custom_gemma3_4b_datasets.py | 5 +- examples/gemma3/qnn/gemma-3-4b.ipynb | 344 ++++++++++++++++++ .../qnn/gemma3-4b-embedding-qnn-config.json | 81 ++++- .../gemma3/qnn/gemma3-4b-text-qnn-config.json | 117 ++++-- .../qnn/gemma3-4b-vision-qnn-config.json | 75 +++- examples/gemma3/qnn/{ => genai}/app.py | 0 .../gemma3/qnn/{ => genai}/genai_config.json | 69 +++- .../gemma3/qnn/genai/processor_config.json | 53 +++ examples/gemma3/qnn/olive_req.txt | 7 - examples/gemma3/qnn/qnn_req.txt | 7 + examples/gemma3/requirements.txt | 8 +- olive/common/hf/utils.py | 10 +- olive/model/handler/hf.py | 8 +- 13 files changed, 693 insertions(+), 91 deletions(-) create mode 100755 examples/gemma3/qnn/gemma-3-4b.ipynb rename examples/gemma3/qnn/{ => genai}/app.py (100%) rename examples/gemma3/qnn/{ => genai}/genai_config.json (92%) mode change 100644 => 100755 create mode 100755 examples/gemma3/qnn/genai/processor_config.json delete mode 100755 examples/gemma3/qnn/olive_req.txt create mode 100755 examples/gemma3/qnn/qnn_req.txt diff --git a/examples/gemma3/qnn/custom_gemma3_4b_datasets.py b/examples/gemma3/qnn/custom_gemma3_4b_datasets.py index 6d3c35c12..987297cb8 100644 --- a/examples/gemma3/qnn/custom_gemma3_4b_datasets.py +++ b/examples/gemma3/qnn/custom_gemma3_4b_datasets.py @@ -150,7 +150,6 @@ def _download_and_extract_images(self): zip_path, ], check=True, - cwd=self.CACHE_DIR, ) logger.info("Download completed successfully") except subprocess.CalledProcessError: @@ -354,7 +353,7 @@ def _get_vision_components(self): # Clean up full model to save memory del full_model.language_model - return self._vision_tower, self._multi_modal_projector + return self._vision_tower.cuda(), self._multi_modal_projector.cuda() def setup_dataset(self): """Set up the multimodal dataset with text conversation conversion.""" @@ -398,7 +397,7 @@ def _process_dataset_entry(self, entry: dict[str, any]): # Convert to numpy for caching image_features = image_features.cpu().detach().numpy() - return {"input_ids": inputs["input_ids"].squeeze(0), "image_features": image_features} + return {"input_ids": inputs["input_ids"], "image_features": image_features} class GemmaEmbeddingDataset(BaseGemmaDataset): diff --git a/examples/gemma3/qnn/gemma-3-4b.ipynb b/examples/gemma3/qnn/gemma-3-4b.ipynb new file mode 100755 index 000000000..8203288a3 --- /dev/null +++ b/examples/gemma3/qnn/gemma-3-4b.ipynb @@ -0,0 +1,344 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Gemma 3 4B QNN model conversion with Olive \n", + "### Task: Text + Vision Generation 📝\n", + "\n", + "In this notebook, you'll:\n", + "- Download the required datasets\n", + "- Convert LLM to QNN format\n", + "- Convert Vision to QNN format\n", + "- Convert Embedding layer with image to QNN format\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 🐍 Python Virtual environments\n", + "Creates Olive and QNN python virtual environments" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import venv\n", + "from pathlib import Path\n", + "import subprocess\n", + "import json\n", + "import shutil\n", + "import urllib.request\n", + "import onnx\n", + "from onnx import helper, TensorProto\n", + "\n", + "current_dir = os.getcwd()\n", + "MODEL=\"google/gemma-3-4b-it\"\n", + "OLIVE_PYTHON_PATH = './olive_venv'\n", + "OLIVE_PYTHON_BIN = './olive_venv/bin/python'\n", + "olive_pip_path = Path(OLIVE_PYTHON_PATH) / \"bin\" / \"pip\"\n", + "OLIVE_REPO_PATH = Path(\"../../../\")\n", + "OLIVE_REQ = \"../requirements.txt\"\n", + "QNN_REQ = \"./qnn_req.txt\"\n", + "\n", + "QNN_PYTHON_PATH = './qnn_venv'\n", + "QNN_PYTHON_BIN_PATH = './qnn_venv/bin'\n", + "qnn_pip_path = Path(QNN_PYTHON_PATH) / \"bin\" / \"pip\"\n", + "QNN_PYTHON_BIN_FULL_PATH = f\"{current_dir}/{QNN_PYTHON_BIN_PATH}\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Prepare Olive Python Environment" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "if not os.path.exists(OLIVE_PYTHON_PATH):\n", + " print(\"Creating Olive Venv\")\n", + " builder = venv.EnvBuilder(with_pip=True)\n", + " builder.create(Path(OLIVE_PYTHON_PATH))\n", + "my_env = os.environ.copy()\n", + "my_env[\"BUILD_CUDA_EXT\"] = \"0\"\n", + "GPTQ=\"git+https://github.com/ModelCloud/GPTQModel.git\"\n", + "subprocess.check_call([str(olive_pip_path), \"install\", \"-U\", \"-r\" , OLIVE_REQ], env=my_env)\n", + "subprocess.check_call([str(olive_pip_path), \"install\", \"--no-build-isolation\", GPTQ], env=my_env)\n", + "subprocess.check_call([str(olive_pip_path), \"install\", \"-e\", OLIVE_REPO_PATH])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Prepare QNN Python Environment" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "if not os.path.exists(QNN_PYTHON_PATH):\n", + " print(\"Creating QNN Venv\")\n", + " builder = venv.EnvBuilder(with_pip=True)\n", + " builder.create(Path(QNN_PYTHON_PATH))\n", + "subprocess.check_call([str(qnn_pip_path), \"install\", \"--no-build-isolation\", \"-r\" , QNN_REQ], env=my_env)\n", + "subprocess.check_call([str(qnn_pip_path), \"install\", \"-e\", OLIVE_REPO_PATH])\n", + "subprocess.check_call([str(qnn_pip_path), \"install\", \"-U\", \"--pre\", \"--extra-index-url\",\n", + " \"https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/ORT-Nightly/pypi/simple\",\n", + " \"onnxruntime-qnn==1.23.0.dev20250716009\", \"--no-deps\"])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 🤗 Login to Hugging Face\n", + "To access models, you'll need to log-in to Hugging Face with a [user access token](https://huggingface.co/docs/hub/security-tokens). The following command will run you through the steps to login:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!huggingface-cli login --token <>" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Apply few patches to Onnxruntime\n", + "\n", + "This is needed for running the Olive recipies for this model" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "base_url = \"https://raw.githubusercontent.com/CodeLinaro/onnxruntime/326d9d30129bbad698e0306d24dcea0ec5a19e60\"\n", + "urls = [\n", + " base_url + \"/onnxruntime/python/tools/quantization/execution_providers/qnn/quant_config.py\",\n", + " base_url + \"/onnxruntime/python/tools/quantization/quant_utils.py\"\n", + "]\n", + "\n", + "destinations = [\n", + " OLIVE_PYTHON_PATH+\"/lib/python3.10/site-packages/onnxruntime/quantization/execution_providers/qnn/quant_config.py\",\n", + " OLIVE_PYTHON_PATH+\"/lib/python3.10/site-packages/onnxruntime/quantization/quant_utils.py\"\n", + "]\n", + "\n", + "for url, dest in zip(urls, destinations):\n", + " urllib.request.urlretrieve(url, dest)\n", + " print(f\"Downloaded and replaced: {dest}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Run Olive Recipes" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**GPU utilization observed during the run**\n", + "\n", + "\t\ta. Text GPTQModel quantization: 12gb\n", + "\t\tb. Text Onnx static quantization: 41gb\n", + "\t\tc. Vision Onnx static quantization: 68gb\n", + " d. Embedding Onnx static quantization: 3gb" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 1️⃣ LLM model generation" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "config_path = Path(f\"./gemma3-4b-text-qnn-config.json\")\n", + "with open(config_path, \"r\") as file:\n", + " data = json.load(file)\n", + "\n", + "data[\"systems\"][\"qnn_system\"][\"python_environment_path\"] = QNN_PYTHON_BIN_FULL_PATH\n", + "data[\"input_model\"][\"model_path\"] = MODEL\n", + "\n", + "with open(config_path, \"w\") as file:\n", + " json.dump(data, file, indent=4)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "!./olive_venv/bin/olive run --config ./gemma3-4b-text-qnn-config.json" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 2️⃣ Vision model Quantization" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "config_path = Path(f\"./gemma3-4b-vision-qnn-config.json\")\n", + "with open(config_path, \"r\") as file:\n", + " data = json.load(file)\n", + "data[\"systems\"][\"qnn_system\"][\"python_environment_path\"] = QNN_PYTHON_BIN_FULL_PATH\n", + "\n", + "with open(config_path, \"w\") as file:\n", + " json.dump(data, file, indent=4)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "!./olive_venv/bin/olive run --config ./gemma3-4b-vision-qnn-config.json" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 3️⃣ Embedding Model" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "!./olive_venv/bin/olive run --config ./gemma3-4b-embedding-qnn-config.json" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Keep output of the embedding model as uint16 instead of float" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "model = onnx.load(\"./models/gemma-3-4b-it-embed/model/model.onnx\")\n", + "graph = model.graph\n", + "\n", + "last_node = graph.node[-1]\n", + "graph.node.remove(last_node)\n", + "previous_node_output = graph.node[-1].output[0]\n", + "new_output = helper.make_tensor_value_info(\n", + " name=previous_node_output,\n", + " elem_type=TensorProto.UINT16,\n", + " shape=[\"batch_size\", \"seq_length\", 2560]\n", + ")\n", + "graph.output.remove(graph.output[0])\n", + "graph.output.extend([new_output])\n", + "onnx.save(model, \"./models/gemma-3-4b-it-embed/model/embeddings_with_image.onnx\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Prepare final ORT GenAI folder for on-device inference " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!cp ./models/gemma-3-4b-it-embed/model/embeddings_with_image.onnx ./models/gemma3_qnn/model/\n", + "!cp ./models/gemma-3-4b-it-vision/model/model_ctx.onnx ./models/gemma3_qnn/model/model_ctx_vision.onnx \n", + "!cp ./models/gemma-3-4b-it-vision/model/model_ctx_qnn.bin ./models/gemma3_qnn/model/model_ctx_qnn.bin \n", + "!cp ./genai/*.* ./models/gemma3_qnn/model/\n", + "\n", + "print(\"ORT GenAI inference setup: ./models/gemma3_qnn\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/examples/gemma3/qnn/gemma3-4b-embedding-qnn-config.json b/examples/gemma3/qnn/gemma3-4b-embedding-qnn-config.json index 8a70e359a..1c5b7f626 100644 --- a/examples/gemma3/qnn/gemma3-4b-embedding-qnn-config.json +++ b/examples/gemma3/qnn/gemma3-4b-embedding-qnn-config.json @@ -4,33 +4,75 @@ "model_script": "custom_gemma3_4b_embedding.py", "model_loader": "load_gemma3_embedding_model", "io_config": { - "input_names": [ "input_ids", "image_features" ], - "input_shapes": [ [ 1, 64 ], [ 1, 256, 2560 ] ], - "input_types": [ "int32", "float32" ], - "output_names": [ "/model/embed_tokens/Mul/output_0" ], - "output_shapes": [ [ 1, 64, 2560 ] ], + "input_names": [ + "input_ids", + "image_features" + ], + "input_shapes": [ + [ + 1, + 64 + ], + [ + 1, + 256, + 2560 + ] + ], + "input_types": [ + "int64", + "float32" + ], + "output_names": [ + "/model/embed_tokens/Mul/output_0" + ], + "output_shapes": [ + [ + 1, + 64, + 2560 + ] + ], "dynamic_axes": { - "input_ids": { "0": "batch_size", "1": "seq_length" }, - "image_features": { "0": "batch_size" } + "input_ids": { + "0": "batch_size", + "1": "seq_length" + }, + "image_features": { + "0": "batch_size", + "1": "image_tokens_length" + } } } }, "systems": { - "qnn_system": { - "type": "PythonEnvironment", - "python_environment_path": "/local/mnt2/workspace/kromero/olive/olive-venv/bin", - "accelerators": [ { "execution_providers": [ "QNNExecutionProvider" ] } ] + "local_system": { + "type": "LocalSystem", + "accelerators": [ + { + "device": "cpu", + "execution_providers": [ + "CPUExecutionProvider" + ] + } + ] } }, "data_configs": [ { "name": "gemma_embedding_data_config", "user_script": "custom_gemma3_4b_datasets.py", - "load_dataset_config": { "type": "gemma_embedding_input_dataset", "model_id": "google/gemma-3-4b-it" } + "load_dataset_config": { + "type": "gemma_embedding_input_dataset", + "model_id": "google/gemma-3-4b-it" + } } ], "passes": { - "conversion": { "type": "OnnxConversion", "target_opset": 20 }, + "conversion": { + "type": "OnnxConversion", + "target_opset": 20 + }, "quantization": { "type": "OnnxStaticQuantization", "quant_preprocess": false, @@ -38,15 +80,20 @@ "activation_type": "uint16", "precision": "uint8", "calibrate_method": "MinMax", - "calibration_providers": [ "CUDAExecutionProvider" ], + "calibration_providers": [ + "CUDAExecutionProvider" + ], "per_channel": true, "weight_symmetric": true }, - "add_metadata": { "type": "AddOliveMetadata", "graph_name": "gemma-3-4b-it-vision" } + "add_metadata": { + "type": "AddOliveMetadata", + "graph_name": "gemma-3-4b-it-embedding" + } }, - "target": "qnn_system", + "target": "local_system", "log_severity_level": 1, "output_dir": "models/gemma-3-4b-it-embed", "cache_dir": "cache-embd", "no_artifacts": true -} +} \ No newline at end of file diff --git a/examples/gemma3/qnn/gemma3-4b-text-qnn-config.json b/examples/gemma3/qnn/gemma3-4b-text-qnn-config.json index 1cad472ab..672cd0263 100644 --- a/examples/gemma3/qnn/gemma3-4b-text-qnn-config.json +++ b/examples/gemma3/qnn/gemma3-4b-text-qnn-config.json @@ -1,21 +1,56 @@ { - "input_model": { "type": "HfModel", "model_path": "google/gemma-3-4b-it" }, + "input_model": { + "type": "HfModel", + "model_path": "google/gemma-3-4b-it", + "custom_task_class_name": "Gemma3ForCausalLM", + "custom_task_class_module": "transformers" + }, "systems": { "qnn_system": { "type": "PythonEnvironment", - "python_environment_path": "/local/mnt2/workspace/kromero/olive/olive-venv/bin", - "accelerators": [ { "execution_providers": [ "QNNExecutionProvider" ] } ] + "python_environment_path": "", + "accelerators": [ + { + "execution_providers": [ + "QNNExecutionProvider" + ] + } + ] } }, "data_configs": [ { - "name": "gemma_embedding_data_config", - "user_script": "custom_gemma3_4b_datasets.py", - "load_dataset_config": { "type": "gemma_embedding_dataset", "model_id": "google/gemma-3-4b-it" } + "name": "wikitext2_train_joined", + "type": "HuggingfaceContainer", + "load_dataset_config": { + "data_name": "wikitext", + "subset": "wikitext-2-raw-v1", + "split": "train" + }, + "pre_process_data_config": { + "strategy": "join", + "add_special_tokens": false, + "max_seq_len": 4096, + "max_samples": 256 + } + }, + { + "name": "wikitext2_train_act", + "type": "HuggingfaceContainer", + "load_dataset_config": { + "data_name": "wikitext", + "subset": "wikitext-2-raw-v1", + "split": "train" + }, + "pre_process_data_config": { + "strategy": "line-by-line", + "add_special_tokens": true, + "max_samples": 256, + "max_seq_len": 2048 + } } ], "passes": { - "q": { "type": "QuaRot", "device": "cpu" }, "g": { "type": "GptqModel", "bits": 4, @@ -23,59 +58,93 @@ "group_size": -1, "lm_head": false, "device": "cuda", - "data_config": "gemma_embedding_data_config" + "data_config": "wikitext2_train_joined" + }, + "cs": { + "type": "CaptureSplitInfo", + "num_splits": 2, + "unique_embeds_lm_head_splits": true }, - "cs": { "type": "CaptureSplitInfo", "num_splits": 2, "unique_embeds_lm_head_splits": true }, "mb": { "type": "ModelBuilder", "precision": "int4", "int4_block_size": 32, "int4_accuracy_level": 4, - "int4_op_types_to_quantize": [ "MatMul", "Gather" ] + "int4_op_types_to_quantize": [ + "MatMul", + "Gather" + ] }, "mq": { "type": "MatMulNBitsToQDQ", "use_int4": true, "add_zero_point": true, - "nodes_to_exclude": [ "/lm_head/MatMul_Q4" ], + "nodes_to_exclude": [ + "/lm_head/MatMul_Q4" + ], "save_as_external_data": true }, "gs": { "type": "GraphSurgeries", "surgeries": [ - { "surgeon": "RemoveRopeMultiCache" }, - { "surgeon": "AttentionMaskToSequenceLengths" }, - { "surgeon": "SimplifiedLayerNormToL2Norm" } + { + "surgeon": "RemoveRopeMultiCache" + }, + { + "surgeon": "AttentionMaskToSequenceLengths" + }, + { + "surgeon": "SimplifiedLayerNormToL2Norm" + } ], "save_as_external_data": true }, "sq": { "type": "OnnxStaticQuantization", - "data_config": "gemma_embedding_data_config", + "data_config": "wikitext2_train_act", "activation_type": "uint16", "precision": "uint8", - "calibration_providers": [ "CUDAExecutionProvider" ], + "calibration_providers": [ + "CUDAExecutionProvider" + ], "quant_preprocess": true, - "op_types_to_exclude": [ "GatherBlockQuantized", "GroupQueryAttention", "MatMulNBits" ], + "op_types_to_exclude": [ + "GatherBlockQuantized", + "GroupQueryAttention", + "MatMulNBits" + ], "save_as_external_data": true }, - "sp": { "type": "SplitModel" }, - "st": { "type": "StaticLLM", "batch_size": 1, "context_length": 64 }, + "sp": { + "type": "SplitModel" + }, + "st": { + "type": "StaticLLM", + "batch_size": 1, + "context_length": 64 + }, "cb": { "type": "EPContextBinaryGenerator", "provider_options": { "htp_performance_mode": "burst", "htp_graph_finalization_optimization_mode": "3", + "vtcm_mb": "8", + "htp_arch": "v73", "soc_model": "60" }, - "session_options": { "intra_op_num_threads": 2, "inter_op_num_threads": 1 }, + "session_options": { + "intra_op_num_threads": 2, + "inter_op_num_threads": 1 + }, "weight_sharing": true }, - "cp": { "type": "ComposeOnnxModels" } + "cp": { + "type": "ComposeOnnxModels" + } }, "target": "qnn_system", - "log_severity_level": 1, - "output_dir": "models/gemma-3-4b-it-text", + "log_severity_level": 0, + "output_dir": "models/gemma3_qnn", "cache_dir": "cache", "no_artifacts": true -} +} \ No newline at end of file diff --git a/examples/gemma3/qnn/gemma3-4b-vision-qnn-config.json b/examples/gemma3/qnn/gemma3-4b-vision-qnn-config.json index fe5328c6e..e252381ab 100644 --- a/examples/gemma3/qnn/gemma3-4b-vision-qnn-config.json +++ b/examples/gemma3/qnn/gemma3-4b-vision-qnn-config.json @@ -4,30 +4,68 @@ "model_script": "custom_gemma3_4b_vision.py", "model_loader": "load_gemma3_vision_model", "io_config": { - "input_names": [ "pixel_values" ], - "input_shapes": [ [ 1, 3, 896, 896 ] ], - "input_types": [ "float32" ], - "output_names": [ "image_features" ], - "output_shapes": [ [ 1, 256, 2560 ] ] + "input_names": [ + "pixel_values" + ], + "input_shapes": [ + [ + 1, + 3, + 896, + 896 + ] + ], + "input_types": [ + "float32" + ], + "output_names": [ + "image_features" + ], + "output_shapes": [ + [ + 1, + 256, + 2560 + ] + ] } }, "systems": { "qnn_system": { "type": "PythonEnvironment", - "python_environment_path": "/local/mnt2/workspace/kromero/olive/olive-venv/bin", - "accelerators": [ { "execution_providers": [ "QNNExecutionProvider" ] } ] + "python_environment_path": "", + "accelerators": [ + { + "execution_providers": [ + "QNNExecutionProvider" + ] + } + ] } }, "data_configs": [ { "name": "gemma_vision_data_config", "user_script": "custom_gemma3_4b_datasets.py", - "load_dataset_config": { "type": "gemma_image_dataset", "model_id": "google/gemma-3-4b-it" } + "load_dataset_config": { + "type": "gemma_image_dataset", + "model_id": "google/gemma-3-4b-it" + } } ], "passes": { - "conversion": { "type": "OnnxConversion", "target_opset": 20 }, - "surgery": { "type": "GraphSurgeries", "surgeries": [ { "surgeon": "MatMulAddToGemm" } ] }, + "conversion": { + "type": "OnnxConversion", + "target_opset": 20 + }, + "surgery": { + "type": "GraphSurgeries", + "surgeries": [ + { + "surgeon": "MatMulAddToGemm" + } + ] + }, "quantization": { "type": "OnnxStaticQuantization", "quant_preprocess": true, @@ -35,21 +73,30 @@ "activation_type": "uint16", "precision": "uint8", "calibrate_method": "MinMax", + "calibration_providers": [ + "CUDAExecutionProvider" + ], "per_channel": true, "weight_symmetric": true }, "cb": { "type": "EPContextBinaryGenerator", "provider_options": { + "htp_performance_mode": "burst", "htp_graph_finalization_optimization_mode": "3", - "offload_graph_io_quantization": "0" + "vtcm_mb": "8", + "htp_arch": "v73", + "soc_model": "60" } }, - "add_metadata": { "type": "AddOliveMetadata", "graph_name": "gemma-3-4b-it-vision" } + "add_metadata": { + "type": "AddOliveMetadata", + "graph_name": "gemma-3-4b-it-vision" + } }, "target": "qnn_system", "log_severity_level": 1, "output_dir": "models/gemma-3-4b-it-vision", - "cache_dir": "cache", + "cache_dir": "cache-vision", "no_artifacts": true -} +} \ No newline at end of file diff --git a/examples/gemma3/qnn/app.py b/examples/gemma3/qnn/genai/app.py similarity index 100% rename from examples/gemma3/qnn/app.py rename to examples/gemma3/qnn/genai/app.py diff --git a/examples/gemma3/qnn/genai_config.json b/examples/gemma3/qnn/genai/genai_config.json old mode 100644 new mode 100755 similarity index 92% rename from examples/gemma3/qnn/genai_config.json rename to examples/gemma3/qnn/genai/genai_config.json index d1185aa08..a835fb863 --- a/examples/gemma3/qnn/genai_config.json +++ b/examples/gemma3/qnn/genai/genai_config.json @@ -6,19 +6,13 @@ "session_options": { "log_id": "onnxruntime-genai", "provider_options": [ - { - "qnn": { - "htp_performance_mode": "burst", - "htp_graph_finalization_optimization_mode": "3", - "soc_model": "60" - } - } ] }, "head_size": 256, "hidden_size": 2560, "inputs": { - "input_ids": "inputs_embeds", + "input_ids":"input_ids", + "inputs_embeds": "inputs_embeds", "attention_mask": "attention_mask", "past_key_names": "past_key_values.%d.key", "past_value_names": "past_key_values.%d.value", @@ -42,10 +36,20 @@ }, "pipeline": [ { + "embeddings": { + "filename": "embeddings.onnx", + "inputs": [ + "input_ids" + ], + "outputs": [ + "/model/embed_tokens/Mul/output_0_QuantizeLinear_Output" + ], + "run_on_prompt": false + }, "context_ctx": { "filename": "context_ctx.onnx", "inputs": [ - "inputs_embeds", + "/model/embed_tokens/Mul/output_0_QuantizeLinear_Output", "past_key_values.0.key", "past_key_values.0.value", "past_seq_len", @@ -206,7 +210,7 @@ "iterator_ctx": { "filename": "iterator_ctx.onnx", "inputs": [ - "inputs_embeds", + "/model/embed_tokens/Mul/output_0_QuantizeLinear_Output", "past_key_values.0.key", "past_key_values.0.value", "past_seq_len", @@ -369,22 +373,49 @@ "inputs": [ "/model/layers.34/final_norm_layernorm/SkipLayerNorm_Mul_output_0_QuantizeLinear_Output" ], - "outputs": [ "logits" ] + "outputs": [ + "logits" + ] } } ] }, "embedding": { - "filename": "embeddings_combined.onnx", - "inputs": { "input_ids": "input_ids", "image_features": "image_features" }, - "outputs": { "inputs_embeds": "inputs_embeds" } + "filename": "embeddings_with_image.onnx", + "inputs": { + "input_ids": "input_ids", + "image_features": "image_features" + }, + "outputs": { + "inputs_embeds": "/model/embed_tokens/Mul/output_0_QuantizeLinear_Output" + } }, "vision": { - "filename": "model_ctx.onnx", - "inputs": { "pixel_values": "pixel_values" }, - "outputs": { "image_features": "image_features" } + "filename": "model_ctx_vision.onnx", + "inputs": { + "pixel_values": "pixel_values" + }, + "outputs": { + "image_features": "image_features" + }, + "session_options": { + "intra_op_num_threads": 2, + "inter_op_num_threads": 1, + "provider_options": [ + { + "qnn": { + "htp_performance_mode": "burst", + "htp_graph_finalization_optimization_mode": "3", + "soc_model": "60" + } + } + ] + } }, - "eos_token_id": [ 1, 106 ], + "eos_token_id": [ + 1, + 106 + ], "pad_token_id": 0, "type": "gemma3", "vocab_size": 262208 @@ -405,4 +436,4 @@ "top_k": 64, "top_p": 0.95 } -} +} \ No newline at end of file diff --git a/examples/gemma3/qnn/genai/processor_config.json b/examples/gemma3/qnn/genai/processor_config.json new file mode 100755 index 000000000..d1c66b6ce --- /dev/null +++ b/examples/gemma3/qnn/genai/processor_config.json @@ -0,0 +1,53 @@ +{ + "processor": { + "name": "gemma_3_image_processing", + "transforms": [ + { + "operation": { + "name": "decode_image", + "type": "DecodeImage", + "attrs": { + "color_space": "RGB" + } + } + }, + { + "operation": { + "name": "resize", + "type": "Resize", + "attrs": { + "interpolation": "CUBIC", + "width": 896, + "height": 896, + "keep_aspect_ratio": 0 + } + } + }, + { + "operation": { + "name": "re-scale", + "type": "Rescale" + } + }, + { + "operation": { + "name": "normalize", + "type": "Normalize", + "attrs": { + "mean": [0.5, 0.5, 0.5], + "std": [0.5, 0.5, 0.5] + } + } + }, + { + "operation": { + "name": "to_channel_first", + "type": "Permute3D", + "attrs": { + "dims": [2, 0, 1] + } + } + } + ] + } +} \ No newline at end of file diff --git a/examples/gemma3/qnn/olive_req.txt b/examples/gemma3/qnn/olive_req.txt deleted file mode 100755 index 8923fbfa7..000000000 --- a/examples/gemma3/qnn/olive_req.txt +++ /dev/null @@ -1,7 +0,0 @@ -transformers -datasets -optimum -onnxruntime-gpu==1.22.0 -onnxruntime-genai-cuda==0.9.0 -setuptools -tabulate \ No newline at end of file diff --git a/examples/gemma3/qnn/qnn_req.txt b/examples/gemma3/qnn/qnn_req.txt new file mode 100755 index 000000000..3cabc5919 --- /dev/null +++ b/examples/gemma3/qnn/qnn_req.txt @@ -0,0 +1,7 @@ +coloredlogs +flatbuffers +numpy >= 1.21.6 +packaging +protobuf +sympy + diff --git a/examples/gemma3/requirements.txt b/examples/gemma3/requirements.txt index 0b56b7908..8923fbfa7 100644 --- a/examples/gemma3/requirements.txt +++ b/examples/gemma3/requirements.txt @@ -1,5 +1,7 @@ +transformers datasets -onnxruntime-genai-cuda==0.7.1 -onnxruntime-gpu==1.21.1 optimum -transformers +onnxruntime-gpu==1.22.0 +onnxruntime-genai-cuda==0.9.0 +setuptools +tabulate \ No newline at end of file diff --git a/olive/common/hf/utils.py b/olive/common/hf/utils.py index 9a98ff0e3..752bf4990 100644 --- a/olive/common/hf/utils.py +++ b/olive/common/hf/utils.py @@ -11,6 +11,7 @@ from olive.common.hf.mappings import TASK_TO_PEFT_TASK_TYPE from olive.common.hf.mlflow import get_pretrained_name_or_path from olive.common.utils import hardlink_copy_file +import importlib if TYPE_CHECKING: from transformers import PretrainedConfig, PreTrainedModel, PreTrainedTokenizer, PreTrainedTokenizerFast @@ -18,7 +19,7 @@ logger = logging.getLogger(__name__) -def load_model_from_task(task: str, model_name_or_path: str, **kwargs) -> "PreTrainedModel": +def load_model_from_task(task: str, model_name_or_path: str, custom_task_class_name:str = None, custom_task_class_module:str = None, **kwargs) -> "PreTrainedModel": """Load huggingface model from task and model_name_or_path.""" from transformers.pipelines import check_task @@ -55,7 +56,12 @@ def load_model_from_task(task: str, model_name_or_path: str, **kwargs) -> "PreTr AUTO_QUANTIZATION_CONFIG_MAPPING["olive"] = OliveHfQuantizationConfig AUTO_QUANTIZER_MAPPING["olive"] = OliveHfQuantizer - class_tuple = targeted_task["pt"] or (AutoModel,) + if (custom_task_class_module is not None and custom_task_class_name is not None): + module = importlib.import_module(custom_task_class_module) + class_tuple = (getattr(module, custom_task_class_name),) + else: + class_tuple = targeted_task["pt"] or (AutoModel,) + print("class_tuple", class_tuple) model = None for i, model_class in enumerate(class_tuple): try: diff --git a/olive/model/handler/hf.py b/olive/model/handler/hf.py index bf46d7417..396a36d2a 100644 --- a/olive/model/handler/hf.py +++ b/olive/model/handler/hf.py @@ -28,7 +28,7 @@ @model_handler_registry("HFModel") class HfModelHandler(PyTorchModelHandlerBase, MLFlowTransformersMixin, HfMixin): # pylint: disable=too-many-ancestors resource_keys: tuple[str, ...] = ("model_path", "adapter_path") - json_config_keys: tuple[str, ...] = ("task", "load_kwargs") + json_config_keys: tuple[str, ...] = ("task", "load_kwargs", "custom_task_class_name", "custom_task_class_module") def __init__( self, @@ -38,6 +38,8 @@ def __init__( io_config: Union[dict[str, Any], IoConfig, str] = None, adapter_path: OLIVE_RESOURCE_ANNOTATIONS = None, model_attributes: Optional[dict[str, Any]] = None, + custom_task_class_name: str = None, + custom_task_class_module: str = None ): super().__init__( model_file_format=None, @@ -47,6 +49,8 @@ def __init__( ) self.add_resources(locals()) self.task = task + self.custom_task_class_name = custom_task_class_name + self.custom_task_class_module = custom_task_class_module self.load_kwargs = validate_config(load_kwargs, HfLoadKwargs, warn_unused_keys=False) if load_kwargs else None self.model_attributes = {**self.get_hf_model_config().to_dict(), **(self.model_attributes or {})} @@ -72,7 +76,7 @@ def load_model(self, rank: int = None, cache_model: bool = True) -> "torch.nn.Mo if self.model: model = self.model else: - model = load_model_from_task(self.task, self.model_path, **self.get_load_kwargs()) + model = load_model_from_task(self.task, self.model_path, self.custom_task_class_name, self.custom_task_class_module, **self.get_load_kwargs()) # we only have peft adapters for now if self.adapter_path: From 14018ee2ec48f4d3bb4c7efc892386adc3f06952 Mon Sep 17 00:00:00 2001 From: Alahari Prudhvi Akhil Date: Fri, 5 Sep 2025 02:53:17 -0700 Subject: [PATCH 19/24] Update few python packages --- .../gemma3/qnn/custom_gemma3_4b_datasets.py | 2 +- examples/gemma3/qnn/gemma-3-4b.ipynb | 33 +++++++++++++------ .../qnn/gemma3-4b-embedding-qnn-config.json | 0 .../gemma3/qnn/gemma3-4b-text-qnn-config.json | 0 .../qnn/gemma3-4b-vision-qnn-config.json | 0 examples/gemma3/qnn/qnn_req.txt | 2 +- examples/gemma3/requirements.txt | 5 ++- 7 files changed, 29 insertions(+), 13 deletions(-) mode change 100644 => 100755 examples/gemma3/qnn/gemma3-4b-embedding-qnn-config.json mode change 100644 => 100755 examples/gemma3/qnn/gemma3-4b-text-qnn-config.json mode change 100644 => 100755 examples/gemma3/qnn/gemma3-4b-vision-qnn-config.json diff --git a/examples/gemma3/qnn/custom_gemma3_4b_datasets.py b/examples/gemma3/qnn/custom_gemma3_4b_datasets.py index 987297cb8..03f77fea0 100644 --- a/examples/gemma3/qnn/custom_gemma3_4b_datasets.py +++ b/examples/gemma3/qnn/custom_gemma3_4b_datasets.py @@ -495,7 +495,7 @@ def _process_dataset_entry(self, entry: dict[str, any]): # Remove this when submitting for review -SHORTCUT_FIRST_N = 25 +SHORTCUT_FIRST_N = 200 @Registry.register_dataset() diff --git a/examples/gemma3/qnn/gemma-3-4b.ipynb b/examples/gemma3/qnn/gemma-3-4b.ipynb index 8203288a3..dbf127b44 100755 --- a/examples/gemma3/qnn/gemma-3-4b.ipynb +++ b/examples/gemma3/qnn/gemma-3-4b.ipynb @@ -37,6 +37,7 @@ "import urllib.request\n", "import onnx\n", "from onnx import helper, TensorProto\n", + "import glob\n", "\n", "current_dir = os.getcwd()\n", "MODEL=\"google/gemma-3-4b-it\"\n", @@ -103,7 +104,7 @@ "subprocess.check_call([str(qnn_pip_path), \"install\", \"-e\", OLIVE_REPO_PATH])\n", "subprocess.check_call([str(qnn_pip_path), \"install\", \"-U\", \"--pre\", \"--extra-index-url\",\n", " \"https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/ORT-Nightly/pypi/simple\",\n", - " \"onnxruntime-qnn==1.23.0.dev20250716009\", \"--no-deps\"])" + " \"onnxruntime-qnn==1.23.0.dev20250815002\", \"--no-deps\"])" ] }, { @@ -174,6 +175,27 @@ " d. Embedding Onnx static quantization: 3gb" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Clean Context binary directories if they exist\n", + "def clean_directory(path):\n", + " if os.path.exists(path):\n", + " for file in glob.glob(os.path.join(path, '*')):\n", + " if os.path.isfile(file):\n", + " os.remove(file)\n", + "dirs_to_clean = [\n", + " './models/gemma3_qnn/model/',\n", + " './models/gemma-3-4b-it-vision/model/'\n", + "]\n", + "\n", + "for dir_path in dirs_to_clean:\n", + " clean_directory(dir_path)" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -309,15 +331,6 @@ "\n", "print(\"ORT GenAI inference setup: ./models/gemma3_qnn\")" ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "\n" - ] } ], "metadata": { diff --git a/examples/gemma3/qnn/gemma3-4b-embedding-qnn-config.json b/examples/gemma3/qnn/gemma3-4b-embedding-qnn-config.json old mode 100644 new mode 100755 diff --git a/examples/gemma3/qnn/gemma3-4b-text-qnn-config.json b/examples/gemma3/qnn/gemma3-4b-text-qnn-config.json old mode 100644 new mode 100755 diff --git a/examples/gemma3/qnn/gemma3-4b-vision-qnn-config.json b/examples/gemma3/qnn/gemma3-4b-vision-qnn-config.json old mode 100644 new mode 100755 diff --git a/examples/gemma3/qnn/qnn_req.txt b/examples/gemma3/qnn/qnn_req.txt index 3cabc5919..05c845791 100755 --- a/examples/gemma3/qnn/qnn_req.txt +++ b/examples/gemma3/qnn/qnn_req.txt @@ -4,4 +4,4 @@ numpy >= 1.21.6 packaging protobuf sympy - +transformers==4.55.2 diff --git a/examples/gemma3/requirements.txt b/examples/gemma3/requirements.txt index 8923fbfa7..3d365f2bb 100644 --- a/examples/gemma3/requirements.txt +++ b/examples/gemma3/requirements.txt @@ -4,4 +4,7 @@ optimum onnxruntime-gpu==1.22.0 onnxruntime-genai-cuda==0.9.0 setuptools -tabulate \ No newline at end of file +tabulate +onnx==1.18.0 +onnx-ir==0.1.4 +onnxscript==0.3.2 \ No newline at end of file From 1f892410860f53d5009606fb221cc22fe965e865 Mon Sep 17 00:00:00 2001 From: Alahari Prudhvi Akhil Date: Mon, 8 Sep 2025 09:22:22 -0700 Subject: [PATCH 20/24] Use the same llava dataset for text model as well This fixes the issue of text model repeating words in the output. --- .../gemma3/qnn/custom_gemma3_4b_datasets.py | 6 ++-- examples/gemma3/qnn/gemma-3-4b.ipynb | 11 +++++- .../gemma3/qnn/gemma3-4b-text-qnn-config.json | 34 ++++--------------- 3 files changed, 18 insertions(+), 33 deletions(-) mode change 100644 => 100755 examples/gemma3/qnn/custom_gemma3_4b_datasets.py diff --git a/examples/gemma3/qnn/custom_gemma3_4b_datasets.py b/examples/gemma3/qnn/custom_gemma3_4b_datasets.py old mode 100644 new mode 100755 index 03f77fea0..71410af4f --- a/examples/gemma3/qnn/custom_gemma3_4b_datasets.py +++ b/examples/gemma3/qnn/custom_gemma3_4b_datasets.py @@ -257,8 +257,6 @@ def _process_dataset_entry(self, entry: dict[str, any]): inputs = self.processor.apply_chat_template( entry["text"][0], add_generation_prompt=True, tokenize=True, return_tensors="pt", return_dict=True ) - inputs = {k: v.unsqueeze(0) for k, v in inputs.items()} - inputs["input_ids"] = inputs["input_ids"][0] return inputs @@ -495,13 +493,13 @@ def _process_dataset_entry(self, entry: dict[str, any]): # Remove this when submitting for review +TEXT_SHORTCUT_FIRST_N = 600 SHORTCUT_FIRST_N = 200 - @Registry.register_dataset() def gemma_dataset(model_id: str): """Full E2E Gemma 3 multi-modal dataset (image + text).""" - return GemmaMultimodalDataset(model_id, first_n=SHORTCUT_FIRST_N).get_dataset() + return GemmaMultimodalDataset(model_id, first_n=TEXT_SHORTCUT_FIRST_N).get_dataset() @Registry.register_dataset() diff --git a/examples/gemma3/qnn/gemma-3-4b.ipynb b/examples/gemma3/qnn/gemma-3-4b.ipynb index dbf127b44..7b36c9cf5 100755 --- a/examples/gemma3/qnn/gemma-3-4b.ipynb +++ b/examples/gemma3/qnn/gemma-3-4b.ipynb @@ -189,7 +189,8 @@ " os.remove(file)\n", "dirs_to_clean = [\n", " './models/gemma3_qnn/model/',\n", - " './models/gemma-3-4b-it-vision/model/'\n", + " './models/gemma-3-4b-it-vision/model/',\n", + " './models/gemma-3-4b-it-embed/model/'\n", "]\n", "\n", "for dir_path in dirs_to_clean:\n", @@ -328,9 +329,17 @@ "!cp ./models/gemma-3-4b-it-vision/model/model_ctx.onnx ./models/gemma3_qnn/model/model_ctx_vision.onnx \n", "!cp ./models/gemma-3-4b-it-vision/model/model_ctx_qnn.bin ./models/gemma3_qnn/model/model_ctx_qnn.bin \n", "!cp ./genai/*.* ./models/gemma3_qnn/model/\n", + "!ls -al ./models/gemma3_qnn/model/\n", "\n", "print(\"ORT GenAI inference setup: ./models/gemma3_qnn\")" ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { diff --git a/examples/gemma3/qnn/gemma3-4b-text-qnn-config.json b/examples/gemma3/qnn/gemma3-4b-text-qnn-config.json index 672cd0263..06eb58078 100755 --- a/examples/gemma3/qnn/gemma3-4b-text-qnn-config.json +++ b/examples/gemma3/qnn/gemma3-4b-text-qnn-config.json @@ -20,33 +20,11 @@ }, "data_configs": [ { - "name": "wikitext2_train_joined", - "type": "HuggingfaceContainer", + "name": "gemma_data_config", + "user_script": "custom_gemma3_4b_datasets.py", "load_dataset_config": { - "data_name": "wikitext", - "subset": "wikitext-2-raw-v1", - "split": "train" - }, - "pre_process_data_config": { - "strategy": "join", - "add_special_tokens": false, - "max_seq_len": 4096, - "max_samples": 256 - } - }, - { - "name": "wikitext2_train_act", - "type": "HuggingfaceContainer", - "load_dataset_config": { - "data_name": "wikitext", - "subset": "wikitext-2-raw-v1", - "split": "train" - }, - "pre_process_data_config": { - "strategy": "line-by-line", - "add_special_tokens": true, - "max_samples": 256, - "max_seq_len": 2048 + "type": "gemma_dataset", + "model_id": "google/gemma-3-4b-it" } } ], @@ -58,7 +36,7 @@ "group_size": -1, "lm_head": false, "device": "cuda", - "data_config": "wikitext2_train_joined" + "data_config": "gemma_data_config" }, "cs": { "type": "CaptureSplitInfo", @@ -101,7 +79,7 @@ }, "sq": { "type": "OnnxStaticQuantization", - "data_config": "wikitext2_train_act", + "data_config": "gemma_data_config", "activation_type": "uint16", "precision": "uint8", "calibration_providers": [ From 7d4ced80df73118194263079c92a04c072608c94 Mon Sep 17 00:00:00 2001 From: Kyle Romero Date: Mon, 8 Sep 2025 17:19:11 -0700 Subject: [PATCH 21/24] Minor cleanup --- .../gemma3/qnn/custom_gemma3_4b_datasets.py | 4 +- .../qnn/gemma3-4b-embedding-qnn-config.json | 74 ++++-------------- .../gemma3/qnn/gemma3-4b-text-qnn-config.json | 71 ++++-------------- .../qnn/gemma3-4b-vision-qnn-config.json | 67 +++-------------- examples/gemma3/qnn/genai/app.py | 29 +++---- examples/gemma3/qnn/genai/genai_config.json | 44 +++-------- .../gemma3/qnn/genai/processor_config.json | 75 ++++++------------- examples/gemma3/requirements.txt | 12 +-- olive/common/hf/utils.py | 12 ++- olive/model/handler/hf.py | 10 ++- 10 files changed, 110 insertions(+), 288 deletions(-) diff --git a/examples/gemma3/qnn/custom_gemma3_4b_datasets.py b/examples/gemma3/qnn/custom_gemma3_4b_datasets.py index 71410af4f..77751530b 100755 --- a/examples/gemma3/qnn/custom_gemma3_4b_datasets.py +++ b/examples/gemma3/qnn/custom_gemma3_4b_datasets.py @@ -254,10 +254,9 @@ def _process_dataset_entry(self, entry: dict[str, any]): Tokenized inputs ready for model processing """ - inputs = self.processor.apply_chat_template( + return self.processor.apply_chat_template( entry["text"][0], add_generation_prompt=True, tokenize=True, return_tensors="pt", return_dict=True ) - return inputs class GemmaTextOnlyDataset(BaseGemmaDataset): @@ -496,6 +495,7 @@ def _process_dataset_entry(self, entry: dict[str, any]): TEXT_SHORTCUT_FIRST_N = 600 SHORTCUT_FIRST_N = 200 + @Registry.register_dataset() def gemma_dataset(model_id: str): """Full E2E Gemma 3 multi-modal dataset (image + text).""" diff --git a/examples/gemma3/qnn/gemma3-4b-embedding-qnn-config.json b/examples/gemma3/qnn/gemma3-4b-embedding-qnn-config.json index 1c5b7f626..360f0e2bb 100755 --- a/examples/gemma3/qnn/gemma3-4b-embedding-qnn-config.json +++ b/examples/gemma3/qnn/gemma3-4b-embedding-qnn-config.json @@ -4,75 +4,32 @@ "model_script": "custom_gemma3_4b_embedding.py", "model_loader": "load_gemma3_embedding_model", "io_config": { - "input_names": [ - "input_ids", - "image_features" - ], - "input_shapes": [ - [ - 1, - 64 - ], - [ - 1, - 256, - 2560 - ] - ], - "input_types": [ - "int64", - "float32" - ], - "output_names": [ - "/model/embed_tokens/Mul/output_0" - ], - "output_shapes": [ - [ - 1, - 64, - 2560 - ] - ], + "input_names": [ "input_ids", "image_features" ], + "input_shapes": [ [ 1, 64 ], [ 1, 256, 2560 ] ], + "input_types": [ "int64", "float32" ], + "output_names": [ "/model/embed_tokens/Mul/output_0" ], + "output_shapes": [ [ 1, 64, 2560 ] ], "dynamic_axes": { - "input_ids": { - "0": "batch_size", - "1": "seq_length" - }, - "image_features": { - "0": "batch_size", - "1": "image_tokens_length" - } + "input_ids": { "0": "batch_size", "1": "seq_length" }, + "image_features": { "0": "batch_size", "1": "image_tokens_length" } } } }, "systems": { "local_system": { "type": "LocalSystem", - "accelerators": [ - { - "device": "cpu", - "execution_providers": [ - "CPUExecutionProvider" - ] - } - ] + "accelerators": [ { "device": "cpu", "execution_providers": [ "CPUExecutionProvider" ] } ] } }, "data_configs": [ { "name": "gemma_embedding_data_config", "user_script": "custom_gemma3_4b_datasets.py", - "load_dataset_config": { - "type": "gemma_embedding_input_dataset", - "model_id": "google/gemma-3-4b-it" - } + "load_dataset_config": { "type": "gemma_embedding_input_dataset", "model_id": "google/gemma-3-4b-it" } } ], "passes": { - "conversion": { - "type": "OnnxConversion", - "target_opset": 20 - }, + "conversion": { "type": "OnnxConversion", "target_opset": 20 }, "quantization": { "type": "OnnxStaticQuantization", "quant_preprocess": false, @@ -80,20 +37,15 @@ "activation_type": "uint16", "precision": "uint8", "calibrate_method": "MinMax", - "calibration_providers": [ - "CUDAExecutionProvider" - ], + "calibration_providers": [ "CUDAExecutionProvider" ], "per_channel": true, "weight_symmetric": true }, - "add_metadata": { - "type": "AddOliveMetadata", - "graph_name": "gemma-3-4b-it-embedding" - } + "add_metadata": { "type": "AddOliveMetadata", "graph_name": "gemma-3-4b-it-embedding" } }, "target": "local_system", "log_severity_level": 1, "output_dir": "models/gemma-3-4b-it-embed", "cache_dir": "cache-embd", "no_artifacts": true -} \ No newline at end of file +} diff --git a/examples/gemma3/qnn/gemma3-4b-text-qnn-config.json b/examples/gemma3/qnn/gemma3-4b-text-qnn-config.json index 06eb58078..12fc5c8dc 100755 --- a/examples/gemma3/qnn/gemma3-4b-text-qnn-config.json +++ b/examples/gemma3/qnn/gemma3-4b-text-qnn-config.json @@ -9,23 +9,14 @@ "qnn_system": { "type": "PythonEnvironment", "python_environment_path": "", - "accelerators": [ - { - "execution_providers": [ - "QNNExecutionProvider" - ] - } - ] + "accelerators": [ { "execution_providers": [ "QNNExecutionProvider" ] } ] } }, "data_configs": [ { "name": "gemma_data_config", "user_script": "custom_gemma3_4b_datasets.py", - "load_dataset_config": { - "type": "gemma_dataset", - "model_id": "google/gemma-3-4b-it" - } + "load_dataset_config": { "type": "gemma_dataset", "model_id": "google/gemma-3-4b-it" } } ], "passes": { @@ -38,42 +29,27 @@ "device": "cuda", "data_config": "gemma_data_config" }, - "cs": { - "type": "CaptureSplitInfo", - "num_splits": 2, - "unique_embeds_lm_head_splits": true - }, + "cs": { "type": "CaptureSplitInfo", "num_splits": 2, "unique_embeds_lm_head_splits": true }, "mb": { "type": "ModelBuilder", "precision": "int4", "int4_block_size": 32, "int4_accuracy_level": 4, - "int4_op_types_to_quantize": [ - "MatMul", - "Gather" - ] + "int4_op_types_to_quantize": [ "MatMul", "Gather" ] }, "mq": { "type": "MatMulNBitsToQDQ", "use_int4": true, "add_zero_point": true, - "nodes_to_exclude": [ - "/lm_head/MatMul_Q4" - ], + "nodes_to_exclude": [ "/lm_head/MatMul_Q4" ], "save_as_external_data": true }, "gs": { "type": "GraphSurgeries", "surgeries": [ - { - "surgeon": "RemoveRopeMultiCache" - }, - { - "surgeon": "AttentionMaskToSequenceLengths" - }, - { - "surgeon": "SimplifiedLayerNormToL2Norm" - } + { "surgeon": "RemoveRopeMultiCache" }, + { "surgeon": "AttentionMaskToSequenceLengths" }, + { "surgeon": "SimplifiedLayerNormToL2Norm" } ], "save_as_external_data": true }, @@ -82,25 +58,13 @@ "data_config": "gemma_data_config", "activation_type": "uint16", "precision": "uint8", - "calibration_providers": [ - "CUDAExecutionProvider" - ], + "calibration_providers": [ "CUDAExecutionProvider" ], "quant_preprocess": true, - "op_types_to_exclude": [ - "GatherBlockQuantized", - "GroupQueryAttention", - "MatMulNBits" - ], + "op_types_to_exclude": [ "GatherBlockQuantized", "GroupQueryAttention", "MatMulNBits" ], "save_as_external_data": true }, - "sp": { - "type": "SplitModel" - }, - "st": { - "type": "StaticLLM", - "batch_size": 1, - "context_length": 64 - }, + "sp": { "type": "SplitModel" }, + "st": { "type": "StaticLLM", "batch_size": 1, "context_length": 64 }, "cb": { "type": "EPContextBinaryGenerator", "provider_options": { @@ -110,19 +74,14 @@ "htp_arch": "v73", "soc_model": "60" }, - "session_options": { - "intra_op_num_threads": 2, - "inter_op_num_threads": 1 - }, + "session_options": { "intra_op_num_threads": 2, "inter_op_num_threads": 1 }, "weight_sharing": true }, - "cp": { - "type": "ComposeOnnxModels" - } + "cp": { "type": "ComposeOnnxModels" } }, "target": "qnn_system", "log_severity_level": 0, "output_dir": "models/gemma3_qnn", "cache_dir": "cache", "no_artifacts": true -} \ No newline at end of file +} diff --git a/examples/gemma3/qnn/gemma3-4b-vision-qnn-config.json b/examples/gemma3/qnn/gemma3-4b-vision-qnn-config.json index e252381ab..b15d6185f 100755 --- a/examples/gemma3/qnn/gemma3-4b-vision-qnn-config.json +++ b/examples/gemma3/qnn/gemma3-4b-vision-qnn-config.json @@ -4,68 +4,30 @@ "model_script": "custom_gemma3_4b_vision.py", "model_loader": "load_gemma3_vision_model", "io_config": { - "input_names": [ - "pixel_values" - ], - "input_shapes": [ - [ - 1, - 3, - 896, - 896 - ] - ], - "input_types": [ - "float32" - ], - "output_names": [ - "image_features" - ], - "output_shapes": [ - [ - 1, - 256, - 2560 - ] - ] + "input_names": [ "pixel_values" ], + "input_shapes": [ [ 1, 3, 896, 896 ] ], + "input_types": [ "float32" ], + "output_names": [ "image_features" ], + "output_shapes": [ [ 1, 256, 2560 ] ] } }, "systems": { "qnn_system": { "type": "PythonEnvironment", "python_environment_path": "", - "accelerators": [ - { - "execution_providers": [ - "QNNExecutionProvider" - ] - } - ] + "accelerators": [ { "execution_providers": [ "QNNExecutionProvider" ] } ] } }, "data_configs": [ { "name": "gemma_vision_data_config", "user_script": "custom_gemma3_4b_datasets.py", - "load_dataset_config": { - "type": "gemma_image_dataset", - "model_id": "google/gemma-3-4b-it" - } + "load_dataset_config": { "type": "gemma_image_dataset", "model_id": "google/gemma-3-4b-it" } } ], "passes": { - "conversion": { - "type": "OnnxConversion", - "target_opset": 20 - }, - "surgery": { - "type": "GraphSurgeries", - "surgeries": [ - { - "surgeon": "MatMulAddToGemm" - } - ] - }, + "conversion": { "type": "OnnxConversion", "target_opset": 20 }, + "surgery": { "type": "GraphSurgeries", "surgeries": [ { "surgeon": "MatMulAddToGemm" } ] }, "quantization": { "type": "OnnxStaticQuantization", "quant_preprocess": true, @@ -73,9 +35,7 @@ "activation_type": "uint16", "precision": "uint8", "calibrate_method": "MinMax", - "calibration_providers": [ - "CUDAExecutionProvider" - ], + "calibration_providers": [ "CUDAExecutionProvider" ], "per_channel": true, "weight_symmetric": true }, @@ -89,14 +49,11 @@ "soc_model": "60" } }, - "add_metadata": { - "type": "AddOliveMetadata", - "graph_name": "gemma-3-4b-it-vision" - } + "add_metadata": { "type": "AddOliveMetadata", "graph_name": "gemma-3-4b-it-vision" } }, "target": "qnn_system", "log_severity_level": 1, "output_dir": "models/gemma-3-4b-it-vision", "cache_dir": "cache-vision", "no_artifacts": true -} \ No newline at end of file +} diff --git a/examples/gemma3/qnn/genai/app.py b/examples/gemma3/qnn/genai/app.py index e83d6420f..0b5da39c3 100644 --- a/examples/gemma3/qnn/genai/app.py +++ b/examples/gemma3/qnn/genai/app.py @@ -4,13 +4,14 @@ import argparse import glob import json +import logging import os import time from pathlib import Path import onnxruntime_genai as og -# og.set_log_options(enabled=True, model_input_values=True, model_output_values=True) +logger = logging.getLogger(__name__) def _find_dir_contains_sub_dir(current_dir: Path, target_dir_name): @@ -30,15 +31,15 @@ def _complete(text, state): def run(args: argparse.Namespace): - print("Loading model...") + logger.info("Loading model...") config = og.Config(args.model_path) if args.execution_provider != "follow_config": config.clear_providers() if args.execution_provider != "cpu": - print(f"Setting model to {args.execution_provider}...") + logger.info(f"Setting model to {args.execution_provider}...") config.append_provider(args.execution_provider) model = og.Model(config) - print("Model loaded") + logger.info("Model loaded") tokenizer = og.Tokenizer(model) processor = model.create_multimodal_processor() @@ -68,16 +69,15 @@ def run(args: argparse.Namespace): image_paths = [str(Path(__file__).parent / "images" / "dog.jpg")] image_paths = [image_path for image_path in image_paths if image_path] - print(image_paths) images = None if len(image_paths) == 0: - print("No image provided") + logger.info("No image provided") else: for i, image_path in enumerate(image_paths): if not os.path.exists(image_path): raise FileNotFoundError(f"Image file not found: {image_path}") - print(f"Using image: {image_path}") + logger.info(f"Using image: {image_path}") images = og.Images.open(*image_paths) @@ -103,18 +103,15 @@ def run(args: argparse.Namespace): # Apply the chat template using the tokenizer message_json = json.dumps(messages) - print(message_json) prompt = tokenizer.apply_chat_template(message_json, add_generation_prompt=True) - print("Processing images and prompt...") + logger.info("Processing images and prompt...") inputs = processor(prompt, images=images) - print("Generating response...") + logger.info("Generating response...") params = og.GeneratorParams(model) params.set_search_options(max_length=1024) - print(inputs) - generator = og.Generator(model, params) generator.set_inputs(inputs) start_time = time.time() @@ -123,14 +120,10 @@ def run(args: argparse.Namespace): generator.generate_next_token() new_token = generator.get_next_tokens()[0] - print(stream.decode(new_token), end="", flush=True) + logger.info(stream.decode(new_token), end="", flush=True) - print() total_run_time = time.time() - start_time - print(f"Total Time : {total_run_time:.2f}") - - for _ in range(3): - print() + logger.info(f"Total Time : {total_run_time:.2f}") # Delete the generator to free the captured graph before creating another one del generator diff --git a/examples/gemma3/qnn/genai/genai_config.json b/examples/gemma3/qnn/genai/genai_config.json index a835fb863..754b33cd0 100755 --- a/examples/gemma3/qnn/genai/genai_config.json +++ b/examples/gemma3/qnn/genai/genai_config.json @@ -3,15 +3,11 @@ "bos_token_id": 2, "context_length": 131072, "decoder": { - "session_options": { - "log_id": "onnxruntime-genai", - "provider_options": [ - ] - }, + "session_options": { "log_id": "onnxruntime-genai", "provider_options": [ ] }, "head_size": 256, "hidden_size": 2560, "inputs": { - "input_ids":"input_ids", + "input_ids": "input_ids", "inputs_embeds": "inputs_embeds", "attention_mask": "attention_mask", "past_key_names": "past_key_values.%d.key", @@ -38,12 +34,8 @@ { "embeddings": { "filename": "embeddings.onnx", - "inputs": [ - "input_ids" - ], - "outputs": [ - "/model/embed_tokens/Mul/output_0_QuantizeLinear_Output" - ], + "inputs": [ "input_ids" ], + "outputs": [ "/model/embed_tokens/Mul/output_0_QuantizeLinear_Output" ], "run_on_prompt": false }, "context_ctx": { @@ -373,31 +365,20 @@ "inputs": [ "/model/layers.34/final_norm_layernorm/SkipLayerNorm_Mul_output_0_QuantizeLinear_Output" ], - "outputs": [ - "logits" - ] + "outputs": [ "logits" ] } } ] }, "embedding": { "filename": "embeddings_with_image.onnx", - "inputs": { - "input_ids": "input_ids", - "image_features": "image_features" - }, - "outputs": { - "inputs_embeds": "/model/embed_tokens/Mul/output_0_QuantizeLinear_Output" - } + "inputs": { "input_ids": "input_ids", "image_features": "image_features" }, + "outputs": { "inputs_embeds": "/model/embed_tokens/Mul/output_0_QuantizeLinear_Output" } }, "vision": { "filename": "model_ctx_vision.onnx", - "inputs": { - "pixel_values": "pixel_values" - }, - "outputs": { - "image_features": "image_features" - }, + "inputs": { "pixel_values": "pixel_values" }, + "outputs": { "image_features": "image_features" }, "session_options": { "intra_op_num_threads": 2, "inter_op_num_threads": 1, @@ -412,10 +393,7 @@ ] } }, - "eos_token_id": [ - 1, - 106 - ], + "eos_token_id": [ 1, 106 ], "pad_token_id": 0, "type": "gemma3", "vocab_size": 262208 @@ -436,4 +414,4 @@ "top_k": 64, "top_p": 0.95 } -} \ No newline at end of file +} diff --git a/examples/gemma3/qnn/genai/processor_config.json b/examples/gemma3/qnn/genai/processor_config.json index d1c66b6ce..b25059aa2 100755 --- a/examples/gemma3/qnn/genai/processor_config.json +++ b/examples/gemma3/qnn/genai/processor_config.json @@ -1,53 +1,24 @@ { - "processor": { - "name": "gemma_3_image_processing", - "transforms": [ - { - "operation": { - "name": "decode_image", - "type": "DecodeImage", - "attrs": { - "color_space": "RGB" - } - } - }, - { - "operation": { - "name": "resize", - "type": "Resize", - "attrs": { - "interpolation": "CUBIC", - "width": 896, - "height": 896, - "keep_aspect_ratio": 0 - } - } - }, - { - "operation": { - "name": "re-scale", - "type": "Rescale" - } - }, - { - "operation": { - "name": "normalize", - "type": "Normalize", - "attrs": { - "mean": [0.5, 0.5, 0.5], - "std": [0.5, 0.5, 0.5] - } - } - }, - { - "operation": { - "name": "to_channel_first", - "type": "Permute3D", - "attrs": { - "dims": [2, 0, 1] - } - } - } - ] - } -} \ No newline at end of file + "processor": { + "name": "gemma_3_image_processing", + "transforms": [ + { "operation": { "name": "decode_image", "type": "DecodeImage", "attrs": { "color_space": "RGB" } } }, + { + "operation": { + "name": "resize", + "type": "Resize", + "attrs": { "interpolation": "CUBIC", "width": 896, "height": 896, "keep_aspect_ratio": 0 } + } + }, + { "operation": { "name": "re-scale", "type": "Rescale" } }, + { + "operation": { + "name": "normalize", + "type": "Normalize", + "attrs": { "mean": [ 0.5, 0.5, 0.5 ], "std": [ 0.5, 0.5, 0.5 ] } + } + }, + { "operation": { "name": "to_channel_first", "type": "Permute3D", "attrs": { "dims": [ 2, 0, 1 ] } } } + ] + } +} diff --git a/examples/gemma3/requirements.txt b/examples/gemma3/requirements.txt index 3d365f2bb..337d1987d 100644 --- a/examples/gemma3/requirements.txt +++ b/examples/gemma3/requirements.txt @@ -1,10 +1,10 @@ -transformers datasets -optimum -onnxruntime-gpu==1.22.0 +onnx==1.18.0 +onnx-ir==0.1.4 onnxruntime-genai-cuda==0.9.0 +onnxruntime-gpu==1.22.0 +onnxscript==0.3.2 +optimum setuptools tabulate -onnx==1.18.0 -onnx-ir==0.1.4 -onnxscript==0.3.2 \ No newline at end of file +transformers diff --git a/olive/common/hf/utils.py b/olive/common/hf/utils.py index 752bf4990..dee79e6e8 100644 --- a/olive/common/hf/utils.py +++ b/olive/common/hf/utils.py @@ -2,6 +2,7 @@ # Copyright (c) Microsoft Corporation. All rights reserved. # Licensed under the MIT License. # -------------------------------------------------------------------------- +import importlib import logging from pathlib import Path from typing import TYPE_CHECKING, Optional, Union @@ -11,7 +12,6 @@ from olive.common.hf.mappings import TASK_TO_PEFT_TASK_TYPE from olive.common.hf.mlflow import get_pretrained_name_or_path from olive.common.utils import hardlink_copy_file -import importlib if TYPE_CHECKING: from transformers import PretrainedConfig, PreTrainedModel, PreTrainedTokenizer, PreTrainedTokenizerFast @@ -19,7 +19,13 @@ logger = logging.getLogger(__name__) -def load_model_from_task(task: str, model_name_or_path: str, custom_task_class_name:str = None, custom_task_class_module:str = None, **kwargs) -> "PreTrainedModel": +def load_model_from_task( + task: str, + model_name_or_path: str, + custom_task_class_name: str = None, + custom_task_class_module: str = None, + **kwargs, +) -> "PreTrainedModel": """Load huggingface model from task and model_name_or_path.""" from transformers.pipelines import check_task @@ -56,7 +62,7 @@ def load_model_from_task(task: str, model_name_or_path: str, custom_task_class_n AUTO_QUANTIZATION_CONFIG_MAPPING["olive"] = OliveHfQuantizationConfig AUTO_QUANTIZER_MAPPING["olive"] = OliveHfQuantizer - if (custom_task_class_module is not None and custom_task_class_name is not None): + if custom_task_class_module is not None and custom_task_class_name is not None: module = importlib.import_module(custom_task_class_module) class_tuple = (getattr(module, custom_task_class_name),) else: diff --git a/olive/model/handler/hf.py b/olive/model/handler/hf.py index 396a36d2a..343c84f77 100644 --- a/olive/model/handler/hf.py +++ b/olive/model/handler/hf.py @@ -39,7 +39,7 @@ def __init__( adapter_path: OLIVE_RESOURCE_ANNOTATIONS = None, model_attributes: Optional[dict[str, Any]] = None, custom_task_class_name: str = None, - custom_task_class_module: str = None + custom_task_class_module: str = None, ): super().__init__( model_file_format=None, @@ -76,7 +76,13 @@ def load_model(self, rank: int = None, cache_model: bool = True) -> "torch.nn.Mo if self.model: model = self.model else: - model = load_model_from_task(self.task, self.model_path, self.custom_task_class_name, self.custom_task_class_module, **self.get_load_kwargs()) + model = load_model_from_task( + self.task, + self.model_path, + self.custom_task_class_name, + self.custom_task_class_module, + **self.get_load_kwargs(), + ) # we only have peft adapters for now if self.adapter_path: From a0bd7031557ae1e7cdad4cbbfb8e3db0bb036e74 Mon Sep 17 00:00:00 2001 From: Alahari Prudhvi Akhil Date: Tue, 9 Sep 2025 04:39:27 -0700 Subject: [PATCH 22/24] Add system requirements --- examples/gemma3/qnn/gemma-3-4b.ipynb | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/examples/gemma3/qnn/gemma-3-4b.ipynb b/examples/gemma3/qnn/gemma-3-4b.ipynb index 7b36c9cf5..42890860e 100755 --- a/examples/gemma3/qnn/gemma-3-4b.ipynb +++ b/examples/gemma3/qnn/gemma-3-4b.ipynb @@ -14,6 +14,19 @@ "- Convert Embedding layer with image to QNN format\n" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Platform requirements\n", + "This notebook is intended to run on a machine with:\n", + " * **Operating System**: Linux Ubuntu 22.04 (automated setup script is Linux-only)\n", + " * **Python**: 3.10\n", + " * NVIDIA driver version equivalent to 525.60.13\n", + " * NVIDIA A100 GPU\n", + " * **Storage**: ~13GB for COCO train2017 dataset (downloaded automatically)" + ] + }, { "cell_type": "markdown", "metadata": {}, From f685073f15bd0b19fdc269803eabe8be2c58b554 Mon Sep 17 00:00:00 2001 From: Kyle Romero Date: Thu, 18 Sep 2025 10:39:36 -0700 Subject: [PATCH 23/24] Remove examples --- examples/gemma3/qnn/README.md | 122 ---- .../gemma3/qnn/custom_gemma3_4b_datasets.py | 526 ------------------ .../gemma3/qnn/custom_gemma3_4b_embedding.py | 37 -- .../gemma3/qnn/custom_gemma3_4b_vision.py | 36 -- examples/gemma3/qnn/env_setup.sh | 28 - examples/gemma3/qnn/gemma-3-4b.ipynb | 379 ------------- .../qnn/gemma3-4b-embedding-qnn-config.json | 51 -- .../gemma3/qnn/gemma3-4b-text-qnn-config.json | 87 --- .../qnn/gemma3-4b-vision-qnn-config.json | 59 -- examples/gemma3/qnn/genai/app.py | 163 ------ examples/gemma3/qnn/genai/genai_config.json | 417 -------------- .../gemma3/qnn/genai/processor_config.json | 24 - examples/gemma3/qnn/qnn_req.txt | 7 - examples/gemma3/requirements.txt | 10 - 14 files changed, 1946 deletions(-) delete mode 100644 examples/gemma3/qnn/README.md delete mode 100755 examples/gemma3/qnn/custom_gemma3_4b_datasets.py delete mode 100644 examples/gemma3/qnn/custom_gemma3_4b_embedding.py delete mode 100644 examples/gemma3/qnn/custom_gemma3_4b_vision.py delete mode 100644 examples/gemma3/qnn/env_setup.sh delete mode 100755 examples/gemma3/qnn/gemma-3-4b.ipynb delete mode 100755 examples/gemma3/qnn/gemma3-4b-embedding-qnn-config.json delete mode 100755 examples/gemma3/qnn/gemma3-4b-text-qnn-config.json delete mode 100755 examples/gemma3/qnn/gemma3-4b-vision-qnn-config.json delete mode 100644 examples/gemma3/qnn/genai/app.py delete mode 100755 examples/gemma3/qnn/genai/genai_config.json delete mode 100755 examples/gemma3/qnn/genai/processor_config.json delete mode 100755 examples/gemma3/qnn/qnn_req.txt delete mode 100644 examples/gemma3/requirements.txt diff --git a/examples/gemma3/qnn/README.md b/examples/gemma3/qnn/README.md deleted file mode 100644 index 93c347fbe..000000000 --- a/examples/gemma3/qnn/README.md +++ /dev/null @@ -1,122 +0,0 @@ -# Gemma-3-4B Model Optimization - -This repository demonstrates the optimization of the [Google Gemma-3-4B](https://huggingface.co/google/gemma-3-4b-it) model using **post-training quantization (PTQ)** techniques for QNN (Qualcomm Neural Network) execution. The optimization process utilizes an environment based heavily upon the [PTQ tutorial for Phi-3.5](https://github.com/CodeLinaro/Olive/blob/main/examples/phi3_5/README.md) - -## File Overview - -This example contains the following key files: - -- **`env_setup.sh`** - Automated environment setup script (Linux only) -- **`gemma3-4b-text-qnn-config.json`** - Olive configuration for optimizing the text component -- **`gemma3-4b-vision-qnn-config.json`** - Olive configuration for optimizing the vision component -- **`user_script.py`** - Dataset handling and preprocessing utilities -- **`custom_gemma3_4b_it_vision.py`** - Vision model loader for the optimization pipeline - -## Prerequisites - -### System Requirements -- **Operating System**: Linux (automated setup script is Linux-only) -- **Python**: 3.10 -- **Package Manager**: [uv](https://docs.astral.sh/uv/getting-started/installation/#installation-methods) -- **Storage**: ~13GB for COCO train2017 dataset (downloaded automatically) - -### Dependencies Installed by Setup Script -The `env_setup.sh` script installs the following components: -- setuptools (for building Olive from source) -- Olive requirements and dependencies -- AutoGPTQ (from source) -- GPTQModel (specific commit: `558449bed3ef2653c36041650d30da6bbbca440d`) -- onnxruntime-qnn (pre-release version) - -## Setup Instructions - -### Automated Setup (Recommended) -```bash -source env_setup.sh -``` - -### Manual Setup (Alternative) -If you prefer to set up manually or need to troubleshoot: - -1. Install setuptools: - ```bash - uv pip install setuptools - ``` - -2. Install requirements: - ```bash - uv pip install -r ../requirements.txt - uv pip install -r ../../../requirements.txt - ``` - -3. Install AutoGPTQ from source: - ```bash - export BUILD_CUDA_EXT=0 - uv pip install --no-build-isolation git+https://github.com/PanQiWei/AutoGPTQ.git - ``` - -4. Install GPTQModel with Gemma3 fix: - ```bash - uv pip install --no-build-isolation git+https://github.com/ModelCloud/GPTQModel.git@558449bed3ef2653c36041650d30da6bbbca440d - ``` - -5. Install onnxruntime-qnn: - ```bash - uv pip install -r https://raw.githubusercontent.com/microsoft/onnxruntime/refs/heads/main/requirements.txt - uv pip install -U --pre --extra-index-url https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/ORT-Nightly/pypi/simple onnxruntime-qnn --no-deps - ``` - -> **Important:** The setup uses a specific commit hash for GPTQModel (`558449bed3ef2653c36041650d30da6bbbca440d`) to address a [memory leak issue](https://github.com/ModelCloud/GPTQModel/commit/558449bed3ef2653c36041650d30da6bbbca440d) with Gemma3 models. - -## Optimization Process - -Since Gemma-3-4B is a multi-modal model composed of both vision and text components, the strategy for optimizing it through Olive is to operate on the constituent models separately before configuring them to work together at the onnxruntime-genai stage. - -### Configuration Differences - -**Text Configuration (`gemma3-4b-text-qnn-config.json`)**: -- Uses HuggingFace model directly (`google/gemma-3-4b-it`) -- Applies comprehensive optimization pipeline: QuaRot → GptqModel → ModelBuilder → Quantization -- Outputs to: `models/gemma-3-4b-it-text/` - -**Vision Configuration (`gemma3-4b-vision-qnn-config.json`)**: -- Uses custom PyTorch model loader (`custom_gemma3_4b_it_vision.py`) -- Simpler pipeline: ONNX Conversion → Graph Surgery → Quantization -- Outputs to: `models/gemma-3-4b-it-vision/` - -### Running Optimization - -Execute the following commands to separately produce optimized binaries for each component: - -```bash -olive run --config gemma3-4b-text-qnn-config.json -``` - -```bash -olive run --config gemma3-4b-vision-qnn-config.json -``` - -## Expected Outputs - -After successful optimization, you will find: - -- **Text model outputs**: `models/gemma-3-4b-it-text/` -- **Vision model outputs**: `models/gemma-3-4b-it-vision/` -- **Cache directory**: `cache/` (intermediate files and downloaded datasets) -- **Dataset**: `.cache/train2017/` (COCO train2017 images, ~13GB) - -Both configurations use `"no_artifacts": true`, meaning only the final optimized models are retained. - -## Troubleshooting - -### Common Issues - -**Insufficient Storage**: The COCO train2017 dataset requires ~13GB of storage and is downloaded automatically to `.cache/train2017/`. - -**Memory Requirements**: The optimization process, particularly for the text model with its comprehensive pipeline, requires substantial memory. - -**QNN Provider**: Ensure the QNNExecutionProvider is properly installed and configured in your environment. - -**Platform Limitation**: The current setup script is designed for Linux only. Windows/macOS users will need to adapt the manual setup steps. - -**Dataset Download**: If the COCO dataset download fails, check your internet connection and available storage. The script uses `wget` which must be available on your system. diff --git a/examples/gemma3/qnn/custom_gemma3_4b_datasets.py b/examples/gemma3/qnn/custom_gemma3_4b_datasets.py deleted file mode 100755 index 77751530b..000000000 --- a/examples/gemma3/qnn/custom_gemma3_4b_datasets.py +++ /dev/null @@ -1,526 +0,0 @@ -# ------------------------------------------------------------------------- -# Copyright (c) Microsoft Corporation. All rights reserved. -# Licensed under the MIT License. -# -------------------------------------------------------------------------- - -import copy -import logging -import os -import subprocess -import zipfile -from abc import ABC, abstractmethod -from pathlib import Path -from typing import Optional - -import torch -from datasets import load_dataset -from huggingface_hub import hf_hub_download -from PIL import Image as PILImage -from transformers import ( - AutoModel, - AutoProcessor, - AutoTokenizer, -) - -from olive.data.registry import Registry - -logger = logging.getLogger(__name__) - - -class BaseGemmaDataset(ABC): - """Abstract base class for Gemma dataset implementations.""" - - CACHE_DIR = os.getenv("CACHE_DIR", ".cache") - - def __init__(self, model_id: str, first_n: Optional[int] = None): - self.model_id = model_id - self.first_n = first_n - self.processor = AutoProcessor.from_pretrained(self.model_id) - - # Initialize attributes that will be set during dataset loading - self.image_data_path = None - self.raw_datasets = None - - # Initialize processor components based on subclass requirements - self._initialize_processor_components() - - self.setup_dataset() - - @abstractmethod - def _initialize_processor_components(self): - """Initialize processor components specific to the dataset type.""" - - @abstractmethod - def _process_dataset_entry(self, entry: dict[str, any]): - """Process a single dataset entry according to the dataset type.""" - - def _convert_single_llava_to_gemma_conversation( - self, conversation: list[dict[str, str]], strip_images: bool = False - ) -> dict[str, str | list[dict]]: - """Convert a single llava-style conversation entry to Gemma-style. - - Args: - conversation: The conversation entry to convert - strip_images: If True, remove tokens and create text-only content. - If False, preserve tokens and create multimodal content. - - Examples: - >>> conversation = {"from": "human", "value": "What are the colors of the bus in the image?"} - >>> _convert_single_llava_to_gemma_conversation(conversation, strip_images=False) - { - 'role': 'user', - 'content': [{'type': 'image'}, {'type': 'text', 'text': 'What are the colors of the bus in the image?'}] - } - >>> _convert_single_llava_to_gemma_conversation(conversation, strip_images=True) - { - 'role': 'user', - 'content': [{'type': 'text', 'text': 'What are the colors of the bus in the image?'}] - } - - """ - who = conversation.get("from") - match who: - case "human": - role = "user" - case "gpt": - role = "assistant" - case _: - raise ValueError(f"Unknown role: {who}") - - text = conversation.get("value") - - if strip_images: - # Text-only: remove image references completely - text = text.replace("", "").strip() - return { - "role": role, - "content": [{"type": "text", "text": text}], - } - else: - # Multimodal: preserve image references - if "" in text: - has_image = True - text = text.replace("", "") - else: - has_image = False - - return { - "role": role, - "content": ( - [{"type": "image"}, {"type": "text", "text": text}] - if has_image - else [{"type": "text", "text": text}] - ), - } - - def _convert_llava_to_gemma_conversation(self, entry: dict[str, any], strip_images: bool = False): - """Convert LlaVA-style conversations to Gemma-style.""" - entry["text"] = [ - self._convert_single_llava_to_gemma_conversation(conversation, strip_images=strip_images) - for conversation in entry["conversations"] - ] - del entry["conversations"] - return entry - - def _download_and_extract_images(self): - """Download the COCO train2017 image dataset and extract to the cache directory.""" - zip_filename = "train2017.zip" - zip_path = os.path.join(self.CACHE_DIR, zip_filename) - extract_path = os.path.join(self.CACHE_DIR, "train2017") - - # Create cache directory if it doesn't exist - os.makedirs(self.CACHE_DIR, exist_ok=True) - - # Check if images are already downloaded and extracted - extract_path_obj = Path(extract_path) - if extract_path_obj.exists() and any(extract_path_obj.iterdir()): - logger.info("Images already exist at %s", extract_path) - return extract_path - - # Download the dataset if zip doesn't exist - if not os.path.exists(zip_path): - logger.info("Downloading COCO train2017 dataset to %s", zip_path) - try: - subprocess.run( - [ - "wget", - "https://images.cocodataset.org/zips/train2017.zip", - "--no-check-certificate", - "-O", - zip_path, - ], - check=True, - ) - logger.info("Download completed successfully") - except subprocess.CalledProcessError: - logger.exception("Failed to download dataset") - raise - except FileNotFoundError: - logger.exception("wget command not found. Please install wget or use an alternative download method.") - raise - - # Extract the zip file - logger.info("Extracting %s to %s", zip_path, self.CACHE_DIR) - try: - with zipfile.ZipFile(zip_path, "r") as zip_ref: - zip_ref.extractall(self.CACHE_DIR) - logger.info("Extraction completed successfully") - except zipfile.BadZipFile: - logger.exception("Failed to extract zip file") - # Remove corrupted zip file so it can be re-downloaded - if os.path.exists(zip_path): - os.remove(zip_path) - raise - - return extract_path - - def _load_base_dataset(self): - """Load the base LlaVA dataset.""" - # Issue with Arrow leads to errors when using load_dataset directly on liuhaotian/LLaVA-Instruct-150K - file_path = hf_hub_download( - repo_id="liuhaotian/LLaVA-Instruct-150K", - filename="llava_instruct_80k.json", - repo_type="dataset", - cache_dir=self.CACHE_DIR, - ) - - self.image_data_path = self._download_and_extract_images() - self.raw_datasets = load_dataset("json", data_files=[file_path], split="train") - - # Limit data processing to the first_n rows - self.raw_datasets = self.raw_datasets if self.first_n is None else self.raw_datasets.select(range(self.first_n)) - - def _extract_image_details(self, entry: dict[str, any]): - """Extract image details from the dataset example. - - Opens the image file and adds image mode information to the example. - """ - image = PILImage.open(fp=os.path.join(self.image_data_path, entry["image"])) - entry["image_mode"] = image.mode - return entry - - def setup_dataset(self): - """Set up the dataset with common preprocessing steps.""" - self._load_base_dataset() - - # Extract image details - self.raw_datasets = self.raw_datasets.map(self._extract_image_details) - - # Filter out any images that are not RGB - self.raw_datasets = self.raw_datasets.filter(lambda x: x["image_mode"] == "RGB") - - # Apply dataset-specific processing - self.raw_datasets = self.raw_datasets.with_transform(self._process_dataset_entry) - - def get_dataset(self): - """Return the processed dataset.""" - return self.raw_datasets - - -class GemmaMultimodalDataset(BaseGemmaDataset): - """Dataset for full E2E Gemma 3 multi-modal model including both image and text.""" - - def _initialize_processor_components(self): - """Initialize tokenizer for multimodal processing.""" - self.tokenizer = AutoTokenizer.from_pretrained( - self.model_id, cache_dir=self.CACHE_DIR, use_fast=True, trust_remote_code=True - ) - - def setup_dataset(self): - """Set up the multimodal dataset with text conversation conversion.""" - self._load_base_dataset() - - # Convert the Llava-style conversation to Gemma-style conversation (preserve images) - self.raw_datasets = self.raw_datasets.map( - lambda entry: self._convert_llava_to_gemma_conversation(entry, strip_images=False) - ) - - # Extract image details - self.raw_datasets = self.raw_datasets.map(self._extract_image_details) - - # Filter out any images that are not RGB - self.raw_datasets = self.raw_datasets.filter(lambda x: x["image_mode"] == "RGB") - - # Apply multimodal processing - self.raw_datasets = self.raw_datasets.with_transform(self._process_dataset_entry) - - def _process_dataset_entry(self, entry: dict[str, any]): - """Load image and tokenize the conversation for model input. - - Args: - entry: Dataset entry containing text conversation and image path - - Returns: - Tokenized inputs ready for model processing - - """ - return self.processor.apply_chat_template( - entry["text"][0], add_generation_prompt=True, tokenize=True, return_tensors="pt", return_dict=True - ) - - -class GemmaTextOnlyDataset(BaseGemmaDataset): - """Dataset for only the text portion of the Gemma 3 model.""" - - def _initialize_processor_components(self): - """Initialize tokenizer for text-only processing.""" - self.tokenizer = AutoTokenizer.from_pretrained( - self.model_id, cache_dir=self.CACHE_DIR, use_fast=True, trust_remote_code=True - ) - - def setup_dataset(self): - """Set up the text-only dataset with conversation conversion.""" - self._load_base_dataset() - - # Convert the Llava-style conversation to Gemma-style conversation (strip images) - self.raw_datasets = self.raw_datasets.map( - lambda entry: self._convert_llava_to_gemma_conversation(entry, strip_images=True) - ) - - # Extract image details (still needed for filtering) - self.raw_datasets = self.raw_datasets.map(self._extract_image_details) - - # Filter out any images that are not RGB - self.raw_datasets = self.raw_datasets.filter(lambda x: x["image_mode"] == "RGB") - - # Apply text-only processing - self.raw_datasets = self.raw_datasets.with_transform(self._process_dataset_entry) - - def _process_dataset_entry(self, entry: dict[str, any]): - """Extract and tokenize only the text content. - - Args: - entry: Dataset entry containing text conversation - - Returns: - Tokenized text inputs ready for model processing - - """ - # Apply chat template without images, text-only - inputs = self.tokenizer.apply_chat_template( - entry["text"][0], add_generation_prompt=True, tokenize=True, return_tensors="pt", return_dict=True - ) - return {k: v.squeeze(0) for k, v in inputs.items()} # Remove batch dimension - - -class GemmaImageDataset(BaseGemmaDataset): - """Dataset for only the image processing of the Gemma 3 model.""" - - def _initialize_processor_components(self): - """No additional components needed for image-only processing.""" - - def _process_dataset_entry(self, entry: dict[str, any]): - """Load image and extract only pixel_values for image-only processing.""" - # Load and process the image - image = PILImage.open(fp=os.path.join(self.image_data_path, entry["image"][0])) - - # Process image to get pixel_values - inputs = self.processor(text="", images=image, return_tensors="pt") - - # Return only pixel_values - return {"pixel_values": inputs["pixel_values"]} - - -class GemmaEmbeddingInputDataset(BaseGemmaDataset): - """Dataset that is the input to the embedding layer.""" - - def __init__(self, model_id, first_n=None): - # Initialize lazy-loaded model components - self._vision_tower = None - self._multi_modal_projector = None - - super().__init__(model_id, first_n) - - def _initialize_processor_components(self): - """Initialize only standard processor components.""" - self.tokenizer = AutoTokenizer.from_pretrained( - self.model_id, cache_dir=self.CACHE_DIR, use_fast=True, trust_remote_code=True - ) - - def _get_vision_components(self): - """Lazy-load vision model components when first needed.""" - if self._vision_tower is None: - logger.info("Loading vision model components for cached embedding dataset") - full_model = AutoModel.from_pretrained(self.model_id) - - # Extract vision components (equivalent to Gemma3VisualEmbeddingGenerator) - self._vision_tower = full_model.vision_tower - self._multi_modal_projector = full_model.multi_modal_projector - - # Clean up full model to save memory - del full_model.language_model - - return self._vision_tower.cuda(), self._multi_modal_projector.cuda() - - def setup_dataset(self): - """Set up the multimodal dataset with text conversation conversion.""" - self._load_base_dataset() - - # Convert the Llava-style conversation to Gemma-style conversation (preserve images) - self.raw_datasets = self.raw_datasets.map( - lambda entry: self._convert_llava_to_gemma_conversation(entry, strip_images=False) - ) - - # Extract image details - self.raw_datasets = self.raw_datasets.map(self._extract_image_details) - - # Filter out any images that are not RGB - self.raw_datasets = self.raw_datasets.filter(lambda x: x["image_mode"] == "RGB") - - # Apply multimodal processing - self.raw_datasets = self.raw_datasets.with_transform(self._process_dataset_entry) - - def _process_dataset_entry(self, entry: dict[str, any]): - """Process entry to return input_ids and cached image features.""" - # Convert conversation and tokenize - inputs = self.processor.apply_chat_template( - entry["text"][0], add_generation_prompt=True, tokenize=True, return_tensors="pt", return_dict=True - ) - - # Load and process image - image = PILImage.open(fp=os.path.join(self.image_data_path, entry["image"][0])) - pixel_values = torch.tensor(self.processor(text="", images=image).pixel_values) - - # Get vision components and extract features - vision_tower, projector = self._get_vision_components() - pixel_values = pixel_values.to(device="cuda") - - with torch.no_grad(): - # Process through vision tower - image_outputs = vision_tower(pixel_values, output_hidden_states=True) - selected_image_feature = image_outputs.last_hidden_state - # Project to final embedding space - image_features = projector(selected_image_feature) - # Convert to numpy for caching - image_features = image_features.cpu().detach().numpy() - - return {"input_ids": inputs["input_ids"], "image_features": image_features} - - -class GemmaEmbeddingDataset(BaseGemmaDataset): - """Dataset that pre-merges text and image embeddings.""" - - def __init__(self, model_id, first_n=None): - # Initialize lazy-loaded model components - self._vision_tower = None - self._multi_modal_projector = None - self._embedding_layer = None - - super().__init__(model_id, first_n) - - def _initialize_processor_components(self): - """Initialize only standard processor components.""" - self.tokenizer = AutoTokenizer.from_pretrained( - self.model_id, cache_dir=self.CACHE_DIR, use_fast=True, trust_remote_code=True - ) - - def _get_model_components(self): - """Lazy-load all required model components when first needed.""" - if self._embedding_layer is None: - logger.info("Loading model components for merged embedding dataset") - full_model = AutoModel.from_pretrained(self.model_id) - - # Extract components - self._vision_tower = full_model.vision_tower.cuda() - self._multi_modal_projector = full_model.multi_modal_projector.cuda() - self._embedding_layer = copy.deepcopy(full_model.language_model.embed_tokens).cuda() - - # Clean up full model - del full_model.language_model - - return self._vision_tower, self._multi_modal_projector, self._embedding_layer - - def _merge_embeddings(self, input_ids: torch.Tensor, pixel_values: torch.Tensor): - """Merge text and image embeddings at special token positions.""" - vision_tower, projector, embedding_layer = self._get_model_components() - - # Get text embeddings - inputs_embeds = embedding_layer(input_ids.to(device="cuda")) - - # Process image - pixel_values = pixel_values.to(dtype=inputs_embeds.dtype, device="cuda") - with torch.no_grad(): - image_outputs = vision_tower(pixel_values, output_hidden_states=True) - selected_image_feature = image_outputs.last_hidden_state - image_features = projector(selected_image_feature) - - # Merge at special token positions (image_token_index = 262144) - image_token_index = 262144 - special_image_mask = (input_ids == image_token_index).unsqueeze(-1) - special_image_mask = special_image_mask.expand_as(inputs_embeds).to(inputs_embeds.device) - - image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype) - return inputs_embeds.masked_scatter(special_image_mask, image_features) - - def setup_dataset(self): - """Set up the multimodal dataset with text conversation conversion.""" - self._load_base_dataset() - - # Convert the Llava-style conversation to Gemma-style conversation (preserve images) - self.raw_datasets = self.raw_datasets.map( - lambda entry: self._convert_llava_to_gemma_conversation(entry, strip_images=False) - ) - - # Extract image details - self.raw_datasets = self.raw_datasets.map(self._extract_image_details) - - # Filter out any images that are not RGB - self.raw_datasets = self.raw_datasets.filter(lambda x: x["image_mode"] == "RGB") - - # Apply multimodal processing - self.raw_datasets = self.raw_datasets.with_transform(self._process_dataset_entry) - - def _process_dataset_entry(self, entry: dict[str, any]): - """Process entry to return merged embeddings.""" - # Convert conversation and tokenize - inputs = self.processor.apply_chat_template( - entry["text"][0], add_generation_prompt=True, tokenize=True, return_tensors="pt", return_dict=True - ) - - # Load and process image - image = PILImage.open(fp=os.path.join(self.image_data_path, entry["image"][0])) - pixel_values = torch.tensor(self.processor(text="", images=image).pixel_values) - - # Merge embeddings - inputs_embeds = self._merge_embeddings(inputs["input_ids"], pixel_values) - - return { - "input_ids": inputs["input_ids"], - "inputs_embeds": inputs_embeds, - "attention_mask": inputs["attention_mask"].squeeze(0), - } - - -# Remove this when submitting for review -TEXT_SHORTCUT_FIRST_N = 600 -SHORTCUT_FIRST_N = 200 - - -@Registry.register_dataset() -def gemma_dataset(model_id: str): - """Full E2E Gemma 3 multi-modal dataset (image + text).""" - return GemmaMultimodalDataset(model_id, first_n=TEXT_SHORTCUT_FIRST_N).get_dataset() - - -@Registry.register_dataset() -def gemma_text_dataset(model_id: str): - """Text-only Gemma 3 dataset.""" - return GemmaTextOnlyDataset(model_id, first_n=SHORTCUT_FIRST_N).get_dataset() - - -@Registry.register_dataset() -def gemma_image_dataset(model_id: str): - """Image-only Gemma 3 dataset.""" - return GemmaImageDataset(model_id, first_n=SHORTCUT_FIRST_N).get_dataset() - - -@Registry.register_dataset() -def gemma_embedding_input_dataset(model_id: str): - """Gemma 3 dataset with embedding layer input.""" - return GemmaEmbeddingInputDataset(model_id, first_n=SHORTCUT_FIRST_N).get_dataset() - - -@Registry.register_dataset() -def gemma_embedding_dataset(model_id: str): - """Gemma 3 dataset with pre-merged text and image embeddings.""" - return GemmaEmbeddingDataset(model_id, first_n=SHORTCUT_FIRST_N).get_dataset() diff --git a/examples/gemma3/qnn/custom_gemma3_4b_embedding.py b/examples/gemma3/qnn/custom_gemma3_4b_embedding.py deleted file mode 100644 index 97c9cf2ea..000000000 --- a/examples/gemma3/qnn/custom_gemma3_4b_embedding.py +++ /dev/null @@ -1,37 +0,0 @@ -# ------------------------------------------------------------------------- -# Copyright (c) Microsoft Corporation. All rights reserved. -# Licensed under the MIT License. -# -------------------------------------------------------------------------- - - -import logging - -import torch -from transformers import AutoModel - -logger = logging.getLogger(__name__) - - -class EmbeddingLayer(torch.nn.Module): - def __init__(self, full_model): - super().__init__() - self.embedding_layer = full_model.language_model.embed_tokens - - def forward(self, input_ids, image_features): - image_token_index = 262144 - inputs_embeds = self.embedding_layer(input_ids) - - special_image_mask = (input_ids == image_token_index).unsqueeze(-1) - special_image_mask = special_image_mask.expand_as(inputs_embeds).to(inputs_embeds.device) - image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype) - return inputs_embeds.masked_scatter(special_image_mask, image_features) - - -def load_gemma3_embedding_model(model_path): - full_model = AutoModel.from_pretrained("google/gemma-3-4b-it") - logger.info("Loaded full model: %s", full_model) - - embedding_layer = EmbeddingLayer(full_model) - - logger.info("Created embedding-only model: %s", embedding_layer) - return embedding_layer diff --git a/examples/gemma3/qnn/custom_gemma3_4b_vision.py b/examples/gemma3/qnn/custom_gemma3_4b_vision.py deleted file mode 100644 index 1eb7f8f33..000000000 --- a/examples/gemma3/qnn/custom_gemma3_4b_vision.py +++ /dev/null @@ -1,36 +0,0 @@ -# ------------------------------------------------------------------------- -# Copyright (c) Microsoft Corporation. All rights reserved. -# Licensed under the MIT License. -# -------------------------------------------------------------------------- - - -import logging - -import torch -from transformers import AutoModel - -logger = logging.getLogger(__name__) - - -class Gemma3VisualEmbeddingGenerator(torch.nn.Module): - def __init__(self, full_model): - super().__init__() - # Extract only the vision components - self.vision_tower = full_model.vision_tower - self.multi_modal_projector = full_model.multi_modal_projector - - def forward(self, pixel_values): - # Process images through vision tower - image_outputs = self.vision_tower(pixel_values, output_hidden_states=True) - selected_image_feature = image_outputs.last_hidden_state - # Project to final embedding space - return self.multi_modal_projector(selected_image_feature) - - -def load_gemma3_vision_model(model_path): - full_model = AutoModel.from_pretrained("google/gemma-3-4b-it") - logger.info("Loaded full model: %s", full_model) - - vision_model = Gemma3VisualEmbeddingGenerator(full_model) - logger.info("Created vision-only model: %s", vision_model) - return vision_model diff --git a/examples/gemma3/qnn/env_setup.sh b/examples/gemma3/qnn/env_setup.sh deleted file mode 100644 index aa117afc0..000000000 --- a/examples/gemma3/qnn/env_setup.sh +++ /dev/null @@ -1,28 +0,0 @@ -#!/bin/bash -# ------------------------------------------------------------------------- -# Copyright (c) Microsoft Corporation. All rights reserved. -# Licensed under the MIT License. -# -------------------------------------------------------------------------- - -# Installing setuptools to build Olive from source -uv pip install setuptools - -# Requires installation of uv -uv pip install -r ../requirements.txt - -# Require installation of Olive dependencies -uv pip install -r ../../../requirements.txt - -# Disable CUDA extension build -export BUILD_CUDA_EXT=0 - -# Install AutoGPTQ from source -uv pip install --no-build-isolation git+https://github.com/PanQiWei/AutoGPTQ.git - -# Install GptqModel from source -# Note: Commit hash corresponds to commit which fixes Gemma 3 memory leak issue. See README.md for additional details. -uv pip install --no-build-isolation git+https://github.com/ModelCloud/GPTQModel.git@558449bed3ef2653c36041650d30da6bbbca440d - -# Install onnxruntime-qnn without installing onnxruntime -uv pip install -r https://raw.githubusercontent.com/microsoft/onnxruntime/refs/heads/main/requirements.txt -uv pip install -U --pre --extra-index-url https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/ORT-Nightly/pypi/simple onnxruntime-qnn --no-deps diff --git a/examples/gemma3/qnn/gemma-3-4b.ipynb b/examples/gemma3/qnn/gemma-3-4b.ipynb deleted file mode 100755 index 42890860e..000000000 --- a/examples/gemma3/qnn/gemma-3-4b.ipynb +++ /dev/null @@ -1,379 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Gemma 3 4B QNN model conversion with Olive \n", - "### Task: Text + Vision Generation 📝\n", - "\n", - "In this notebook, you'll:\n", - "- Download the required datasets\n", - "- Convert LLM to QNN format\n", - "- Convert Vision to QNN format\n", - "- Convert Embedding layer with image to QNN format\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Platform requirements\n", - "This notebook is intended to run on a machine with:\n", - " * **Operating System**: Linux Ubuntu 22.04 (automated setup script is Linux-only)\n", - " * **Python**: 3.10\n", - " * NVIDIA driver version equivalent to 525.60.13\n", - " * NVIDIA A100 GPU\n", - " * **Storage**: ~13GB for COCO train2017 dataset (downloaded automatically)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 🐍 Python Virtual environments\n", - "Creates Olive and QNN python virtual environments" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import os\n", - "import venv\n", - "from pathlib import Path\n", - "import subprocess\n", - "import json\n", - "import shutil\n", - "import urllib.request\n", - "import onnx\n", - "from onnx import helper, TensorProto\n", - "import glob\n", - "\n", - "current_dir = os.getcwd()\n", - "MODEL=\"google/gemma-3-4b-it\"\n", - "OLIVE_PYTHON_PATH = './olive_venv'\n", - "OLIVE_PYTHON_BIN = './olive_venv/bin/python'\n", - "olive_pip_path = Path(OLIVE_PYTHON_PATH) / \"bin\" / \"pip\"\n", - "OLIVE_REPO_PATH = Path(\"../../../\")\n", - "OLIVE_REQ = \"../requirements.txt\"\n", - "QNN_REQ = \"./qnn_req.txt\"\n", - "\n", - "QNN_PYTHON_PATH = './qnn_venv'\n", - "QNN_PYTHON_BIN_PATH = './qnn_venv/bin'\n", - "qnn_pip_path = Path(QNN_PYTHON_PATH) / \"bin\" / \"pip\"\n", - "QNN_PYTHON_BIN_FULL_PATH = f\"{current_dir}/{QNN_PYTHON_BIN_PATH}\"" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Prepare Olive Python Environment" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "scrolled": true - }, - "outputs": [], - "source": [ - "if not os.path.exists(OLIVE_PYTHON_PATH):\n", - " print(\"Creating Olive Venv\")\n", - " builder = venv.EnvBuilder(with_pip=True)\n", - " builder.create(Path(OLIVE_PYTHON_PATH))\n", - "my_env = os.environ.copy()\n", - "my_env[\"BUILD_CUDA_EXT\"] = \"0\"\n", - "GPTQ=\"git+https://github.com/ModelCloud/GPTQModel.git\"\n", - "subprocess.check_call([str(olive_pip_path), \"install\", \"-U\", \"-r\" , OLIVE_REQ], env=my_env)\n", - "subprocess.check_call([str(olive_pip_path), \"install\", \"--no-build-isolation\", GPTQ], env=my_env)\n", - "subprocess.check_call([str(olive_pip_path), \"install\", \"-e\", OLIVE_REPO_PATH])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Prepare QNN Python Environment" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "scrolled": true - }, - "outputs": [], - "source": [ - "if not os.path.exists(QNN_PYTHON_PATH):\n", - " print(\"Creating QNN Venv\")\n", - " builder = venv.EnvBuilder(with_pip=True)\n", - " builder.create(Path(QNN_PYTHON_PATH))\n", - "subprocess.check_call([str(qnn_pip_path), \"install\", \"--no-build-isolation\", \"-r\" , QNN_REQ], env=my_env)\n", - "subprocess.check_call([str(qnn_pip_path), \"install\", \"-e\", OLIVE_REPO_PATH])\n", - "subprocess.check_call([str(qnn_pip_path), \"install\", \"-U\", \"--pre\", \"--extra-index-url\",\n", - " \"https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/ORT-Nightly/pypi/simple\",\n", - " \"onnxruntime-qnn==1.23.0.dev20250815002\", \"--no-deps\"])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### 🤗 Login to Hugging Face\n", - "To access models, you'll need to log-in to Hugging Face with a [user access token](https://huggingface.co/docs/hub/security-tokens). The following command will run you through the steps to login:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "!huggingface-cli login --token <>" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Apply few patches to Onnxruntime\n", - "\n", - "This is needed for running the Olive recipies for this model" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "base_url = \"https://raw.githubusercontent.com/CodeLinaro/onnxruntime/326d9d30129bbad698e0306d24dcea0ec5a19e60\"\n", - "urls = [\n", - " base_url + \"/onnxruntime/python/tools/quantization/execution_providers/qnn/quant_config.py\",\n", - " base_url + \"/onnxruntime/python/tools/quantization/quant_utils.py\"\n", - "]\n", - "\n", - "destinations = [\n", - " OLIVE_PYTHON_PATH+\"/lib/python3.10/site-packages/onnxruntime/quantization/execution_providers/qnn/quant_config.py\",\n", - " OLIVE_PYTHON_PATH+\"/lib/python3.10/site-packages/onnxruntime/quantization/quant_utils.py\"\n", - "]\n", - "\n", - "for url, dest in zip(urls, destinations):\n", - " urllib.request.urlretrieve(url, dest)\n", - " print(f\"Downloaded and replaced: {dest}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Run Olive Recipes" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**GPU utilization observed during the run**\n", - "\n", - "\t\ta. Text GPTQModel quantization: 12gb\n", - "\t\tb. Text Onnx static quantization: 41gb\n", - "\t\tc. Vision Onnx static quantization: 68gb\n", - " d. Embedding Onnx static quantization: 3gb" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Clean Context binary directories if they exist\n", - "def clean_directory(path):\n", - " if os.path.exists(path):\n", - " for file in glob.glob(os.path.join(path, '*')):\n", - " if os.path.isfile(file):\n", - " os.remove(file)\n", - "dirs_to_clean = [\n", - " './models/gemma3_qnn/model/',\n", - " './models/gemma-3-4b-it-vision/model/',\n", - " './models/gemma-3-4b-it-embed/model/'\n", - "]\n", - "\n", - "for dir_path in dirs_to_clean:\n", - " clean_directory(dir_path)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### 1️⃣ LLM model generation" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "config_path = Path(f\"./gemma3-4b-text-qnn-config.json\")\n", - "with open(config_path, \"r\") as file:\n", - " data = json.load(file)\n", - "\n", - "data[\"systems\"][\"qnn_system\"][\"python_environment_path\"] = QNN_PYTHON_BIN_FULL_PATH\n", - "data[\"input_model\"][\"model_path\"] = MODEL\n", - "\n", - "with open(config_path, \"w\") as file:\n", - " json.dump(data, file, indent=4)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "scrolled": true - }, - "outputs": [], - "source": [ - "!./olive_venv/bin/olive run --config ./gemma3-4b-text-qnn-config.json" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### 2️⃣ Vision model Quantization" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "config_path = Path(f\"./gemma3-4b-vision-qnn-config.json\")\n", - "with open(config_path, \"r\") as file:\n", - " data = json.load(file)\n", - "data[\"systems\"][\"qnn_system\"][\"python_environment_path\"] = QNN_PYTHON_BIN_FULL_PATH\n", - "\n", - "with open(config_path, \"w\") as file:\n", - " json.dump(data, file, indent=4)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "scrolled": true - }, - "outputs": [], - "source": [ - "!./olive_venv/bin/olive run --config ./gemma3-4b-vision-qnn-config.json" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### 3️⃣ Embedding Model" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "scrolled": true - }, - "outputs": [], - "source": [ - "!./olive_venv/bin/olive run --config ./gemma3-4b-embedding-qnn-config.json" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Keep output of the embedding model as uint16 instead of float" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "model = onnx.load(\"./models/gemma-3-4b-it-embed/model/model.onnx\")\n", - "graph = model.graph\n", - "\n", - "last_node = graph.node[-1]\n", - "graph.node.remove(last_node)\n", - "previous_node_output = graph.node[-1].output[0]\n", - "new_output = helper.make_tensor_value_info(\n", - " name=previous_node_output,\n", - " elem_type=TensorProto.UINT16,\n", - " shape=[\"batch_size\", \"seq_length\", 2560]\n", - ")\n", - "graph.output.remove(graph.output[0])\n", - "graph.output.extend([new_output])\n", - "onnx.save(model, \"./models/gemma-3-4b-it-embed/model/embeddings_with_image.onnx\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Prepare final ORT GenAI folder for on-device inference " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "!cp ./models/gemma-3-4b-it-embed/model/embeddings_with_image.onnx ./models/gemma3_qnn/model/\n", - "!cp ./models/gemma-3-4b-it-vision/model/model_ctx.onnx ./models/gemma3_qnn/model/model_ctx_vision.onnx \n", - "!cp ./models/gemma-3-4b-it-vision/model/model_ctx_qnn.bin ./models/gemma3_qnn/model/model_ctx_qnn.bin \n", - "!cp ./genai/*.* ./models/gemma3_qnn/model/\n", - "!ls -al ./models/gemma3_qnn/model/\n", - "\n", - "print(\"ORT GenAI inference setup: ./models/gemma3_qnn\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.12" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/examples/gemma3/qnn/gemma3-4b-embedding-qnn-config.json b/examples/gemma3/qnn/gemma3-4b-embedding-qnn-config.json deleted file mode 100755 index 360f0e2bb..000000000 --- a/examples/gemma3/qnn/gemma3-4b-embedding-qnn-config.json +++ /dev/null @@ -1,51 +0,0 @@ -{ - "input_model": { - "type": "PyTorchModel", - "model_script": "custom_gemma3_4b_embedding.py", - "model_loader": "load_gemma3_embedding_model", - "io_config": { - "input_names": [ "input_ids", "image_features" ], - "input_shapes": [ [ 1, 64 ], [ 1, 256, 2560 ] ], - "input_types": [ "int64", "float32" ], - "output_names": [ "/model/embed_tokens/Mul/output_0" ], - "output_shapes": [ [ 1, 64, 2560 ] ], - "dynamic_axes": { - "input_ids": { "0": "batch_size", "1": "seq_length" }, - "image_features": { "0": "batch_size", "1": "image_tokens_length" } - } - } - }, - "systems": { - "local_system": { - "type": "LocalSystem", - "accelerators": [ { "device": "cpu", "execution_providers": [ "CPUExecutionProvider" ] } ] - } - }, - "data_configs": [ - { - "name": "gemma_embedding_data_config", - "user_script": "custom_gemma3_4b_datasets.py", - "load_dataset_config": { "type": "gemma_embedding_input_dataset", "model_id": "google/gemma-3-4b-it" } - } - ], - "passes": { - "conversion": { "type": "OnnxConversion", "target_opset": 20 }, - "quantization": { - "type": "OnnxStaticQuantization", - "quant_preprocess": false, - "data_config": "gemma_embedding_data_config", - "activation_type": "uint16", - "precision": "uint8", - "calibrate_method": "MinMax", - "calibration_providers": [ "CUDAExecutionProvider" ], - "per_channel": true, - "weight_symmetric": true - }, - "add_metadata": { "type": "AddOliveMetadata", "graph_name": "gemma-3-4b-it-embedding" } - }, - "target": "local_system", - "log_severity_level": 1, - "output_dir": "models/gemma-3-4b-it-embed", - "cache_dir": "cache-embd", - "no_artifacts": true -} diff --git a/examples/gemma3/qnn/gemma3-4b-text-qnn-config.json b/examples/gemma3/qnn/gemma3-4b-text-qnn-config.json deleted file mode 100755 index 12fc5c8dc..000000000 --- a/examples/gemma3/qnn/gemma3-4b-text-qnn-config.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "input_model": { - "type": "HfModel", - "model_path": "google/gemma-3-4b-it", - "custom_task_class_name": "Gemma3ForCausalLM", - "custom_task_class_module": "transformers" - }, - "systems": { - "qnn_system": { - "type": "PythonEnvironment", - "python_environment_path": "", - "accelerators": [ { "execution_providers": [ "QNNExecutionProvider" ] } ] - } - }, - "data_configs": [ - { - "name": "gemma_data_config", - "user_script": "custom_gemma3_4b_datasets.py", - "load_dataset_config": { "type": "gemma_dataset", "model_id": "google/gemma-3-4b-it" } - } - ], - "passes": { - "g": { - "type": "GptqModel", - "bits": 4, - "sym": true, - "group_size": -1, - "lm_head": false, - "device": "cuda", - "data_config": "gemma_data_config" - }, - "cs": { "type": "CaptureSplitInfo", "num_splits": 2, "unique_embeds_lm_head_splits": true }, - "mb": { - "type": "ModelBuilder", - "precision": "int4", - "int4_block_size": 32, - "int4_accuracy_level": 4, - "int4_op_types_to_quantize": [ "MatMul", "Gather" ] - }, - "mq": { - "type": "MatMulNBitsToQDQ", - "use_int4": true, - "add_zero_point": true, - "nodes_to_exclude": [ "/lm_head/MatMul_Q4" ], - "save_as_external_data": true - }, - "gs": { - "type": "GraphSurgeries", - "surgeries": [ - { "surgeon": "RemoveRopeMultiCache" }, - { "surgeon": "AttentionMaskToSequenceLengths" }, - { "surgeon": "SimplifiedLayerNormToL2Norm" } - ], - "save_as_external_data": true - }, - "sq": { - "type": "OnnxStaticQuantization", - "data_config": "gemma_data_config", - "activation_type": "uint16", - "precision": "uint8", - "calibration_providers": [ "CUDAExecutionProvider" ], - "quant_preprocess": true, - "op_types_to_exclude": [ "GatherBlockQuantized", "GroupQueryAttention", "MatMulNBits" ], - "save_as_external_data": true - }, - "sp": { "type": "SplitModel" }, - "st": { "type": "StaticLLM", "batch_size": 1, "context_length": 64 }, - "cb": { - "type": "EPContextBinaryGenerator", - "provider_options": { - "htp_performance_mode": "burst", - "htp_graph_finalization_optimization_mode": "3", - "vtcm_mb": "8", - "htp_arch": "v73", - "soc_model": "60" - }, - "session_options": { "intra_op_num_threads": 2, "inter_op_num_threads": 1 }, - "weight_sharing": true - }, - "cp": { "type": "ComposeOnnxModels" } - }, - "target": "qnn_system", - "log_severity_level": 0, - "output_dir": "models/gemma3_qnn", - "cache_dir": "cache", - "no_artifacts": true -} diff --git a/examples/gemma3/qnn/gemma3-4b-vision-qnn-config.json b/examples/gemma3/qnn/gemma3-4b-vision-qnn-config.json deleted file mode 100755 index b15d6185f..000000000 --- a/examples/gemma3/qnn/gemma3-4b-vision-qnn-config.json +++ /dev/null @@ -1,59 +0,0 @@ -{ - "input_model": { - "type": "PyTorchModel", - "model_script": "custom_gemma3_4b_vision.py", - "model_loader": "load_gemma3_vision_model", - "io_config": { - "input_names": [ "pixel_values" ], - "input_shapes": [ [ 1, 3, 896, 896 ] ], - "input_types": [ "float32" ], - "output_names": [ "image_features" ], - "output_shapes": [ [ 1, 256, 2560 ] ] - } - }, - "systems": { - "qnn_system": { - "type": "PythonEnvironment", - "python_environment_path": "", - "accelerators": [ { "execution_providers": [ "QNNExecutionProvider" ] } ] - } - }, - "data_configs": [ - { - "name": "gemma_vision_data_config", - "user_script": "custom_gemma3_4b_datasets.py", - "load_dataset_config": { "type": "gemma_image_dataset", "model_id": "google/gemma-3-4b-it" } - } - ], - "passes": { - "conversion": { "type": "OnnxConversion", "target_opset": 20 }, - "surgery": { "type": "GraphSurgeries", "surgeries": [ { "surgeon": "MatMulAddToGemm" } ] }, - "quantization": { - "type": "OnnxStaticQuantization", - "quant_preprocess": true, - "data_config": "gemma_vision_data_config", - "activation_type": "uint16", - "precision": "uint8", - "calibrate_method": "MinMax", - "calibration_providers": [ "CUDAExecutionProvider" ], - "per_channel": true, - "weight_symmetric": true - }, - "cb": { - "type": "EPContextBinaryGenerator", - "provider_options": { - "htp_performance_mode": "burst", - "htp_graph_finalization_optimization_mode": "3", - "vtcm_mb": "8", - "htp_arch": "v73", - "soc_model": "60" - } - }, - "add_metadata": { "type": "AddOliveMetadata", "graph_name": "gemma-3-4b-it-vision" } - }, - "target": "qnn_system", - "log_severity_level": 1, - "output_dir": "models/gemma-3-4b-it-vision", - "cache_dir": "cache-vision", - "no_artifacts": true -} diff --git a/examples/gemma3/qnn/genai/app.py b/examples/gemma3/qnn/genai/app.py deleted file mode 100644 index 0b5da39c3..000000000 --- a/examples/gemma3/qnn/genai/app.py +++ /dev/null @@ -1,163 +0,0 @@ -# Copyright (c) Microsoft Corporation. All rights reserved. -# Licensed under the MIT License - -import argparse -import glob -import json -import logging -import os -import time -from pathlib import Path - -import onnxruntime_genai as og - -logger = logging.getLogger(__name__) - - -def _find_dir_contains_sub_dir(current_dir: Path, target_dir_name): - curr_path = Path(current_dir).absolute() - target_dir = glob.glob(target_dir_name, root_dir=curr_path) - if target_dir: - return Path(curr_path / target_dir[0]).absolute() - else: - if curr_path.parent == curr_path: - # Root dir - return None - return _find_dir_contains_sub_dir(curr_path / "..", target_dir_name) - - -def _complete(text, state): - return (glob.glob(text + "*") + [None])[state] - - -def run(args: argparse.Namespace): - logger.info("Loading model...") - config = og.Config(args.model_path) - if args.execution_provider != "follow_config": - config.clear_providers() - if args.execution_provider != "cpu": - logger.info(f"Setting model to {args.execution_provider}...") - config.append_provider(args.execution_provider) - model = og.Model(config) - logger.info("Model loaded") - - tokenizer = og.Tokenizer(model) - processor = model.create_multimodal_processor() - stream = processor.create_stream() - - interactive = not args.non_interactive - - while True: - if interactive: - try: - import readline - - readline.set_completer_delims(" \t\n;") - readline.parse_and_bind("tab: complete") - readline.set_completer(_complete) - except ImportError: - # Not available on some platforms. Ignore it. - pass - image_paths = [ - image_path.strip() - for image_path in input("Image Path (comma separated; leave empty if no image): ").split(",") - ] - else: - if args.image_paths: - image_paths = args.image_paths - else: - image_paths = [str(Path(__file__).parent / "images" / "dog.jpg")] - - image_paths = [image_path for image_path in image_paths if image_path] - - images = None - if len(image_paths) == 0: - logger.info("No image provided") - else: - for i, image_path in enumerate(image_paths): - if not os.path.exists(image_path): - raise FileNotFoundError(f"Image file not found: {image_path}") - logger.info(f"Using image: {image_path}") - - images = og.Images.open(*image_paths) - - if interactive: - text = input("Prompt: ") - else: - if args.prompt: - text = args.prompt - else: - text = "What is shown in this image?" - - # Construct the "messages" argument passed to apply_chat_template - messages = [] - if model.type == "phi3v": - # Combine all image tags and text into one user message - content = "".join([f"<|image_{i + 1}|>\n" for i in range(len(image_paths))]) + text - messages.append({"role": "user", "content": content}) - else: - # Gemma3-style multimodal: structured content - content_list = [{"type": "image"} for _ in image_paths] - content_list.append({"type": "text", "text": text}) - messages.append({"role": "user", "content": content_list}) - - # Apply the chat template using the tokenizer - message_json = json.dumps(messages) - prompt = tokenizer.apply_chat_template(message_json, add_generation_prompt=True) - - logger.info("Processing images and prompt...") - inputs = processor(prompt, images=images) - - logger.info("Generating response...") - params = og.GeneratorParams(model) - params.set_search_options(max_length=1024) - - generator = og.Generator(model, params) - generator.set_inputs(inputs) - start_time = time.time() - - while not generator.is_done(): - generator.generate_next_token() - - new_token = generator.get_next_tokens()[0] - logger.info(stream.decode(new_token), end="", flush=True) - - total_run_time = time.time() - start_time - logger.info(f"Total Time : {total_run_time:.2f}") - - # Delete the generator to free the captured graph before creating another one - del generator - - if not interactive: - break - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument( - "-m", "--model_path", type=str, default="", required=True, help="Path to the folder containing the model" - ) - parser.add_argument( - "-e", - "--execution_provider", - type=str, - required=False, - default="follow_config", - choices=["cpu", "cuda", "dml", "follow_config"], - help="Execution provider to run the ONNX Runtime session with. Defaults to follow_config that uses the execution provider listed in the genai_config.json instead.", - ) - parser.add_argument( - "--image_paths", nargs="*", type=str, required=False, help="Path to the images, mainly for CI usage" - ) - parser.add_argument( - "-pr", "--prompt", required=False, help="Input prompts to generate tokens from, mainly for CI usage" - ) - parser.add_argument( - "--non-interactive", - action=argparse.BooleanOptionalAction, - default=True, - required=False, - help="Non-interactive mode, mainly for CI usage", - ) - args = parser.parse_args() - run(args) diff --git a/examples/gemma3/qnn/genai/genai_config.json b/examples/gemma3/qnn/genai/genai_config.json deleted file mode 100755 index 754b33cd0..000000000 --- a/examples/gemma3/qnn/genai/genai_config.json +++ /dev/null @@ -1,417 +0,0 @@ -{ - "model": { - "bos_token_id": 2, - "context_length": 131072, - "decoder": { - "session_options": { "log_id": "onnxruntime-genai", "provider_options": [ ] }, - "head_size": 256, - "hidden_size": 2560, - "inputs": { - "input_ids": "input_ids", - "inputs_embeds": "inputs_embeds", - "attention_mask": "attention_mask", - "past_key_names": "past_key_values.%d.key", - "past_value_names": "past_key_values.%d.value", - "past_sequence_length": "past_seq_len", - "total_sequence_length": "total_seq_len" - }, - "outputs": { - "logits": "logits", - "present_key_names": "present.%d.key", - "present_value_names": "present.%d.value" - }, - "num_attention_heads": 8, - "num_hidden_layers": 34, - "num_key_value_heads": 4, - "sliding_window": { - "window_size": 64, - "slide_key_value_cache": false, - "slide_inputs": true, - "pad_value": 0, - "alignment": "left" - }, - "pipeline": [ - { - "embeddings": { - "filename": "embeddings.onnx", - "inputs": [ "input_ids" ], - "outputs": [ "/model/embed_tokens/Mul/output_0_QuantizeLinear_Output" ], - "run_on_prompt": false - }, - "context_ctx": { - "filename": "context_ctx.onnx", - "inputs": [ - "/model/embed_tokens/Mul/output_0_QuantizeLinear_Output", - "past_key_values.0.key", - "past_key_values.0.value", - "past_seq_len", - "total_seq_len", - "past_key_values.1.key", - "past_key_values.1.value", - "past_key_values.2.key", - "past_key_values.2.value", - "past_key_values.3.key", - "past_key_values.3.value", - "past_key_values.4.key", - "past_key_values.4.value", - "past_key_values.5.key", - "past_key_values.5.value", - "past_key_values.6.key", - "past_key_values.6.value", - "past_key_values.7.key", - "past_key_values.7.value", - "past_key_values.8.key", - "past_key_values.8.value", - "past_key_values.9.key", - "past_key_values.9.value", - "past_key_values.10.key", - "past_key_values.10.value", - "past_key_values.11.key", - "past_key_values.11.value", - "past_key_values.12.key", - "past_key_values.12.value", - "past_key_values.13.key", - "past_key_values.13.value", - "past_key_values.14.key", - "past_key_values.14.value", - "past_key_values.15.key", - "past_key_values.15.value", - "past_key_values.16.key", - "past_key_values.16.value", - "past_key_values.17.key", - "past_key_values.17.value", - "past_key_values.18.key", - "past_key_values.18.value", - "past_key_values.19.key", - "past_key_values.19.value", - "past_key_values.20.key", - "past_key_values.20.value", - "past_key_values.21.key", - "past_key_values.21.value", - "past_key_values.22.key", - "past_key_values.22.value", - "past_key_values.23.key", - "past_key_values.23.value", - "past_key_values.24.key", - "past_key_values.24.value", - "past_key_values.25.key", - "past_key_values.25.value", - "past_key_values.26.key", - "past_key_values.26.value", - "past_key_values.27.key", - "past_key_values.27.value", - "past_key_values.28.key", - "past_key_values.28.value", - "past_key_values.29.key", - "past_key_values.29.value", - "past_key_values.30.key", - "past_key_values.30.value", - "past_key_values.31.key", - "past_key_values.31.value", - "past_key_values.32.key", - "past_key_values.32.value", - "past_key_values.33.key", - "past_key_values.33.value" - ], - "outputs": [ - "present.0.key", - "present.0.value", - "present.1.key", - "present.1.value", - "present.2.key", - "present.2.value", - "present.3.key", - "present.3.value", - "present.4.key", - "present.4.value", - "present.5.key", - "present.5.value", - "present.6.key", - "present.6.value", - "present.7.key", - "present.7.value", - "present.8.key", - "present.8.value", - "present.9.key", - "present.9.value", - "present.10.key", - "present.10.value", - "present.11.key", - "present.11.value", - "present.12.key", - "present.12.value", - "present.13.key", - "present.13.value", - "present.14.key", - "present.14.value", - "present.15.key", - "present.15.value", - "present.16.key", - "present.16.value", - "present.17.key", - "present.17.value", - "present.18.key", - "present.18.value", - "present.19.key", - "present.19.value", - "present.20.key", - "present.20.value", - "present.21.key", - "present.21.value", - "present.22.key", - "present.22.value", - "present.23.key", - "present.23.value", - "present.24.key", - "present.24.value", - "present.25.key", - "present.25.value", - "present.26.key", - "present.26.value", - "present.27.key", - "present.27.value", - "present.28.key", - "present.28.value", - "present.29.key", - "present.29.value", - "present.30.key", - "present.30.value", - "present.31.key", - "present.31.value", - "present.32.key", - "present.32.value", - "present.33.key", - "present.33.value", - "/model/layers.34/final_norm_layernorm/SkipLayerNorm_Mul_output_0_QuantizeLinear_Output" - ], - "session_options": { - "intra_op_num_threads": 2, - "inter_op_num_threads": 1, - "provider_options": [ - { - "qnn": { - "htp_performance_mode": "burst", - "htp_graph_finalization_optimization_mode": "3", - "soc_model": "60" - } - } - ] - }, - "run_on_token_gen": false - }, - "iterator_ctx": { - "filename": "iterator_ctx.onnx", - "inputs": [ - "/model/embed_tokens/Mul/output_0_QuantizeLinear_Output", - "past_key_values.0.key", - "past_key_values.0.value", - "past_seq_len", - "total_seq_len", - "past_key_values.1.key", - "past_key_values.1.value", - "past_key_values.2.key", - "past_key_values.2.value", - "past_key_values.3.key", - "past_key_values.3.value", - "past_key_values.4.key", - "past_key_values.4.value", - "past_key_values.5.key", - "past_key_values.5.value", - "past_key_values.6.key", - "past_key_values.6.value", - "past_key_values.7.key", - "past_key_values.7.value", - "past_key_values.8.key", - "past_key_values.8.value", - "past_key_values.9.key", - "past_key_values.9.value", - "past_key_values.10.key", - "past_key_values.10.value", - "past_key_values.11.key", - "past_key_values.11.value", - "past_key_values.12.key", - "past_key_values.12.value", - "past_key_values.13.key", - "past_key_values.13.value", - "past_key_values.14.key", - "past_key_values.14.value", - "past_key_values.15.key", - "past_key_values.15.value", - "past_key_values.16.key", - "past_key_values.16.value", - "past_key_values.17.key", - "past_key_values.17.value", - "past_key_values.18.key", - "past_key_values.18.value", - "past_key_values.19.key", - "past_key_values.19.value", - "past_key_values.20.key", - "past_key_values.20.value", - "past_key_values.21.key", - "past_key_values.21.value", - "past_key_values.22.key", - "past_key_values.22.value", - "past_key_values.23.key", - "past_key_values.23.value", - "past_key_values.24.key", - "past_key_values.24.value", - "past_key_values.25.key", - "past_key_values.25.value", - "past_key_values.26.key", - "past_key_values.26.value", - "past_key_values.27.key", - "past_key_values.27.value", - "past_key_values.28.key", - "past_key_values.28.value", - "past_key_values.29.key", - "past_key_values.29.value", - "past_key_values.30.key", - "past_key_values.30.value", - "past_key_values.31.key", - "past_key_values.31.value", - "past_key_values.32.key", - "past_key_values.32.value", - "past_key_values.33.key", - "past_key_values.33.value" - ], - "outputs": [ - "present.0.key", - "present.0.value", - "present.1.key", - "present.1.value", - "present.2.key", - "present.2.value", - "present.3.key", - "present.3.value", - "present.4.key", - "present.4.value", - "present.5.key", - "present.5.value", - "present.6.key", - "present.6.value", - "present.7.key", - "present.7.value", - "present.8.key", - "present.8.value", - "present.9.key", - "present.9.value", - "present.10.key", - "present.10.value", - "present.11.key", - "present.11.value", - "present.12.key", - "present.12.value", - "present.13.key", - "present.13.value", - "present.14.key", - "present.14.value", - "present.15.key", - "present.15.value", - "present.16.key", - "present.16.value", - "present.17.key", - "present.17.value", - "present.18.key", - "present.18.value", - "present.19.key", - "present.19.value", - "present.20.key", - "present.20.value", - "present.21.key", - "present.21.value", - "present.22.key", - "present.22.value", - "present.23.key", - "present.23.value", - "present.24.key", - "present.24.value", - "present.25.key", - "present.25.value", - "present.26.key", - "present.26.value", - "present.27.key", - "present.27.value", - "present.28.key", - "present.28.value", - "present.29.key", - "present.29.value", - "present.30.key", - "present.30.value", - "present.31.key", - "present.31.value", - "present.32.key", - "present.32.value", - "present.33.key", - "present.33.value", - "/model/layers.34/final_norm_layernorm/SkipLayerNorm_Mul_output_0_QuantizeLinear_Output" - ], - "session_options": { - "intra_op_num_threads": 2, - "inter_op_num_threads": 1, - "provider_options": [ - { - "qnn": { - "htp_performance_mode": "burst", - "htp_graph_finalization_optimization_mode": "3", - "soc_model": "60" - } - } - ] - }, - "run_on_prompt": false - }, - "lm_head": { - "filename": "lm_head.onnx", - "inputs": [ - "/model/layers.34/final_norm_layernorm/SkipLayerNorm_Mul_output_0_QuantizeLinear_Output" - ], - "outputs": [ "logits" ] - } - } - ] - }, - "embedding": { - "filename": "embeddings_with_image.onnx", - "inputs": { "input_ids": "input_ids", "image_features": "image_features" }, - "outputs": { "inputs_embeds": "/model/embed_tokens/Mul/output_0_QuantizeLinear_Output" } - }, - "vision": { - "filename": "model_ctx_vision.onnx", - "inputs": { "pixel_values": "pixel_values" }, - "outputs": { "image_features": "image_features" }, - "session_options": { - "intra_op_num_threads": 2, - "inter_op_num_threads": 1, - "provider_options": [ - { - "qnn": { - "htp_performance_mode": "burst", - "htp_graph_finalization_optimization_mode": "3", - "soc_model": "60" - } - } - ] - } - }, - "eos_token_id": [ 1, 106 ], - "pad_token_id": 0, - "type": "gemma3", - "vocab_size": 262208 - }, - "search": { - "diversity_penalty": 0.0, - "do_sample": true, - "early_stopping": true, - "length_penalty": 1.0, - "max_length": 131072, - "min_length": 0, - "no_repeat_ngram_size": 0, - "num_beams": 1, - "num_return_sequences": 1, - "past_present_share_buffer": true, - "repetition_penalty": 1.0, - "temperature": 1.0, - "top_k": 64, - "top_p": 0.95 - } -} diff --git a/examples/gemma3/qnn/genai/processor_config.json b/examples/gemma3/qnn/genai/processor_config.json deleted file mode 100755 index b25059aa2..000000000 --- a/examples/gemma3/qnn/genai/processor_config.json +++ /dev/null @@ -1,24 +0,0 @@ -{ - "processor": { - "name": "gemma_3_image_processing", - "transforms": [ - { "operation": { "name": "decode_image", "type": "DecodeImage", "attrs": { "color_space": "RGB" } } }, - { - "operation": { - "name": "resize", - "type": "Resize", - "attrs": { "interpolation": "CUBIC", "width": 896, "height": 896, "keep_aspect_ratio": 0 } - } - }, - { "operation": { "name": "re-scale", "type": "Rescale" } }, - { - "operation": { - "name": "normalize", - "type": "Normalize", - "attrs": { "mean": [ 0.5, 0.5, 0.5 ], "std": [ 0.5, 0.5, 0.5 ] } - } - }, - { "operation": { "name": "to_channel_first", "type": "Permute3D", "attrs": { "dims": [ 2, 0, 1 ] } } } - ] - } -} diff --git a/examples/gemma3/qnn/qnn_req.txt b/examples/gemma3/qnn/qnn_req.txt deleted file mode 100755 index 05c845791..000000000 --- a/examples/gemma3/qnn/qnn_req.txt +++ /dev/null @@ -1,7 +0,0 @@ -coloredlogs -flatbuffers -numpy >= 1.21.6 -packaging -protobuf -sympy -transformers==4.55.2 diff --git a/examples/gemma3/requirements.txt b/examples/gemma3/requirements.txt deleted file mode 100644 index 337d1987d..000000000 --- a/examples/gemma3/requirements.txt +++ /dev/null @@ -1,10 +0,0 @@ -datasets -onnx==1.18.0 -onnx-ir==0.1.4 -onnxruntime-genai-cuda==0.9.0 -onnxruntime-gpu==1.22.0 -onnxscript==0.3.2 -optimum -setuptools -tabulate -transformers From 5dff1552e310329862ae450c35886984ac1ab2f4 Mon Sep 17 00:00:00 2001 From: Kyle Romero Date: Thu, 18 Sep 2025 10:44:34 -0700 Subject: [PATCH 24/24] Fix review comments --- olive/common/hf/utils.py | 2 +- olive/model/handler/hf.py | 3 --- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/olive/common/hf/utils.py b/olive/common/hf/utils.py index dee79e6e8..8359396b5 100644 --- a/olive/common/hf/utils.py +++ b/olive/common/hf/utils.py @@ -67,7 +67,7 @@ def load_model_from_task( class_tuple = (getattr(module, custom_task_class_name),) else: class_tuple = targeted_task["pt"] or (AutoModel,) - print("class_tuple", class_tuple) + model = None for i, model_class in enumerate(class_tuple): try: diff --git a/olive/model/handler/hf.py b/olive/model/handler/hf.py index 343c84f77..4e4bb917d 100644 --- a/olive/model/handler/hf.py +++ b/olive/model/handler/hf.py @@ -91,9 +91,6 @@ def load_model(self, rank: int = None, cache_model: bool = True) -> "torch.nn.Mo model = PeftModel.from_pretrained(model, self.adapter_path) self.model = model if cache_model else None - - logger.error(self.model) - return model @property