diff --git a/.aitk/configs/checks.json b/.aitk/configs/checks.json
index 4b219bcd..c80ae155 100644
--- a/.aitk/configs/checks.json
+++ b/.aitk/configs/checks.json
@@ -1,16 +1,16 @@
 {
-    "configCheck": 139,
+    "configCheck": 140,
     "copyCheck": 179,
     "extensionCheck": 1,
-    "gitignoreCheck": 38,
+    "gitignoreCheck": 39,
     "inferenceModelCheck": 25,
-    "ipynbCheck": 38,
-    "licenseCheck": 37,
-    "modelProjectCheck": 39,
+    "ipynbCheck": 39,
+    "licenseCheck": 38,
+    "modelProjectCheck": 40,
     "oliveCheck": 45,
-    "oliveJsonCheck": 139,
-    "pathCheck": 1153,
+    "oliveJsonCheck": 140,
+    "pathCheck": 1170,
     "requirementsCheck": 37,
     "templateCheck": 1,
-    "venvRequirementsCheck": 13
+    "venvRequirementsCheck": 14
 }
diff --git a/.aitk/configs/model_list.json b/.aitk/configs/model_list.json
index 0ef10cf3..a6d85da0 100644
--- a/.aitk/configs/model_list.json
+++ b/.aitk/configs/model_list.json
@@ -647,6 +647,19 @@
             "status": "Ready",
             "relativePath": "Qwen-Qwen2.5-Coder-14B-Instruct/aitk",
             "version": 3
+        },
+        {
+            "displayName": "stable-diffusion-v1-5/stable-diffusion-v1-5",
+            "icon": "HuggingFace",
+            "modelLink": "https://huggingface.co/stable-diffusion-v1-5/stable-diffusion-v1-5",
+            "id": "huggingface/stable-diffusion-v1-5/stable-diffusion-v1-5",
+            "runtimes": [
+                "QNN"
+            ],
+            "architecture": "Transformer",
+            "status": "Hide",
+            "relativePath": "sd-legacy-stable-diffusion-v1-5/aitk",
+            "version": 1
         }
     ],
     "template_models": [
@@ -692,6 +705,7 @@
         "google-research-datasets/conceptual_captions": "https://huggingface.co/datasets/google-research-datasets/conceptual_captions",
         "AIMClab-RUC/COCO-CN": "https://huggingface.co/datasets/AIMClab-RUC/COCO-CN",
         "librispeech_asr": "https://huggingface.co/datasets/openslr/librispeech_asr",
+        "phiyodr/coco2017": "https://huggingface.co/datasets/phiyodr/coco2017",
         "pileval_for_awq_benchmark": "https://huggingface.co/datasets/mit-han-lab/pile-val-backup"
     },
     "LoginRequiredDatasets": [
diff --git a/.aitk/requirements/requirements-WCR-SD.txt b/.aitk/requirements/requirements-WCR-SD.txt
new file mode 100644
index 00000000..74b5790f
--- /dev/null
+++ b/.aitk/requirements/requirements-WCR-SD.txt
@@ -0,0 +1,3 @@
+accelerate==1.12.0
+diffusers==0.35.0
+torch-fidelity==0.3.0
diff --git a/.aitk/scripts/project_processor.py b/.aitk/scripts/project_processor.py
index d36c3f82..fb04a69f 100644
--- a/.aitk/scripts/project_processor.py
+++ b/.aitk/scripts/project_processor.py
@@ -28,6 +28,7 @@
     "mistralai": IconEnum.mistralai,
     # TODO add
     "OFA-Sys": IconEnum.HuggingFace,
+    "stable-diffusion-v1-5": IconEnum.HuggingFace,
 }
 
 
diff --git a/sd-legacy-stable-diffusion-v1-5/aitk/.gitignore b/sd-legacy-stable-diffusion-v1-5/aitk/.gitignore
new file mode 100644
index 00000000..ccd91a69
--- /dev/null
+++ b/sd-legacy-stable-diffusion-v1-5/aitk/.gitignore
@@ -0,0 +1,8 @@
+__pycache__
+/cache
+/history/*/*
+!/history/*/history.config
+!/history/*/olive_config.json
+/footprints
+/result_*.png
+/*data*/
diff --git a/sd-legacy-stable-diffusion-v1-5/aitk/README.md b/sd-legacy-stable-diffusion-v1-5/aitk/README.md
new file mode 100644
index 00000000..1a0a7045
--- /dev/null
+++ b/sd-legacy-stable-diffusion-v1-5/aitk/README.md
@@ -0,0 +1,31 @@
+## Stable Diffusion Optimization with ONNX Runtime QNN EP
+
+### Generate data for static quantization
+
+To get better result, we need to generate real data from original model instead of using random data for static quantization.
+
+First generate onnx unoptimized model:
+
+`python stable_diffusion.py --model_id stable-diffusion-v1-5/stable-diffusion-v1-5 --script_dir .\ --provider cpu --format qdq --optimize --only_conversion`
+
+Then generate data:
+
+`python .\evaluation.py --save_data --script_dir .\ --model_id stable-diffusion-v1-5/stable-diffusion-v1-5 --num_inference_steps 25 --seed 0 --num_data 100 --guidance_scale 7.5`
+
+### Optimize
+
+Optimize the onnx models for performance improvements. vae_decoder and unet are per-channel quantized and text_encoder runs in fp16 precision.
+
+`python stable_diffusion.py --script_dir .\ --model_id stable-diffusion-v1-5/stable-diffusion-v1-5 --provider qnn --format qdq --optimize`
+
+### Test and evaluate
+
+`python .\evaluation.py --script_dir .\ --model_id stable-diffusion-v1-5/stable-diffusion-v1-5 --num_inference_steps 25 --seed 0 --num_data 100 --guidance_scale 7.5 --provider QNNExecutionProvider --model_dir optimized-qnn_qdq`
+
+To generate one image:
+
+`python stable_diffusion.py --script_dir .\ --model_id stable-diffusion-v1-5/stable-diffusion-v1-5 --provider qnn --format qdq --guidance_scale 7.5 --seed 0 --num_inference_steps 25 --prompt "A baby is laying down with a teddy bear"`
+
+### References
+
+[stable-diffusion-v1-4](https://github.com/microsoft/olive-recipes/tree/main/compvis-stable-diffusion-v1-4/olive#readme)
diff --git a/sd-legacy-stable-diffusion-v1-5/aitk/assets/dog.png b/sd-legacy-stable-diffusion-v1-5/aitk/assets/dog.png
new file mode 100644
index 00000000..a8bef282
Binary files /dev/null and b/sd-legacy-stable-diffusion-v1-5/aitk/assets/dog.png differ
diff --git a/sd-legacy-stable-diffusion-v1-5/aitk/config_safety_checker.json b/sd-legacy-stable-diffusion-v1-5/aitk/config_safety_checker.json
new file mode 100644
index 00000000..8b2d063c
--- /dev/null
+++ b/sd-legacy-stable-diffusion-v1-5/aitk/config_safety_checker.json
@@ -0,0 +1,98 @@
+{
+    "input_model": {
+        "type": "PyTorchModel",
+        "model_path": "stable-diffusion-v1-5/stable-diffusion-v1-5",
+        "model_loader": "safety_checker_load",
+        "model_script": "user_script.py",
+        "io_config": {
+            "input_names": [ "clip_input", "images" ],
+            "output_names": [ "out_images", "has_nsfw_concepts" ],
+            "dynamic_axes": {
+                "clip_input": { "0": "batch", "1": "channels", "2": "height", "3": "width" },
+                "images": { "0": "batch", "1": "height", "2": "width", "3": "channels" }
+            }
+        },
+        "dummy_inputs_func": "safety_checker_conversion_inputs"
+    },
+    "systems": {
+        "local_system": {
+            "type": "LocalSystem",
+            "accelerators": [ { "device": "gpu", "execution_providers": [ "CUDAExecutionProvider" ] } ]
+        }
+    },
+    "data_configs": [
+        {
+            "name": "latency_data_config",
+            "user_script": "user_script.py",
+            "load_dataset_config": { "type": "local_dataset" },
+            "dataloader_config": { "type": "safety_checker_data_loader", "batch_size": 1 }
+        }
+    ],
+    "evaluators": {
+        "common_evaluator": {
+            "metrics": [
+                {
+                    "name": "latency",
+                    "type": "latency",
+                    "data_config": "latency_data_config",
+                    "sub_types": [ { "name": "avg" } ]
+                }
+            ]
+        }
+    },
+    "passes": {
+        "convert": { "type": "OnnxConversion", "target_opset": 14 },
+        "ov_convert": {
+            "type": "OpenVINOConversion",
+            "user_script": "user_script.py",
+            "example_input_func": "safety_checker_conversion_inputs",
+            "output_model": "safety_checker"
+        },
+        "optimize": {
+            "type": "OrtTransformersOptimization",
+            "model_type": "unet",
+            "opt_level": 0,
+            "float16": true,
+            "use_gpu": true,
+            "keep_io_types": false,
+            "optimization_options": {
+                "enable_gelu": true,
+                "enable_layer_norm": true,
+                "enable_attention": true,
+                "use_multi_head_attention": true,
+                "enable_skip_layer_norm": false,
+                "enable_embed_layer_norm": true,
+                "enable_bias_skip_layer_norm": false,
+                "enable_bias_gelu": true,
+                "enable_gelu_approximation": false,
+                "enable_qordered_matmul": false,
+                "enable_shape_inference": true,
+                "enable_gemm_fast_gelu": false,
+                "enable_nhwc_conv": false,
+                "enable_group_norm": true,
+                "enable_bias_splitgelu": false,
+                "enable_packed_qkv": true,
+                "enable_packed_kv": true,
+                "enable_bias_add": false,
+                "group_norm_channels_last": false
+            },
+            "force_fp32_ops": [ "RandomNormalLike" ],
+            "force_fp16_inputs": { "GroupNorm": [ 0, 1, 2 ] }
+        },
+        "optimize_cuda": {
+            "type": "OrtTransformersOptimization",
+            "model_type": "unet",
+            "opt_level": 0,
+            "float16": true,
+            "use_gpu": true,
+            "keep_io_types": false
+        }
+    },
+    "log_severity_level": 0,
+    "evaluator": "common_evaluator",
+    "evaluate_input_model": false,
+    "host": "local_system",
+    "target": "local_system",
+    "cache_dir": "cache",
+    "output_dir": "footprints/safety_checker"
+}
diff --git a/sd-legacy-stable-diffusion-v1-5/aitk/config_text_encoder.json b/sd-legacy-stable-diffusion-v1-5/aitk/config_text_encoder.json
new file mode 100644
index 00000000..ce2559d2
--- /dev/null
+++ b/sd-legacy-stable-diffusion-v1-5/aitk/config_text_encoder.json
@@ -0,0 +1,147 @@
+{
+    "input_model": {
+        "type": "PyTorchModel",
+        "model_path": "stable-diffusion-v1-5/stable-diffusion-v1-5",
+        "model_loader": "text_encoder_load",
+        "model_script": "user_script.py",
+        "io_config": {
+            "input_names": [ "input_ids" ],
+            "output_names": [ "last_hidden_state", "pooler_output" ],
+            "dynamic_axes": { "input_ids": { "0": "batch", "1": "sequence" } }
+        },
+        "dummy_inputs_func": "text_encoder_conversion_inputs"
+    },
+    "systems": {
+        "local_system": {
+            "type": "LocalSystem",
+            "accelerators": [ { "device": "gpu", "execution_providers": [ "CUDAExecutionProvider" ] } ]
+        }
+    },
+    "data_configs": [
+        {
+            "name": "latency_data_config",
+            "user_script": "user_script.py",
+            "load_dataset_config": { "type": "local_dataset" },
+            "dataloader_config": { "type": "text_encoder_data_loader", "batch_size": 1 }
+        },
+        {
+            "name": "quantize_data_config",
+            "user_script": "user_script.py",
+            "load_dataset_config": { "type": "local_dataset" },
+            "dataloader_config": { "type": "text_encoder_quantize_data_loader", "data_num": 100 }
+        }
+    ],
+    "evaluators": {
+        "common_evaluator": {
+            "metrics": [
+                {
+                    "name": "latency",
+                    "type": "latency",
+                    "data_config": "latency_data_config",
+                    "sub_types": [ { "name": "avg" } ]
+                }
+            ]
+        }
+    },
+    "passes": {
+        "convert": { "type": "OnnxConversion", "target_opset": 17 },
+        "ov_convert": {
+            "type": "OpenVINOConversion",
+            "user_script": "user_script.py",
+            "example_input_func": "text_encoder_conversion_inputs",
+            "output_model": "text_encoder"
+        },
+        "optimize": {
+            "type": "OrtTransformersOptimization",
+            "model_type": "clip",
+            "opt_level": 0,
+            "float16": true,
+            "use_gpu": true,
+            "keep_io_types": false,
+            "optimization_options": {
+                "enable_gelu": true,
+                "enable_layer_norm": true,
+                "enable_attention": true,
+                "use_multi_head_attention": true,
+                "enable_skip_layer_norm": false,
+                "enable_embed_layer_norm": true,
+                "enable_bias_skip_layer_norm": false,
+                "enable_bias_gelu": true,
+                "enable_gelu_approximation": false,
+                "enable_qordered_matmul": false,
+                "enable_shape_inference": true,
+                "enable_gemm_fast_gelu": false,
+                "enable_nhwc_conv": false,
+                "enable_group_norm": true,
+                "enable_bias_splitgelu": false,
+                "enable_packed_qkv": true,
+                "enable_packed_kv": true,
+                "enable_bias_add": false,
+                "group_norm_channels_last": false
+            },
+            "force_fp32_ops": [ "RandomNormalLike" ],
+            "force_fp16_inputs": { "GroupNorm": [ 0, 1, 2 ] }
+        },
+        "optimize_cuda": {
+            "type": "OrtTransformersOptimization",
+            "model_type": "clip",
+            "opt_level": 0,
+            "float16": true,
+            "use_gpu": true,
+            "keep_io_types": false
+        },
+        "dynamic_shape_to_fixed": {
+            "type": "DynamicToFixedShape",
+            "dim_param": [ "batch", "sequence" ],
+            "dim_value": [ 1, 77 ]
+        },
+        "surgery": { "type": "GraphSurgeries", "surgeries": [ { "surgeon": "ReplaceAttentionMaskValue", "replacement": -200.0 } ] },
+        "optimize_qdq": {
+            "type": "OrtTransformersOptimization",
+            "model_type": "clip",
+            "opt_level": 0,
+            "optimization_options": {
+                "enable_gelu": true,
+                "enable_layer_norm": true,
+                "enable_attention": false,
+                "use_multi_head_attention": false,
+                "enable_skip_layer_norm": false,
+                "enable_embed_layer_norm": false,
+                "enable_bias_skip_layer_norm": false,
+                "enable_bias_gelu": false,
+                "enable_gelu_approximation": false,
+                "enable_qordered_matmul": false,
+                "enable_shape_inference": false,
+                "enable_gemm_fast_gelu": false,
+                "enable_nhwc_conv": false,
+                "enable_group_norm": false,
+                "enable_bias_splitgelu": false,
+                "enable_packed_qkv": false,
+                "enable_packed_kv": false,
+                "enable_bias_add": false,
+                "group_norm_channels_last": false
+            }
+        },
+        "quantization": {
+            "type": "OnnxStaticQuantization",
+            "data_config": "quantize_data_config",
+            "activation_type": "uint16",
+            "precision": "uint8",
+            "calibrate_method": "MinMax",
+            "quant_preprocess": true
+        },
+        "cb": {
+            "type": "EPContextBinaryGenerator",
+            "provider_options": {
+                "htp_graph_finalization_optimization_mode": "3"
+            }
+        }
+    },
+    "log_severity_level": 0,
+    "evaluator": "common_evaluator",
+    "evaluate_input_model": false,
+    "host": "local_system",
+    "target": "local_system",
+    "cache_dir": "cache",
+    "output_dir": "footprints/text_encoder"
+}
diff --git a/sd-legacy-stable-diffusion-v1-5/aitk/config_unet.json b/sd-legacy-stable-diffusion-v1-5/aitk/config_unet.json
new file mode 100644
index 00000000..1f631b3f
--- /dev/null
+++ b/sd-legacy-stable-diffusion-v1-5/aitk/config_unet.json
@@ -0,0 +1,170 @@
+{
+    "input_model": {
+        "type": "PyTorchModel",
+        "model_path": "stable-diffusion-v1-5/stable-diffusion-v1-5",
+        "model_loader": "unet_load",
+        "model_script": "user_script.py",
+        "io_config": {
+            "input_names": [ "sample", "timestep", "encoder_hidden_states", "return_dict" ],
+            "output_names": [ "out_sample" ],
+            "dynamic_axes": {
+                "sample": {
+                    "0": "unet_sample_batch",
+                    "1": "unet_sample_channels",
+                    "2": "unet_sample_height",
+                    "3": "unet_sample_width"
+                },
+                "timestep": { "0": "unet_time_batch" },
+                "encoder_hidden_states": { "0": "unet_hidden_batch", "1": "unet_hidden_sequence" }
+            }
+        },
+        "dummy_inputs_func": "unet_conversion_inputs"
+    },
+    "systems": {
+        "local_system": {
+            "type": "LocalSystem",
+            "accelerators": [ { "device": "gpu", "execution_providers": [ "CUDAExecutionProvider" ] } ]
+        }
+    },
+    "data_configs": [
+        {
+            "name": "latency_data_config",
+            "user_script": "user_script.py",
+            "load_dataset_config": { "type": "local_dataset" },
+            "dataloader_config": { "type": "unet_data_loader", "batch_size": 1 }
+        },
+        {
+            "name": "quantize_data_config",
+            "user_script": "user_script.py",
+            "load_dataset_config": { "type": "local_dataset" },
+            "dataloader_config": { "type": "unet_quantize_data_loader", "data_num": 200 }
+        }
+    ],
+    "evaluators": {
+        "common_evaluator": {
+            "metrics": [
+                {
+                    "name": "latency",
+                    "type": "latency",
+                    "data_config": "latency_data_config",
+                    "sub_types": [ { "name": "avg" } ]
+                }
+            ]
+        }
+    },
+    "passes": {
+        "convert": {
+            "type": "OnnxConversion",
+            "target_opset": 17,
+            "save_as_external_data": true,
+            "all_tensors_to_one_file": true,
+            "external_data_name": "weights.pb"
+        },
+        "ov_convert": {
+            "type": "OpenVINOConversion",
+            "user_script": "user_script.py",
+            "example_input_func": "get_unet_ov_example_input",
+            "output_model": "unet"
+        },
+        "optimize": {
+            "type": "OrtTransformersOptimization",
+            "model_type": "unet",
+            "opt_level": 0,
+            "float16": true,
+            "use_gpu": true,
+            "keep_io_types": false,
+            "optimization_options": {
+                "enable_gelu": true,
+                "enable_layer_norm": true,
+                "enable_attention": true,
+                "use_multi_head_attention": true,
+                "enable_skip_layer_norm": false,
+                "enable_embed_layer_norm": true,
+                "enable_bias_skip_layer_norm": false,
+                "enable_bias_gelu": true,
+                "enable_gelu_approximation": false,
+                "enable_qordered_matmul": false,
+                "enable_shape_inference": true,
+                "enable_gemm_fast_gelu": false,
+                "enable_nhwc_conv": false,
+                "enable_group_norm": true,
+                "enable_bias_splitgelu": false,
+                "enable_packed_qkv": true,
+                "enable_packed_kv": true,
+                "enable_bias_add": false,
+                "group_norm_channels_last": false
+            },
+            "force_fp32_ops": [ "RandomNormalLike" ],
+            "force_fp16_inputs": { "GroupNorm": [ 0, 1, 2 ] }
+        },
+        "optimize_cuda": {
+            "type": "OrtTransformersOptimization",
+            "model_type": "unet",
+            "opt_level": 0,
+            "float16": true,
+            "use_gpu": true,
+            "keep_io_types": false
+        },
+        "dynamic_shape_to_fixed": {
+            "type": "DynamicToFixedShape",
+            "dim_param": [
+                "unet_sample_batch",
+                "unet_sample_channels",
+                "unet_sample_height",
+                "unet_sample_width",
+                "unet_time_batch",
+                "unet_hidden_batch",
+                "unet_hidden_sequence"
+            ],
+            "dim_value": [ 1, 4, 64, 64, 1, 1, 77 ],
+            "save_as_external_data": true
+        },
+        "optimize_qdq": {
+            "type": "OrtTransformersOptimization",
+            "model_type": "unet",
+            "opt_level": 0,
+            "optimization_options": {
+                "enable_gelu": true,
+                "enable_layer_norm": true,
+                "enable_attention": false,
+                "use_multi_head_attention": false,
+                "enable_skip_layer_norm": false,
+                "enable_embed_layer_norm": false,
+                "enable_bias_skip_layer_norm": false,
+                "enable_bias_gelu": false,
+                "enable_gelu_approximation": false,
+                "enable_qordered_matmul": false,
+                "enable_shape_inference": true,
+                "enable_gemm_fast_gelu": false,
+                "enable_nhwc_conv": false,
+                "enable_group_norm": false,
+                "enable_bias_splitgelu": false,
+                "enable_packed_qkv": false,
+                "enable_packed_kv": false,
+                "enable_bias_add": false,
+                "group_norm_channels_last": false
+            }
+        },
+        "quantization": {
+            "type": "OnnxStaticQuantization",
+            "data_config": "quantize_data_config",
+            "activation_type": "uint16",
+            "precision": "uint8",
+            "calibrate_method": "MinMax",
+            "quant_preprocess": true
+        },
+        "cb": {
+            "type": "EPContextBinaryGenerator",
+            "provider_options": {
+                "htp_graph_finalization_optimization_mode": "3"
+            }
+        }
+    },
+    "log_severity_level": 0,
+    "evaluator": "common_evaluator",
+    "evaluate_input_model": false,
+    "host": "local_system",
+    "target": "local_system",
+    "cache_dir": "cache",
+    "output_dir": "footprints/unet"
+}
diff --git a/sd-legacy-stable-diffusion-v1-5/aitk/config_vae_decoder.json b/sd-legacy-stable-diffusion-v1-5/aitk/config_vae_decoder.json
new file mode 100644
index 00000000..ab59bdd1
--- /dev/null
+++ b/sd-legacy-stable-diffusion-v1-5/aitk/config_vae_decoder.json
@@ -0,0 +1,127 @@
+{
+    "input_model": {
+        "type": "PyTorchModel",
+        "model_path": "stable-diffusion-v1-5/stable-diffusion-v1-5",
+        "model_loader": "vae_decoder_load",
+        "model_script": "user_script.py",
+        "io_config": {
+            "input_names": [ "latent_sample", "return_dict" ],
+            "output_names": [ "sample" ],
+            "dynamic_axes": {
+                "latent_sample": {
+                    "0": "decoder_batch",
+                    "1": "decoder_channels",
+                    "2": "decoder_height",
+                    "3": "decoder_width"
+                }
+            }
+        },
+        "dummy_inputs_func": "vae_decoder_conversion_inputs"
+    },
+    "systems": {
+        "local_system": {
+            "type": "LocalSystem",
+            "accelerators": [ { "device": "gpu", "execution_providers": [ "CUDAExecutionProvider" ] } ]
+        }
+    },
+    "data_configs": [
+        {
+            "name": "latency_data_config",
+            "user_script": "user_script.py",
+            "load_dataset_config": { "type": "local_dataset" },
+            "dataloader_config": { "type": "vae_decoder_data_loader", "batch_size": 1 }
+        },
+        {
+            "name": "quantize_data_config",
+            "user_script": "user_script.py",
+            "load_dataset_config": { "type": "local_dataset" },
+            "dataloader_config": { "type": "vae_decoder_quantize_data_loader", "data_num": 100 }
+        }
+    ],
+    "evaluators": {
+        "common_evaluator": {
+            "metrics": [
+                {
+                    "name": "latency",
+                    "type": "latency",
+                    "data_config": "latency_data_config",
+                    "sub_types": [ { "name": "avg" } ]
+                }
+            ]
+        }
+    },
+    "passes": {
+        "convert": { "type": "OnnxConversion", "target_opset": 17 },
+        "ov_convert": {
+            "type": "OpenVINOConversion",
+            "user_script": "user_script.py",
+            "example_input_func": "vae_decoder_conversion_inputs",
+            "output_model": "vae_decoder"
+        },
+        "optimize": {
+            "type": "OrtTransformersOptimization",
+            "model_type": "vae",
+            "opt_level": 0,
+            "float16": true,
+            "use_gpu": true,
+            "keep_io_types": false,
+            "optimization_options": {
+                "enable_gelu": true,
+                "enable_layer_norm": true,
+                "enable_attention": true,
+                "use_multi_head_attention": true,
+                "enable_skip_layer_norm": false,
+                "enable_embed_layer_norm": true,
+                "enable_bias_skip_layer_norm": false,
+                "enable_bias_gelu": true,
+                "enable_gelu_approximation": false,
+                "enable_qordered_matmul": false,
+                "enable_shape_inference": true,
+                "enable_gemm_fast_gelu": false,
+                "enable_nhwc_conv": false,
+                "enable_group_norm": true,
+                "enable_bias_splitgelu": false,
+                "enable_packed_qkv": true,
+                "enable_packed_kv": true,
+                "enable_bias_add": false,
+                "group_norm_channels_last": false
+            },
+            "force_fp32_ops": [ "RandomNormalLike" ],
+            "force_fp16_inputs": { "GroupNorm": [ 0, 1, 2 ] }
+        },
+        "optimize_cuda": {
+            "type": "OrtTransformersOptimization",
+            "model_type": "vae",
+            "opt_level": 0,
+            "float16": true,
+            "use_gpu": true,
+            "keep_io_types": false
+        },
+        "dynamic_shape_to_fixed": {
+            "type": "DynamicToFixedShape",
+            "dim_param": [ "decoder_batch", "decoder_channels", "decoder_height", "decoder_width" ],
+            "dim_value": [ 1, 4, 64, 64 ]
+        },
+        "quantization": {
+            "type": "OnnxStaticQuantization",
+            "data_config": "quantize_data_config",
+            "activation_type": "uint16",
+            "precision": "uint8",
+            "calibrate_method": "MinMax",
+            "quant_preprocess": true
+        },
+        "cb": {
+            "type": "EPContextBinaryGenerator",
+            "provider_options": {
+                "htp_graph_finalization_optimization_mode": "3"
+            }
+        }
+    },
+    "log_severity_level": 0,
+    "evaluator": "common_evaluator",
+    "evaluate_input_model": false,
+    "host": "local_system",
+    "target": "local_system",
+    "cache_dir": "cache",
+    "output_dir": "footprints/vae_decoder"
+}
diff --git a/sd-legacy-stable-diffusion-v1-5/aitk/config_vae_encoder.json b/sd-legacy-stable-diffusion-v1-5/aitk/config_vae_encoder.json
new file mode 100644
index 00000000..a1b09a1c
--- /dev/null
+++ b/sd-legacy-stable-diffusion-v1-5/aitk/config_vae_encoder.json
@@ -0,0 +1,125 @@
+{
+    "input_model": {
+        "type": "PyTorchModel",
+        "model_path": "stable-diffusion-v1-5/stable-diffusion-v1-5",
+        "model_loader": "vae_encoder_load",
+        "model_script": "user_script.py",
+        "io_config": {
+            "input_names": [ "sample", "return_dict" ],
+            "output_names": [ "latent_sample" ],
+            "dynamic_axes": {
+                "sample": { "0": "encoder_batch", "1": "encoder_channels", "2": "encoder_height", "3": "encoder_width" }
+            }
+        },
+        "dummy_inputs_func": "vae_encoder_conversion_inputs"
+    },
+    "systems": {
+        "local_system": {
+            "type": "LocalSystem",
+            "accelerators": [ { "device": "gpu", "execution_providers": [ "CUDAExecutionProvider" ] } ]
+        }
+    },
+    "data_configs": [
+        {
+            "name": "latency_data_config",
+            "user_script": "user_script.py",
+            "load_dataset_config": { "type": "local_dataset" },
+            "dataloader_config": { "type": "vae_encoder_data_loader", "batch_size": 1 }
+        },
+        {
+            "name": "quantize_data_config",
+            "user_script": "user_script.py",
+            "load_dataset_config": { "type": "local_dataset" },
+            "dataloader_config": { "type": "vae_encoder_quantize_data_loader", "data_num": 100 }
+        }
+    ],
+    "evaluators": {
+        "common_evaluator": {
+            "metrics": [
+                {
+                    "name": "latency",
+                    "type": "latency",
+                    "data_config": "latency_data_config",
+                    "sub_types": [ { "name": "avg" } ]
+                }
+            ]
+        }
+    },
+    "passes": {
+        "convert": { "type": "OnnxConversion", "target_opset": 17 },
+        "ov_convert": {
+            "type": "OpenVINOConversion",
+            "user_script": "user_script.py",
+            "example_input_func": "vae_encoder_conversion_inputs",
+            "output_model": "vae_encoder"
+        },
+        "optimize": {
+            "type": "OrtTransformersOptimization",
+            "model_type": "vae",
+            "opt_level": 0,
+            "float16": true,
+            "use_gpu": true,
+            "keep_io_types": false,
+            "optimization_options": {
+                "enable_gelu": true,
+                "enable_layer_norm": true,
+                "enable_attention": true,
+                "use_multi_head_attention": true,
+                "enable_skip_layer_norm": false,
+                "enable_embed_layer_norm": true,
+                "enable_bias_skip_layer_norm": false,
+                "enable_bias_gelu": true,
+                "enable_gelu_approximation": false,
+                "enable_qordered_matmul": false,
+                "enable_shape_inference": true,
+                "enable_gemm_fast_gelu": false,
+                "enable_nhwc_conv": false,
+                "enable_group_norm": true,
+                "enable_bias_splitgelu": false,
+                "enable_packed_qkv": true,
+                "enable_packed_kv": true,
+                "enable_bias_add": false,
+                "group_norm_channels_last": false
+            },
+            "force_fp32_ops": [ "RandomNormalLike" ],
+            "force_fp16_inputs": { "GroupNorm": [ 0, 1, 2 ] }
+        },
+        "optimize_cuda": {
+            "type": "OrtTransformersOptimization",
+            "model_type": "vae",
+            "opt_level": 0,
+            "float16": true,
+            "use_gpu": true,
+            "keep_io_types": false
+        },
+        "dynamic_shape_to_fixed": {
+            "type": "DynamicToFixedShape",
+            "dim_param": [
+                "encoder_batch",
+                "encoder_channels",
+                "encoder_height",
+                "encoder_width",
+                "Addlatent_sample_dim_0",
+                "Addlatent_sample_dim_1",
+                "Addlatent_sample_dim_2",
+                "Addlatent_sample_dim_3"
+            ],
+            "dim_value": [ 1, 3, 512, 512, 1, 4, 64, 64 ]
+        },
+        "quantization": {
+            "type": "OnnxStaticQuantization",
+            "data_config": "quantize_data_config",
+            "activation_type": "uint16",
+            "precision": "uint8",
+            "calibrate_method": "MinMax",
+            "quant_preprocess": true
+        }
+    },
+    "log_severity_level": 0,
+    "evaluator": "common_evaluator",
+    "evaluate_input_model": false,
+    "host": "local_system",
+    "target": "local_system",
+    "cache_dir": "cache",
+    "output_dir": "footprints/vae_encoder"
+}
diff --git a/sd-legacy-stable-diffusion-v1-5/aitk/evaluation.py b/sd-legacy-stable-diffusion-v1-5/aitk/evaluation.py
new file mode 100644
index 00000000..c224ddca
--- /dev/null
+++ b/sd-legacy-stable-diffusion-v1-5/aitk/evaluation.py
@@ -0,0 +1,304 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+
+import logging
+import math
+import os
+import re
+import sys
+from functools import partial
+from pathlib import Path
+from typing import Callable
+
+import numpy as np
+import requests
+import torch
+from datasets import load_dataset
+from PIL import Image
+from sd_utils.qdq import OnnxStableDiffusionPipelineWithSave
+from torchmetrics.functional.multimodal import clip_score
+from torchmetrics.image.fid import FrechetInceptionDistance
+from torchvision.transforms import functional as F
+
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+logger.addHandler(logging.StreamHandler(sys.stdout))
+
+# Clip score
+
+
+def get_clip_score(prompt: str, path: Path, clip_score_fn):
+    with torch.no_grad():
+        image = Image.open(path / f"{prompt}.png")
+        image = torch.tensor(np.array(image), dtype=torch.uint8).permute(2, 0, 1)
+        score = clip_score_fn(image, prompt)
+        return score.detach()
+
+
+def get_clip_scores(prompts: list[str], path: Path, clip_score_fn):
+    scores = []
+    with open(path / "clip_scores.txt", "w") as f:
+        f.write("| Prompt | Score |\n")
+        for prompt in prompts:
+            score = get_clip_score(prompt, path, clip_score_fn)
+            scores.append(score)
+            f.write(f"| {prompt} | {score} |\n")
+
+        logger.info("CLIP Scores avg: %s", np.mean(np.array(scores)))
+        f.write(f"| Avg | {np.mean(np.array(scores))} |\n")
+
+
+# MSE score
+
+
+def calc_error(image1, image2):
+    image1 = Image.open(image1)
+    image2 = Image.open(image2)
+    image1 = np.array(image1, dtype=np.float32)
+    image2 = np.array(image2, dtype=np.float32)
+    return np.mean((image1 - image2) ** 2)
+
+
+def get_mse_scores(prompts: list[str], unoptimized_path: Path, optimized_path: Path, train_num: int):
+    train_error = []
+    test_error = []
+    with open(optimized_path / "mse_scores.txt", "w") as f:
+        f.write("| Prompt | Error |\n")
+        for i, prompt in enumerate(prompts):
+            error = calc_error(unoptimized_path / f"{prompt}.png", optimized_path / f"{prompt}.png")
+            f.write(f"| {prompt} | {error} |\n")
+            if i < train_num:
+                train_error.append(error)
+            else:
+                test_error.append(error)
+
+        train_error = np.mean(np.array(train_error))
+        test_error = np.mean(np.array(test_error))
+        logger.info("Average train error %f", train_error)
+        logger.info("Average test error %f", test_error)
+        f.write(f"| Avg Train | {train_error} |\n")
+        f.write(f"| Avg Test | {test_error} |\n")
+
+
+# FID score
+
+
+def get_fid_scores(prompts: list[str], path: Path, real_images):
+    with torch.no_grad(), open(path / "fid_scores.txt", "w") as f:
+        f.write("| Prompt | Score |\n")
+        images = []
+        for prompt in prompts:
+            image = Image.open(path / f"{prompt}.png")
+            image = torch.tensor(np.array(image), dtype=torch.uint8).permute(2, 0, 1).unsqueeze(0)
+            images.append(image)
+        images = torch.cat(images)
+
+        fid = FrechetInceptionDistance()
+        fid.update(real_images, real=True)
+        fid.update(images, real=False)
+
+        score = fid.compute()
+        logger.info("FID: %f", score)
+        f.write(f"| FID | {score} |\n")
+
+
+# hpsv2 score
+
+
+def get_hpsv2_scores(path: Path, generate_image: Callable[[str, str], None], prompt_style: str):
+    if prompt_style is None:
+        return
+    import hpsv2
+
+    # Get benchmark prompts (<style> = all, anime, concept-art, paintings, photo)
+    all_prompts = hpsv2.benchmark_prompts(prompt_style)
+    if prompt_style != "all":
+        all_prompts = {prompt_style: all_prompts}
+    for style, prompts in all_prompts.items():
+        os.makedirs(path / style, exist_ok=True)
+        for idx, prompt in enumerate(prompts):
+            logger.info("Generating %s for %s [%d/%d]", prompt, style, idx + 1, len(prompts))
+            output = path / style / f"{idx:05d}.jpg"
+            generate_image(prompt, output)
+    hpsv2.evaluate(path.as_posix(), hps_version="v2.1")
+
+
+# prepare data
+
+
+def sanitize_path(input_string):
+    return re.sub(r"[^\w\-, ]", "", input_string.strip())
+
+
+def download_file(url, save_path):
+    try:
+        # Send a GET request to the URL
+        response = requests.get(url, stream=True, timeout=10)
+        response.raise_for_status()  # Raise an error for bad status codes
+
+        # Write the content to the specified file
+        os.makedirs(os.path.dirname(save_path), exist_ok=True)
+        with open(save_path, "wb") as file:
+            for chunk in response.iter_content(chunk_size=8192):
+                file.write(chunk)
+
+        logger.info("File successfully downloaded and saved to %s", save_path)
+    except requests.exceptions.MissingSchema:
+        logger.info("Error: Invalid URL. Please provide a valid URL.")
+    except requests.exceptions.ConnectionError:
+        logger.info("Error: Network issue. Please check your internet connection.")
+    except requests.exceptions.HTTPError as http_err:
+        logger.info("HTTP error occurred: %s", http_err)
+    except Exception as e:
+        logger.info("An error occurred: %s", e)
+
+
+def get_real_images(train_data, num_data):
+    script_dir = Path(__file__).resolve().parent
+    data_dir = script_dir / "train_data"
+    with torch.no_grad():
+        images = []
+        for i, example in enumerate(train_data):
+            if i >= num_data:
+                break
+            image_path = data_dir / example["file_name"]
+            if not image_path.exists():
+                download_file(example["coco_url"], image_path)
+            image = Image.open(image_path).convert("RGB")
+            image = torch.tensor(np.array(image), dtype=torch.uint8).permute(2, 0, 1).unsqueeze(0)
+            # TODO(anyone): use resize?
+            image = F.center_crop(image, (256, 256))
+            images.append(image)
+        return torch.cat(images)
+
+
+def run_inference(pipeline, args, prompt: str, output_path: Path, pathIsFile: bool = False):
+    output = output_path if pathIsFile else output_path / f"{prompt}.png"
+    if output.exists():
+        return
+    generator = None if args.seed is None else np.random.RandomState(seed=args.seed)
+    result = pipeline(
+        [prompt],
+        num_inference_steps=args.num_inference_steps,
+        height=args.image_size,
+        width=args.image_size,
+        guidance_scale=args.guidance_scale,
+        generator=generator,
+    )
+    result.images[0].save(output)
+
+
+def parse_args(raw_args):
+    import argparse
+
+    parser = argparse.ArgumentParser("Common arguments")
+
+    parser.add_argument("--script_dir", required=True, type=str)
+    parser.add_argument("--save_data", action="store_true")
+    parser.add_argument("--model_dir", default="optimized", type=str, help="model_dir path")
+    parser.add_argument("--model_id", default="stable-diffusion-v1-5/stable-diffusion-v1-5", type=str)
+    parser.add_argument(
+        "--guidance_scale",
+        default=7.5,
+        type=float,
+        help="Guidance scale as defined in Classifier-Free Diffusion Guidance",
+    )
+    parser.add_argument("--num_inference_steps", default=50, type=int, help="Number of steps in diffusion process")
+    parser.add_argument(
+        "--seed",
+        default=None,
+        type=int,
+        help="The seed to give to the generator to generate deterministic results.",
+    )
+    parser.add_argument("--data_dir", default="quantize_data", type=str)
+    parser.add_argument("--provider", default="CPUExecutionProvider", type=str)
+    parser.add_argument(
+        "--sub_dir", default="optimized", type=str, help="Sub directory to save the data for optimized model test"
+    )
+    parser.add_argument(
+        "--dataset_name",
+        default="phiyodr/coco2017",
+        type=str,
+        help="(Optional) dataset to download",
+    )
+    parser.add_argument(
+        "--dataset_split",
+        default="train",
+        type=str,
+        help="(Optional) dataset split to download",
+    )
+    parser.add_argument(
+        "--num_data",
+        default=10,
+        type=int,
+        help="Number of data samples to use for quantization",
+    )
+    parser.add_argument("--train_ratio", default=0.5, type=float)
+    parser.add_argument("--image_size", default=512, type=int, help="Width and height of the images to generate")
+    parser.add_argument("--hpsv2_style", default=None, type=str, help="Style for hpsv2 benchmark")
+    parser.add_argument("--xl", action="store_true")
+    return parser.parse_args(raw_args)
+
+
+def main(raw_args=None):
+    args = parse_args(raw_args)
+    real_images = None
+    dataset = load_dataset(args.dataset_name, streaming=True)
+    train_data = dataset[args.dataset_split]
+    prompts = [sanitize_path(example["captions"][0]) for i, example in enumerate(train_data) if i < args.num_data]
+    real_images = get_real_images(train_data, args.num_data)
+
+    train_num = math.floor(len(prompts) * args.train_ratio)
+    clip_score_fn = partial(clip_score, model_name_or_path="openai/clip-vit-base-patch16")
+
+    script_dir = Path(args.script_dir)
+    unoptimized_path: Path = script_dir / args.data_dir / "unoptimized"
+    optimized_path: Path = script_dir / args.data_dir / args.sub_dir
+
+    unoptimized_model_dir = script_dir / "models" / "unoptimized" / args.model_id
+    optimized_dir_name = args.model_dir
+    optimized_model_dir = script_dir / "models" / optimized_dir_name / args.model_id
+
+    model_dir = unoptimized_model_dir if args.save_data else optimized_model_dir
+    if args.xl:
+        from sd_utils.qdq_xl import ORTStableDiffusionXLPipelineWithSave
+
+        pipeline = ORTStableDiffusionXLPipelineWithSave.from_pretrained(model_dir, provider=args.provider)
+    else:
+        pipeline = OnnxStableDiffusionPipelineWithSave.from_pretrained(model_dir, provider=args.provider)
+    pipeline.save_data_dir = None
+
+    if args.save_data:
+        os.makedirs(unoptimized_path, exist_ok=True)
+        for i, prompt in enumerate(prompts):
+            logger.info(prompt)
+            if i < train_num:
+                pipeline.save_data_dir = script_dir / args.data_dir / "data" / prompt
+                os.makedirs(pipeline.save_data_dir, exist_ok=True)
+            else:
+                pipeline.save_data_dir = None
+            run_inference(pipeline, args, prompt, unoptimized_path)
+        get_clip_scores(prompts, unoptimized_path, clip_score_fn)
+        get_fid_scores(prompts, unoptimized_path, real_images)
+        get_hpsv2_scores(
+            unoptimized_path / "hpsv2", partial(run_inference, pipeline, args, pathIsFile=True), args.hpsv2_style
+        )
+
+    else:
+        os.makedirs(optimized_path, exist_ok=True)
+        for prompt in prompts:
+            logger.info(prompt)
+            run_inference(pipeline, args, prompt, optimized_path)
+
+        get_clip_scores(prompts, optimized_path, clip_score_fn)
+        get_fid_scores(prompts, optimized_path, real_images)
+        get_mse_scores(prompts, unoptimized_path, optimized_path, train_num)
+        get_hpsv2_scores(
+            optimized_path / "hpsv2", partial(run_inference, pipeline, args, pathIsFile=True), args.hpsv2_style
+        )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/sd-legacy-stable-diffusion-v1-5/aitk/inference_sample.ipynb b/sd-legacy-stable-diffusion-v1-5/aitk/inference_sample.ipynb
new file mode 100644
index 00000000..0f14e814
--- /dev/null
+++ b/sd-legacy-stable-diffusion-v1-5/aitk/inference_sample.ipynb
@@ -0,0 +1,101 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "547a25de",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model_dir = \"./models/optimized/stable-diffusion-v1-5/stable-diffusion-v1-5\"\n",
+    "\n",
+    "ExecutionProvider=\"QNNExecutionProvider\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "eed9c231",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# reference: https://learn.microsoft.com/en-us/windows/ai/new-windows-ml/tutorial?tabs=python#acquiring-the-model-and-preprocessing\n",
+    "import subprocess\n",
+    "import json\n",
+    "import sys\n",
+    "import os\n",
+    "import onnxruntime as ort\n",
+    "\n",
+    "def register_execution_providers():\n",
+    "    worker_script = os.path.abspath('winml.py')\n",
+    "    print(worker_script)\n",
+    "    result = subprocess.check_output([sys.executable, worker_script], text=True)\n",
+    "    paths = json.loads(result)\n",
+    "    for item in paths.items():\n",
+    "        try:\n",
+    "            ort.register_execution_provider_library(item[0], item[1])\n",
+    "        except Exception as e:\n",
+    "            print(f\"Failed to register execution provider {item[0]}: {e}\")\n",
+    "\n",
+    "register_execution_providers()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ffcd22ad",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import sys\n",
+    "from pathlib import Path\n",
+    "\n",
+    "NOTEBOOK_DIR = Path(__file__).parent if \"__file__\" in globals() else Path.cwd()\n",
+    "PROJECT_ROOT = NOTEBOOK_DIR.parents[1]\n",
+    "sys.path.insert(0, str(PROJECT_ROOT))\n",
+    "\n",
+    "import numpy as np\n",
+    "from sd_utils.qdq import OnnxStableDiffusionPipelineWithSave\n",
+    "\n",
+    "def add_ep_for_device(session_options, ep_name, device_type, ep_options=None):\n",
+    "    ep_devices = ort.get_ep_devices()\n",
+    "    for ep_device in ep_devices:\n",
+    "        if ep_device.ep_name == ep_name and ep_device.device.type == device_type:\n",
+    "            print(f\"Adding {ep_name} for {device_type}\")\n",
+    "            session_options.add_provider_for_devices([ep_device], {} if ep_options is None else ep_options)\n",
+    "            break\n",
+    "\n",
+    "\n",
+    "sess_options = ort.SessionOptions()\n",
+    "provider_options = [{}]\n",
+    "\n",
+    "add_ep_for_device(sess_options, ExecutionProvider, ort.OrtHardwareDeviceType.NPU)\n",
+    "\n",
+    "pipeline = OnnxStableDiffusionPipelineWithSave.from_pretrained(\n",
+    "    model_dir, provider=ExecutionProvider, sess_options=sess_options, provider_options=provider_options\n",
+    ")\n",
+    "pipeline.save_data_dir = None\n",
+    "\n",
+    "prompt = \"A beautiful landscape painting of mountains during sunrise\"\n",
+    "result = pipeline(\n",
+    "    [prompt],\n",
+    "    num_inference_steps=25,\n",
+    "    height=512,\n",
+    "    width=512,\n",
+    "    guidance_scale=7.5,\n",
+    "    generator=np.random.RandomState(seed=0)\n",
+    ")\n",
+    "\n",
+    "from IPython.display import display\n",
+    "display(result.images[0])\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "language_info": {
+   "name": "python"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/sd-legacy-stable-diffusion-v1-5/aitk/info.yml b/sd-legacy-stable-diffusion-v1-5/aitk/info.yml
new file mode 100644
index 00000000..2c096503
--- /dev/null
+++ b/sd-legacy-stable-diffusion-v1-5/aitk/info.yml
@@ -0,0 +1,12 @@
+keywords:
+  aitk
+arch: sd
+recipes:
+  - file: "sd_qnn_workflow.json"
+    device: npu
+    ep: QNNExecutionProvider
+aitk:
+  modelInfo:
+    id: "huggingface/stable-diffusion-v1-5/stable-diffusion-v1-5"
+    version: 1
+    status: Hide
diff --git a/sd-legacy-stable-diffusion-v1-5/aitk/model_adaptations.py b/sd-legacy-stable-diffusion-v1-5/aitk/model_adaptations.py
new file mode 100644
index 00000000..945237ad
--- /dev/null
+++ b/sd-legacy-stable-diffusion-v1-5/aitk/model_adaptations.py
@@ -0,0 +1,605 @@
+# ---------------------------------------------------------------------
+# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+# ---------------------------------------------------------------------
+import math
+import types
+from typing import Callable, Optional
+
+import diffusers.models.attention_processor as attention_processor
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from diffusers import UNet2DConditionModel
+from diffusers.models.activations import GEGLU, GELU, ApproximateGELU
+from diffusers.models.attention import BasicTransformerBlock, FeedForward
+from diffusers.models.transformers.transformer_2d import Transformer2DModel
+
+
+class Conv2dLinear(torch.nn.Module):
+    """A class to convert a Linear layer to a Conv2D layer with a 1x1 kernel.
+    This allows the linear transformation to be applied to the channel dimension
+    at each spatial location in the input tensor.
+
+    Args:
+        linear (nn.Linear): The original linear layer to be converted.
+
+    """
+
+    def __init__(self, linear: torch.nn.Linear):
+        super().__init__()
+        self.in_features = linear.in_features
+        self.out_features = linear.out_features
+
+        # Initialize a Conv2D layer with a 1x1 kernel to mimic the Linear layer
+        self.conv = torch.nn.Conv2d(
+            in_channels=self.in_features,
+            out_channels=self.out_features,
+            kernel_size=1,
+            bias=(linear.bias is not None),
+        )
+
+        # Copy the weights from the Linear layer to the Conv2D layer
+        self.conv.weight.data.copy_(linear.weight.data.view(self.out_features, self.in_features, 1, 1))
+
+        # Copy the bias if it exists
+        if linear.bias is not None:
+            self.conv.bias.data.copy_(linear.bias.data)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """Forward-pass routine for the Conv2D layer.
+
+        Args:
+            x (torch.Tensor): The input tensor in NCHW format.
+
+        Returns:
+            torch.Tensor: The output tensor after applying the Conv2D transformation.
+
+        """
+        return self.conv(x)
+
+
+class SHAAttention(nn.Module):
+    """Split-Head Attention with per-head Conv2D projections and a single output
+    Conv2D projection.  This implementation splits the attention heads into
+    separate Conv2D projection layers and applies a single output projection
+    after concatenating all heads.  Adjusted to handle spatial dimensions (H,
+    W) instead of sequence length.
+    """
+
+    def __init__(self, orig_attn: attention_processor.Attention):
+        """Initialize SHAAttention by copying weights from an existing Attention module.
+
+        Args:
+            orig_attn (attention_processor.Attention): The original Attention module to be replaced.
+
+        """
+        super().__init__()
+
+        for f in ["group_norm", "spatial_norm", "norm_q", "norm_k", "norm_cross"]:
+            if getattr(orig_attn, f) is not None:
+                raise NotImplementedError(f"{f} is not supported")
+
+        # Copy configuration from the original Attention module
+        self.heads = orig_attn.heads
+        self.kv_heads = int(orig_attn.inner_kv_dim / orig_attn.inner_dim * self.heads)
+
+        # Infer dim_head from to_q.out_features and heads
+        if orig_attn.to_q.out_features % self.heads != 0:
+            raise ValueError("to_q.out_features is not divisible by heads. Cannot infer dim_head.")
+        self.dim_head = orig_attn.to_q.out_features // self.heads
+        self.scale = 1 / math.sqrt(self.dim_head)
+        self.rescale_output_factor_inv = 1 / orig_attn.rescale_output_factor
+
+        self.residual_connection = orig_attn.residual_connection
+
+        # Verify to_k and to_v dimensions
+        expected_kv_out = self.kv_heads * self.dim_head
+        if orig_attn.to_k.out_features != expected_kv_out:
+            raise ValueError(
+                f"to_k.out_features ({orig_attn.to_k.out_features}) does not match expected {expected_kv_out}."
+            )
+        if orig_attn.to_v.out_features != expected_kv_out:
+            raise ValueError(
+                f"to_v.out_features ({orig_attn.to_v.out_features}) does not match expected {expected_kv_out}."
+            )
+
+        # Initialize separate Conv2D projection layers for each head
+        self.q_proj_sha = nn.ModuleList(
+            [
+                nn.Conv2d(
+                    orig_attn.to_q.in_features,
+                    self.dim_head,
+                    kernel_size=1,
+                    bias=(orig_attn.to_q.bias is not None),
+                )
+                for _ in range(self.heads)
+            ]
+        )
+        self.k_proj_sha = nn.ModuleList(
+            [
+                nn.Conv2d(
+                    orig_attn.to_k.in_features,
+                    self.dim_head,
+                    kernel_size=1,
+                    bias=(orig_attn.to_k.bias is not None),
+                )
+                for _ in range(self.kv_heads)
+            ]
+        )
+        self.v_proj_sha = nn.ModuleList(
+            [
+                nn.Conv2d(
+                    orig_attn.to_v.in_features,
+                    self.dim_head,
+                    kernel_size=1,
+                    bias=(orig_attn.to_v.bias is not None),
+                )
+                for _ in range(self.kv_heads)
+            ]
+        )
+
+        self.to_out = orig_attn.to_out
+
+        # Copy weights from the original shared Linear projections to the separate Conv2D projections
+        for i in range(self.heads):
+            # Query Projection
+            q_weight = orig_attn.to_q.weight.data[i * self.dim_head : (i + 1) * self.dim_head, :].clone()
+            q_weight = q_weight.unsqueeze(-1).unsqueeze(-1)  # Shape: (dim_head, in_features, 1, 1)
+            self.q_proj_sha[i].weight.data.copy_(q_weight)
+            if orig_attn.to_q.bias is not None:
+                self.q_proj_sha[i].bias.data.copy_(
+                    orig_attn.to_q.bias.data[i * self.dim_head : (i + 1) * self.dim_head].clone()
+                )
+
+        for i in range(self.kv_heads):
+            # Key Projection
+            k_weight = orig_attn.to_k.weight.data[i * self.dim_head : (i + 1) * self.dim_head, :].clone()
+            k_weight = k_weight.unsqueeze(-1).unsqueeze(-1)  # Shape: (dim_head, in_features, 1, 1)
+            self.k_proj_sha[i].weight.data.copy_(k_weight)
+            if orig_attn.to_k.bias is not None:
+                self.k_proj_sha[i].bias.data.copy_(
+                    orig_attn.to_k.bias.data[i * self.dim_head : (i + 1) * self.dim_head].clone()
+                )
+
+            # Value Projection
+            v_weight = orig_attn.to_v.weight.data[i * self.dim_head : (i + 1) * self.dim_head, :].clone()
+            v_weight = v_weight.unsqueeze(-1).unsqueeze(-1)  # Shape: (dim_head, in_features, 1, 1)
+            self.v_proj_sha[i].weight.data.copy_(v_weight)
+            if orig_attn.to_v.bias is not None:
+                self.v_proj_sha[i].bias.data.copy_(
+                    orig_attn.to_v.bias.data[i * self.dim_head : (i + 1) * self.dim_head].clone()
+                )
+
+        del orig_attn.to_q
+        del orig_attn.to_k
+        del orig_attn.to_v
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        **kwargs,
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]:
+        """Forward pass for Split-Head Cross Attention.
+        Processes each head separately using head-specific Conv2D projection layers.
+
+        Args:
+            hidden_states (torch.Tensor): The hidden states (batch_size, hidden_size, H, W).
+            attention_mask (Optional[torch.Tensor]): The attention mask.
+            encoder_hidden_states (Optional[torch.Tensor]): The encoder hidden states for cross-attention.
+            **kwargs: Additional keyword arguments.
+
+        Returns:
+            Tuple containing the attention output, attention weights, and past key-value.
+
+        """
+        bsz, hidden_size, H, W = hidden_states.size()
+        residual = hidden_states
+
+        if encoder_hidden_states is not None:
+            # (N, seq_len, inner_dim) to (N, inner_dim, 1, seq_len)
+            encoder_hidden_states = encoder_hidden_states.permute(0, 2, 1).unsqueeze(2)
+            # encoder_hidden_states: (N, inner_dim, 1, seq_len)
+        else:
+            encoder_hidden_states = hidden_states
+
+        query_states = [q_proj(hidden_states) for q_proj in self.q_proj_sha]
+        key_states = [k_proj(encoder_hidden_states) for k_proj in self.k_proj_sha]
+        value_states = [v_proj(encoder_hidden_states) for v_proj in self.v_proj_sha]
+        # query_states, key_states, value_states: List of (bsz, dim_head, H, W)
+
+        # Handle past_key_value for caching
+        past_key_value = kwargs.get("past_key_value")
+        if past_key_value is not None:
+            raise NotImplementedError("SHAAttention does not support kv cache yet")
+
+        # Prepare for attention computation
+        attn_outputs = []
+        for head_idx, (q, k, v) in enumerate(zip(query_states, key_states, value_states)):
+            q_flat = q.permute(0, 2, 3, 1)  # (bsz, H, W, dim_head)
+            k_flat = k.view(bsz, 1, self.dim_head, -1)  # (bsz, 1, dim_head, H_enc*W_enc)
+            v_flat = v.view(bsz, 1, self.dim_head, -1)  # (bsz, 1, dim_head, H_enc*W_enc)
+
+            attn_scores = torch.matmul(q_flat, k_flat) * self.scale
+            # attn_scores: (bsz, H, W, H_enc*W_enc)
+
+            if attention_mask is not None:
+                attn_scores = attn_scores + attention_mask
+
+            attn_probs = torch.nn.functional.softmax(attn_scores, dim=-1)
+            # attn_probs: (bsz, H, W, H_enc*W_enc)
+
+            # Compute attention output
+            v_perm = v_flat.permute(0, 1, 3, 2)  # (bsz, 1, H_enc*W_enc, dim_head)
+            attn_output = torch.matmul(attn_probs, v_perm)
+            # attn_output: (bsz, H, W, dim_head)
+
+            attn_outputs.append(attn_output)
+
+        # Concatenate all heads' outputs along the channel dimension
+        attn_output = torch.cat(attn_outputs, dim=-1)  # (bsz, H, W, heads * dim_head)
+
+        attn_output = self.to_out[0](attn_output)  # (bsz, H, W, out_features)
+        attn_output = self.to_out[1](attn_output)  # (bsz, H, W, out_features)
+        attn_output = attn_output.permute(0, 3, 1, 2)  # (bsz,out_features, H, W)
+
+        if self.residual_connection:
+            hidden_states = hidden_states + residual
+
+        if kwargs.get("output_attentions", False):
+            raise NotImplementedError("output_attentions=True is not supported")
+
+        if self.rescale_output_factor_inv != 1:
+            hidden_states *= self.rescale_output_factor_inv
+
+        return attn_output
+
+
+class PermuteLayerNorm(nn.Module):
+    def __init__(self, original_norm):
+        super().__init__()
+        self.original_norm = original_norm
+
+    def forward(self, *args, **kwargs):
+        # Assuming the first argument is the tensor to be normalized
+        # Permute the tensor dimensions from (N, C, H, W) to (N, H, W, C)
+        permuted_args = list(args)
+        if len(permuted_args) > 0 and isinstance(permuted_args[0], torch.Tensor):
+            permuted_args[0] = permuted_args[0].permute(0, 2, 3, 1)
+
+        # Apply the original normalization
+        norm_output = self.original_norm(*permuted_args, **kwargs)
+
+        # If the output is a tuple (as in some custom norms), permute relevant tensors
+        if isinstance(norm_output, tuple):
+            # Permute the first tensor in the output tuple
+            norm_output = (norm_output[0].permute(0, 3, 1, 2),) + norm_output[1:]
+        elif isinstance(norm_output, torch.Tensor):
+            norm_output = norm_output.permute(0, 3, 1, 2)
+
+        return norm_output
+
+
+def traverse_and_replace(
+    model: nn.Module,
+    target_type: type[torch.nn.Module],
+    replacement_fn: Callable[[torch.nn.Module], torch.nn.Module],
+):
+    """Recursively traverses the model to find and replace modules of a specified type.
+
+    Args:
+        model (nn.Module): The model to traverse.
+        target_type (type): The type of modules to replace (e.g., Attention, GELU).
+        replacement_fn (callable): A function that takes a module instance and returns the replacement module.
+
+    """
+    for name, module in model.named_children():
+        if isinstance(module, target_type):
+            setattr(model, name, replacement_fn(module))
+
+        elif isinstance(module, nn.ModuleList):
+            for idx in range(len(module)):
+                child = module[idx]
+                if isinstance(child, target_type):
+                    module[idx] = replacement_fn(child)
+                else:
+                    # Recursively apply to child modules
+                    traverse_and_replace(child, target_type, replacement_fn)
+        else:
+            traverse_and_replace(module, target_type, replacement_fn)
+
+
+def replace_attention_modules(model: nn.Module):
+    """Recursively traverses the model to find and replace all instances of Attention with SHAAttention,
+    including those nested within ModuleList containers.
+
+    Args:
+        model (nn.Module): The model in which to replace Attention modules.
+
+    """
+    traverse_and_replace(model, attention_processor.Attention, lambda orig_attn: SHAAttention(orig_attn))
+
+
+def replace_gelu_and_approx_gelu_with_conv2d(activation_module: nn.Module) -> nn.Module:
+    """Replaces the projection layer in GELU and ApproximateGELU activation modules from Linear to Conv2D.
+
+    Args:
+        activation_module (nn.Module): The activation module to replace.
+
+    Returns:
+        nn.Module: The activation module with Conv2D projection.
+
+    """
+    assert isinstance(activation_module, GELU) or isinstance(activation_module, ApproximateGELU)
+    dim_in = activation_module.proj.in_features
+    dim_out = activation_module.proj.out_features
+    bias = activation_module.proj.bias is not None
+
+    # Define Conv2d projection
+    conv = nn.Conv2d(in_channels=dim_in, out_channels=dim_out, kernel_size=1, bias=bias)
+
+    # Copy weights from Linear to Conv2d
+    with torch.no_grad():
+        conv.weight.copy_(activation_module.proj.weight.view(dim_out, dim_in, 1, 1))
+        if bias:
+            conv.bias.copy_(activation_module.proj.bias)
+
+    # Replace the Linear layer with Conv2d
+    activation_module.proj = conv
+    return activation_module
+
+
+class QcGEGLU(nn.Module):
+    r"""A reimplemented version of the GEGLU activation function using two Conv2D layers.
+    This class replaces the original GEGLU's Linear projections with Conv2D projections
+    and eliminates the need for the chunk operation by directly computing the gate.
+
+    Parameters:
+        original_geglu (GEGLU): The original GEGLU module to be replaced.
+
+    """
+
+    def __init__(self, original_geglu: GEGLU):
+        super().__init__()
+        # Extract dimensions from the original GEGLU
+        dim_in = original_geglu.proj.in_features
+        dim_out = original_geglu.proj.out_features // 2  # GEGLU splits output into two parts
+        bias = original_geglu.proj.bias is not None
+
+        # Define separate Conv2D layers for hidden projection and gate projection
+        self.hidden_proj = nn.Conv2d(in_channels=dim_in, out_channels=dim_out, kernel_size=1, bias=bias)
+        self.gate_proj = nn.Conv2d(in_channels=dim_in, out_channels=dim_out, kernel_size=1, bias=bias)
+
+        # Initialize weights and biases from the original GEGLU's Linear layer
+        with torch.no_grad():
+            # Original Linear weights shape: [dim_out*2, dim_in]
+            linear_weight = original_geglu.proj.weight.data  # Shape: [dim_out*2, dim_in]
+            linear_bias = original_geglu.proj.bias.data if bias else None  # Shape: [dim_out*2]
+
+            # Assign weights to hidden_proj and gate_proj Conv2D layers
+            self.hidden_proj.weight.copy_(linear_weight[:dim_out, :].view(dim_out, dim_in, 1, 1))
+            if bias:
+                self.hidden_proj.bias.copy_(linear_bias[:dim_out])  # type: ignore
+
+            self.gate_proj.weight.copy_(linear_weight[dim_out:, :].view(dim_out, dim_in, 1, 1))
+            if bias:
+                self.gate_proj.bias.copy_(linear_bias[dim_out:])  # type: ignore
+
+    def gelu(self, gate: torch.Tensor) -> torch.Tensor:
+        return F.gelu(gate)
+
+    def forward(self, hidden_states, *args, **kwargs):
+        # Project hidden states and compute gate
+        hidden_proj = self.hidden_proj(hidden_states)  # (N, dim_out, H, W)
+        gate = self.gate_proj(hidden_states)  # (N, dim_out, H, W)
+
+        # Apply GELU activation to the gate
+        gate = self.gelu(gate)
+
+        # Apply gating mechanism
+        return hidden_proj * gate
+
+
+def replace_geglu_with_conv2d(activation_module: nn.Module) -> nn.Module:
+    """Replaces the original GEGLU activation module with the QcGEGLU module,
+    which uses two Conv2D layers and eliminates the chunk operation.
+
+    Args:
+        activation_module (nn.Module): The GEGLU activation module to replace.
+
+    Returns:
+        nn.Module: The QcGEGLU activation module with two Conv2D projections.
+
+    """
+    if isinstance(activation_module, GEGLU):
+        # Instantiate QcGEGLU with the original GEGLU module
+        qc_geglu = QcGEGLU(activation_module)
+        return qc_geglu
+    else:
+        raise TypeError(f"Unsupported activation module type for GEGLU replacement: {type(activation_module)}")
+
+
+def replace_activations_with_conv2d(model: nn.Module):
+    """Recursively traverses the model to find and replace GELU, GEGLU, and ApproximateGELU activation projections
+    from Linear layers to Conv2D layers, ensuring compatibility with NCHW input shapes.
+    Also handles activations nested within ModuleList containers.
+
+    Args:
+        model (nn.Module): The model in which to perform the replacement.
+
+    """
+    # Replace GELU and ApproximateGELU
+    traverse_and_replace(model, GELU, replace_gelu_and_approx_gelu_with_conv2d)
+    traverse_and_replace(model, ApproximateGELU, replace_gelu_and_approx_gelu_with_conv2d)
+
+    # Replace GEGLU
+    traverse_and_replace(model, GEGLU, replace_geglu_with_conv2d)
+
+
+def replace_feedforward_with_conv2d(feedforward_module: nn.Module) -> nn.Module:
+    """Replaces the nn.Linear layer in the FeedForward module with a Conv2D layer
+    to handle hidden_states of shape (N, C, H, W).
+
+    Args:
+        feedforward_module (nn.Module): The FeedForward module to replace.
+
+    Returns:
+        nn.Module: The FeedForward module with Conv2D layers instead of Linear layers.
+
+    """
+    if isinstance(feedforward_module, FeedForward):
+        # Create a new ModuleList to hold the modified layers
+        new_net = nn.ModuleList()
+        for module in feedforward_module.net:
+            if isinstance(module, nn.Linear):
+                # Define Conv2d projection
+                conv = nn.Conv2d(
+                    in_channels=module.in_features,
+                    out_channels=module.out_features,
+                    kernel_size=1,
+                    bias=module.bias is not None,
+                )
+                # Copy weights from Linear to Conv2d
+                with torch.no_grad():
+                    conv.weight.copy_(module.weight.data.view(module.out_features, module.in_features, 1, 1))
+                    if module.bias is not None:
+                        conv.bias.copy_(module.bias.data)
+                # Append the Conv2d layer instead of Linear
+                new_net.append(conv)
+            else:
+                # Append other modules (e.g., activation functions, Dropout) unchanged
+                new_net.append(module)
+        # Replace the original ModuleList with the new one containing Conv2d layers
+        feedforward_module.net = new_net
+        return feedforward_module
+    else:
+        raise TypeError(f"Unsupported module type for FeedForward replacement: {type(feedforward_module)}")
+
+
+def replace_feedforward_modules(model: nn.Module):
+    # Replace FeedForward modules' Linear layers with Conv2D
+    traverse_and_replace(model, FeedForward, replace_feedforward_with_conv2d)
+
+
+def replace_layer_norm_modules(model: nn.Module):
+    """Recursively traverses the model to find and replace all instances of
+    LayerNorm within BasicTransformerBlock with PermuteLayerNorm to be
+    compatible with optimized_operate_on_continuous_inputs.
+
+    Args:
+        model (nn.Module): The model in which to replace LayerNorm modules.
+
+    """
+
+    def replace_layer_norm(block: BasicTransformerBlock) -> BasicTransformerBlock:
+        """Replaces norm1, norm2, and norm3 within a BasicTransformerBlock.
+
+        Args:
+            block (BasicTransformerBlock): The transformer block to modify.
+
+        Returns:
+            BasicTransformerBlock: The modified transformer block.
+
+        """
+        block.norm1 = PermuteLayerNorm(block.norm1)
+        block.norm2 = PermuteLayerNorm(block.norm2)
+        if hasattr(block, "norm3"):
+            block.norm3 = PermuteLayerNorm(block.norm3)
+        return block
+
+    traverse_and_replace(model, BasicTransformerBlock, replace_layer_norm)
+
+
+def replace_transformer2d_modules(model: nn.Module):
+    """Recursively traverses the model to find and replace all instances of
+    Transformer2DModel that use linear projection, patching only those
+    instances with the optimized continuous‐input methods and swapping in
+    Conv2dLinear for proj_in / proj_out.
+    """
+
+    def _patch_instance(m: Transformer2DModel):
+        # bind the two optimized routines onto this instance only
+        m._operate_on_continuous_inputs = types.MethodType(  # type: ignore
+            optimized_operate_on_continuous_inputs, m
+        )
+        m._get_output_for_continuous_inputs = types.MethodType(  # type: ignore
+            optimized_get_output_for_continuous_inputs, m
+        )
+
+    def _replace_transformer2d(m: Transformer2DModel) -> Transformer2DModel:
+        # 1) patch the instance methods
+        _patch_instance(m)
+        # 2) swap out the Linear‐based proj_in / proj_out for Conv2dLinear
+        if isinstance(m.proj_in, nn.Linear):
+            m.proj_in = Conv2dLinear(m.proj_in)
+        if isinstance(m.proj_out, nn.Linear):
+            m.proj_out = Conv2dLinear(m.proj_out)
+        return m
+
+    traverse_and_replace(model, Transformer2DModel, _replace_transformer2d)
+
+
+def optimized_operate_on_continuous_inputs(self, hidden_states):
+    """By using 4D NCHW hidden states, we can skip permutation and reshape
+    required in the HF implementation.
+    """
+    hidden_states = self.norm(hidden_states)
+    if not self.use_linear_projection:
+        hidden_states = self.proj_in(hidden_states)
+        inner_dim = hidden_states.shape[1]
+    else:
+        inner_dim = hidden_states.shape[1]
+        hidden_states = self.proj_in(hidden_states)
+    return hidden_states, inner_dim
+
+
+def optimized_get_output_for_continuous_inputs(self, hidden_states, residual, batch_size, height, width, inner_dim):
+    """Similar to optimized_operate_on_continuous_inputs"""
+    hidden_states = self.proj_out(hidden_states)
+    return hidden_states + residual
+
+
+def get_timestep_embedding(sample: torch.Tensor, timestep: torch.Tensor):
+    """Adapted from diffusers.models.get_timestep_embedding.
+    Removes parameters unused by our implementation and supports batching.
+    """
+    embedding_dim = 320  # TODO: Extract from last unet layers
+    MAX_PERIOD = 10000
+    half_dim = embedding_dim // 2
+    exponent = -math.log(MAX_PERIOD) * torch.arange(start=0, end=half_dim, dtype=torch.float32, device=timestep.device)
+    exponent = exponent / half_dim
+
+    emb = torch.exp(exponent)
+    emb = timestep.float() * emb
+
+    # concat sine and cosine embeddings
+    emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=-1)
+
+    # flip sine and cosine embeddings
+    emb = torch.cat([emb[:, half_dim:], emb[:, :half_dim]], dim=-1)
+
+    # zero pad
+    if embedding_dim % 2 == 1:
+        emb = torch.nn.functional.pad(emb, (0, 1, 0, 0))
+
+    return emb
+
+
+def monkey_patch_model(model: UNet2DConditionModel):
+    """1. Apply monkey patches
+    2. Apply module replacements for targeted modules whenever monkeypatch
+    code is too long
+
+    Note:
+    - This monkeypatch is verified against diffusers==0.31.0 on stable
+    diffusion 1.5
+
+    """
+    print("Monkeypatching Unet (replacing MHA with SHA attention etc)")
+    replace_attention_modules(model)
+    replace_activations_with_conv2d(model)
+    replace_layer_norm_modules(model)
+    replace_feedforward_modules(model)
+    replace_transformer2d_modules(model)
diff --git a/sd-legacy-stable-diffusion-v1-5/aitk/model_project.config b/sd-legacy-stable-diffusion-v1-5/aitk/model_project.config
new file mode 100644
index 00000000..516e503b
--- /dev/null
+++ b/sd-legacy-stable-diffusion-v1-5/aitk/model_project.config
@@ -0,0 +1,12 @@
+{
+    "workflows": [
+        {
+            "file": "sd_qnn_workflow.json",
+            "templateName": "sd_qnn_workflow"
+        }
+    ],
+    "modelInfo": {
+        "id": "huggingface/stable-diffusion-v1-5/stable-diffusion-v1-5",
+        "version": 1
+    }
+}
diff --git a/sd-legacy-stable-diffusion-v1-5/aitk/sd_qnn_evaluation.py b/sd-legacy-stable-diffusion-v1-5/aitk/sd_qnn_evaluation.py
new file mode 100644
index 00000000..5e4c0418
--- /dev/null
+++ b/sd-legacy-stable-diffusion-v1-5/aitk/sd_qnn_evaluation.py
@@ -0,0 +1,145 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+
+import json
+import logging
+import os
+from pathlib import Path
+
+import numpy as np
+import onnxruntime as ort
+
+from sd_utils.qdq import OnnxStableDiffusionPipelineWithSave
+
+logger = logging.getLogger(os.path.basename(__file__))
+logging.basicConfig(level=logging.INFO)
+
+def parse_args(raw_args):
+    import argparse
+
+    parser = argparse.ArgumentParser("Common arguments")
+
+    parser.add_argument("--script_dir", required=True, type=str)
+    parser.add_argument("--model_dir", default="optimized", type=str, help="model_dir path")
+    parser.add_argument("--model_id", default="stable-diffusion-v1-5/stable-diffusion-v1-5", type=str)
+    parser.add_argument(
+        "--guidance_scale",
+        default=7.5,
+        type=float,
+        help="Guidance scale as defined in Classifier-Free Diffusion Guidance",
+    )
+    parser.add_argument("--num_inference_steps", default=25, type=int, help="Number of steps in diffusion process")
+    parser.add_argument(
+        "--seed",
+        default=None,
+        type=int,
+        help="The seed to give to the generator to generate deterministic results.",
+    )
+    parser.add_argument("--image_size", default=512, type=int, help="Width and height of the images to generate")
+    parser.add_argument(
+        "--execution_provider",
+        type=str,
+        default="CPUExecutionProvider",
+        help="ORT Execution provider",
+    )
+    parser.add_argument(
+        "--device_str",
+        type=str,
+        default="cpu",
+    )
+    parser.add_argument(
+        "--output_file",
+        type=str,
+        required=True,
+    )
+    return parser.parse_args(raw_args)
+
+def get_device_type(device_str):
+    if device_str.lower() == "gpu":
+        return ort.OrtHardwareDeviceType.GPU
+    elif device_str.lower() == "npu":
+        return ort.OrtHardwareDeviceType.NPU
+    else:
+        return ort.OrtHardwareDeviceType.CPU
+
+def add_ep_for_device(session_options, ep_name, device_type, ep_options=None):
+    ep_devices = ort.get_ep_devices()
+    for ep_device in ep_devices:
+        if ep_device.ep_name == ep_name and ep_device.device.type == device_type:
+            print(f"Adding {ep_name} for {device_type}")
+            session_options.add_provider_for_devices([ep_device], {} if ep_options is None else ep_options)
+            break
+
+def register_execution_providers():
+    import subprocess
+    import sys
+
+    worker_script = os.path.abspath('winml.py')
+    result = subprocess.check_output([sys.executable, worker_script], text=True)
+    paths = json.loads(result)
+    for item in paths.items():
+        try:
+            ort.register_execution_provider_library(item[0], item[1])
+        except Exception as e:
+            print(f"Failed to register execution provider {item[0]}: {e}")
+
+
+def main(raw_args=None):
+    args = parse_args(raw_args)
+
+    prompts = ["A baby is laying down with a teddy bear"]
+    model_dir = Path(args.script_dir) / "models" / args.model_dir / args.model_id
+
+    register_execution_providers()
+
+    sess_options = ort.SessionOptions()
+    provider_options = [{}]
+
+    add_ep_for_device(sess_options, args.execution_provider, get_device_type(args.device_str))
+
+    pipeline = OnnxStableDiffusionPipelineWithSave.from_pretrained(
+        model_dir, provider=args.execution_provider, sess_options=sess_options, provider_options=provider_options
+    )
+    pipeline.save_data_dir = None
+
+    text_encoder_latencies = []
+    unet_latencies = []
+    vae_decoder_latencies = []
+
+    generator = None if args.seed is None else np.random.RandomState(seed=args.seed)
+
+    # we know that PatchedOnnxRuntimeModel records latencies for each call
+    for _ in range(10):
+        pipeline.text_encoder.latencies = []
+        pipeline.unet.latencies = []
+        pipeline.vae_decoder.latencies = []
+        pipeline(
+            prompts,
+            num_inference_steps=args.num_inference_steps,
+            height=args.image_size,
+            width=args.image_size,
+            guidance_scale=args.guidance_scale,
+            generator=generator,
+        )
+        text_encoder_latencies.extend(pipeline.text_encoder.latencies)
+        unet_latencies.extend(pipeline.unet.latencies)
+        vae_decoder_latencies.extend(pipeline.vae_decoder.latencies)
+
+    text_encoder_latency_avg = round(sum(text_encoder_latencies) / len(text_encoder_latencies) * 1000, 5)
+    unet_latency_avg = round(sum(unet_latencies) / len(unet_latencies) * 1000, 5)
+    vae_decoder_latency_avg = round(sum(vae_decoder_latencies) / len(vae_decoder_latencies) * 1000, 5)
+
+    metrics = {
+        "text-encoder-latency-avg": text_encoder_latency_avg,
+        "unet-latency-avg": unet_latency_avg,
+        "vae-decoder-latency-avg": vae_decoder_latency_avg
+    }
+    resultStr = json.dumps(metrics, indent=4)
+    with open(args.output_file, 'w') as file:
+        file.write(resultStr)
+    logger.info("Model lab succeeded for evaluation.\n%s", resultStr)
+
+if __name__ == "__main__":
+    main()
diff --git a/sd-legacy-stable-diffusion-v1-5/aitk/sd_qnn_workflow.json b/sd-legacy-stable-diffusion-v1-5/aitk/sd_qnn_workflow.json
new file mode 100644
index 00000000..258dfee9
--- /dev/null
+++ b/sd-legacy-stable-diffusion-v1-5/aitk/sd_qnn_workflow.json
@@ -0,0 +1,39 @@
+{
+    "input_model": {
+        "model_path": "stable-diffusion-v1-5/stable-diffusion-v1-5"
+    },
+    "systems": {
+        "target_system": {
+            "type": "LocalSystem",
+            "accelerators": [
+                {
+                    "device": "npu",
+                    "execution_providers": [
+                        "QNNExecutionProvider"
+                    ]
+                }
+            ]
+        }
+    },
+    "evaluators": {
+        "common_evaluator": {
+        }
+    },
+    "passes": {
+        "aitkpython": {
+            "type": "AitkPython",
+            "activation_type": "uint16",
+            "precision": "uint8",
+            "dataset_name": "phiyodr/coco2017",
+            "split": "train",
+            "length": 100,
+            "user_script": "sd_qnn_workflow.py"
+        }
+    },
+    "evaluator": "common_evaluator",
+    "evaluate_input_model": false,
+    "target": "target_system",
+    "clean_cache": false,
+    "output_dir": "model/sd_qnn",
+    "cache_dir": "cache"
+}
diff --git a/sd-legacy-stable-diffusion-v1-5/aitk/sd_qnn_workflow.json.config b/sd-legacy-stable-diffusion-v1-5/aitk/sd_qnn_workflow.json.config
new file mode 100644
index 00000000..0d7d2cf3
--- /dev/null
+++ b/sd-legacy-stable-diffusion-v1-5/aitk/sd_qnn_workflow.json.config
@@ -0,0 +1,230 @@
+{
+    "name": "Convert to Qualcomm NPU",
+    "evalMetrics": {
+        "text-encoder-latency-avg": "text-encoder-latency-avg",
+        "unet-latency-avg": "unet-latency-avg",
+        "vae-decoder-latency-avg": "vae-decoder-latency-avg"
+    },
+    "evalNoDataConfig": true,
+    "memoryGbSuggested": 30,
+    "executeRuntimeFeatures": [
+        "SD"
+    ],
+    "evaluationRuntimeFeatures": [
+        "SD"
+    ],
+    "addCpu": true,
+    "runtime": {
+        "autoGenerated": true,
+        "name": "Evaluate on",
+        "type": "enum",
+        "displayNames": [
+            "Qualcomm NPU",
+            "CPU"
+        ],
+        "path": "systems.target_system.accelerators.0.execution_providers.0",
+        "values": [
+            "QNNExecutionProvider",
+            "CPUExecutionProvider"
+        ],
+        "actions": [
+            [
+                {
+                    "type": "update",
+                    "path": "systems.target_system.accelerators.0.execution_providers.0",
+                    "value": "QNNExecutionProvider"
+                },
+                {
+                    "type": "update",
+                    "path": "systems.target_system.accelerators.0.device",
+                    "value": "npu"
+                }
+            ],
+            [
+                {
+                    "type": "update",
+                    "path": "systems.target_system.accelerators.0.execution_providers.0",
+                    "value": "CPUExecutionProvider"
+                },
+                {
+                    "type": "update",
+                    "path": "systems.target_system.accelerators.0.device",
+                    "value": "cpu"
+                }
+            ]
+        ],
+        "readOnly": false
+    },
+    "optimizationPaths": [
+        {
+            "path": "passes.aitkpython.precision",
+            "name": "WeightType"
+        },
+        {
+            "path": "passes.aitkpython.activation_type",
+            "name": "ActivationType"
+        }
+    ],
+    "optimizationDefault": "w8a16",
+    "aitkPython": "sd_qnn_workflow.py",
+    "sections": [
+        {
+            "autoGenerated": true,
+            "name": "Convert",
+            "phase": "Conversion",
+            "parameters": [],
+            "toggle": {
+                "autoGenerated": true,
+                "name": "Convert to ONNX format",
+                "type": "bool",
+                "path": "passes.aitkpython",
+                "actions": [
+                    [],
+                    []
+                ],
+                "readOnly": true
+            }
+        },
+        {
+            "name": "Quantize",
+            "phase": "Quantization",
+            "parameters": [
+                {
+                    "name": "Activation Type",
+                    "tags": [
+                        "ActivationType"
+                    ],
+                    "description": "Quantization data type of activation. ‘Int8’ for signed 8-bit integer, ‘UInt8’ for unsigned 8-bit integer etc.",
+                    "descriptionLink": "https://onnxruntime.ai/docs/performance/model-optimizations/quantization.html",
+                    "type": "enum",
+                    "displayNames": [
+                        "Int8",
+                        "UInt8",
+                        "Int16",
+                        "UInt16"
+                    ],
+                    "displayType": "RadioGroup",
+                    "path": "passes.aitkpython.activation_type",
+                    "values": [
+                        "int8",
+                        "uint8",
+                        "int16",
+                        "uint16"
+                    ],
+                    "template": {
+                        "path": "passes.aitkpython.activation_type",
+                        "template": "ActivationType"
+                    }
+                },
+                {
+                    "name": "Weight Type",
+                    "tags": [
+                        "WeightType"
+                    ],
+                    "description": "Data type for quantizing weights. ‘Int8’ for signed 8-bit integer, ‘UInt8’ for unsigned 8-bit integer etc.",
+                    "descriptionLink": "https://onnxruntime.ai/docs/performance/model-optimizations/quantization.html",
+                    "type": "enum",
+                    "displayNames": [
+                        "Int8",
+                        "UInt8",
+                        "Int16",
+                        "UInt16"
+                    ],
+                    "displayType": "RadioGroup",
+                    "path": "passes.aitkpython.precision",
+                    "values": [
+                        "int8",
+                        "uint8",
+                        "int16",
+                        "uint16"
+                    ],
+                    "template": {
+                        "path": "passes.aitkpython.precision",
+                        "template": "WeightType"
+                    }
+                },
+                {
+                    "name": "Quantization Dataset",
+                    "tags": [
+                        "QuantizationDataset"
+                    ],
+                    "type": "enum",
+                    "path": "passes.aitkpython.dataset_name",
+                    "values": [
+                        "phiyodr/coco2017"
+                    ],
+                    "template": {
+                        "path": "passes.aitkpython.dataset_name",
+                        "values": [
+                            "phiyodr/coco2017"
+                        ],
+                        "template": "QuantizationDataset"
+                    }
+                },
+                {
+                    "name": "Quantization Dataset Split",
+                    "tags": [
+                        "QuantizationDatasetSplit",
+                        "DependsOnDataset"
+                    ],
+                    "type": "enum",
+                    "path": "passes.aitkpython.split",
+                    "values": [
+                        "train",
+                        "validation"
+                    ],
+                    "template": {
+                        "path": "passes.aitkpython.split",
+                        "values": [
+                            "train",
+                            "validation"
+                        ],
+                        "template": "QuantizationDatasetSplit"
+                    }
+                },
+                {
+                    "name": "Quantization Dataset Size",
+                    "description": "ATTENTION! Using 100 samples of coco2017 dataset to generate the required real data requires large space of disk space.",
+                    "type": "int",
+                    "path": "passes.aitkpython.length",
+                    "template": {
+                        "description": "ATTENTION! Using 100 samples of coco2017 dataset to generate the required real data requires large space of disk space.",
+                        "path": "passes.aitkpython.length",
+                        "template": "QuantizationDatasetSize"
+                    }
+                }
+            ],
+            "disableToggleGeneration": true,
+            "toggle": {
+                "name": "Quantize model",
+                "type": "bool",
+                "path": "passes.aitkpython",
+                "actions": [
+                    [],
+                    []
+                ],
+                "readOnly": true
+            }
+        },
+        {
+            "name": "Evaluate",
+            "phase": "Evaluation",
+            "parameters": [],
+            "toggle": {
+                "autoGenerated": true,
+                "name": "Evaluate model performance",
+                "type": "bool",
+                "path": "evaluator",
+                "actions": [
+                    [],
+                    [
+                        {
+                            "type": "delete",
+                            "path": "evaluator"
+                        }
+                    ]
+                ]
+            }
+        }
+    ]
+}
diff --git a/sd-legacy-stable-diffusion-v1-5/aitk/sd_qnn_workflow.py b/sd-legacy-stable-diffusion-v1-5/aitk/sd_qnn_workflow.py
new file mode 100644
index 00000000..abe175ec
--- /dev/null
+++ b/sd-legacy-stable-diffusion-v1-5/aitk/sd_qnn_workflow.py
@@ -0,0 +1,132 @@
+import argparse
+import json
+import os
+import subprocess
+import sys
+import logging
+
+logger = logging.getLogger(os.path.basename(__file__))
+logging.basicConfig(level=logging.INFO)
+
+def parse_arguments():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--config", required=True, help="path to input config file")
+    parser.add_argument("--model_config", help="path to input model config file")
+    parser.add_argument("--runtime", required=True, help="runtime")
+    return parser.parse_args()
+
+def load_update_config(
+        config_path: str,
+        cache_dir: str,
+        output_dir: str,
+        activation_type: str | None = None,
+        precision: str | None = None) -> dict:
+    with open(config_path, 'r', encoding='utf-8') as file:
+        oliveJson = json.load(file)
+
+    oliveJson["cache_dir"] = cache_dir
+    oliveJson["output_dir"] = os.path.join(os.path.dirname(output_dir), oliveJson["output_dir"])
+
+    if "quantization" in oliveJson["passes"]:
+        if activation_type is not None:
+            oliveJson["passes"]["quantization"]["activation_type"] = activation_type
+        if precision is not None:
+            oliveJson["passes"]["quantization"]["precision"] = precision
+
+    return oliveJson
+
+def copy_olive_config(
+        history_folder: str,
+        config_path: str,
+        cache_dir: str,
+        output_dir: str,
+        activation_type: str | None = None,
+        precision: str | None = None):
+    logger.info(f"Copying {config_path} to {history_folder}...")
+    oliveJson = load_update_config(config_path, cache_dir, output_dir, activation_type, precision)
+    # save updated config for record
+    config_name = os.path.basename(config_path)
+    os.makedirs(history_folder, exist_ok=True)
+    with open(os.path.join(history_folder, config_name), 'w', encoding='utf-8') as file:
+        json.dump(oliveJson, file, indent=4)
+
+def main():
+    args = parse_arguments()
+
+    with open(args.config, 'r', encoding='utf-8') as file:
+        oliveJson = json.load(file)
+
+    if args.model_config:
+        model_path: str = os.path.dirname(args.model_config)
+        execution_provider: str = oliveJson["systems"]["target_system"]["accelerators"][0]["execution_providers"][0]
+        device_str: str = oliveJson["systems"]["target_system"]["accelerators"][0]["device"]
+        output_file = os.path.join(os.path.dirname(args.config), "metrics.json")
+
+        # Run evaluator
+        subprocess.run([sys.executable, "sd_qnn_evaluation.py",
+                        "--script_dir", os.path.dirname(model_path),
+                        "--model_dir", "optimized",
+                        "--model_id", "stable-diffusion-v1-5/stable-diffusion-v1-5",
+                        "--execution_provider", execution_provider,
+                        "--device_str", device_str,
+                        "--output_file", output_file],
+                        check=True)
+        return
+
+
+    # Get arguments
+    output_dir: str = oliveJson["output_dir"]
+    cache_dir: str = oliveJson["cache_dir"]
+    config_pass = oliveJson["passes"]["aitkpython"]
+    activation_type: str = config_pass["activation_type"]
+    precision: str = config_pass["precision"]
+
+    dataset_name: str = config_pass["dataset_name"]
+    dataset_split: str = config_pass["split"]
+    num_data: int = config_pass["length"]
+
+    history_folder = os.path.dirname(args.config)
+
+    logger.info(f"history dir: {history_folder}")
+    os.makedirs(os.path.join(history_folder, "model"), exist_ok=True)
+
+    submodel_names = ["vae_encoder", "vae_decoder", "unet", "text_encoder", "safety_checker"]
+
+    for submodel_name in submodel_names:
+        config_name = f"config_{submodel_name}.json"
+        copy_olive_config(history_folder, config_name, cache_dir, output_dir, activation_type, precision)
+
+    # run stable_diffusion.py to generate onnx unoptimized model
+    subprocess.run([sys.executable, "stable_diffusion.py",
+                    "--script_dir", history_folder,
+                    "--model_id", "stable-diffusion-v1-5/stable-diffusion-v1-5",
+                    "--provider", "cpu",
+                    "--format", "qdq",
+                    "--optimize",
+                    "--only_conversion"],
+                   check=True)
+
+    # # run evaluation.py to generate data
+    subprocess.run([sys.executable, "evaluation.py",
+                    "--script_dir", history_folder,
+                    "--save_data",
+                    "--model_id", "stable-diffusion-v1-5/stable-diffusion-v1-5",
+                    "--num_inference_steps", "25",
+                    "--seed", "0",
+                    "--dataset_name", dataset_name,
+                    "--dataset_split", dataset_split,
+                    "--num_data", str(num_data),
+                    "--guidance_scale", "7.5"],
+                   check=True)
+
+    # run stable_diffusion.py to generate onnx quantized model
+    subprocess.run([sys.executable, "stable_diffusion.py",
+                    "--script_dir", history_folder,
+                    "--model_id", "stable-diffusion-v1-5/stable-diffusion-v1-5",
+                    "--provider", "cpu",
+                    "--format", "qdq",
+                    "--optimize"],
+                   check=True)
+
+if __name__ == "__main__":
+    main()
diff --git a/sd-legacy-stable-diffusion-v1-5/aitk/sd_utils/config.py b/sd-legacy-stable-diffusion-v1-5/aitk/sd_utils/config.py
new file mode 100644
index 00000000..b4113cd8
--- /dev/null
+++ b/sd-legacy-stable-diffusion-v1-5/aitk/sd_utils/config.py
@@ -0,0 +1,10 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+
+vae_sample_size = 512
+unet_sample_size = 64
+cross_attention_dim = 768
+only_conversion = False
+data_dir = "quantize_data"
diff --git a/sd-legacy-stable-diffusion-v1-5/aitk/sd_utils/onnx_patch.py b/sd-legacy-stable-diffusion-v1-5/aitk/sd_utils/onnx_patch.py
new file mode 100644
index 00000000..9855a31d
--- /dev/null
+++ b/sd-legacy-stable-diffusion-v1-5/aitk/sd_utils/onnx_patch.py
@@ -0,0 +1,97 @@
+# --------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+
+import os
+import time
+import shutil
+from pathlib import Path
+from typing import Optional, Union
+
+from diffusers import OnnxRuntimeModel
+from diffusers.utils import ONNX_EXTERNAL_WEIGHTS_NAME, ONNX_WEIGHTS_NAME
+
+import numpy as np
+import onnxruntime as ort
+
+
+class PatchedOnnxRuntimeModel(OnnxRuntimeModel):
+    def __init__(self, model=None, **kwargs):
+        self.model = model
+        self.model_save_dir = kwargs.get("model_save_dir", None)
+        self.latencies = []
+
+    def __call__(self, **kwargs):
+        inputs = {k: np.array(v) for k, v in kwargs.items()}
+        start_time = time.perf_counter()
+        outputs = self.model.run(None, inputs)
+        self.latencies.append(time.perf_counter() - start_time)
+        return outputs
+
+    @staticmethod
+    def load_model(path: Union[str, Path], provider=None, sess_options=None, provider_options=None):
+        return ort.InferenceSession(path, sess_options=sess_options)
+
+    def _save_pretrained(self, save_directory: Union[str, Path], file_name: Optional[str] = None, **kwargs):
+        model_file_name = ONNX_WEIGHTS_NAME
+
+        src_path = self.model_save_dir.joinpath(model_file_name)
+        dst_path = Path(save_directory).joinpath(model_file_name)
+
+        try:
+            shutil.copyfile(src_path, dst_path)
+        except shutil.SameFileError:
+            pass
+
+        # copy external weights (for models >2GB)
+        src_path = self.model_save_dir.joinpath(ONNX_EXTERNAL_WEIGHTS_NAME)
+        if src_path.exists():
+            dst_path = Path(save_directory).joinpath(ONNX_EXTERNAL_WEIGHTS_NAME)
+            try:
+                shutil.copyfile(src_path, dst_path)
+            except shutil.SameFileError:
+                pass
+
+    def save_pretrained(
+        self,
+        save_directory: Union[str, os.PathLike],
+        **kwargs,
+    ):
+        if os.path.isfile(save_directory):
+            print(f"Provided path ({save_directory}) should be a directory, not a file")
+            return
+
+        os.makedirs(save_directory, exist_ok=True)
+
+        self._save_pretrained(save_directory, **kwargs)
+
+
+    @classmethod
+    def _from_pretrained(
+        cls,
+        model_id: Union[str, Path],
+        provider: Optional[str] = None,
+        sess_options: Optional["ort.SessionOptions"] = None,
+        **kwargs,
+    ):
+        model_file_name = ONNX_WEIGHTS_NAME
+        model_path = Path(model_id, model_file_name).as_posix()
+
+        model = PatchedOnnxRuntimeModel.load_model(
+            model_path,
+            provider=provider,
+            sess_options=sess_options,
+        )
+
+        kwargs["model_save_dir"] = Path(model_id)
+
+        return cls(model=model, **kwargs)
+
+    @classmethod
+    def from_pretrained(
+        cls,
+        model_id: Union[str, Path],
+        **model_kwargs,
+    ):
+        return cls._from_pretrained(model_id=model_id, **model_kwargs)
diff --git a/sd-legacy-stable-diffusion-v1-5/aitk/sd_utils/ort.py b/sd-legacy-stable-diffusion-v1-5/aitk/sd_utils/ort.py
new file mode 100644
index 00000000..3f9c8e9b
--- /dev/null
+++ b/sd-legacy-stable-diffusion-v1-5/aitk/sd_utils/ort.py
@@ -0,0 +1,184 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+
+import json
+import shutil
+import sys
+from pathlib import Path
+
+import onnxruntime as ort
+
+from diffusers.pipelines.stable_diffusion.pipeline_onnx_stable_diffusion import OnnxStableDiffusionPipeline
+from olive.model import ONNXModelHandler
+from onnxruntime import __version__ as OrtVersion
+from packaging import version
+from sd_utils import config as sd_config
+
+# ruff: noqa: TID252, T201
+
+
+def update_cuda_config(config_cuda: dict):
+    if version.parse(OrtVersion) < version.parse("1.17.0"):
+        # disable skip_group_norm fusion since there is a shape inference bug which leads to invalid models
+        config_cuda["passes"]["optimize_cuda"]["optimization_options"] = {"enable_skip_group_norm": False}
+    used_passes = {"convert", "optimize_cuda"}
+    for pass_name in set(config_cuda["passes"].keys()):
+        if pass_name not in used_passes:
+            config_cuda["passes"].pop(pass_name, None)
+    config_cuda["systems"]["local_system"]["accelerators"][0]["execution_providers"] = ["CUDAExecutionProvider"]
+    return config_cuda
+
+
+def validate_args(args, provider):
+    ort.set_default_logger_severity(4)
+    if args.static_dims:
+        print(
+            "WARNING: the --static_dims option is deprecated, and static shape optimization is enabled by default. "
+            "Use --dynamic_dims to disable static shape optimization."
+        )
+
+    validate_ort_version(provider)
+
+
+def validate_ort_version(provider: str):
+    if provider == "cuda" and version.parse(OrtVersion) < version.parse("1.17.0"):
+        if version.parse(OrtVersion) < version.parse("1.16.2"):
+            print("This script requires onnxruntime-gpu 1.16.2 or newer")
+            sys.exit(1)
+        print(
+            f"WARNING: onnxruntime {OrtVersion} has known issues with shape inference for SkipGroupNorm. Will disable"
+            " skip_group_norm fusion. onnxruntime-gpu 1.17.0 or newer is strongly recommended!"
+        )
+
+
+def save_optimized_onnx_submodel(script_dir, submodel_name, provider, model_info):
+    footprints_file_path = Path(script_dir).resolve() / "footprints" / submodel_name / "footprint.json"
+    with footprints_file_path.open("r") as footprint_file:
+        footprints = json.load(footprint_file)
+
+        conversion_footprint = None
+        optimizer_footprint = None
+        for footprint in footprints.values():
+            from_pass = footprint["from_pass"].lower() if footprint["from_pass"] else ""
+            if from_pass == "OnnxConversion".lower():
+                conversion_footprint = footprint
+                if sd_config.only_conversion:
+                    optimizer_footprint = footprint
+            elif (
+                from_pass == "OrtTransformersOptimization".lower()
+                or from_pass == "OnnxStaticQuantization".lower()
+                or from_pass == "EPContextBinaryGenerator".lower()
+                or from_pass == "DynamicToFixedShape".lower()
+            ):
+                optimizer_footprint = footprint
+
+        assert conversion_footprint
+        assert optimizer_footprint
+
+        unoptimized_olive_model = ONNXModelHandler(**conversion_footprint["model_config"]["config"])
+        optimized_olive_model = ONNXModelHandler(**optimizer_footprint["model_config"]["config"])
+
+        model_info[submodel_name] = {
+            "unoptimized": {
+                "path": Path(unoptimized_olive_model.model_path),
+            },
+            "optimized": {
+                "path": Path(optimized_olive_model.model_path),
+            },
+        }
+
+        print(f"Unoptimized Model : {model_info[submodel_name]['unoptimized']['path']}")
+        print(f"Optimized Model   : {model_info[submodel_name]['optimized']['path']}")
+
+
+def save_onnx_pipeline(
+    has_safety_checker, model_info, optimized_model_dir, unoptimized_model_dir, pipeline, submodel_names
+):
+    # Save the unoptimized models in a directory structure that the diffusers library can load and run.
+    # This is optional, and the optimized models can be used directly in a custom pipeline if desired.
+    print("\nCreating ONNX pipeline...")
+
+    from sd_utils.onnx_patch import PatchedOnnxRuntimeModel
+
+    if has_safety_checker:
+        safety_checker = PatchedOnnxRuntimeModel.from_pretrained(model_info["safety_checker"]["unoptimized"]["path"].parent)
+    else:
+        safety_checker = None
+
+    onnx_pipeline = OnnxStableDiffusionPipeline(
+        vae_encoder=PatchedOnnxRuntimeModel.from_pretrained(model_info["vae_encoder"]["unoptimized"]["path"].parent),
+        vae_decoder=PatchedOnnxRuntimeModel.from_pretrained(model_info["vae_decoder"]["unoptimized"]["path"].parent),
+        text_encoder=PatchedOnnxRuntimeModel.from_pretrained(model_info["text_encoder"]["unoptimized"]["path"].parent),
+        tokenizer=pipeline.tokenizer,
+        unet=PatchedOnnxRuntimeModel.from_pretrained(model_info["unet"]["unoptimized"]["path"].parent, provider_options=[]),
+        scheduler=pipeline.scheduler,
+        safety_checker=safety_checker,
+        feature_extractor=pipeline.feature_extractor,
+        requires_safety_checker=True,
+    )
+
+    print("Saving unoptimized models...")
+    onnx_pipeline.save_pretrained(unoptimized_model_dir)
+
+    # Create a copy of the unoptimized model directory, then overwrite with optimized models from the olive cache.
+    print("Copying optimized models...")
+    shutil.copytree(unoptimized_model_dir, optimized_model_dir, ignore=shutil.ignore_patterns("weights.pb"))
+    for submodel_name in submodel_names:
+        src_path = model_info[submodel_name]["optimized"]["path"]
+        dst_path = optimized_model_dir / submodel_name / "model.onnx"
+        shutil.copyfile(src_path, dst_path)
+
+        # Copy the QNN context bin if present
+        src_ctx_path = Path(str(src_path).replace(".onnx", "_qnn.bin"))
+        if src_ctx_path.exists():
+            dst_ctx_path = optimized_model_dir / submodel_name / "model_ctx_qnn.bin"
+            shutil.copyfile(src_ctx_path, dst_ctx_path)
+
+    print(f"The optimized pipeline is located here: {optimized_model_dir}")
+
+
+def get_ort_pipeline(model_dir, common_args, ort_args, guidance_scale):
+    ort.set_default_logger_severity(3)
+
+    print("Loading models into ORT session...")
+    sess_options = ort.SessionOptions()
+    sess_options.enable_mem_pattern = False
+
+    static_dims = not ort_args.dynamic_dims
+    batch_size = common_args.batch_size
+    image_size = common_args.image_size
+    provider = common_args.provider
+    vae_sample_size = sd_config.vae_sample_size
+    unet_sample_size = sd_config.unet_sample_size
+
+    if static_dims:
+        hidden_batch_size = batch_size if (guidance_scale <= 1.0) else batch_size * 2
+        # batch_size is doubled for sample & hidden state because of classifier free guidance:
+        # https://github.com/huggingface/diffusers/blob/46c52f9b9607e6ecb29c782c052aea313e6487b7/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py#L672
+        sess_options.add_free_dimension_override_by_name("unet_sample_batch", hidden_batch_size)
+        sess_options.add_free_dimension_override_by_name("unet_sample_channels", 4)
+        sess_options.add_free_dimension_override_by_name("unet_sample_height", image_size // 8)
+        sess_options.add_free_dimension_override_by_name("unet_sample_width", image_size // 8)
+        sess_options.add_free_dimension_override_by_name("unet_time_batch", 1)
+        sess_options.add_free_dimension_override_by_name("unet_hidden_batch", hidden_batch_size)
+        sess_options.add_free_dimension_override_by_name("unet_hidden_sequence", 77)
+
+        sess_options.add_free_dimension_override_by_name("decoder_batch", batch_size)
+        sess_options.add_free_dimension_override_by_name("decoder_channels", 4)
+        sess_options.add_free_dimension_override_by_name("decoder_height", unet_sample_size)
+        sess_options.add_free_dimension_override_by_name("decoder_width", unet_sample_size)
+
+        sess_options.add_free_dimension_override_by_name("encoder_batch", batch_size)
+        sess_options.add_free_dimension_override_by_name("encoder_channels", 3)
+        sess_options.add_free_dimension_override_by_name("encoder_height", vae_sample_size)
+        sess_options.add_free_dimension_override_by_name("encoder_width", vae_sample_size)
+
+    provider_map = {
+        "cuda": "CUDAExecutionProvider",
+    }
+    assert provider in provider_map, f"Unsupported provider: {provider}"
+    return OnnxStableDiffusionPipeline.from_pretrained(
+        model_dir, provider=provider_map[provider], sess_options=sess_options
+    )
diff --git a/sd-legacy-stable-diffusion-v1-5/aitk/sd_utils/ov.py b/sd-legacy-stable-diffusion-v1-5/aitk/sd_utils/ov.py
new file mode 100644
index 00000000..aa784dc7
--- /dev/null
+++ b/sd-legacy-stable-diffusion-v1-5/aitk/sd_utils/ov.py
@@ -0,0 +1,531 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+# Example modified from: https://docs.openvino.ai/2023.3/notebooks/225-stable-diffusion-text-to-image-with-output.html
+# --------------------------------------------------------------------------
+import inspect
+import json
+import shutil
+import sys
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Callable, Optional, Union
+
+import cv2
+import numpy as np
+import openvino as ov
+import PIL
+import torch
+from diffusers import StableDiffusionPipeline
+from diffusers.pipelines.pipeline_utils import DiffusionPipeline
+from diffusers.schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler
+from openvino.runtime import Model
+from transformers import CLIPTokenizer
+
+OV_OPTIMIZED_MODEL_INFO = "ov_optimized_model_info.json"
+
+# ruff: noqa: T201
+
+
+def scale_fit_to_window(dst_width: int, dst_height: int, image_width: int, image_height: int):
+    """Preprocessing helper function for calculating image size.
+
+    Calculating image size for resize with peserving original aspect ratio and fitting image to specific window size.
+
+    Args:
+      dst_width (int): destination window width
+      dst_height (int): destination window height
+      image_width (int): source image width
+      image_height (int): source image height
+
+    Returns:
+      result_width (int): calculated width for resize
+      result_height (int): calculated height for resize
+
+    """
+    im_scale = min(dst_height / image_height, dst_width / image_width)
+    return int(im_scale * image_width), int(im_scale * image_height)
+
+
+def preprocess(image: PIL.Image.Image):
+    """Image preprocessing function.
+
+    Takes image in PIL.Image format, resizes it to keep aspect ration and fits to model input window 512x512,
+    then converts it to np.ndarray and adds padding with zeros on right or
+    bottom side of image (depends from aspect ratio), after that
+    converts data to float32 data type and change range of values from [0, 255] to [-1, 1], finally,
+    converts data layout from planar NHWC to NCHW.
+    The function returns preprocessed input tensor and padding size, which can be used in postprocessing.
+
+    Args:
+      image (PIL.Image.Image): input image
+    Returns:
+       image (np.ndarray): preprocessed image tensor
+       meta (Dict): dictionary with preprocessing metadata info
+
+    """
+    src_width, src_height = image.size
+    dst_width, dst_height = scale_fit_to_window(512, 512, src_width, src_height)
+    image = np.array(image.resize((dst_width, dst_height), resample=PIL.Image.Resampling.LANCZOS))[None, :]
+    pad_width = 512 - dst_width
+    pad_height = 512 - dst_height
+    pad = ((0, 0), (0, pad_height), (0, pad_width), (0, 0))
+    image = np.pad(image, pad, mode="constant")
+    image = image.astype(np.float32) / 255.0
+    image = 2.0 * image - 1.0
+    image = image.transpose(0, 3, 1, 2)
+    return image, {"padding": pad, "src_width": src_width, "src_height": src_height}
+
+
+@dataclass
+class OvStableDiffusionPipelineOutput:
+    """Output class for Stable Diffusion pipelines.
+
+    Args:
+        images (`List[PIL.Image.Image]` or `np.ndarray`)
+            List of denoised PIL images of length `batch_size` or NumPy array of shape `(batch_size, height, width,
+            num_channels)`.
+
+    """
+
+    images: Union[list[PIL.Image.Image], np.ndarray]
+    nsfw_content_detected: Optional[list[bool]] = None
+
+
+class OVStableDiffusionPipeline(DiffusionPipeline):
+    def __init__(
+        self,
+        vae_decoder: Model,
+        text_encoder: Model,
+        tokenizer: CLIPTokenizer,
+        unet: Model,
+        scheduler: Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler],
+        vae_encoder: Model = None,
+    ):
+        """Pipeline for text-to-image generation using Stable Diffusion.
+
+        Args:
+            vae_decoder (Model):
+                Variational Auto-Encoder (VAE) Model to decode images to and from latent representations.
+            text_encoder (Model):
+                Frozen text-encoder. Stable Diffusion uses the text portion of
+                [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
+                the clip-vit-large-patch14(https://huggingface.co/openai/clip-vit-large-patch14) variant.
+            tokenizer (CLIPTokenizer):
+                Tokenizer of class CLIPTokenizer
+                (https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+            unet (Model): Conditional U-Net architecture to denoise the encoded image latents.
+            scheduler (SchedulerMixin):
+                A scheduler to be used in combination with unet to denoise the encoded image latents. Can be one of
+                DDIMScheduler, LMSDiscreteScheduler, or PNDMScheduler.
+            vae_encoder (Model):
+                Variational Auto-Encoder (VAE) Model to encode images to and from latent representations.
+
+        """
+        super().__init__()
+        self.scheduler = scheduler
+        self.vae_decoder = vae_decoder
+        self.vae_encoder = vae_encoder
+        self.text_encoder = text_encoder
+        self.unet = unet
+        self._text_encoder_output = text_encoder.output(0)
+        self._unet_output = unet.output(0)
+        self._vae_d_output = vae_decoder.output(0)
+        self._vae_e_output = vae_encoder.output(0) if vae_encoder is not None else None
+        self.height = 512
+        self.width = 512
+        self.tokenizer = tokenizer
+
+    def __call__(
+        self,
+        prompts: Union[str, list[str]],
+        image: PIL.Image.Image = None,
+        height: Optional[int] = 512,
+        width: Optional[int] = 512,
+        num_inference_steps: Optional[int] = 50,
+        negative_prompt: Union[str, list[str]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        guidance_scale: Optional[float] = 7.5,
+        eta: Optional[float] = 0.0,
+        output_type: Optional[str] = "pil",
+        seed: Optional[int] = None,
+        strength: float = 0.5,
+        gif: Optional[bool] = False,
+        callback: Optional[Callable[[int, int, np.ndarray], None]] = None,
+        callback_steps: int = 1,
+        **kwargs,
+    ):
+        """Invoke when calling the pipeline for generation.
+
+        Args:
+            prompts (str or List[str]):
+                The prompts to guide the image generation.
+            image (PIL.Image.Image, *optional*, None):
+                 Intinal image for generation.
+            height (int, *optional*, defaults to 512):
+                Height of the generated image.
+            width (int, *optional*, defaults to 512):
+                Width of the generated image.
+            num_inference_steps (int, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            negative_prompt (str or List[str]):
+                The negative prompt or prompts to guide the image generation.
+            num_images_per_prompt (int, *optional*, defaults to 1):
+                Number of images to generate per prompt.
+            guidance_scale (float, *optional*, defaults to 7.5):
+                Guidance scale as defined in Classifier-Free Diffusion Guidance(https://arxiv.org/abs/2207.12598).
+                guidance_scale is defined as `w` of equation 2.
+                Higher guidance scale encourages to generate images that are closely linked to the text prompt,
+                usually at the expense of lower image quality.
+            eta (float, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                [DDIMScheduler], will be ignored for others.
+            output_type (`str`, *optional*, defaults to "pil"):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): PIL.Image.Image or np.array.
+            seed (int, *optional*, None):
+                Seed for random generator state initialization.
+            strength (float):
+                Controls the amount of noise that is added to the input image.
+            gif (bool, *optional*, False):
+                Flag for storing all steps results or not.
+            callback (Callable[[int, int, np.ndarray], None], *optional*, None):
+                Callback function for each step of generation.
+            callback_steps (int, *optional*, 1):
+                Number of steps between callback calls.
+            kwargs: Additional keyword arguments.
+
+        Returns:
+            Dictionary with keys:
+                sample - the last generated image PIL.Image.Image or np.array
+                iterations - *optional* (if gif=True) images for all diffusion steps,
+                List of PIL.Image.Image or np.array.
+
+        """
+        if seed is not None:
+            np.random.seed(seed)
+
+        batch_size = len(prompts) if isinstance(prompts, list) else 1
+
+        print(f"Start inference with prompt: {prompts}")
+        img_buffer = []
+        do_classifier_free_guidance = guidance_scale > 1.0
+
+        # get prompt text embeddings
+        text_embeddings = self._encode_prompt(
+            prompts,
+            num_images_per_prompt,
+            do_classifier_free_guidance=do_classifier_free_guidance,
+            negative_prompt=negative_prompt,
+        )
+
+        # set timesteps
+        accepts_offset = "offset" in set(inspect.signature(self.scheduler.set_timesteps).parameters.keys())
+        extra_set_kwargs = {}
+        if accepts_offset:
+            extra_set_kwargs["offset"] = 1
+
+        self.scheduler.set_timesteps(num_inference_steps, **extra_set_kwargs)
+        timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength)
+        latent_timestep = timesteps[:1]
+
+        # get the initial random noise unless the user supplied it
+        latents, meta = self.prepare_latents(batch_size, num_images_per_prompt, height, width, image, latent_timestep)
+
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        for i, t in enumerate(self.progress_bar(timesteps)):
+            # expand the latents if you are doing classifier free guidance
+            latent_model_input = np.concatenate([latents] * 2) if do_classifier_free_guidance else latents
+            latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+            # predict the noise residual
+            noise_pred = self.unet([latent_model_input, t, text_embeddings])[self._unet_output]
+            # perform guidance
+            if do_classifier_free_guidance:
+                noise_pred_uncond, noise_pred_text = noise_pred[0], noise_pred[1]
+                noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+            # compute the previous noisy sample x_t -> x_t-1
+            latents = self.scheduler.step(
+                torch.from_numpy(noise_pred), t, torch.from_numpy(latents), **extra_step_kwargs
+            )["prev_sample"].numpy()
+            if gif:
+                image = self.vae_decoder(latents * (1 / 0.18215))[self._vae_d_output]
+                image = self.postprocess_image(image, meta, output_type)
+                img_buffer.extend(image)
+
+            # call the callback, if provided
+            if callback is not None and i % callback_steps == 0:
+                step_idx = i // getattr(self.scheduler, "order", 1)
+                callback(step_idx, t, latents)
+
+        # scale and decode the image latents with vae
+        image = self.vae_decoder(latents * (1 / 0.18215))[self._vae_d_output]
+
+        image = self.postprocess_image(image, meta, output_type)
+
+        return OvStableDiffusionPipelineOutput(images=image)
+
+    def _encode_prompt(
+        self,
+        prompt: str,
+        num_images_per_prompt: int = 1,
+        do_classifier_free_guidance: bool = True,
+        negative_prompt: Union[str, list[str]] = None,
+    ):
+        """Encode the prompt into text encoder hidden states.
+
+        Args:
+            prompt (str): prompt to be encoded
+            num_images_per_prompt (int): number of images to generate per prompt
+            do_classifier_free_guidance (bool): whether to use classifier free guidance or not
+            negative_prompt (str or list(str)): negative prompt to be encoded
+        Returns:
+            text_embeddings (np.ndarray): text encoder hidden states
+
+        """
+        batch_size = len(prompt) if isinstance(prompt, list) else 1
+
+        # tokenize input prompts
+        text_inputs = self.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=self.tokenizer.model_max_length,
+            truncation=True,
+            return_tensors="np",
+        )
+        text_input_ids = text_inputs.input_ids
+
+        text_embeddings = self.text_encoder(text_input_ids)[self._text_encoder_output]
+
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance:
+            uncond_tokens: list[str]
+            max_length = text_input_ids.shape[-1]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            else:
+                uncond_tokens = negative_prompt
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="np",
+            )
+
+            uncond_embeddings = self.text_encoder(uncond_input.input_ids)[self._text_encoder_output]
+
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = uncond_embeddings.shape[1]
+            uncond_embeddings = np.tile(uncond_embeddings, (1, num_images_per_prompt, 1))
+            uncond_embeddings = np.reshape(uncond_embeddings, (batch_size * num_images_per_prompt, seq_len, -1))
+
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            text_embeddings = np.concatenate([uncond_embeddings, text_embeddings])
+
+        return text_embeddings
+
+    def prepare_latents(
+        self,
+        batch_size,
+        num_images_per_prompt,
+        height,
+        width,
+        image: PIL.Image.Image = None,
+        latent_timestep: torch.Tensor = None,
+    ):
+        """Get initial latents for starting generation.
+
+        Args:
+            batch_size (int):
+                Batch size for generation
+            num_images_per_prompt (int):
+                Number of images to generate per prompt
+            height (int):
+                Height of generated image
+            width (int):
+                Width of generated image
+            image (PIL.Image.Image, *optional*, None):
+                Input image for generation, if not provided randon noise will be used as starting point
+            latent_timestep (torch.Tensor, *optional*, None):
+                Predicted by scheduler initial step for image generation, required for latent image mixing with nosie
+        Returns:
+            latents (np.ndarray):
+                Image encoded in latent space
+
+        """
+        latents_shape = (batch_size * num_images_per_prompt, 4, height // 8, width // 8)
+        noise = np.random.randn(*latents_shape).astype(np.float32)
+        if image is None and isinstance(self.scheduler, LMSDiscreteScheduler):
+            # if you use LMSDiscreteScheduler, let's make sure latents are multiplied by sigmas
+            noise = noise * self.scheduler.sigmas[0].numpy()
+            return noise, {}
+        input_image, meta = preprocess(image)
+        latents = self.vae_encoder(input_image)[self._vae_e_output] * 0.18215
+        latents = self.scheduler.add_noise(torch.from_numpy(latents), torch.from_numpy(noise), latent_timestep).numpy()
+        return latents, meta
+
+    def postprocess_image(self, image: np.ndarray, meta: dict, output_type: str = "pil"):
+        """Postprocessing for decoded image.
+
+        Takes generated image decoded by VAE decoder, unpad it to initila image size (if required),
+        normalize and convert to [0, 255] pixels range. Optionally, convertes it from np.ndarray to PIL.Image format
+
+        Args:
+            image (np.ndarray):
+                Generated image
+            meta (Dict):
+                Metadata obtained on latents preparing step, can be empty
+            output_type (str, *optional*, pil):
+                Output format for result, can be pil or numpy
+        Returns:
+            image (List of np.ndarray or PIL.Image.Image):
+                Postprocessed images
+
+        """
+        if "padding" in meta:
+            pad = meta["padding"]
+            (_, end_h), (_, end_w) = pad[1:3]
+            h, w = image.shape[2:]
+            unpad_h = h - end_h
+            unpad_w = w - end_w
+            image = image[:, :, :unpad_h, :unpad_w]
+        image = np.clip(image / 2 + 0.5, 0, 1)
+        image = np.transpose(image, (0, 2, 3, 1))
+        # 9. Convert to PIL
+        if output_type == "pil":
+            image = self.numpy_to_pil(image)
+            if "src_height" in meta:
+                orig_height, orig_width = meta["src_height"], meta["src_width"]
+                image = [img.resize((orig_width, orig_height), PIL.Image.Resampling.LANCZOS) for img in image]
+        else:
+            if "src_height" in meta:
+                orig_height, orig_width = meta["src_height"], meta["src_width"]
+                image = [cv2.resize(img, (orig_width, orig_width)) for img in image]
+        return image
+
+    def get_timesteps(self, num_inference_steps: int, strength: float):
+        """Get scheduler timesteps for generation.
+
+        In case of image-to-image generation, it updates number of steps according to strength
+
+        Args:
+           num_inference_steps (int):
+              number of inference steps for generation
+           strength (float):
+               value between 0.0 and 1.0, that controls the amount of noise that is added to the input image.
+               Values that approach 1.0 enable lots of variations
+               but will also produce images that are not semantically consistent with the input.
+
+        """
+        # get the original timestep using init_timestep
+        init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
+
+        t_start = max(num_inference_steps - init_timestep, 0)
+        timesteps = self.scheduler.timesteps[t_start:]
+
+        return timesteps, num_inference_steps - t_start
+
+
+def update_ov_config(config: dict):
+    config["passes"] = {"ov_convert": config["passes"]["ov_convert"]}
+    config["search_strategy"] = False
+    config["systems"]["local_system"]["accelerators"][0]["execution_providers"] = ["CPUExecutionProvider"]
+    del config["evaluators"]
+    del config["evaluator"]
+    return config
+
+
+def save_optimized_ov_submodel(workflow_output, submodel, optimized_model_dir, optimized_model_path_map):
+    output_model_dir = workflow_output.get_best_candidate().model_path
+    optimized_model_path = optimized_model_dir / submodel
+    shutil.copytree(output_model_dir, optimized_model_path)
+    model_path = (optimized_model_path / submodel).with_suffix(".xml")
+    optimized_model_path_map[submodel] = str(model_path)
+
+
+def get_ov_pipeline(common_args, ov_args, optimized_model_dir):
+    if common_args.test_unoptimized:
+        return StableDiffusionPipeline.from_pretrained(common_args.model_id)
+
+    with (optimized_model_dir / OV_OPTIMIZED_MODEL_INFO).open("r") as model_info_file:
+        optimized_model_path_map = json.load(model_info_file)
+
+    device = ov_args.device
+
+    core = ov.Core()
+    text_enc = core.compile_model(optimized_model_path_map["text_encoder"], device)
+    unet_model = core.compile_model(optimized_model_path_map["unet"], device)
+
+    ov_config = {"INFERENCE_PRECISION_HINT": "f32"} if device != "CPU" else {}
+
+    vae_decoder = core.compile_model(optimized_model_path_map["vae_decoder"], device, ov_config)
+    vae_encoder = core.compile_model(optimized_model_path_map["vae_encoder"], device, ov_config)
+
+    lms = LMSDiscreteScheduler(beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear")
+    tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14")
+
+    return OVStableDiffusionPipeline(
+        tokenizer=tokenizer,
+        text_encoder=text_enc,
+        unet=unet_model,
+        vae_encoder=vae_encoder,
+        vae_decoder=vae_decoder,
+        scheduler=lms,
+    )
+
+
+def run_ov_image_inference(
+    pipe, image_path, prompt, strength, guidance_scale, image_size, num_inference_steps, common_args, generator=None
+):
+    image = None
+    if image_path:
+        img_path = Path(image_path)
+        print(f"Image path is {img_path}")
+        if not img_path.exists():
+            print("Image doesn't exist.")
+            sys.exit(1)
+        image = PIL.Image.open(img_path)
+
+    return pipe(
+        prompts=[prompt] * common_args.batch_size,
+        image=image,
+        num_inference_steps=num_inference_steps,
+        height=image_size,
+        width=image_size,
+        strength=strength,
+        guidance_scale=guidance_scale,
+        generator=generator,
+    )
+
+
+def run_ov_img_to_img_example(pipe, guidance_scale, common_args):
+    prompt = "amazing watercolor painting"
+    strength = 0.5
+    image_path = Path("./assets/dog.png")
+    image_size = 512
+    num_inference_steps = 10
+
+    return run_ov_image_inference(
+        pipe, image_path, prompt, strength, guidance_scale, image_size, num_inference_steps, common_args
+    )
+
+
+def save_ov_model_info(model_info, optimized_model_dir):
+    model_info_path = optimized_model_dir / OV_OPTIMIZED_MODEL_INFO
+    with model_info_path.open("w") as model_info_file:
+        json.dump(model_info, model_info_file, indent=4)
diff --git a/sd-legacy-stable-diffusion-v1-5/aitk/sd_utils/qdq.py b/sd-legacy-stable-diffusion-v1-5/aitk/sd_utils/qdq.py
new file mode 100644
index 00000000..b1275bc7
--- /dev/null
+++ b/sd-legacy-stable-diffusion-v1-5/aitk/sd_utils/qdq.py
@@ -0,0 +1,306 @@
+# --------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+
+import inspect
+import os
+from typing import Callable, Optional, Union
+
+import numpy as np
+import onnxruntime as ort
+import sd_utils
+import sd_utils.config
+import torch
+
+from diffusers.pipelines.stable_diffusion.pipeline_onnx_stable_diffusion import OnnxStableDiffusionPipeline
+from diffusers.pipelines.onnx_utils import ORT_TO_NP_TYPE
+from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
+
+# ruff: noqa: T201
+
+
+def update_qdq_config(config: dict, provider: str, submodel_name: str):
+    used_passes = {}
+    if sd_utils.config.only_conversion:
+        used_passes = {"convert"}
+        config["evaluator"] = None
+    elif submodel_name == "text_encoder":
+        if provider == "qnn":
+            used_passes = {"convert", "dynamic_shape_to_fixed", "surgery", "cb"}
+        else:
+            used_passes = {"convert", "dynamic_shape_to_fixed", "surgery", "optimize_qdq", "quantization"}
+    elif submodel_name == "unet":
+        if provider == "qnn":
+            used_passes = {"convert", "dynamic_shape_to_fixed", "optimize_qdq", "quantization", "cb"}
+        else:
+            used_passes = {"convert", "dynamic_shape_to_fixed", "optimize_qdq", "quantization"}
+    elif submodel_name == "vae_encoder":
+        if provider == "qnn":
+            used_passes = {"convert", "dynamic_shape_to_fixed"}
+        else:
+            used_passes = {"convert", "dynamic_shape_to_fixed", "quantization"}
+    else:
+        if provider == "qnn":
+            used_passes = {"convert", "dynamic_shape_to_fixed", "quantization", "cb"}
+        else:
+            used_passes = {"convert", "dynamic_shape_to_fixed", "quantization"}
+
+    for pass_name in set(config["passes"].keys()):
+        if pass_name not in used_passes:
+            config["passes"].pop(pass_name, None)
+
+    if provider == "cuda":
+        config["systems"]["local_system"]["accelerators"][0]["execution_providers"] = ["CUDAExecutionProvider"]
+    elif provider == "qnn" and submodel_name not in ("vae_encoder"):
+        config["systems"]["local_system"]["accelerators"][0]["device"] = "npu"
+        config["systems"]["local_system"]["accelerators"][0]["execution_providers"] = ["QNNExecutionProvider"]
+        config["passes"]["convert"]["target_opset"] = 20
+
+        # Quantization params
+        if submodel_name not in ("text_encoder"):
+            config["passes"]["quantization"]["quant_preprocess"] = True
+            config["passes"]["quantization"]["per_channel"] = True
+            config["passes"]["quantization"]["weight_symmetric"] = True
+
+        if submodel_name == "unet":
+            config["input_model"]["model_loader"] = "unet_load_qnn"
+    else:
+        config["systems"]["local_system"]["accelerators"][0]["device"] = "cpu"
+        config["systems"]["local_system"]["accelerators"][0]["execution_providers"] = ["CPUExecutionProvider"]
+        # not meaningful to evaluate QDQ latency on CPU
+        config["evaluator"] = None
+    return config
+
+
+class OnnxStableDiffusionPipelineWithSave(OnnxStableDiffusionPipeline):
+    def __call__(
+        self,
+        prompt: Union[str, list[str]] = None,
+        height: Optional[int] = 512,
+        width: Optional[int] = 512,
+        num_inference_steps: Optional[int] = 50,
+        guidance_scale: Optional[float] = 7.5,
+        negative_prompt: Optional[Union[str, list[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: Optional[float] = 0.0,
+        generator: Optional[np.random.RandomState] = None,
+        latents: Optional[np.ndarray] = None,
+        prompt_embeds: Optional[np.ndarray] = None,
+        negative_prompt_embeds: Optional[np.ndarray] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, np.ndarray], None]] = None,
+        callback_steps: int = 1,
+    ):
+        # check inputs. Raise error if not correct
+        self.check_inputs(prompt, height, width, callback_steps, negative_prompt, prompt_embeds, negative_prompt_embeds)
+
+        # define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        if generator is None:
+            generator = np.random
+
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+
+        if self.save_data_dir:
+            text_inputs = self.tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=77,
+                truncation=True,
+                return_tensors="np",
+            ).input_ids.astype(np.int32)
+            np.savez(self.save_data_dir / "text_inputs.npz", input_ids=text_inputs)
+
+            uncond_input = self.tokenizer(
+                negative_prompt if negative_prompt else "",
+                padding="max_length",
+                max_length=77,
+                truncation=True,
+                return_tensors="np",
+            ).input_ids.astype(np.int32)
+            np.savez(self.save_data_dir / "uncond_input.npz", input_ids=uncond_input)
+
+        prompt_embeds = self._encode_prompt(
+            prompt,
+            num_images_per_prompt,
+            do_classifier_free_guidance,
+            negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+        )
+
+        # get the initial random noise unless the user supplied it
+        latents_dtype = prompt_embeds.dtype
+        latents_shape = (batch_size * num_images_per_prompt, 4, height // 8, width // 8)
+        if latents is None:
+            latents = generator.randn(*latents_shape).astype(latents_dtype)
+        elif latents.shape != latents_shape:
+            raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {latents_shape}")
+
+        # set timesteps
+        self.scheduler.set_timesteps(num_inference_steps)
+
+        latents = latents * np.float32(self.scheduler.init_noise_sigma)
+
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        timestep_dtype = next(
+            (unet_input.type for unet_input in self.unet.model.get_inputs() if unet_input.name == "timestep"),
+            "tensor(float)",
+        )
+        timestep_dtype = ORT_TO_NP_TYPE[timestep_dtype]
+
+        if do_classifier_free_guidance:
+            splits = np.split(prompt_embeds, 2)
+            neg_embeds, text_embeds = splits[0], splits[1]
+
+        for i, t in enumerate(self.progress_bar(self.scheduler.timesteps)):
+            latent_model_input = latents
+            latent_model_input = self.scheduler.scale_model_input(torch.from_numpy(latent_model_input), t)
+            latent_model_input = latent_model_input.cpu().numpy()
+
+            # predict the noise residual
+            timestep = np.array([t], dtype=timestep_dtype)
+
+            if do_classifier_free_guidance:
+                # Note that in QDQ, we need to use static dimensions (batch is fixed to 1), so we need to split
+                unet_input = {"sample": latent_model_input, "timestep": timestep, "encoder_hidden_states": neg_embeds}
+                if self.save_data_dir:
+                    np.savez(self.save_data_dir / f"{i}_unet_input_neg.npz", **unet_input)
+                noise_pred_uncond = self.unet(**unet_input)
+                noise_pred_uncond = noise_pred_uncond[0]
+
+                unet_input = {"sample": latent_model_input, "timestep": timestep, "encoder_hidden_states": text_embeds}
+                if self.save_data_dir:
+                    np.savez(self.save_data_dir / f"{i}_unet_input.npz", **unet_input)
+                noise_pred_text = self.unet(**unet_input)
+                noise_pred_text = noise_pred_text[0]
+
+                noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+            else:
+                unet_input = {
+                    "sample": latent_model_input,
+                    "timestep": timestep,
+                    "encoder_hidden_states": prompt_embeds,
+                }
+                if self.save_data_dir:
+                    np.savez(self.save_data_dir / f"{i}_unet_input.npz", **unet_input)
+                noise_pred = self.unet(**unet_input)
+                noise_pred = noise_pred[0]
+
+            # compute the previous noisy sample x_t -> x_t-1
+            scheduler_output = self.scheduler.step(
+                torch.from_numpy(noise_pred), t, torch.from_numpy(latents), **extra_step_kwargs
+            )
+            latents = scheduler_output.prev_sample.numpy()
+
+            # call the callback, if provided
+            if callback is not None and i % callback_steps == 0:
+                step_idx = i // getattr(self.scheduler, "order", 1)
+                callback(step_idx, t, latents)
+
+        latents = 1 / 0.18215 * latents
+        # image = self.vae_decoder(latent_sample=latents)[0]
+        # it seems likes there is a strange result for using half-precision vae decoder if batchsize>1
+        if self.save_data_dir:
+            np.savez(self.save_data_dir / "latent.npz", latent_sample=latents[0:1])
+        image = np.concatenate([self.vae_decoder(latent_sample=latents[i : i + 1])[0] for i in range(latents.shape[0])])
+        if self.save_data_dir:
+            np.savez(self.save_data_dir / "output_img.npz", sample=image)
+
+        image = np.clip(image / 2 + 0.5, 0, 1)
+        image = image.transpose((0, 2, 3, 1))
+
+        if self.safety_checker is not None:
+            safety_checker_input = self.feature_extractor(
+                self.numpy_to_pil(image), return_tensors="np"
+            ).pixel_values.astype(image.dtype)
+
+            images, has_nsfw_concept = [], []
+            for i in range(image.shape[0]):
+                image_i, has_nsfw_concept_i = self.safety_checker(
+                    clip_input=safety_checker_input[i : i + 1], images=image[i : i + 1]
+                )
+                images.append(image_i)
+                has_nsfw_concept.append(has_nsfw_concept_i[0])
+            image = np.concatenate(images)
+        else:
+            has_nsfw_concept = None
+
+        if output_type == "pil":
+            image = self.numpy_to_pil(image)
+
+        if not return_dict:
+            return (image, has_nsfw_concept)
+
+        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
+
+
+def add_ep_for_device(session_options, ep_name, device_type, ep_options=None):
+    ep_devices = ort.get_ep_devices()
+    for ep_device in ep_devices:
+        if ep_device.ep_name == ep_name and ep_device.device.type == device_type:
+            print(f"Adding {ep_name} for {device_type}")
+            session_options.add_provider_for_devices([ep_device], {} if ep_options is None else ep_options)
+            break
+
+def register_execution_providers():
+    import json
+    import subprocess
+    import sys
+
+    worker_script = os.path.abspath('winml.py')
+    result = subprocess.check_output([sys.executable, worker_script], text=True)
+    paths = json.loads(result)
+    for item in paths.items():
+        try:
+            ort.register_execution_provider_library(item[0], item[1])
+        except Exception as e:
+            print(f"Failed to register execution provider {item[0]}: {e}")
+
+def get_qdq_pipeline(model_dir, common_args, qdq_args, script_dir):
+    register_execution_providers()
+    ort.set_default_logger_severity(3)
+
+    print("Loading models into ORT session...")
+    sess_options = ort.SessionOptions()
+    provider_options = [{}]
+
+    provider = common_args.provider
+
+    provider_map = {
+        "cpu": "CPUExecutionProvider",
+        "cuda": "CUDAExecutionProvider",
+        "qnn": "QNNExecutionProvider",
+    }
+    assert provider in provider_map, f"Unsupported provider: {provider}"
+
+    add_ep_for_device(sess_options, provider_map[provider], ort.OrtHardwareDeviceType.NPU)
+
+    pipeline = OnnxStableDiffusionPipelineWithSave.from_pretrained(
+        model_dir, provider=provider_map[provider], sess_options=sess_options, provider_options=provider_options
+    )
+    if qdq_args.save_data:
+        pipeline.save_data_dir = script_dir / qdq_args.data_dir / common_args.prompt
+        os.makedirs(pipeline.save_data_dir, exist_ok=True)
+    else:
+        pipeline.save_data_dir = None
+    return pipeline
diff --git a/sd-legacy-stable-diffusion-v1-5/aitk/sd_utils/qdq_xl.py b/sd-legacy-stable-diffusion-v1-5/aitk/sd_utils/qdq_xl.py
new file mode 100644
index 00000000..08ec41ad
--- /dev/null
+++ b/sd-legacy-stable-diffusion-v1-5/aitk/sd_utils/qdq_xl.py
@@ -0,0 +1,238 @@
+# --------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+
+import inspect
+from pathlib import Path
+from tempfile import TemporaryDirectory
+from typing import Any, Optional, Union
+
+import numpy as np
+import onnxruntime as ort
+import torch
+from diffusers import StableDiffusionXLPipeline
+from optimum.onnxruntime.modeling_diffusion import (
+    ORTDiffusionPipeline,
+    ORTModelTextEncoder,
+    ORTModelUnet,
+    ORTModelVaeDecoder,
+    ORTModelVaeEncoder,
+    ORTWrapperVae,
+)
+from transformers.modeling_outputs import ModelOutput
+
+# ruff: noqa: F821
+# ruff: noqa: SIM118
+
+
+class ORTDiffusionPipelineWithSave(ORTDiffusionPipeline):
+    def __init__(
+        self,
+        scheduler: "SchedulerMixin",
+        unet_session: ort.InferenceSession,
+        vae_decoder_session: ort.InferenceSession,
+        # optional pipeline models
+        vae_encoder_session: Optional[ort.InferenceSession] = None,
+        text_encoder_session: Optional[ort.InferenceSession] = None,
+        text_encoder_2_session: Optional[ort.InferenceSession] = None,
+        # optional pipeline submodels
+        tokenizer: Optional["CLIPTokenizer"] = None,
+        tokenizer_2: Optional["CLIPTokenizer"] = None,
+        feature_extractor: Optional["CLIPFeatureExtractor"] = None,
+        # stable diffusion xl specific arguments
+        force_zeros_for_empty_prompt: bool = True,
+        requires_aesthetics_score: bool = False,
+        add_watermarker: Optional[bool] = None,
+        # onnxruntime specific arguments
+        use_io_binding: Optional[bool] = None,
+        model_save_dir: Optional[Union[str, Path, TemporaryDirectory]] = None,
+        **kwargs,
+    ):
+        self.unet = ORTModelUnetWithSave(unet_session, self)
+        self.vae_decoder = ORTModelVaeDecoderWithSave(vae_decoder_session, self)
+        self.vae_encoder = ORTModelVaeEncoder(vae_encoder_session, self) if vae_encoder_session is not None else None
+        self.text_encoder = (
+            ORTModelTextEncoderWithSave(text_encoder_session, self) if text_encoder_session is not None else None
+        )
+        self.text_encoder_2 = (
+            ORTModelTextEncoderWithSave(text_encoder_2_session, self) if text_encoder_2_session is not None else None
+        )
+        # We wrap the VAE Decoder & Encoder in a single object to simulate diffusers API
+        self.vae = ORTWrapperVae(self.vae_encoder, self.vae_decoder)
+
+        # we allow passing these as torch models for now
+        self.image_encoder = kwargs.pop("image_encoder", None)  # TODO(anyone): maybe implement ORTModelImageEncoder
+        self.safety_checker = kwargs.pop("safety_checker", None)  # TODO(anyone): maybe implement ORTModelSafetyChecker
+
+        self.scheduler = scheduler
+        self.tokenizer = tokenizer
+        self.tokenizer_2 = tokenizer_2
+        self.feature_extractor = feature_extractor
+
+        all_pipeline_init_args = {
+            "vae": self.vae,
+            "unet": self.unet,
+            "text_encoder": self.text_encoder,
+            "text_encoder_2": self.text_encoder_2,
+            "safety_checker": self.safety_checker,
+            "image_encoder": self.image_encoder,
+            "scheduler": self.scheduler,
+            "tokenizer": self.tokenizer,
+            "tokenizer_2": self.tokenizer_2,
+            "feature_extractor": self.feature_extractor,
+            "requires_aesthetics_score": requires_aesthetics_score,
+            "force_zeros_for_empty_prompt": force_zeros_for_empty_prompt,
+            "add_watermarker": add_watermarker,
+        }
+
+        diffusers_pipeline_args = {}
+        for key in inspect.signature(self.auto_model_class).parameters.keys():
+            if key in all_pipeline_init_args:
+                diffusers_pipeline_args[key] = all_pipeline_init_args[key]
+        # inits diffusers pipeline specific attributes (registers modules and config)
+        self.auto_model_class.__init__(self, **diffusers_pipeline_args)
+
+        # inits ort specific attributes
+        self.shared_attributes_init(
+            model=unet_session, use_io_binding=use_io_binding, model_save_dir=model_save_dir, **kwargs
+        )
+
+    @property
+    def save_data_dir(self):
+        return self.text_encoder.save_data_dir
+
+    @save_data_dir.setter
+    def save_data_dir(self, save_dir: Path):
+        self.text_encoder.save_data_dir = save_dir
+        self.text_encoder.save_data_index = 10
+        self.text_encoder_2.save_data_dir = save_dir
+        self.text_encoder_2.save_data_index = 20
+        self.unet.save_data_dir = save_dir
+        self.unet.save_data_index = 0
+        self.vae_decoder.save_data_dir = save_dir
+
+
+class ORTStableDiffusionXLPipelineWithSave(ORTDiffusionPipelineWithSave, StableDiffusionXLPipeline):
+    main_input_name = "prompt"
+    export_feature = "text-to-image"
+    auto_model_class = StableDiffusionXLPipeline
+
+    def _get_add_time_ids(
+        self,
+        original_size,
+        crops_coords_top_left,
+        target_size,
+        dtype,
+        text_encoder_projection_dim=None,
+    ):
+        add_time_ids = list(original_size + crops_coords_top_left + target_size)
+
+        return torch.tensor([add_time_ids], dtype=dtype)
+
+
+# Wrappers
+
+
+class ORTModelTextEncoderWithSave(ORTModelTextEncoder):
+    def forward(
+        self,
+        input_ids: Union[np.ndarray, torch.Tensor],
+        attention_mask: Optional[Union[np.ndarray, torch.Tensor]] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: bool = False,
+    ):
+        use_torch = isinstance(input_ids, torch.Tensor)
+
+        model_inputs = {"input_ids": input_ids}
+
+        onnx_inputs = self.prepare_onnx_inputs(use_torch, **model_inputs)
+        if self.save_data_dir:
+            np.savez(self.save_data_dir / f"text_encoder_{self.save_data_index}.npz", **onnx_inputs)
+            self.save_data_index += 1
+        onnx_outputs = self.session.run(None, onnx_inputs)
+        model_outputs = self.prepare_onnx_outputs(use_torch, *onnx_outputs)
+
+        if output_hidden_states:
+            model_outputs["hidden_states"] = []
+            for i in range(self.config.num_hidden_layers):
+                model_outputs["hidden_states"].append(model_outputs.pop(f"hidden_states.{i}"))
+            model_outputs["hidden_states"].append(model_outputs.get("last_hidden_state"))
+        else:
+            for i in range(self.config.num_hidden_layers):
+                model_outputs.pop(f"hidden_states.{i}", None)
+
+        if return_dict:
+            return model_outputs
+
+        return ModelOutput(**model_outputs)
+
+
+class ORTModelUnetWithSave(ORTModelUnet):
+    def forward(
+        self,
+        sample: Union[np.ndarray, torch.Tensor],
+        timestep: Union[np.ndarray, torch.Tensor],
+        encoder_hidden_states: Union[np.ndarray, torch.Tensor],
+        text_embeds: Optional[Union[np.ndarray, torch.Tensor]] = None,
+        time_ids: Optional[Union[np.ndarray, torch.Tensor]] = None,
+        timestep_cond: Optional[Union[np.ndarray, torch.Tensor]] = None,
+        cross_attention_kwargs: Optional[dict[str, Any]] = None,
+        added_cond_kwargs: Optional[dict[str, Any]] = None,
+        return_dict: bool = False,
+    ):
+        use_torch = isinstance(sample, torch.Tensor)
+
+        if len(timestep.shape) == 0:
+            timestep = timestep.unsqueeze(0)
+
+        model_inputs = {
+            "sample": sample,
+            "timestep": timestep,
+            "encoder_hidden_states": encoder_hidden_states,
+            "text_embeds": text_embeds,
+            "time_ids": time_ids,
+            "timestep_cond": timestep_cond,
+            **(cross_attention_kwargs or {}),
+            **(added_cond_kwargs or {}),
+        }
+
+        onnx_inputs = self.prepare_onnx_inputs(use_torch, **model_inputs)
+        if self.save_data_dir:
+            np.savez(self.save_data_dir / f"unet_{self.save_data_index}.npz", **onnx_inputs)
+            self.save_data_index += 1
+        onnx_outputs = self.session.run(None, onnx_inputs)
+        model_outputs = self.prepare_onnx_outputs(use_torch, *onnx_outputs)
+
+        if return_dict:
+            return model_outputs
+
+        return ModelOutput(**model_outputs)
+
+
+class ORTModelVaeDecoderWithSave(ORTModelVaeDecoder):
+    def forward(
+        self,
+        latent_sample: Union[np.ndarray, torch.Tensor],
+        generator: Optional[torch.Generator] = None,
+        return_dict: bool = False,
+    ):
+        use_torch = isinstance(latent_sample, torch.Tensor)
+
+        model_inputs = {"latent_sample": latent_sample}
+
+        onnx_inputs = self.prepare_onnx_inputs(use_torch, **model_inputs)
+        if self.save_data_dir:
+            np.savez(self.save_data_dir / "vae_decoder.npz", **onnx_inputs)
+        onnx_outputs = self.session.run(None, onnx_inputs)
+        if self.save_data_dir:
+            np.savez(self.save_data_dir / "vae_decoder_output.npz", sample=onnx_outputs[0])
+        model_outputs = self.prepare_onnx_outputs(use_torch, *onnx_outputs)
+
+        if "latent_sample" in model_outputs:
+            model_outputs["latents"] = model_outputs.pop("latent_sample")
+
+        if return_dict:
+            return model_outputs
+
+        return ModelOutput(**model_outputs)
diff --git a/sd-legacy-stable-diffusion-v1-5/aitk/stable_diffusion.py b/sd-legacy-stable-diffusion-v1-5/aitk/stable_diffusion.py
new file mode 100644
index 00000000..4567305d
--- /dev/null
+++ b/sd-legacy-stable-diffusion-v1-5/aitk/stable_diffusion.py
@@ -0,0 +1,495 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+import argparse
+import json
+import shutil
+import sys
+import warnings
+from pathlib import Path
+
+import numpy as np
+import torch
+from diffusers import DiffusionPipeline
+from olive.common.utils import set_tempdir
+from olive.workflows import run as olive_run
+from sd_utils import config
+from user_script import get_base_model_name
+
+# pylint: disable=redefined-outer-name
+# ruff: noqa: TID252, T201
+
+
+def save_image(result, batch_size, provider, num_images, images_saved, image_callback=None):
+    passed_safety_checker = 0
+    for image_index in range(batch_size):
+        if result.nsfw_content_detected is None or not result.nsfw_content_detected[image_index]:
+            passed_safety_checker += 1
+            if images_saved < num_images:
+                output_path = f"result_{images_saved}.png"
+                result.images[image_index].save(output_path)
+                if image_callback:
+                    image_callback(images_saved, output_path)
+                images_saved += 1
+                print(f"Generated {output_path}")
+    print(f"Inference Batch End ({passed_safety_checker}/{batch_size} images).")
+    if provider == "openvino":
+        print("WARNING: Safety checker is not supported by OpenVINO. It will be disabled.")
+    else:
+        print("Images passed the safety checker.")
+    return images_saved
+
+
+def run_inference_loop(
+    pipeline,
+    prompt,
+    num_images,
+    batch_size,
+    image_size,
+    num_inference_steps,
+    guidance_scale,
+    strength: float,
+    provider: str,
+    generator=None,
+    image_callback=None,
+    step_callback=None,
+):
+    images_saved = 0
+
+    def update_steps(step, timestep, latents):
+        if step_callback:
+            step_callback((images_saved // batch_size) * num_inference_steps + step)
+
+    while images_saved < num_images:
+        print(f"\nInference Batch Start (batch size = {batch_size}).")
+
+        kwargs = {"strength": strength} if provider == "openvino" else {}
+
+        result = pipeline(
+            [prompt] * batch_size,
+            num_inference_steps=num_inference_steps,
+            callback=update_steps if step_callback else None,
+            height=image_size,
+            width=image_size,
+            guidance_scale=guidance_scale,
+            generator=generator,
+            **kwargs,
+        )
+
+        images_saved = save_image(result, batch_size, provider, num_images, images_saved, image_callback)
+
+
+def run_inference_gui(
+    pipeline,
+    prompt,
+    num_images,
+    batch_size,
+    image_size,
+    num_inference_steps,
+    guidance_scale,
+    strength,
+    provider,
+    generator,
+):
+    import threading
+    import tkinter as tk
+    import tkinter.ttk as ttk
+
+    from PIL import Image, ImageTk
+
+    def update_progress_bar(total_steps_completed):
+        progress_bar["value"] = total_steps_completed
+
+    def image_completed(index, path):
+        img = Image.open(path)
+        photo = ImageTk.PhotoImage(img)
+        gui_images[index].config(image=photo)
+        gui_images[index].image = photo
+        if index == num_images - 1:
+            generate_button["state"] = "normal"
+
+    def on_generate_click():
+        generate_button["state"] = "disabled"
+        progress_bar["value"] = 0
+        threading.Thread(
+            target=run_inference_loop,
+            args=(
+                pipeline,
+                prompt_textbox.get(),
+                num_images,
+                batch_size,
+                image_size,
+                num_inference_steps,
+                guidance_scale,
+                strength,
+                provider,
+                generator,
+                image_completed,
+                update_progress_bar,
+            ),
+        ).start()
+
+    if num_images > 9:
+        print("WARNING: interactive UI only supports displaying up to 9 images")
+        num_images = 9
+
+    image_rows = 1 + (num_images - 1) // 3
+    image_cols = 2 if num_images == 4 else min(num_images, 3)
+    min_batches_required = 1 + (num_images - 1) // batch_size
+
+    bar_height = 10
+    button_width = 80
+    button_height = 30
+    padding = 2
+    window_width = image_cols * image_size + (image_cols + 1) * padding
+    window_height = image_rows * image_size + (image_rows + 1) * padding + bar_height + button_height
+
+    window = tk.Tk()
+    window.title("Stable Diffusion")
+    window.resizable(width=False, height=False)
+    window.geometry(f"{window_width}x{window_height}")
+
+    gui_images = []
+    for row in range(image_rows):
+        for col in range(image_cols):
+            label = tk.Label(window, width=image_size, height=image_size, background="black")
+            gui_images.append(label)
+            label.place(x=col * image_size, y=row * image_size)
+
+    y = image_rows * image_size + (image_rows + 1) * padding
+
+    progress_bar = ttk.Progressbar(window, value=0, maximum=num_inference_steps * min_batches_required)
+    progress_bar.place(x=0, y=y, height=bar_height, width=window_width)
+
+    y += bar_height
+
+    prompt_textbox = tk.Entry(window)
+    prompt_textbox.insert(tk.END, prompt)
+    prompt_textbox.place(x=0, y=y, width=window_width - button_width, height=button_height)
+
+    generate_button = tk.Button(window, text="Generate", command=on_generate_click)
+    generate_button.place(x=window_width - button_width, y=y, width=button_width, height=button_height)
+
+    window.mainloop()
+
+
+def update_config_with_provider(config: dict, provider: str, model_format: str, submodel_name: str):
+    if provider == "cuda" and not model_format:
+        from sd_utils.ort import update_cuda_config
+
+        return update_cuda_config(config)
+    elif provider == "openvino":
+        from sd_utils.ov import update_ov_config
+
+        return update_ov_config(config)
+    elif provider in ("cpu", "cuda", "qnn") and model_format == "qdq":
+        from sd_utils.qdq import update_qdq_config
+
+        return update_qdq_config(config, provider, submodel_name)
+    else:
+        raise ValueError(f"Unsupported provider: {provider} with format: {model_format}")
+
+
+def optimize(
+    common_args,
+    unoptimized_model_dir: Path,
+    optimized_model_dir: Path,
+):
+    model_id = common_args.model_id
+    provider = common_args.provider
+    model_format = common_args.format
+
+    script_dir = Path(common_args.script_dir)
+
+    # Clean up previously optimized models, if any.
+    shutil.rmtree(script_dir / "footprints", ignore_errors=True)
+    shutil.rmtree(unoptimized_model_dir, ignore_errors=True)
+    shutil.rmtree(optimized_model_dir, ignore_errors=True)
+
+    # The model_id and base_model_id are identical when optimizing a standard stable diffusion model like
+    # stable-diffusion-v1-5/stable-diffusion-v1-5. These variables are only different when optimizing a LoRA variant.
+    base_model_id = get_base_model_name(model_id)
+    print(f"\nModel {base_model_id}")
+
+    # Load the entire PyTorch pipeline to ensure all models and their configurations are downloaded and cached.
+    # This avoids an issue where the non-ONNX components (tokenizer, scheduler, and feature extractor) are not
+    # automatically cached correctly if individual models are fetched one at a time.
+    print("Download stable diffusion PyTorch pipeline...")
+    pipeline = DiffusionPipeline.from_pretrained(base_model_id, torch_dtype=torch.float32)
+    config.vae_sample_size = pipeline.vae.config.sample_size
+    config.cross_attention_dim = pipeline.unet.config.cross_attention_dim
+    config.unet_sample_size = pipeline.unet.config.sample_size
+    if model_format == "qdq":
+        config.vae_sample_size = common_args.image_size
+        # config.unet_sample_size = common_args.image_size // 8
+
+    model_info = {}
+
+    submodel_names = ["vae_encoder", "vae_decoder", "unet", "text_encoder"]
+
+    has_safety_checker = getattr(pipeline, "safety_checker", None) is not None
+
+    if has_safety_checker:
+        if provider == "openvino" or model_format == "qdq":
+            print(f"WARNING: Safety checker is not supported by {provider}. It will be disabled.")
+            has_safety_checker = False
+        else:
+            submodel_names.append("safety_checker")
+
+    for submodel_name in submodel_names:
+        print(f"\nOptimizing {submodel_name}")
+
+        olive_config = None
+        with (script_dir / f"config_{submodel_name}.json").open() as fin:
+            olive_config = json.load(fin)
+        olive_config = update_config_with_provider(olive_config, provider, model_format, submodel_name)
+
+        if submodel_name in ("unet", "text_encoder"):
+            olive_config["input_model"]["model_path"] = model_id
+        else:
+            # Only the unet & text encoder are affected by LoRA, so it's better to use the base model ID for
+            # other models: the Olive cache is based on the JSON config, and two LoRA variants with the same
+            # base model ID should be able to reuse previously optimized copies.
+            olive_config["input_model"]["model_path"] = base_model_id
+
+        # Save the config before olive_run modifies it with non-serializable objects
+        suffix = f"{model_format}_{provider}" if model_format else provider
+        with (script_dir / f"config_{submodel_name}_{suffix}.json").open("w") as fout:
+            json.dump(olive_config, fout, indent=4)
+
+        workflow_output = olive_run(olive_config)
+
+        if provider == "openvino":
+            from sd_utils.ov import save_optimized_ov_submodel
+
+            save_optimized_ov_submodel(workflow_output, submodel_name, optimized_model_dir, model_info)
+        else:
+            from sd_utils.ort import save_optimized_onnx_submodel
+
+            save_optimized_onnx_submodel(script_dir, submodel_name, provider, model_info)
+
+    if provider == "openvino":
+        from sd_utils.ov import save_ov_model_info
+
+        save_ov_model_info(model_info, optimized_model_dir)
+    else:
+        from sd_utils.ort import save_onnx_pipeline
+
+        save_onnx_pipeline(
+            has_safety_checker, model_info, optimized_model_dir, unoptimized_model_dir, pipeline, submodel_names
+        )
+
+    return model_info
+
+
+def parse_common_args(raw_args):
+    parser = argparse.ArgumentParser("Common arguments")
+
+    parser.add_argument("--script_dir", required=True, type=str)
+    parser.add_argument("--cache_dir", type=str)
+    parser.add_argument("--model_id", default="stable-diffusion-v1-5/stable-diffusion-v1-5", type=str)
+    parser.add_argument(
+        "--provider",
+        default="cuda",
+        type=str,
+        choices=["cuda", "openvino", "cpu", "qnn"],
+        help="Execution provider to use",
+    )
+    parser.add_argument("--optimize", action="store_true", help="Runs the optimization step")
+    parser.add_argument("--clean_cache", action="store_true", help="Deletes the Olive cache")
+    parser.add_argument("--test_unoptimized", action="store_true", help="Use unoptimized model for inference")
+    parser.add_argument("--batch_size", default=1, type=int, help="Number of images to generate per batch")
+    parser.add_argument(
+        "--prompt",
+        default=(
+            "castle surrounded by water and nature, village, volumetric lighting, photorealistic, "
+            "detailed and intricate, fantasy, epic cinematic shot, mountains, 8k ultra hd"
+        ),
+        type=str,
+    )
+    parser.add_argument(
+        "--guidance_scale",
+        default=7.5,
+        type=float,
+        help="Guidance scale as defined in Classifier-Free Diffusion Guidance",
+    )
+    parser.add_argument("--num_images", default=1, type=int, help="Number of images to generate")
+    parser.add_argument("--num_inference_steps", default=50, type=int, help="Number of steps in diffusion process")
+    parser.add_argument("--interactive", action="store_true", help="Run with a GUI")
+    parser.add_argument("--tempdir", default=None, type=str, help="Root directory for tempfile directories and files")
+    parser.add_argument(
+        "--strength",
+        default=1.0,
+        type=float,
+        help=(
+            "Value between 0.0 and 1.0, that controls the amount of noise that is added to the input image. "
+            "Values that approach 1.0 enable lots of variations but will also produce images "
+            "that are not semantically consistent with the input."
+        ),
+    )
+    parser.add_argument("--image_size", default=512, type=int, help="Width and height of the images to generate")
+    parser.add_argument(
+        "--seed",
+        default=None,
+        type=int,
+        help="The seed to give to the generator to generate deterministic results.",
+    )
+    parser.add_argument(
+        "--format", default=None, type=str, help="Currently only support qdq with provider cpu, cuda or qnn"
+    )
+
+    return parser.parse_known_args(raw_args)
+
+
+def parse_ort_args(raw_args):
+    parser = argparse.ArgumentParser("ONNX Runtime arguments")
+
+    parser.add_argument(
+        "--static_dims",
+        action="store_true",
+        help="DEPRECATED (now enabled by default). Use --dynamic_dims to disable static_dims.",
+    )
+    parser.add_argument("--dynamic_dims", action="store_true", help="Disable static shape optimization")
+
+    return parser.parse_known_args(raw_args)
+
+
+def parse_ov_args(raw_args):
+    parser = argparse.ArgumentParser("OpenVINO arguments")
+
+    parser.add_argument("--device", choices=["CPU", "GPU", "VPU"], default="CPU", type=str)
+    parser.add_argument("--image_path", default=None, type=str)
+    parser.add_argument("--img_to_img_example", action="store_true", help="Runs the image to image example")
+
+    return parser.parse_known_args(raw_args)
+
+
+def parse_qdq_args(raw_args):
+    parser = argparse.ArgumentParser("QDQ arguments")
+
+    parser.add_argument("--save_data", action="store_true", help="Save the input data for qdq")
+    parser.add_argument("--data_dir", default="quantize_data/data", type=str)
+    parser.add_argument(
+        "--only_conversion", action="store_true", help="Only generate unoptimized model to generate data for qdq"
+    )
+
+    return parser.parse_known_args(raw_args)
+
+
+def main(raw_args=None):
+    common_args, extra_args = parse_common_args(raw_args)
+
+    provider = common_args.provider
+    model_id = common_args.model_id
+
+    script_dir = Path(common_args.script_dir)
+    unoptimized_model_dir = script_dir / "models" / "unoptimized" / model_id
+    optimized_model_dir = script_dir / "models" / "optimized" / model_id
+
+    if common_args.clean_cache:
+        shutil.rmtree(common_args.cache_dir, ignore_errors=True)
+
+    guidance_scale = common_args.guidance_scale
+
+    if model_id == "stabilityai/sd-turbo" and guidance_scale > 0:
+        guidance_scale = 0.0
+        print(f"WARNING: Classifier free guidance has been forcefully disabled since {model_id} doesn't support it.")
+
+    ov_args, qdq_args, ort_args = None, None, None
+    if provider == "openvino":
+        ov_args, extra_args = parse_ov_args(extra_args)
+    elif common_args.format == "qdq":
+        qdq_args, extra_args = parse_qdq_args(extra_args)
+        config.only_conversion = qdq_args.only_conversion
+        config.data_dir = script_dir / qdq_args.data_dir
+    else:
+        ort_args, extra_args = parse_ort_args(extra_args)
+
+    if common_args.optimize or not optimized_model_dir.exists():
+        set_tempdir(common_args.tempdir)
+
+        # TODO(jstoecker): clean up warning filter (mostly during conversion from torch to ONNX)
+        with warnings.catch_warnings():
+            warnings.simplefilter("ignore")
+            if not (provider == "openvino" or common_args.format == "qdq"):
+                from sd_utils.ort import validate_args
+
+                validate_args(ort_args, common_args.provider)
+            optimize(
+                common_args,
+                unoptimized_model_dir,
+                optimized_model_dir,
+            )
+
+    generator = None if common_args.seed is None else np.random.RandomState(seed=common_args.seed)
+
+    if not common_args.optimize:
+        model_dir = unoptimized_model_dir if common_args.test_unoptimized else optimized_model_dir
+        with warnings.catch_warnings():
+            warnings.simplefilter("ignore")
+            if provider == "openvino":
+                from sd_utils.ov import get_ov_pipeline
+
+                pipeline = get_ov_pipeline(common_args, ov_args, optimized_model_dir)
+            elif common_args.format == "qdq":
+                from sd_utils.qdq import get_qdq_pipeline
+
+                pipeline = get_qdq_pipeline(model_dir, common_args, qdq_args, script_dir)
+            else:
+                from sd_utils.ort import get_ort_pipeline
+
+                pipeline = get_ort_pipeline(model_dir, common_args, ort_args, guidance_scale)
+            if provider == "openvino" and (ov_args.image_path or ov_args.img_to_img_example):
+                res = None
+                if ov_args.image_path:
+                    from sd_utils.ov import run_ov_image_inference
+
+                    res = run_ov_image_inference(
+                        pipeline,
+                        ov_args.image_path,
+                        common_args.prompt,
+                        common_args.strength,
+                        guidance_scale,
+                        common_args.image_size,
+                        common_args.num_inference_steps,
+                        common_args,
+                        generator=generator,
+                    )
+                if ov_args.img_to_img_example:
+                    from sd_utils.ov import run_ov_img_to_img_example
+
+                    res = run_ov_img_to_img_example(pipeline, guidance_scale, common_args)
+                save_image(res, common_args.batch_size, "openvino", common_args.num_images, 0)
+                sys.exit(0)
+
+            if common_args.interactive:
+                run_inference_gui(
+                    pipeline,
+                    common_args.prompt,
+                    common_args.num_images,
+                    common_args.batch_size,
+                    common_args.image_size,
+                    common_args.num_inference_steps,
+                    guidance_scale,
+                    common_args.strength,
+                    provider=provider,
+                    generator=generator,
+                )
+            else:
+                run_inference_loop(
+                    pipeline,
+                    common_args.prompt,
+                    common_args.num_images,
+                    common_args.batch_size,
+                    common_args.image_size,
+                    common_args.num_inference_steps,
+                    guidance_scale,
+                    common_args.strength,
+                    provider=provider,
+                    generator=generator,
+                )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/sd-legacy-stable-diffusion-v1-5/aitk/user_script.py b/sd-legacy-stable-diffusion-v1-5/aitk/user_script.py
new file mode 100644
index 00000000..fe647855
--- /dev/null
+++ b/sd-legacy-stable-diffusion-v1-5/aitk/user_script.py
@@ -0,0 +1,382 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+import os
+import random
+
+import numpy as np
+import torch
+from diffusers import AutoencoderKL, UNet2DConditionModel
+from diffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker
+from huggingface_hub import model_info
+from model_adaptations import monkey_patch_model
+from olive.data.registry import Registry
+from sd_utils import config
+from transformers.models.clip.modeling_clip import CLIPTextModel
+
+# ruff: noqa: T201
+
+# Generated data helpers
+
+
+class BaseDataLoader:
+    def __init__(self, total):
+        self.data = []
+        self.total = total
+        self.data_folders = [config.data_dir / f.name for f in os.scandir(config.data_dir) if f.is_dir()]
+        self.data_folders.sort()
+
+    def __getitem__(self, idx):
+        if idx >= len(self.data) or idx >= self.total:
+            raise StopIteration
+        # print(f"Process data {idx}")
+        return self.data[idx]
+
+    def load(self, file):
+        self.data.append({key: torch.from_numpy(value) for key, value in np.load(file).items()})
+
+    def finish_load(self):
+        if len(self.data) > self.total:
+            self.data = random.sample(self.data, self.total)
+
+
+class UnetGeneratedDataLoader(BaseDataLoader):
+    def __init__(self, total):
+        super().__init__(total)
+        for f in self.data_folders:
+            i = 0
+            while True:
+                file = f / f"{i}_unet_input.npz"
+                if not os.path.exists(file):
+                    break
+                self.load(file)
+                file = f / f"{i}_unet_input_neg.npz"
+                if os.path.exists(file):
+                    self.load(file)
+                i += 1
+        self.finish_load()
+
+
+class TextEncoderGeneratedDataLoader(BaseDataLoader):
+    def __init__(self, total):
+        super().__init__(total)
+        for f in self.data_folders:
+            self.load(f / "text_inputs.npz")
+            self.load(f / "uncond_input.npz")
+        self.finish_load()
+
+
+class VaeDecoderGeneratedDataLoader(BaseDataLoader):
+    def __init__(self, total):
+        super().__init__(total)
+        for f in self.data_folders:
+            self.load(f / "latent.npz")
+        self.finish_load()
+
+
+class VaeEncoderGeneratedDataLoader(BaseDataLoader):
+    def __init__(self, total):
+        super().__init__(total)
+        for f in self.data_folders:
+            self.load(f / "output_img.npz")
+        self.finish_load()
+
+
+# Helper latency-only dataloader that creates random tensors with no label
+class RandomDataLoader:
+    def __init__(self, create_inputs_func, batch_size, torch_dtype):
+        self.create_input_func = create_inputs_func
+        self.batch_size = batch_size
+        self.torch_dtype = torch_dtype
+
+    def __getitem__(self, idx):
+        label = None
+        return self.create_input_func(self.batch_size, self.torch_dtype), label
+
+
+def get_base_model_name(model_name):
+    return model_info(model_name).cardData.get("base_model") or model_name
+
+
+def is_lora_model(model_name):
+    # TODO(jstoecker): might be a better way to detect (e.g. presence of LORA weights file)
+    return model_name != get_base_model_name(model_name)
+
+
+# Merges LoRA weights into the layers of a base model
+def merge_lora_weights(base_model, lora_model_id, submodel_name="unet", scale=1.0):
+    import inspect
+    from collections import defaultdict
+    from functools import reduce
+
+    try:
+        from diffusers.loaders import LORA_WEIGHT_NAME
+    except ImportError:
+        # moved in version 0.24.0
+        from diffusers.loaders.lora import LORA_WEIGHT_NAME
+    from diffusers.models.attention_processor import LoRAAttnProcessor
+    from diffusers.utils.hub_utils import _get_model_file
+
+    parameters = inspect.signature(_get_model_file).parameters
+
+    kwargs = {}
+    if "use_auth_token" in parameters:
+        kwargs["use_auth_token"] = None
+    elif "token" in parameters:
+        kwargs["token"] = None
+
+    # Load LoRA weights
+    model_file = _get_model_file(
+        lora_model_id,
+        weights_name=LORA_WEIGHT_NAME,
+        cache_dir=None,
+        force_download=False,
+        resume_download=False,
+        proxies=None,
+        local_files_only=False,
+        revision=None,
+        subfolder=None,
+        user_agent={
+            "file_type": "attn_procs_weights",
+            "framework": "pytorch",
+        },
+        **kwargs,
+    )
+    lora_state_dict = torch.load(model_file, map_location="cpu")
+
+    # All keys in the LoRA state dictionary should have 'lora' somewhere in the string.
+    keys = list(lora_state_dict.keys())
+    assert all("lora" in k for k in keys)
+
+    if all(key.startswith(submodel_name) for key in keys):
+        # New format (https://github.com/huggingface/diffusers/pull/2918) supports LoRA weights in both the
+        # unet and text encoder where keys are prefixed with 'unet' or 'text_encoder', respectively.
+        submodel_state_dict = {k: v for k, v in lora_state_dict.items() if k.startswith(submodel_name)}
+    else:
+        # Old format. Keys will not have any prefix. This only applies to unet, so exit early if this is
+        # optimizing the text encoder.
+        if submodel_name != "unet":
+            return
+        submodel_state_dict = lora_state_dict
+
+    # Group LoRA weights into attention processors
+    attn_processors = {}
+    lora_grouped_dict = defaultdict(dict)
+    for key, value in submodel_state_dict.items():
+        attn_processor_key, sub_key = ".".join(key.split(".")[:-3]), ".".join(key.split(".")[-3:])
+        lora_grouped_dict[attn_processor_key][sub_key] = value
+
+    for key, value_dict in lora_grouped_dict.items():
+        rank = value_dict["to_k_lora.down.weight"].shape[0]
+        cross_attention_dim = value_dict["to_k_lora.down.weight"].shape[1]
+        hidden_size = value_dict["to_k_lora.up.weight"].shape[0]
+
+        attn_processors[key] = LoRAAttnProcessor(
+            hidden_size=hidden_size, cross_attention_dim=cross_attention_dim, rank=rank
+        )
+        attn_processors[key].load_state_dict(value_dict)
+
+    # Merge LoRA attention processor weights into existing Q/K/V/Out weights
+    for name, proc in attn_processors.items():
+        attention_name = name[: -len(".processor")]
+        attention = reduce(getattr, attention_name.split(sep="."), base_model)
+        attention.to_q.weight.data += scale * torch.mm(proc.to_q_lora.up.weight, proc.to_q_lora.down.weight)
+        attention.to_k.weight.data += scale * torch.mm(proc.to_k_lora.up.weight, proc.to_k_lora.down.weight)
+        attention.to_v.weight.data += scale * torch.mm(proc.to_v_lora.up.weight, proc.to_v_lora.down.weight)
+        attention.to_out[0].weight.data += scale * torch.mm(proc.to_out_lora.up.weight, proc.to_out_lora.down.weight)
+
+
+# -----------------------------------------------------------------------------
+# TEXT ENCODER
+# -----------------------------------------------------------------------------
+
+
+def text_encoder_inputs(batch_size, torch_dtype):
+    return torch.zeros((batch_size, 77), dtype=torch_dtype)
+
+
+def text_encoder_load(model_name):
+    base_model_id = get_base_model_name(model_name)
+    model = CLIPTextModel.from_pretrained(base_model_id, subfolder="text_encoder")
+    if is_lora_model(model_name):
+        merge_lora_weights(model, model_name, "text_encoder")
+    return model
+
+
+def text_encoder_conversion_inputs(model=None):
+    return text_encoder_inputs(1, torch.int32)
+
+
+@Registry.register_dataloader()
+def text_encoder_data_loader(dataset, batch_size, *args, **kwargs):
+    return RandomDataLoader(text_encoder_inputs, batch_size, torch.int32)
+
+
+@Registry.register_dataloader()
+def text_encoder_quantize_data_loader(dataset, data_num, *args, **kwargs):
+    return TextEncoderGeneratedDataLoader(data_num)
+
+
+# -----------------------------------------------------------------------------
+# UNET
+# -----------------------------------------------------------------------------
+
+
+def unet_inputs(batch_size, torch_dtype, is_conversion_inputs=False):
+    # TODO(jstoecker): Rename onnx::Concat_4 to text_embeds and onnx::Shape_5 to time_ids
+    inputs = {
+        "sample": torch.rand((batch_size, 4, config.unet_sample_size, config.unet_sample_size), dtype=torch_dtype),
+        "timestep": torch.rand((batch_size,), dtype=torch_dtype),
+        "encoder_hidden_states": torch.rand((batch_size, 77, config.cross_attention_dim), dtype=torch_dtype),
+    }
+
+    # use as kwargs since they won't be in the correct position if passed along with the tuple of inputs
+    kwargs = {
+        "return_dict": False,
+    }
+    if is_conversion_inputs:
+        inputs["additional_inputs"] = {
+            **kwargs,
+            "added_cond_kwargs": {
+                "text_embeds": torch.rand((1, 1280), dtype=torch_dtype),
+                "time_ids": torch.rand((1, 5), dtype=torch_dtype),
+            },
+        }
+    else:
+        inputs.update(kwargs)
+        inputs["onnx::Concat_4"] = torch.rand((1, 1280), dtype=torch_dtype)
+        inputs["onnx::Shape_5"] = torch.rand((1, 5), dtype=torch_dtype)
+
+    return inputs
+
+
+def get_unet_ov_example_input():
+    encoder_hidden_state = torch.ones((2, 77, 768))
+    latents_shape = (2, 4, 512 // 8, 512 // 8)
+    latents = torch.randn(latents_shape)
+    t = torch.from_numpy(np.array(1, dtype=float))
+    return (latents, t, encoder_hidden_state)
+
+
+def unet_load(model_name):
+    base_model_id = get_base_model_name(model_name)
+    model = UNet2DConditionModel.from_pretrained(base_model_id, subfolder="unet")
+    if is_lora_model(model_name):
+        merge_lora_weights(model, model_name, "unet")
+    return model
+
+
+def unet_load_qnn(model_name):
+    base_model_id = get_base_model_name(model_name)
+    model = UNet2DConditionModel.from_pretrained(base_model_id, subfolder="unet")
+    if is_lora_model(model_name):
+        merge_lora_weights(model, model_name, "unet")
+    monkey_patch_model(model)
+    return model
+
+
+def unet_conversion_inputs(model=None):
+    return tuple(unet_inputs(1, torch.float32, True).values())
+
+
+@Registry.register_dataloader()
+def unet_data_loader(dataset, batch_size, *args, **kwargs):
+    return RandomDataLoader(unet_inputs, batch_size, torch.float16)
+
+
+@Registry.register_dataloader()
+def unet_quantize_data_loader(dataset, data_num, *args, **kwargs):
+    return UnetGeneratedDataLoader(data_num)
+
+
+# -----------------------------------------------------------------------------
+# VAE ENCODER
+# -----------------------------------------------------------------------------
+
+
+def vae_encoder_inputs(batch_size, torch_dtype):
+    return {"sample": torch.rand((batch_size, 3, config.vae_sample_size, config.vae_sample_size), dtype=torch_dtype)}
+
+
+def vae_encoder_load(model_name):
+    base_model_id = get_base_model_name(model_name)
+    model = AutoencoderKL.from_pretrained(base_model_id, subfolder="vae")
+    model.forward = lambda sample: model.encode(sample)[0].sample()
+    return model
+
+
+def vae_encoder_conversion_inputs(model=None):
+    return tuple(vae_encoder_inputs(1, torch.float32).values())
+
+
+@Registry.register_dataloader()
+def vae_encoder_data_loader(dataset, batch_size, *args, **kwargs):
+    return RandomDataLoader(vae_encoder_inputs, batch_size, torch.float16)
+
+
+@Registry.register_dataloader()
+def vae_encoder_quantize_data_loader(dataset, data_num, *args, **kwargs):
+    return VaeEncoderGeneratedDataLoader(data_num)
+
+
+# -----------------------------------------------------------------------------
+# VAE DECODER
+# -----------------------------------------------------------------------------
+
+
+def vae_decoder_inputs(batch_size, torch_dtype):
+    return {
+        "latent_sample": torch.rand(
+            (batch_size, 4, config.unet_sample_size, config.unet_sample_size), dtype=torch_dtype
+        )
+    }
+
+
+def vae_decoder_load(model_name):
+    base_model_id = get_base_model_name(model_name)
+    model = AutoencoderKL.from_pretrained(base_model_id, subfolder="vae")
+    model.forward = model.decode
+    return model
+
+
+def vae_decoder_conversion_inputs(model=None):
+    return tuple(vae_decoder_inputs(1, torch.float32).values())
+
+
+@Registry.register_dataloader()
+def vae_decoder_data_loader(dataset, batch_size, *args, **kwargs):
+    return RandomDataLoader(vae_decoder_inputs, batch_size, torch.float16)
+
+
+@Registry.register_dataloader()
+def vae_decoder_quantize_data_loader(dataset, data_num, *args, **kwargs):
+    return VaeDecoderGeneratedDataLoader(data_num)
+
+
+# -----------------------------------------------------------------------------
+# SAFETY CHECKER
+# -----------------------------------------------------------------------------
+
+
+def safety_checker_inputs(batch_size, torch_dtype):
+    return {
+        "clip_input": torch.rand((batch_size, 3, 224, 224), dtype=torch_dtype),
+        "images": torch.rand((batch_size, config.vae_sample_size, config.vae_sample_size, 3), dtype=torch_dtype),
+    }
+
+
+def safety_checker_load(model_name):
+    base_model_id = get_base_model_name(model_name)
+    model = StableDiffusionSafetyChecker.from_pretrained(base_model_id, subfolder="safety_checker")
+    model.forward = model.forward_onnx
+    return model
+
+
+def safety_checker_conversion_inputs(model=None):
+    return tuple(safety_checker_inputs(1, torch.float32).values())
+
+
+@Registry.register_dataloader()
+def safety_checker_data_loader(dataset, batch_size, *args, **kwargs):
+    return RandomDataLoader(safety_checker_inputs, batch_size, torch.float16)
diff --git a/sd-legacy-stable-diffusion-v1-5/aitk/winml.py b/sd-legacy-stable-diffusion-v1-5/aitk/winml.py
new file mode 100644
index 00000000..74a12c53
--- /dev/null
+++ b/sd-legacy-stable-diffusion-v1-5/aitk/winml.py
@@ -0,0 +1,21 @@
+import json
+
+def _get_ep_paths() -> dict[str, str]:
+    from winui3.microsoft.windows.applicationmodel.dynamicdependency.bootstrap import (
+        InitializeOptions,
+        initialize
+    )
+    import winui3.microsoft.windows.ai.machinelearning as winml
+    eps = {}
+    with initialize(options = InitializeOptions.ON_NO_MATCH_SHOW_UI):
+        catalog = winml.ExecutionProviderCatalog.get_default()
+        providers = catalog.find_all_providers()
+        for provider in providers:
+            provider.ensure_ready_async().get()
+            eps[provider.name] = provider.library_path
+            # DO NOT call provider.try_register in python. That will register to the native env.
+    return eps
+
+if __name__ == "__main__":
+    eps = _get_ep_paths()
+    print(json.dumps(eps))
diff --git a/sd-legacy-stable-diffusion-v1-5/olive/sd_utils/ort.py b/sd-legacy-stable-diffusion-v1-5/olive/sd_utils/ort.py
index 69b1502b..4b20ee4a 100644
--- a/sd-legacy-stable-diffusion-v1-5/olive/sd_utils/ort.py
+++ b/sd-legacy-stable-diffusion-v1-5/olive/sd_utils/ort.py
@@ -53,7 +53,7 @@ def validate_ort_version(provider: str):
 
 
 def save_optimized_onnx_submodel(submodel_name, provider, model_info):
-    footprints_file_path = Path(__file__).resolve().parents[1] / "footprints" / submodel_name / "footprints.json"
+    footprints_file_path = Path(__file__).resolve().parents[1] / "footprints" / submodel_name / "footprint.json"
     with footprints_file_path.open("r") as footprint_file:
         footprints = json.load(footprint_file)
 
diff --git a/sd-legacy-stable-diffusion-v1-5/olive/winml.py b/sd-legacy-stable-diffusion-v1-5/olive/winml.py
new file mode 100644
index 00000000..74a12c53
--- /dev/null
+++ b/sd-legacy-stable-diffusion-v1-5/olive/winml.py
@@ -0,0 +1,21 @@
+import json
+
+def _get_ep_paths() -> dict[str, str]:
+    from winui3.microsoft.windows.applicationmodel.dynamicdependency.bootstrap import (
+        InitializeOptions,
+        initialize
+    )
+    import winui3.microsoft.windows.ai.machinelearning as winml
+    eps = {}
+    with initialize(options = InitializeOptions.ON_NO_MATCH_SHOW_UI):
+        catalog = winml.ExecutionProviderCatalog.get_default()
+        providers = catalog.find_all_providers()
+        for provider in providers:
+            provider.ensure_ready_async().get()
+            eps[provider.name] = provider.library_path
+            # DO NOT call provider.try_register in python. That will register to the native env.
+    return eps
+
+if __name__ == "__main__":
+    eps = _get_ep_paths()
+    print(json.dumps(eps))