From 96cd9991c25f50bc11cc8a7a04b556316cdbf8e7 Mon Sep 17 00:00:00 2001
From: MillyWei <yuanwei@microsoft.com>
Date: Tue, 5 Aug 2025 13:42:43 +0800
Subject: [PATCH] Add BAAI/bge-small-en-v1.5

---
 README.md                                     |   2 +-
 baai-bge-small-en-v1.5/aitk/.gitignore        |   6 +
 baai-bge-small-en-v1.5/aitk/README.md         | 165 +++++++++++
 .../bge-small-en-v1.5_context_ov_static.json  | 203 +++++++++++++
 ...mall-en-v1.5_context_ov_static.json.config | 182 ++++++++++++
 .../aitk/bge-small-en-v1.5_qdq_amd.json       | 230 +++++++++++++++
 .../bge-small-en-v1.5_qdq_amd.json.config     | 273 ++++++++++++++++++
 .../aitk/bge-small-en-v1.5_qdq_qnn.json       | 205 +++++++++++++
 .../bge-small-en-v1.5_qdq_qnn.json.config     | 231 +++++++++++++++
 .../aitk/inference_sample.ipynb               | 150 ++++++++++
 baai-bge-small-en-v1.5/aitk/info.yml          |  20 ++
 .../aitk/model_project.config                 |  20 ++
 baai-bge-small-en-v1.5/aitk/requirements.txt  |   5 +
 baai-bge-small-en-v1.5/aitk/user_script.py    | 143 +++++++++
 14 files changed, 1834 insertions(+), 1 deletion(-)
 create mode 100644 baai-bge-small-en-v1.5/aitk/.gitignore
 create mode 100644 baai-bge-small-en-v1.5/aitk/README.md
 create mode 100644 baai-bge-small-en-v1.5/aitk/bge-small-en-v1.5_context_ov_static.json
 create mode 100644 baai-bge-small-en-v1.5/aitk/bge-small-en-v1.5_context_ov_static.json.config
 create mode 100644 baai-bge-small-en-v1.5/aitk/bge-small-en-v1.5_qdq_amd.json
 create mode 100644 baai-bge-small-en-v1.5/aitk/bge-small-en-v1.5_qdq_amd.json.config
 create mode 100644 baai-bge-small-en-v1.5/aitk/bge-small-en-v1.5_qdq_qnn.json
 create mode 100644 baai-bge-small-en-v1.5/aitk/bge-small-en-v1.5_qdq_qnn.json.config
 create mode 100644 baai-bge-small-en-v1.5/aitk/inference_sample.ipynb
 create mode 100644 baai-bge-small-en-v1.5/aitk/info.yml
 create mode 100644 baai-bge-small-en-v1.5/aitk/model_project.config
 create mode 100644 baai-bge-small-en-v1.5/aitk/requirements.txt
 create mode 100644 baai-bge-small-en-v1.5/aitk/user_script.py
diff --git a/README.md b/README.md
index 53c5d84e..a97838f0 100644
--- a/README.md
+++ b/README.md
@@ -21,7 +21,7 @@ Below are list of available recipes grouped by different criteria. Click the lin
 | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: |
 | [google-bert-bert-base-multilingual-cased](google-bert-bert-base-multilingual-cased/aitk) | [laion-CLIP-ViT-B-32-laion2B-s34B-b79K](laion-CLIP-ViT-B-32-laion2B-s34B-b79K/aitk) | [deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B](deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B/aitk) | [meta-llama-Llama-3.2-1B-Instruct](meta-llama-Llama-3.2-1B-Instruct/NvTensorRtRtx) | [mistralai-Mistral-7B-Instruct-v0.3](mistralai-Mistral-7B-Instruct-v0.3/aitk) | [microsoft-Phi-3.5-mini-instruct](microsoft-Phi-3.5-mini-instruct/aitk) | [microsoft-Phi-3.5-mini-instruct](microsoft-Phi-3.5-mini-instruct/NvTensorRtRtx) | [Qwen-Qwen2.5-1.5B-Instruct](Qwen-Qwen2.5-1.5B-Instruct/NvTensorRtRtx) | [microsoft-resnet-50](microsoft-resnet-50/aitk) | [google-vit-base-patch16-224](google-vit-base-patch16-224/aitk) |
 | [intel-bert-base-uncased-mrpc](intel-bert-base-uncased-mrpc/aitk) | [openai-clip-vit-base-patch16](openai-clip-vit-base-patch16/aitk) |  | [meta-llama-Llama-3.2-1B-Instruct](meta-llama-Llama-3.2-1B-Instruct/aitk) |  | [microsoft-Phi-4-mini-reasoning](microsoft-Phi-4-mini-reasoning/aitk) |  | [Qwen-Qwen2.5-1.5B-Instruct](Qwen-Qwen2.5-1.5B-Instruct/aitk) |  |  |
-|  | [openai-clip-vit-base-patch32](openai-clip-vit-base-patch32/aitk) |  |  |  |  |  | [deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B](deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B/NvTensorRtRtx) |  |  |
+|[BAAI/bge-small-en-v1.5](baai-bge-small-en-v1.5/aitk)| [openai-clip-vit-base-patch32](openai-clip-vit-base-patch32/aitk) |  |  |  |  |  | [deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B](deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B/NvTensorRtRtx) |  |  |
 <!-- end_arch_models -->
 </details>
 
diff --git a/baai-bge-small-en-v1.5/aitk/.gitignore b/baai-bge-small-en-v1.5/aitk/.gitignore
new file mode 100644
index 00000000..f2371b12
--- /dev/null
+++ b/baai-bge-small-en-v1.5/aitk/.gitignore
@@ -0,0 +1,6 @@
+__pycache__
+/cache
+/history/*/*
+!/history/*/history.config
+!/history/*/olive_config.json
+.DS_Store
diff --git a/baai-bge-small-en-v1.5/aitk/README.md b/baai-bge-small-en-v1.5/aitk/README.md
new file mode 100644
index 00000000..f76ded4a
--- /dev/null
+++ b/baai-bge-small-en-v1.5/aitk/README.md
@@ -0,0 +1,165 @@
+# BGE-Small-EN-v1.5 Optimization
+ 
+This folder contains examples of BGE-Small-EN-v1.5 optimization using different workflows for various hardware accelerators.
+ 
+## Model Overview
+ 
+BGE-Small-EN-v1.5 is a lightweight English text embedding model developed by BAAI (Beijing Academy of Artificial Intelligence). The model is optimized for sentence and text embedding tasks, providing high-quality vector representations for downstream applications such as semantic search, text classification, and similarity matching.
+ 
+## Optimization Workflows
+ 
+This directory provides three different optimization workflows targeting specific hardware accelerators:
+ 
+- **QDQ for Qualcomm NPU**: Quantization-aware training for Qualcomm Neural Processing Units
+- **QDQ for AMD NPU**: Quantization-aware training for AMD Neural Processing Units  
+- **OpenVINO for Intel NPU**: OpenVINO optimization for Intel Neural Processing Units
+ 
+## Workflow Details
+ 
+### QDQ for Qualcomm NPU
+ 
+This workflow performs quantization-aware training optimization for Qualcomm NPU acceleration. It follows the optimization pipeline:
+ 
+- *HuggingFace Model → ONNX Model → Quantized ONNX Model*
+ 
+**Configuration File**: `bge-small-en-v1.5_qdq_qnn.json`
+ 
+**Key Features**:
+- Uses QNN (Qualcomm Neural Network) execution provider
+- Implements quantization-aware training with dynamic quantization
+- Optimized for Qualcomm NPU hardware architecture
+- Supports both activation and weight quantization
+ 
+### QDQ for AMD NPU
+ 
+This workflow performs quantization-aware training optimization for AMD NPU acceleration. It follows the optimization pipeline:
+ 
+- *HuggingFace Model → ONNX Model → Quantized ONNX Model*
+ 
+**Configuration File**: `bge-small-en-v1.5_qdq_amd.json`
+ 
+**Key Features**:
+- Optimized for AMD NPU architecture
+- Implements quantization-aware training with dynamic quantization
+- Enhanced performance for AMD hardware
+- Supports both activation and weight quantization
+ 
+### OpenVINO for Intel NPU
+ 
+This workflow performs OpenVINO optimization for Intel NPU acceleration. It follows the optimization pipeline:
+ 
+- *HuggingFace Model → OpenVINO IR Model*
+ 
+**Configuration File**: `bge-small-en-v1.5_context_ov_static.json`
+ 
+**Key Features**:
+- Uses OpenVINO execution provider for Intel NPU
+- Implements static quantization for optimal performance
+- Custom user script for specialized data processing
+- Enhanced accuracy evaluation using MTEB benchmarks
+ 
+## Dataset Information
+ 
+### Quantization Datasets
+- **QNN/AMD NPU**: Uses MTEB Banking77 test split for quantization calibration
+- **Intel NPU**: Uses Wikipedia train split (300 samples) with custom preprocessing
+ 
+### Evaluation Datasets
+- **Primary**: MTEB Banking77 classification task
+- **Evaluation Metric**: Custom embedding accuracy for semantic similarity
+- **Benchmark**: MTEB (Massive Text Embedding Benchmark) for standardized evaluation
+ 
+## Performance Evaluation Results
+ 
+The following results are based on comprehensive evaluation using standard embedding benchmarks and performance metrics. All evaluations use the MTEB Banking77 dataset for consistency.
+ 
+### Qualcomm NPU (QNN) Performance
+ 
+| Metric | Value |
+|--------|-------|
+| **Accuracy** | 85.57% |
+| **Latency (avg)** | 14.83 ms |
+| **Latency (min)** | 13.66 ms |
+| **Latency (max)** | 17.92 ms |
+| **Latency (p90)** | 15.52 ms |
+| **Throughput (avg)** | 70.97 tokens/sec |
+| **Throughput (max)** | 72.83 tokens/sec |
+| **Throughput (min)** | 68.47 tokens/sec |
+ 
+### AMD NPU Performance
+ 
+| Metric | Value |
+|--------|-------|
+| **Accuracy** | 83.66% |
+| **Latency (avg)** | 8.58 ms |
+| **Latency (min)** | 7.54 ms |
+| **Latency (max)** | 9.43 ms |
+| **Latency (p90)** | 9.13 ms |
+| **Throughput (avg)** | 107.26 tokens/sec |
+| **Throughput (max)** | 130.15 tokens/sec |
+| **Throughput (min)** | 88.90 tokens/sec |
+ 
+### Intel NPU Performance
+ 
+| Metric | Value |
+|--------|-------|
+| **Accuracy** | 85.42% |
+| **Latency (avg)** | 3.33 ms |
+| **Latency (min)** | 2.30 ms |
+| **Latency (max)** | 6.39 ms |
+| **Latency (p90)** | 4.01 ms |
+| **Throughput (avg)** | 312.15 tokens/sec |
+| **Throughput (max)** | 421.12 tokens/sec |
+| **Throughput (min)** | 199.13 tokens/sec |
+ 
+## Optimization Techniques
+ 
+### Quantization Strategies
+- **Dynamic Quantization**: Used for QNN and AMD NPU workflows
+- **Static Quantization**: Used for Intel NPU workflow with OpenVINO
+- **Mixed Precision**: Combines different precision levels for optimal performance
+ 
+### Model Optimization Features
+- **Input Optimization**: Fixed input shapes for better inference performance
+- **Memory Optimization**: Efficient memory usage through quantization
+- **Hardware-Specific Tuning**: Custom optimizations for each NPU architecture
+ 
+## Requirements
+ 
+The following dependencies are required for running the optimization workflows:
+ 
+```
+olive-ai
+datasets
+optimum
+mteb
+polars-lts-cpu (QNN only)
+```
+ 
+## Usage
+ 
+1. **Select Workflow**: Choose the appropriate configuration file based on your target hardware:
+   - For Qualcomm NPU: `bge-small-en-v1.5_qdq_qnn.json`
+   - For AMD NPU: `bge-small-en-v1.5_qdq_amd.json`
+   - For Intel NPU: `bge-small-en-v1.5_context_ov_static.json`
+ 
+2. **Configure Parameters**: Adjust quantization parameters such as activation type, weight type, and quantization dataset according to your specific requirements.
+ 
+3. **Run Optimization**: Execute the optimization pipeline using the selected configuration.
+ 
+4. **Evaluate Results**: Use the provided evaluation scripts to assess model performance on your target hardware.
+ 
+## Performance Notes
+ 
+- **Accuracy**: Measured using custom embedding accuracy metrics from MTEB benchmark
+- **Latency**: Measured in milliseconds per inference
+- **Throughput**: Measured in tokens per second
+- 
+## Model Information
+ 
+- **Model ID**: `BAAI/bge-small-en-v1.5`
+- **Model Type**: Text Embedding Model
+- **Framework**: HuggingFace Transformers
+- **Optimization Target**: Hardware-specific acceleration for embedding generation
+ 
+*Note: Performance metrics may vary depending on hardware specifications, system environment, and workload characteristics. The values provided here are for reference and may not reflect performance on all devices or configurations.*
\ No newline at end of file
diff --git a/baai-bge-small-en-v1.5/aitk/bge-small-en-v1.5_context_ov_static.json b/baai-bge-small-en-v1.5/aitk/bge-small-en-v1.5_context_ov_static.json
new file mode 100644
index 00000000..6ed5c6d0
--- /dev/null
+++ b/baai-bge-small-en-v1.5/aitk/bge-small-en-v1.5_context_ov_static.json
@@ -0,0 +1,203 @@
+{
+    "input_model": {
+        "type": "HfModel",
+        "model_path": "BAAI/bge-small-en-v1.5",
+        "task": "feature-extraction",
+        "io_config": {
+            "input_names": [
+                "input_ids",
+                "attention_mask",
+                "token_type_ids"
+            ],
+            "input_shapes": [
+                [
+                    1,
+                    128
+                ],
+                [
+                    1,
+                    128
+                ],
+                [
+                    1,
+                    128
+                ]
+            ],
+            "input_types": [
+                "int64",
+                "int64",
+                "int64"
+            ],
+            "output_names": [
+                "last_hidden_state",
+                "state"
+            ]
+        }
+    },
+    "systems": {
+        "local_system": {
+            "type": "LocalSystem",
+            "accelerators": [
+                {
+                    "device": "npu",
+                    "execution_providers": [
+                        "OpenVINOExecutionProvider"
+                    ]
+                }
+            ]
+        }
+    },
+    "data_configs": [
+        {
+            "name": "quantize_data_config",
+            "user_script": "user_script.py",
+            "load_dataset_config": {
+                "type": "bge_small_en_dataset",
+                "data_name": "wikipedia",
+                "split": "train",
+                "max_samples": 300
+            },
+            "dataloader_config": {
+                "batch_size": 1,
+                "drop_last": true
+            }
+        },
+        {
+            "name": "accuracy_data_config",
+            "type": "HuggingfaceContainer",
+            "load_dataset_config": { 
+                "data_name": "mteb/banking77", 
+                "split": "test" 
+            },
+            "pre_process_data_config": { 
+                "max_length": 128, 
+                "padding": "max_length", 
+                "input_cols": ["text"]
+            },
+            "dataloader_config": { 
+                "batch_size": 1 
+            }
+        },
+        {
+            "name": "evaluation_data_config",
+            "type": "HuggingfaceContainer",
+            "load_dataset_config": { 
+                "data_name": "mteb/banking77", 
+                "split": "test" 
+            },
+            "pre_process_data_config": { 
+                "max_length": 128, 
+                "padding": "max_length", 
+                "input_cols": ["text"]
+            },
+            "dataloader_config": { 
+                "batch_size": 1 
+            }
+        }
+    ],
+    "evaluators": {
+        "common_evaluator": {
+            "metrics": [
+                {
+                    "name": "accuracy",
+                    "type": "custom",
+                    "sub_types": [
+                        { 
+                            "name": "embedding_accuracy", 
+                            "priority": 1, 
+                            "higher_is_better": true,
+                            "goal": { "type": "max-degradation", "value": 0.05 }
+                        }
+                    ],
+                    "user_config": {
+                        "user_script": "user_script.py",
+                        "evaluate_func": "eval_accuracy"
+                    }
+                },
+                {
+                    "name": "latency",
+                    "type": "latency",
+                    "data_config": "evaluation_data_config",
+                    "sub_types": [
+                        { "name": "avg", "priority": 2, "metric_config": { "warmup_num": 20, "repeat_test_num": 100 } },
+                        { "name": "p50", "metric_config": { "warmup_num": 20, "repeat_test_num": 100 } },
+                        { "name": "p75", "metric_config": { "warmup_num": 20, "repeat_test_num": 100 } },
+                        { "name": "p90", "metric_config": { "warmup_num": 20, "repeat_test_num": 100 } },
+                        { "name": "p95", "metric_config": { "warmup_num": 20, "repeat_test_num": 100 } },
+                        { "name": "p99", "metric_config": { "warmup_num": 20, "repeat_test_num": 100 } },
+                        { "name": "min", "metric_config": { "warmup_num": 20, "repeat_test_num": 100 } },
+                        { "name": "max", "metric_config": { "warmup_num": 20, "repeat_test_num": 100 } }
+                    ]
+                },
+                {
+                    "name": "throughput",
+                    "type": "throughput",
+                    "data_config": "evaluation_data_config",
+                    "sub_types": [
+                        { "name": "avg", "priority": 3, "metric_config": { "warmup_num": 20, "repeat_test_num": 100 } },
+                        { "name": "p50", "metric_config": { "warmup_num": 20, "repeat_test_num": 100 } },
+                        { "name": "p75", "metric_config": { "warmup_num": 20, "repeat_test_num": 100 } },
+                        { "name": "p90", "metric_config": { "warmup_num": 20, "repeat_test_num": 100 } },
+                        { "name": "p95", "metric_config": { "warmup_num": 20, "repeat_test_num": 100 } },
+                        { "name": "p99", "metric_config": { "warmup_num": 20, "repeat_test_num": 100 } },
+                        { "name": "min", "metric_config": { "warmup_num": 20, "repeat_test_num": 100 } },
+                        { "name": "max", "metric_config": { "warmup_num": 20, "repeat_test_num": 100 } }
+                    ]
+                }
+            ]
+        }
+    },
+    "passes": {
+        "optimum_convert": {
+            "type": "OpenVINOOptimumConversion",
+            "extra_args": {
+                "device": "npu",
+                "task": "feature-extraction"
+            }
+        },
+        "io_update": {
+            "type": "OpenVINOIoUpdate",
+            "input_shapes": [
+                [
+                    1,
+                    128
+                ],
+                [
+                    1,
+                    128
+                ],
+                [
+                    1,
+                    128
+                ]
+            ],
+            "static": true
+        },
+        "ov_quantize": {
+            "type": "OpenVINOQuantization",
+            "target_device": "npu",
+            "data_config": "quantize_data_config",
+            "model_type": "TRANSFORMER",
+            "user_script": "user_script.py",
+            "transform_fn": "custom_transform_func",
+            "extra_configs": [
+                {
+                    "advanced_quantization_parameters": {
+                        "smooth_quant_alpha": 0.6
+                    }
+                }
+            ]
+        },
+        "encapsulation": {
+            "type": "OpenVINOEncapsulation",
+            "target_device": "npu",
+            "ov_version": "2025.1"
+        }
+    },
+    "cache_dir": "cache",
+    "evaluate_input_model": false,
+    "evaluator": "common_evaluator",
+    "host": "local_system",
+    "output_dir": "models/bge-small-en-v1.5/openvino",
+    "target": "local_system"
+}
\ No newline at end of file
diff --git a/baai-bge-small-en-v1.5/aitk/bge-small-en-v1.5_context_ov_static.json.config b/baai-bge-small-en-v1.5/aitk/bge-small-en-v1.5_context_ov_static.json.config
new file mode 100644
index 00000000..d7134549
--- /dev/null
+++ b/baai-bge-small-en-v1.5/aitk/bge-small-en-v1.5_context_ov_static.json.config
@@ -0,0 +1,182 @@
+{
+    "name": "Convert to Intel CPU/NPU/GPU",
+    "oliveFile": "bge/bge-small-en-v1.5_ptq_qnn.json",
+    "isIntel": true,
+    "debugInfo": {
+        "autoGenerated": true,
+        "useOpenVINOOptimumConversion": "optimum_convert"
+    },
+    "addCpu": false,
+    "runtime": {
+        "autoGenerated": true,
+        "name": "Evaluate on",
+        "type": "enum",
+        "displayNames": [
+            "Intel CPU",
+            "Intel GPU",
+            "Intel NPU"
+        ],
+        "path": "systems.local_system.accelerators.0.device",
+        "values": [
+            "cpu",
+            "gpu",
+            "npu"
+        ],
+        "readOnly": false
+    },
+    "runtimeInConversion": {
+        "autoGenerated": true,
+        "name": "Convert/Quantize to",
+        "type": "enum",
+        "displayNames": [
+            "Intel CPU",
+            "Intel GPU",
+            "Intel NPU"
+        ],
+        "path": "passes.optimum_convert.extra_args.device",
+        "values": [
+            "cpu",
+            "gpu",
+            "npu"
+        ],
+        "actions": [
+            [
+                {
+                    "type": "update",
+                    "path": "passes.ov_quantize.target_device",
+                    "value": "cpu"
+                },
+                {
+                    "type": "update",
+                    "path": "passes.encapsulation.target_device",
+                    "value": "cpu"
+                }
+            ],
+            [
+                {
+                    "type": "update",
+                    "path": "passes.ov_quantize.target_device",
+                    "value": "gpu"
+                },
+                {
+                    "type": "update",
+                    "path": "passes.encapsulation.target_device",
+                    "value": "gpu"
+                }
+            ],
+            [
+                {
+                    "type": "update",
+                    "path": "passes.ov_quantize.target_device",
+                    "value": "npu"
+                },
+                {
+                    "type": "update",
+                    "path": "passes.encapsulation.target_device",
+                    "value": "npu"
+                }
+            ]
+        ]
+    },
+    "sections": [
+        {
+            "autoGenerated": true,
+            "name": "Convert",
+            "phase": "Conversion",
+            "parameters": [],
+            "toggle": {
+                "autoGenerated": true,
+                "name": "Convert to ONNX format",
+                "type": "bool",
+                "path": "passes.optimum_convert",
+                "actions": [
+                    [],
+                    []
+                ],
+                "readOnly": true
+            }
+        },
+        {
+            "name": "Quantize",
+            "phase": "Quantization",
+            "parameters": [
+                {
+                    "name": "Quantization Dataset",
+                    "tags": [
+                        "QuantizationDataset"
+                    ],
+                    "type": "enum",
+                    "path": "data_configs[0].load_dataset_config.data_name",
+                    "values": [
+                        "wikipedia"
+                    ],
+                    "template": {
+                        "path": "data_configs[0].load_dataset_config.data_name",
+                        "values": [
+                            "wikipedia"
+                        ],
+                        "template": "QuantizationDataset"
+                    }
+                },
+                {
+                    "name": "Quantization Dataset Split",
+                    "tags": [
+                        "QuantizationDatasetSplit",
+                        "DependsOnDataset"
+                    ],
+                    "type": "enum",
+                    "path": "data_configs[0].load_dataset_config.split",
+                    "values": [
+                        "train",
+                        "validation",
+                        "test"
+                    ],
+                    "template": {
+                        "path": "data_configs[0].load_dataset_config.split",
+                        "template": "QuantizationDatasetSplit"
+                    }
+                },
+                {
+                    "name": "Quantization Dataset Size",
+                    "type": "int",
+                    "path": "data_configs[0].load_dataset_config.max_samples",
+                    "template": {
+                        "path": "data_configs[0].load_dataset_config.max_samples",
+                        "template": "QuantizationDatasetSize"
+                    }
+                }
+            ],
+            "toggle": {
+                "autoGenerated": true,
+                "name": "Quantize model",
+                "type": "bool",
+                "path": "passes.optimum_convert",
+                "actions": [
+                    [],
+                    []
+                ],
+                "readOnly": true
+            }
+        },
+        {
+            "name": "Evaluate",
+            "phase": "Evaluation",
+            "parameters": [],
+            "toggle": {
+                "autoGenerated": true,
+                "name": "Evaluate model performance",
+                "type": "bool",
+                "path": "evaluator",
+                "actions": [
+                    [],
+                    [
+                        {
+                            "type": "delete",
+                            "path": "evaluator"
+                        }
+                    ]
+                ]
+            }
+        }
+    ]
+}
diff --git a/baai-bge-small-en-v1.5/aitk/bge-small-en-v1.5_qdq_amd.json b/baai-bge-small-en-v1.5/aitk/bge-small-en-v1.5_qdq_amd.json
new file mode 100644
index 00000000..a40f3446
--- /dev/null
+++ b/baai-bge-small-en-v1.5/aitk/bge-small-en-v1.5_qdq_amd.json
@@ -0,0 +1,230 @@
+{
+    "input_model": {
+        "type": "HfModel",
+        "model_path": "BAAI/bge-small-en-v1.5",
+        "task": "feature-extraction",
+        "io_config": {
+            "input_names": [
+                "input_ids",
+                "attention_mask",
+                "token_type_ids"
+            ],
+            "input_shapes": [
+                [
+                    1,
+                    128
+                ],
+                [
+                    1,
+                    128
+                ],
+                [
+                    1,
+                    128
+                ]
+            ],
+            "input_types": [
+                "int64",
+                "int64",
+                "int64"
+            ],
+            "output_names": [
+                "last_hidden_state",
+                "state"
+            ]
+        }
+    },
+    "systems": {
+        "local_system": {
+            "type": "LocalSystem",
+            "accelerators": [
+                {
+                    "device": "npu",
+                    "execution_providers": [
+                        "VitisAIExecutionProvider"
+                    ]
+                }
+            ]
+        }
+    },
+    "data_configs": [
+        {
+            "name": "quantization_data_config",
+            "type": "HuggingfaceContainer",
+            "load_dataset_config": {
+                "data_name": "facebook/xnli",
+                "subset": "en",
+                "split": "validation"
+            },
+            "pre_process_data_config": {
+                "input_cols": [
+                    "premise"
+                ],
+                "padding": "max_length",
+                "max_length": 128,
+                "max_samples": 10
+            },
+            "dataloader_config": {
+                "batch_size": 1
+            }
+        },
+        {
+            "name": "evaluation_data_config",
+            "type": "HuggingfaceContainer",
+            "load_dataset_config": {
+                "data_name": "facebook/xnli",
+                "subset": "en",
+                "split": "validation"
+            },
+            "pre_process_data_config": {
+                "input_cols": [
+                    "premise"
+                ],
+                "padding": "max_length",
+                "max_length": 128,
+                "max_samples": 10
+            },
+            "dataloader_config": {
+                "batch_size": 1
+            }
+        }
+    ],
+    "evaluators": {
+        "common_evaluator": {
+            "metrics": [
+                {
+                    "name": "latency",
+                    "type": "latency",
+                    "data_config": "evaluation_data_config",
+                    "sub_types": [
+                        {
+                            "name": "avg",
+                            "priority": 2,
+                            "goal": {
+                                "type": "percent-min-improvement",
+                                "value": 0.1
+                            }
+                        },
+                        { "name": "p50", "metric_config": { "warmup_num": 20, "repeat_test_num": 100 } },
+                        { "name": "p75", "metric_config": { "warmup_num": 20, "repeat_test_num": 100 } },
+                        { "name": "p90", "metric_config": { "warmup_num": 20, "repeat_test_num": 100 } },
+                        { "name": "p95", "metric_config": { "warmup_num": 20, "repeat_test_num": 100 } },
+                        { "name": "p99", "metric_config": { "warmup_num": 20, "repeat_test_num": 100 } },
+                        { "name": "min", "metric_config": { "warmup_num": 20, "repeat_test_num": 100 } },
+                        { "name": "max", "metric_config": { "warmup_num": 20, "repeat_test_num": 100 } }
+                    ]
+                },
+                {
+                    "name": "throughput",
+                    "type": "throughput",
+                    "data_config": "evaluation_data_config",
+                    "sub_types": [
+                        {
+                            "name": "avg",
+                            "priority": 3
+                        },
+                        { "name": "p50", "metric_config": { "warmup_num": 20, "repeat_test_num": 100 } },
+                        { "name": "p75", "metric_config": { "warmup_num": 20, "repeat_test_num": 100 } },
+                        { "name": "p90", "metric_config": { "warmup_num": 20, "repeat_test_num": 100 } },
+                        { "name": "p95", "metric_config": { "warmup_num": 20, "repeat_test_num": 100 } },
+                        { "name": "p99", "metric_config": { "warmup_num": 20, "repeat_test_num": 100 } },
+                        { "name": "min", "metric_config": { "warmup_num": 20, "repeat_test_num": 100 } },
+                        { "name": "max", "metric_config": { "warmup_num": 20, "repeat_test_num": 100 } }
+                    ]
+                },
+                {
+                    "name": "accuracy",
+                    "type": "custom",
+                    "sub_types": [
+                        {
+                            "name": "accuracy_custom",
+                            "priority": 1,
+                            "higher_is_better": true,
+                            "goal": {
+                                "type": "max-degradation",
+                                "value": 0.05
+                            }
+                        }
+                    ],
+                    "user_config": {
+                        "user_script": "user_script.py",
+                        "evaluate_func": "eval_accuracy",
+                        "evaluate_func_kwargs": {
+                            "tasks": [
+                                "Banking77Classification"
+                            ]
+                        }
+                    }
+                }
+            ]
+        }
+    },
+    "passes": {
+        "conversion": {
+            "type": "OnnxConversion",
+            "target_opset": 20,
+            "save_as_external_data": true,
+            "all_tensors_to_one_file": true
+        },
+        "transformer_optimizer": {
+            "type": "OrtTransformersOptimization",
+            "model_type": "bert",
+            "opt_level": 0
+        },
+        "dynamic_shape_to_fixed": {
+            "type": "DynamicToFixedShape",
+            "dim_param": [
+                "batch_size",
+                "sequence_length"
+            ],
+            "dim_value": [
+                1,
+                128
+            ],
+            "save_as_external_data": true
+        },
+        "surgery": {
+            "type": "GraphSurgeries",
+            "surgeries": [
+                {
+                    "surgeon": "ReplaceAttentionMaskValue"
+                }
+            ],
+            "save_as_external_data": true
+        },
+        "OnnxQuantization": {
+            "type": "OnnxStaticQuantization",
+            "data_config": "quantization_data_config",
+            "activation_type": "uint8",
+            "precision": "uint8",
+            "per_channel": false,
+            "reduce_range": false,
+            "save_as_external_data": true,
+            "all_tensors_to_one_file": true,
+            "nodes_to_quantize": [
+                "MatMul",
+                "Gemm",
+                "Conv"
+            ],
+            "nodes_to_exclude": [
+                "pooler.dense"
+            ]
+        },
+        "addmetadata": {
+            "type": "VitisAIAddMetaData",
+            "config_meta_data_keys": [
+                "architectures",
+                "model_type"
+            ],
+            "activation_type": "uint8",
+            "weight_type": "uint8",
+            "quant_type": "OnnxStaticQuantization"
+        }
+    },
+    "cache_dir": "cache",
+    "evaluate_input_model": false,
+    "evaluator": "common_evaluator",
+    "host": "local_system",
+    "output_dir": "models/bge-small-en-v1.5/amd",
+    "target": "local_system"
+}
\ No newline at end of file
diff --git a/baai-bge-small-en-v1.5/aitk/bge-small-en-v1.5_qdq_amd.json.config b/baai-bge-small-en-v1.5/aitk/bge-small-en-v1.5_qdq_amd.json.config
new file mode 100644
index 00000000..e221c559
--- /dev/null
+++ b/baai-bge-small-en-v1.5/aitk/bge-small-en-v1.5_qdq_amd.json.config
@@ -0,0 +1,273 @@
+{
+    "name": "Convert to AMD NPU",
+    "oliveFile": "bge/bge-small-en-v1.5_ptq_qnn.json",
+    "runtime": {
+        "autoGenerated": true,
+        "name": "Evaluate on",
+        "type": "enum",
+        "displayNames": [
+            "AMD NPU",
+            "CPU"
+        ],
+        "path": "systems.local_system.accelerators.0.execution_providers.0",
+        "values": [
+            "VitisAIExecutionProvider",
+            "CPUExecutionProvider"
+        ],
+        "readOnly": false
+    },
+    "sections": [
+        {
+            "autoGenerated": true,
+            "name": "Convert",
+            "phase": "Conversion",
+            "parameters": [],
+            "toggle": {
+                "autoGenerated": true,
+                "name": "Convert to ONNX format",
+                "type": "bool",
+                "path": "passes.conversion",
+                "actions": [
+                    [],
+                    []
+                ],
+                "readOnly": true
+            }
+        },
+        {
+            "name": "Quantize",
+            "phase": "Quantization",
+            "parameters": [
+                {
+                    "name": "Activation Type",
+                    "tags": [
+                        "ActivationType"
+                    ],
+                    "description": "Quantization data type of activation. ‘Int8’ for signed 8-bit integer, ‘UInt8’ for unsigned 8-bit integer etc.",
+                    "descriptionLink": "https://onnxruntime.ai/docs/performance/model-optimizations/quantization.html",
+                    "type": "enum",
+                    "displayNames": [
+                        "Int8",
+                        "UInt8",
+                        "Int16",
+                        "UInt16"
+                    ],
+                    "displayType": "RadioGroup",
+                    "path": "passes.OnnxQuantization.activation_type",
+                    "values": [
+                        "int8",
+                        "uint8",
+                        "int16",
+                        "uint16"
+                    ],
+                    "template": {
+                        "path": "passes.OnnxQuantization.activation_type",
+                        "template": "ActivationType"
+                    }
+                },
+                {
+                    "name": "Weight Type",
+                    "tags": [
+                        "WeightType"
+                    ],
+                    "description": "Data type for quantizing weights. ‘Int8’ for signed 8-bit integer, ‘UInt8’ for unsigned 8-bit integer etc.",
+                    "descriptionLink": "https://onnxruntime.ai/docs/performance/model-optimizations/quantization.html",
+                    "type": "enum",
+                    "displayNames": [
+                        "Int8",
+                        "UInt8",
+                        "Int16",
+                        "UInt16"
+                    ],
+                    "displayType": "RadioGroup",
+                    "path": "passes.OnnxQuantization.precision",
+                    "values": [
+                        "int8",
+                        "uint8",
+                        "int16",
+                        "uint16"
+                    ],
+                    "template": {
+                        "path": "passes.OnnxQuantization.precision",
+                        "template": "WeightType"
+                    }
+                },
+                {
+                    "name": "Quantization Dataset",
+                    "tags": [
+                        "QuantizationDataset"
+                    ],
+                    "type": "enum",
+                    "path": "data_configs[0].load_dataset_config.data_name",
+                    "values": [
+                        "facebook/xnli"
+                    ],
+                    "template": {
+                        "path": "data_configs[0].load_dataset_config.data_name",
+                        "values": [
+                            "facebook/xnli"
+                        ],
+                        "template": "QuantizationDataset"
+                    }
+                },
+                {
+                    "name": "Quantization Dataset Subset",
+                    "tags": [
+                        "QuantizationDatasetSubset",
+                        "DependsOnDataset"
+                    ],
+                    "type": "enum",
+                    "path": "data_configs[0].load_dataset_config.subset",
+                    "values": [
+                        "en",
+                        "all_languages"
+                    ],
+                    "template": {
+                        "path": "data_configs[0].load_dataset_config.subset",
+                        "values": [
+                            "en",
+                            "all_languages"
+                        ],
+                        "template": "QuantizationDatasetSubset"
+                    }
+                },
+                {
+                    "name": "Quantization Dataset Split",
+                    "tags": [
+                        "QuantizationDatasetSplit",
+                        "DependsOnDataset"
+                    ],
+                    "type": "enum",
+                    "path": "data_configs[0].load_dataset_config.split",
+                    "values": [
+                        "train",
+                        "validation",
+                        "test"
+                    ],
+                    "template": {
+                        "path": "data_configs[0].load_dataset_config.split",
+                        "template": "QuantizationDatasetSplit"
+                    }
+                },
+                {
+                    "name": "Quantization Dataset Size",
+                    "type": "int",
+                    "path": "data_configs[0].pre_process_data_config.max_samples",
+                    "template": {
+                        "path": "data_configs[0].pre_process_data_config.max_samples",
+                        "template": "QuantizationDatasetSize"
+                    }
+                }
+            ],
+            "toggle": {
+                "autoGenerated": true,
+                "name": "Quantize model",
+                "type": "bool",
+                "path": "passes.OnnxQuantization",
+                "actions": [
+                    [],
+                    [
+                        {
+                            "type": "update",
+                            "path": "passes",
+                            "value": {
+                                "conversion": {
+                                    "type": "OnnxConversion",
+                                    "target_opset": 17,
+                                    "save_as_external_data": true
+                                }
+                            }
+                        }
+                    ]
+                ]
+            }
+        },
+        {
+            "name": "Evaluate",
+            "phase": "Evaluation",
+            "parameters": [
+                {
+                    "name": "Evaluation Dataset",
+                    "tags": [
+                        "EvaluationDataset"
+                    ],
+                    "type": "enum",
+                    "path": "data_configs[1].load_dataset_config.data_name",
+                    "values": [
+                        "facebook/xnli"
+                    ],
+                    "template": {
+                        "path": "data_configs[1].load_dataset_config.data_name",
+                        "values": [
+                            "facebook/xnli"
+                        ],
+                        "template": "EvaluationDataset"
+                    }
+                },
+                {
+                    "name": "Evaluation Dataset Subset",
+                    "tags": [
+                        "EvaluationDatasetSubset",
+                        "DependsOnDataset"
+                    ],
+                    "type": "enum",
+                    "path": "data_configs[1].load_dataset_config.subset",
+                    "values": [
+                        "en",
+                        "all_languages"
+                    ],
+                    "template": {
+                        "path": "data_configs[1].load_dataset_config.subset",
+                        "values": [
+                            "en",
+                            "all_languages"
+                        ],
+                        "template": "EvaluationDatasetSubset"
+                    }
+                },
+                {
+                    "name": "Evaluation Dataset Split",
+                    "tags": [
+                        "EvaluationDatasetSplit",
+                        "DependsOnDataset"
+                    ],
+                    "type": "enum",
+                    "path": "data_configs[1].load_dataset_config.split",
+                    "values": [
+                        "train",
+                        "validation",
+                        "test"
+                    ],
+                    "template": {
+                        "path": "data_configs[1].load_dataset_config.split",
+                        "template": "EvaluationDatasetSplit"
+                    }
+                },
+                {
+                    "name": "Quantization Dataset Size",
+                    "type": "int",
+                    "path": "data_configs[1].pre_process_data_config.max_samples",
+                    "template": {
+                        "path": "data_configs[1].pre_process_data_config.max_samples",
+                        "template": "QuantizationDatasetSize"
+                    }
+                }
+            ],
+            "toggle": {
+                "autoGenerated": true,
+                "name": "Evaluate model performance",
+                "type": "bool",
+                "path": "evaluator",
+                "actions": [
+                    [],
+                    [
+                        {
+                            "type": "delete",
+                            "path": "evaluator"
+                        }
+                    ]
+                ]
+            }
+        }
+    ]
+}
diff --git a/baai-bge-small-en-v1.5/aitk/bge-small-en-v1.5_qdq_qnn.json b/baai-bge-small-en-v1.5/aitk/bge-small-en-v1.5_qdq_qnn.json
new file mode 100644
index 00000000..93d45465
--- /dev/null
+++ b/baai-bge-small-en-v1.5/aitk/bge-small-en-v1.5_qdq_qnn.json
@@ -0,0 +1,205 @@
+{
+    "input_model": {
+        "type": "HfModel",
+        "model_path": "BAAI/bge-small-en-v1.5",
+        "task": "feature-extraction",
+        "io_config": {
+            "input_names": [
+                "input_ids",
+                "attention_mask",
+                "token_type_ids"
+            ],
+            "input_shapes": [
+                [
+                    1,
+                    128
+                ],
+                [
+                    1,
+                    128
+                ],
+                [
+                    1,
+                    128
+                ]
+            ],
+            "input_types": [
+                "int64",
+                "int64",
+                "int64"
+            ],
+            "output_names": [
+                "last_hidden_state",
+                "state"
+            ]
+        }
+    },
+    "systems": {
+        "qnn_system": {
+            "type": "LocalSystem",
+            "accelerators": [
+                {
+                    "device": "npu",
+                    "execution_providers": [
+                        "QNNExecutionProvider"
+                    ]
+                }
+            ]
+        }
+    },
+    "data_configs": [
+        {
+            "name": "quantization_data_config",
+            "type": "HuggingfaceContainer",
+            "load_dataset_config": {
+                "data_name": "mteb/banking77",
+                "split": "test"
+            },
+            "pre_process_data_config": {
+                "max_length": 128,
+                "padding": "max_length",
+                "input_cols": [
+                    "text"
+                ],
+                "max_samples": 10
+            },
+            "dataloader_config": {
+                "batch_size": 1
+            }
+        },
+        {
+            "name": "evaluation_data_config",
+            "type": "HuggingfaceContainer",
+            "load_dataset_config": {
+                "data_name": "mteb/banking77",
+                "split": "test"
+            },
+            "pre_process_data_config": {
+                "max_length": 128,
+                "padding": "max_length",
+                "input_cols": [
+                    "text"
+                ],
+                "max_samples": 10
+            },
+            "dataloader_config": {
+                "batch_size": 1
+            }
+        }
+    ],
+    "evaluators": {
+        "common_evaluator": {
+            "metrics": [
+                {
+                    "name": "latency",
+                    "type": "latency",
+                    "data_config": "evaluation_data_config",
+                    "sub_types": [
+                        {
+                            "name": "avg",
+                            "priority": 1,
+                            "goal": {
+                                "type": "percent-min-improvement",
+                                "value": 0.1
+                            }
+                        },
+                        { "name": "p50", "metric_config": { "warmup_num": 20, "repeat_test_num": 100 } },
+                        { "name": "p75", "metric_config": { "warmup_num": 20, "repeat_test_num": 100 } },
+                        { "name": "p90", "metric_config": { "warmup_num": 20, "repeat_test_num": 100 } },
+                        { "name": "p95", "metric_config": { "warmup_num": 20, "repeat_test_num": 100 } },
+                        { "name": "p99", "metric_config": { "warmup_num": 20, "repeat_test_num": 100 } },
+                        { "name": "min", "metric_config": { "warmup_num": 20, "repeat_test_num": 100 } },
+                        { "name": "max", "metric_config": { "warmup_num": 20, "repeat_test_num": 100 } }
+                    ]
+                },
+                {
+                    "name": "throughput",
+                    "type": "throughput",
+                    "data_config": "evaluation_data_config",
+                    "sub_types": [
+                        {
+                            "name": "avg",
+                            "priority": 2
+                        },
+                        { "name": "p50", "metric_config": { "warmup_num": 20, "repeat_test_num": 100 } },
+                        { "name": "p75", "metric_config": { "warmup_num": 20, "repeat_test_num": 100 } },
+                        { "name": "p90", "metric_config": { "warmup_num": 20, "repeat_test_num": 100 } },
+                        { "name": "p95", "metric_config": { "warmup_num": 20, "repeat_test_num": 100 } },
+                        { "name": "p99", "metric_config": { "warmup_num": 20, "repeat_test_num": 100 } },
+                        { "name": "min", "metric_config": { "warmup_num": 20, "repeat_test_num": 100 } },
+                        { "name": "max", "metric_config": { "warmup_num": 20, "repeat_test_num": 100 } }
+                    ]
+                },
+                {
+                    "name": "accuracy",
+                    "type": "custom",
+                    "sub_types": [
+                        {
+                            "name": "accuracy_custom",
+                            "priority": 3,
+                            "higher_is_better": true,
+                            "goal": {
+                                "type": "max-degradation",
+                                "value": 0.05
+                            }
+                        }
+                    ],
+                    "user_config": {
+                        "user_script": "user_script.py",
+                        "evaluate_func": "eval_accuracy",
+                        "evaluate_func_kwargs": {
+                            "tasks": [
+                                "Banking77Classification"
+                            ]
+                        }
+                    }
+                }
+            ]
+        }
+    },
+    "passes": {
+        "conversion": {
+            "type": "OnnxConversion",
+            "target_opset": 20,
+            "save_as_external_data": true
+        },
+        "to_fixed_shape": {
+            "type": "DynamicToFixedShape",
+            "dim_param": [
+                "batch_size",
+                "sequence_length"
+            ],
+            "dim_value": [
+                1,
+                128
+            ]
+        },
+        "surgery": {
+            "type": "GraphSurgeries",
+            "surgeries": [
+                {
+                    "surgeon": "ReplaceAttentionMaskValue"
+                }
+            ]
+        },
+        "QNNPreprocess": {
+            "type": "QNNPreprocess",
+            "fuse_layernorm": true
+        },
+        "OnnxQuantization": {
+            "type": "OnnxQuantization",
+            "data_config": "quantization_data_config",
+            "activation_type": "uint16",
+            "precision": "uint8",
+            "calibrate_method": "MinMax",
+            "quant_preprocess": true,
+            "save_as_external_data": true
+        }
+    },
+    "cache_dir": "cache",
+    "evaluate_input_model": false,
+    "evaluator": "common_evaluator",
+    "host": "qnn_system",
+    "output_dir": "models/bge-small-en-v1.5/qnn",
+    "target": "qnn_system"
+}
\ No newline at end of file
diff --git a/baai-bge-small-en-v1.5/aitk/bge-small-en-v1.5_qdq_qnn.json.config b/baai-bge-small-en-v1.5/aitk/bge-small-en-v1.5_qdq_qnn.json.config
new file mode 100644
index 00000000..071ab0b3
--- /dev/null
+++ b/baai-bge-small-en-v1.5/aitk/bge-small-en-v1.5_qdq_qnn.json.config
@@ -0,0 +1,231 @@
+{
+    "name": "Convert to Qualcomm NPU",
+    "oliveFile": "bge/bge-small-en-v1.5_ptq_qnn.json",
+    "runtime": {
+        "autoGenerated": true,
+        "name": "Evaluate on",
+        "type": "enum",
+        "displayNames": [
+            "Qualcomm NPU",
+            "CPU"
+        ],
+        "path": "systems.qnn_system.accelerators.0.execution_providers.0",
+        "values": [
+            "QNNExecutionProvider",
+            "CPUExecutionProvider"
+        ],
+        "readOnly": false
+    },
+    "sections": [
+        {
+            "autoGenerated": true,
+            "name": "Convert",
+            "phase": "Conversion",
+            "parameters": [],
+            "toggle": {
+                "autoGenerated": true,
+                "name": "Convert to ONNX format",
+                "type": "bool",
+                "path": "passes.conversion",
+                "actions": [
+                    [],
+                    []
+                ],
+                "readOnly": true
+            }
+        },
+        {
+            "name": "Quantize",
+            "phase": "Quantization",
+            "parameters": [
+                {
+                    "name": "Activation Type",
+                    "tags": [
+                        "ActivationType"
+                    ],
+                    "description": "Quantization data type of activation. ‘Int8’ for signed 8-bit integer, ‘UInt8’ for unsigned 8-bit integer etc.",
+                    "descriptionLink": "https://onnxruntime.ai/docs/performance/model-optimizations/quantization.html",
+                    "type": "enum",
+                    "displayNames": [
+                        "Int8",
+                        "UInt8",
+                        "Int16",
+                        "UInt16"
+                    ],
+                    "displayType": "RadioGroup",
+                    "path": "passes.OnnxQuantization.activation_type",
+                    "values": [
+                        "int8",
+                        "uint8",
+                        "int16",
+                        "uint16"
+                    ],
+                    "template": {
+                        "path": "passes.OnnxQuantization.activation_type",
+                        "template": "ActivationType"
+                    }
+                },
+                {
+                    "name": "Weight Type",
+                    "tags": [
+                        "WeightType"
+                    ],
+                    "description": "Data type for quantizing weights. ‘Int8’ for signed 8-bit integer, ‘UInt8’ for unsigned 8-bit integer etc.",
+                    "descriptionLink": "https://onnxruntime.ai/docs/performance/model-optimizations/quantization.html",
+                    "type": "enum",
+                    "displayNames": [
+                        "Int8",
+                        "UInt8",
+                        "Int16",
+                        "UInt16"
+                    ],
+                    "displayType": "RadioGroup",
+                    "path": "passes.OnnxQuantization.precision",
+                    "values": [
+                        "int8",
+                        "uint8",
+                        "int16",
+                        "uint16"
+                    ],
+                    "template": {
+                        "path": "passes.OnnxQuantization.precision",
+                        "template": "WeightType"
+                    }
+                },
+                {
+                    "name": "Quantization Dataset",
+                    "tags": [
+                        "QuantizationDataset"
+                    ],
+                    "type": "enum",
+                    "path": "data_configs[0].load_dataset_config.data_name",
+                    "values": [
+                        "mteb/banking77"
+                    ],
+                    "template": {
+                        "path": "data_configs[0].load_dataset_config.data_name",
+                        "values": [
+                            "mteb/banking77"
+                        ],
+                        "template": "QuantizationDataset"
+                    }
+                },
+                {
+                    "name": "Quantization Dataset Split",
+                    "tags": [
+                        "QuantizationDatasetSplit",
+                        "DependsOnDataset"
+                    ],
+                    "type": "enum",
+                    "path": "data_configs[0].load_dataset_config.split",
+                    "values": [
+                        "train",
+                        "validation",
+                        "test"
+                    ],
+                    "template": {
+                        "path": "data_configs[0].load_dataset_config.split",
+                        "template": "QuantizationDatasetSplit"
+                    }
+                },
+                {
+                    "name": "Quantization Dataset Size",
+                    "type": "int",
+                    "path": "data_configs[0].pre_process_data_config.max_samples",
+                    "template": {
+                        "path": "data_configs[0].pre_process_data_config.max_samples",
+                        "template": "QuantizationDatasetSize"
+                    }
+                }
+            ],
+            "toggle": {
+                "autoGenerated": true,
+                "name": "Quantize model",
+                "type": "bool",
+                "path": "passes.OnnxQuantization",
+                "actions": [
+                    [],
+                    [
+                        {
+                            "type": "update",
+                            "path": "passes",
+                            "value": {
+                                "conversion": {
+                                    "type": "OnnxConversion",
+                                    "target_opset": 20,
+                                    "save_as_external_data": true
+                                }
+                            }
+                        }
+                    ]
+                ]
+            }
+        },
+        {
+            "name": "Evaluate",
+            "phase": "Evaluation",
+            "parameters": [
+                {
+                    "name": "Evaluation Dataset",
+                    "tags": [
+                        "EvaluationDataset"
+                    ],
+                    "type": "enum",
+                    "path": "data_configs[1].load_dataset_config.data_name",
+                    "values": [
+                        "mteb/banking77"
+                    ],
+                    "template": {
+                        "path": "data_configs[1].load_dataset_config.data_name",
+                        "values": [
+                            "mteb/banking77"
+                        ],
+                        "template": "EvaluationDataset"
+                    }
+                },
+                {
+                    "name": "Evaluation Dataset Split",
+                    "tags": [
+                        "EvaluationDatasetSplit",
+                        "DependsOnDataset"
+                    ],
+                    "type": "enum",
+                    "path": "data_configs[1].load_dataset_config.split",
+                    "values": [
+                        "train",
+                        "validation",
+                        "test"
+                    ],
+                    "template": {
+                        "path": "data_configs[1].load_dataset_config.split",
+                        "template": "EvaluationDatasetSplit"
+                    }
+                },
+                {
+                    "name": "Quantization Dataset Size",
+                    "type": "int",
+                    "path": "data_configs[1].pre_process_data_config.max_samples",
+                    "template": {
+                        "path": "data_configs[1].pre_process_data_config.max_samples",
+                        "template": "QuantizationDatasetSize"
+                    }
+                }
+            ],
+            "toggle": {
+                "autoGenerated": true,
+                "name": "Evaluate model performance",
+                "type": "bool",
+                "path": "evaluator",
+                "actions": [
+                    [],
+                    [
+                        {
+                            "type": "delete",
+                            "path": "evaluator"
+                        }
+                    ]
+                ]
+            }
+        }
+    ]
+}
diff --git a/baai-bge-small-en-v1.5/aitk/inference_sample.ipynb b/baai-bge-small-en-v1.5/aitk/inference_sample.ipynb
new file mode 100644
index 00000000..8120cbc3
--- /dev/null
+++ b/baai-bge-small-en-v1.5/aitk/inference_sample.ipynb
@@ -0,0 +1,150 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "onnx_model_path = \"./model/model.onnx\"\n",
+    "\n",
+    "ExecutionProvider=\"QNNExecutionProvider\"\n",
+    "if ExecutionProvider == \"OpenVINOExecutionProvider\":\n",
+    "    onnx_model_path = \"./model/openvino_model_st_quant.onnx\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "inputs = \"This is an example sentence.\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import onnxruntime as ort\n",
+    "import torch\n",
+    "import torch.nn.functional as F\n",
+    "\n",
+    "from transformers import AutoModel, AutoTokenizer"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def mean_pooling(model_output, attention_mask):\n",
+    "    token_embeddings = torch.tensor(model_output[0])\n",
+    "    input_mask_expanded = attention_mask.unsqueeze(-1).expand_as(token_embeddings).float()\n",
+    "    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tokenizer = AutoTokenizer.from_pretrained('BAAI/bge-small-en-v1.5')\n",
+    "encoded_input = tokenizer(\n",
+    "    inputs,\n",
+    "    padding=\"max_length\",\n",
+    "    max_length=128,\n",
+    "    truncation=True,\n",
+    "    add_special_tokens=True,\n",
+    "    return_tensors=\"pt\",\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def add_ep_for_device(session_options, ep_name, device_type, ep_options=None):\n",
+    "    ep_devices = ort.get_ep_devices()\n",
+    "    for ep_device in ep_devices:\n",
+    "        if ep_device.ep_name == ep_name and ep_device.device.type == device_type:\n",
+    "            print(f\"Adding {ep_name} for {device_type}\")\n",
+    "            session_options.add_provider_for_devices([ep_device], {} if ep_options is None else ep_options)\n",
+    "\n",
+    "\n",
+    "session_options = ort.SessionOptions()\n",
+    "\n",
+    "add_ep_for_device(session_options, ExecutionProvider, ort.OrtHardwareDeviceType.NPU)\n",
+    "\n",
+    "session = ort.InferenceSession(\n",
+    "    onnx_model_path, # a model wirh QNN EPContext nodes\n",
+    "    sess_options=session_options,\n",
+    ")\n",
+    "\n",
+    "input_ids = encoded_input[\"input_ids\"]\n",
+    "attention_mask = encoded_input[\"attention_mask\"]\n",
+    "token_type_ids = encoded_input[\"token_type_ids\"]\n",
+    "inputs = {\n",
+    "    \"input_ids\": input_ids.long().cpu().numpy(),\n",
+    "    \"attention_mask\": attention_mask.long().cpu().numpy(),\n",
+    "    \"token_type_ids\": token_type_ids.long().cpu().numpy()\n",
+    "}\n",
+    "\n",
+    "outputs = session.run(None, inputs)\n",
+    "embeds_1 = mean_pooling(outputs, encoded_input['attention_mask'])\n",
+    "embeds_1 = F.normalize(embeds_1, p=2, dim=1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# get text embedding from orinal model, as ground truth.\n",
+    "model = AutoModel.from_pretrained('BAAI/bge-small-en-v1.5').eval()\n",
+    "with torch.no_grad():\n",
+    "    outputs = model(**encoded_input)\n",
+    "    embeds_2 = mean_pooling(outputs, encoded_input['attention_mask'])\n",
+    "    embeds_2 = F.normalize(embeds_2, p=2, dim=1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "similarity = F.cosine_similarity(embeds_1, embeds_2).item()\n",
+    "print(\"Similarity: \", similarity)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.9"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/baai-bge-small-en-v1.5/aitk/info.yml b/baai-bge-small-en-v1.5/aitk/info.yml
new file mode 100644
index 00000000..fc040a82
--- /dev/null
+++ b/baai-bge-small-en-v1.5/aitk/info.yml
@@ -0,0 +1,20 @@
+keywords:
+    aitk
+arch: bert
+recipes:
+    - file: "bge-small-en-v1.5_qdq_qnn.json"
+      device: npu
+      ep: QNNExecutionProvider
+    - file: "bge-small-en-v1.5_qdq_amd.json"
+      device: npu
+      ep: VitisAIExecutionProvider
+    - file: "bge-small-en-v1.5_context_ov_static.json"
+      devices:
+        - npu
+        - cpu
+        - gpu
+      ep: OpenVINOExecutionProvider
+aitk:
+    modelInfo:
+        id: "huggingface/BAAI/bge-small-en-v1.5"
+        version: 1
diff --git a/baai-bge-small-en-v1.5/aitk/model_project.config b/baai-bge-small-en-v1.5/aitk/model_project.config
new file mode 100644
index 00000000..05101c4e
--- /dev/null
+++ b/baai-bge-small-en-v1.5/aitk/model_project.config
@@ -0,0 +1,20 @@
+{
+    "workflows": [
+        {
+            "file": "bge-small-en-v1.5_qdq_qnn.json",
+            "templateName": "bge-small-en-v1.5_qdq_qnn"
+        },
+        {
+            "file": "bge-small-en-v1.5_qdq_amd.json",
+            "templateName": "bge-small-en-v1.5_qdq_amd"
+        },
+        {
+            "file": "bge-small-en-v1.5_context_ov_static.json",
+            "templateName": "bge-small-en-v1.5_context_ov_static"
+        }
+    ],
+    "modelInfo": {
+        "id": "huggingface/BAAI/bge-small-en-v1.5",
+        "version": 1
+    }
+}
\ No newline at end of file
diff --git a/baai-bge-small-en-v1.5/aitk/requirements.txt b/baai-bge-small-en-v1.5/aitk/requirements.txt
new file mode 100644
index 00000000..c1df102f
--- /dev/null
+++ b/baai-bge-small-en-v1.5/aitk/requirements.txt
@@ -0,0 +1,5 @@
+olive-ai
+datasets
+optimum
+mteb
+polars-lts-cpu
\ No newline at end of file
diff --git a/baai-bge-small-en-v1.5/aitk/user_script.py b/baai-bge-small-en-v1.5/aitk/user_script.py
new file mode 100644
index 00000000..5530cb2d
--- /dev/null
+++ b/baai-bge-small-en-v1.5/aitk/user_script.py
@@ -0,0 +1,143 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Intel Corporation. All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+import datasets
+import mteb
+import numpy as np
+import torch
+from transformers import AutoTokenizer
+
+from olive.constants import Framework
+from olive.data.registry import Registry
+from olive.model import OliveModelHandler
+
+# -------------------------------------------------------------------------
+# Common Dataset
+# -------------------------------------------------------------------------
+
+seed = 0
+# seed everything to 0 for reproducibility, https://pytorch.org/docs/stable/notes/randomness.html
+# do not set random seed and np.random.seed for aml test, since it will cause aml job name conflict
+torch.manual_seed(seed)
+# the following are needed only for GPU
+torch.cuda.manual_seed(seed)
+torch.backends.cudnn.deterministic = True
+torch.backends.cudnn.benchmark = False
+
+# set max sequence length
+MAX_SEQ_LENGTH = 128
+
+# define the tokenizer for BGE model
+tokenizer = AutoTokenizer.from_pretrained("BAAI/bge-small-en-v1.5")
+VOCAB_SIZE = len(tokenizer)
+
+# set default input
+default_input = torch.ones(1, MAX_SEQ_LENGTH, dtype=torch.int64)
+
+# define model inputs
+model_inputs = {
+    "input_ids": default_input,
+    "attention_mask": default_input,
+    "token_type_ids": default_input,
+}
+
+# capture input names
+INPUT_NAMES = list(model_inputs)
+
+
+class OliveEncoder:
+    def __init__(self, model, session):
+        self.model = model
+        self.session = session
+        self.tokenizer = AutoTokenizer.from_pretrained("BAAI/bge-small-en-v1.5")
+
+    def encode(self, corpus: list, **kwargs):
+        model_output = None
+        if self.model.framework == Framework.ONNX:
+            encoded_input = self.tokenizer(
+                corpus, padding="max_length", max_length=MAX_SEQ_LENGTH, truncation=True, return_tensors="np"
+            )
+            # batch_size is 1 for static model
+            model_outputs = []
+            for i in range(len(corpus)):
+                model_inputs = {
+                    "input_ids": encoded_input.input_ids[i : i + 1, :].astype(np.int64),
+                    "attention_mask": encoded_input.attention_mask[i : i + 1, :].astype(np.int64),
+                    "token_type_ids": encoded_input.token_type_ids[i : i + 1, :].astype(np.int64),
+                }
+                model_output = self.model.run_session(self.session, model_inputs)[0]
+                model_outputs.append(model_output[0])
+            model_output = np.array(model_outputs)
+        elif self.model.framework == Framework.PYTORCH:
+            encoded_input = self.tokenizer(corpus, padding=True, truncation=True, return_tensors="pt")
+            model_inputs = {
+                "input_ids": encoded_input.input_ids,
+                "attention_mask": encoded_input.attention_mask,
+                "token_type_ids": encoded_input.token_type_ids,
+            }
+            with torch.no_grad():
+                model_output = self.model.run_session(self.session, model_inputs)
+            model_output = model_output.last_hidden_state.numpy()
+        # select the last hidden state of the first token (i.e., [CLS]) as the sentence embedding.
+        return model_output[:, 0, :]
+
+
+def eval_accuracy(model: OliveModelHandler, device, execution_providers, tasks=None):
+    """Evaluate accuracy using MTEB (Massive Text Embedding Benchmark) for standardized evaluation."""
+    sess = model.prepare_session(inference_settings=None, device=device, execution_providers=execution_providers)
+    
+    # Use default tasks if none provided
+    if tasks is None:
+        tasks = ["Banking77Classification"]  # Default to Banking77 for BGE model evaluation
+    
+    evaluation = mteb.MTEB(tasks=tasks)
+    olive_encoder = OliveEncoder(model, sess)
+    results = evaluation.run(olive_encoder, output_folder=None)
+    
+    # Return the main score from the first task
+    return results[0].scores["test"][0]["main_score"]
+
+
+@Registry.register_dataset()
+def bge_small_en_dataset(data_name, split, max_samples):
+    # load the raw wikipedia dataset for tuning. Load just 300 examples for speed.
+    raw_dataset = datasets.load_dataset(data_name, "20220301.en", split=split, trust_remote_code=True)
+    
+    # Apply max_samples limit after loading
+    if max_samples:
+        raw_dataset = raw_dataset.select(range(min(max_samples, len(raw_dataset))))
+
+    def _preprocess_fn(examples):
+        return tokenizer(
+            examples["text"],
+            padding="max_length",
+            max_length=MAX_SEQ_LENGTH,
+            truncation=True,
+        )
+
+    # preprocess the dataset
+    return raw_dataset.map(_preprocess_fn, batched=True, batch_size=1)
+
+
+def custom_transform_func(data_item):
+    return {
+        name: np.asarray([np.array([g.flatten() for g in data_item[name]]).flatten()], dtype=np.int64)
+        for name in INPUT_NAMES
+    }
+
+
+def custom_example_func():
+    vocab_size = VOCAB_SIZE
+    batch_size = 1
+    sequence_length = MAX_SEQ_LENGTH
+
+    input_ids = torch.randint(0, vocab_size, (batch_size, sequence_length))
+
+    # Generate random attention_mask (1s for actual tokens, 0s for padding)
+    attention_mask = default_input
+
+    # Generate random token_type_ids (0 for sentence 1, 1 for sentence 2)
+    token_type_ids = default_input
+
+    return [input_ids, attention_mask, token_type_ids]
\ No newline at end of file