diff --git a/onnxruntime/core/providers/openvino/backend_manager.cc b/onnxruntime/core/providers/openvino/backend_manager.cc index 3426a2781bbc6..d70e2c1acbe94 100644 --- a/onnxruntime/core/providers/openvino/backend_manager.cc +++ b/onnxruntime/core/providers/openvino/backend_manager.cc @@ -292,12 +292,12 @@ bool BackendManager::ModelHasSymbolicInputDims(const onnxruntime::GraphViewer& s } // For dynamic models with incomplete reshape coverage, clear shapes - if (has_symbolic_dims && !all_dynamic_inputs_covered) { + /* if (has_symbolic_dims && !all_dynamic_inputs_covered) { session_context_.reshape.clear(); LOGS_DEFAULT(WARNING) << "reshape_input does not cover all dynamic dimensions, " << "ignoring all provided shapes"; return true; // Model is dynamic - } + }*/ // If shapes are valid with complete coverage for dynamic model, treat as concrete if (has_symbolic_dims && shapes_valid && all_dynamic_inputs_covered) { @@ -476,7 +476,9 @@ BackendManager::GetModelProtoFromFusedNode(const onnxruntime::Node& fused_node, }; [[maybe_unused]] bool enable_ovep_qdq_optimizer = session_context_.enable_qdq_optimizer && IsQDQGraph(subgraph); - [[maybe_unused]] std::optional enable_compiler_qdq_optimization = queryOVProperty("NPU_QDQ_OPTIMIZATION", session_context_.device_type); + [[maybe_unused]] std::optional enable_compiler_qdq_optimization = false; + if (session_context_.device_type.find("NPU") != std::string::npos) + enable_compiler_qdq_optimization = queryOVProperty("NPU_QDQ_OPTIMIZATION", "NPU"); #if (((OPENVINO_VERSION_MAJOR == 2025) && (OPENVINO_VERSION_MINOR > 0)) || (OPENVINO_VERSION_MAJOR > 2025)) if (session_context_.device_type.find("NPU") != std::string::npos && session_context_.enable_qdq_optimizer) { if (enable_compiler_qdq_optimization.has_value() && enable_compiler_qdq_optimization.value()) { diff --git a/onnxruntime/core/providers/openvino/backend_utils.cc b/onnxruntime/core/providers/openvino/backend_utils.cc index 45e518d16686e..139439a9ee43d 100644 --- a/onnxruntime/core/providers/openvino/backend_utils.cc +++ b/onnxruntime/core/providers/openvino/backend_utils.cc @@ -38,6 +38,16 @@ bool IsCILogEnabled() { return false; } +std::string get_shapes_string(const reshape_t& shapes) { + std::stringstream ss; + for (auto& shape : shapes) { + if (!ss.str().empty()) + ss << ", "; + ss << "\'" << shape.first << "': " << shape.second; + } + return ss.str(); +} + std::shared_ptr CreateOVModel(std::string&& model, const SessionContext& session_context, @@ -46,17 +56,27 @@ CreateOVModel(std::string&& model, std::cout << "CreateNgraphFunc" << std::endl; } try { - auto ov_model = OVCore::Get()->ReadModel(std::move(model), session_context.onnx_model_path_name.string()); + auto ov_model = OVCore::Get()->ReadModel(std::move(model), session_context.onnx_model_path_name.string()); + + if (!session_context.affinity.empty()) { + LOGS_DEFAULT(INFO) << log_tag << "Setting the ov nodes to specified affinity"; + Set_Affinity(ov_model, session_context); + } if (!session_context.reshape.empty()) { LOGS_DEFAULT(INFO) << log_tag << "Reshaping the ov tensor to specified shape"; ov_model->reshape(session_context.reshape); } + ov::preprocess::PrePostProcessor preproc(ov_model); + ov_model = preproc.build(); + + if (!session_context.layout.empty()) { LOGS_DEFAULT(INFO) << log_tag << "Setting the ov tensor layout to specified layout"; ov_model = Set_Layout(ov_model, session_context.layout); } + // Check for Constant Folding if ((session_context.device_type != "NPU") && !session_context.is_wholly_supported_graph) { ov::pass::ConstantFolding pass_const_obj; @@ -141,6 +161,33 @@ std::shared_ptr Set_Layout(std::shared_ptr ov_model, const return preproc.build(); } +void Set_Affinity(std::shared_ptr ov_model, const SessionContext& session_context) { + + std::string selected_device = "CPU"; + if (auto delimit = session_context.device_type.find(":"); delimit != std::string::npos) { + auto device_mode = session_context.device_type.substr(0, delimit); + if (device_mode.find("HETERO") != std::string::npos) { + const auto& devices = session_context.device_type.substr(delimit + 1); + auto delimit_comma = devices.find(","); + selected_device = devices.substr(0, delimit_comma); + } else { + ORT_THROW("[ERROR] [OpenVINO] Invalid device_type is selected. Supported modes is HETERO"); + } + } else { + ORT_THROW("[ERROR] [OpenVINO] Invalid device_type is selected. Supported modes is HETERO"); + } + + for (auto&& ov_node : ov_model->get_ops()) { + auto name = ov_node->get_friendly_name(); + auto it = session_context.affinity.find(name); + if (it != session_context.affinity.end()) { + ov_node->get_rt_info()["affinity"] = it->second; + } else { + ov_node->get_rt_info()["affinity"] = selected_device; + } + } +} + int GetFirstAvailableDevice(SessionContext& session_context) { int i = 0; // Get the first available VAD-M device and set the device to busy diff --git a/onnxruntime/core/providers/openvino/backend_utils.h b/onnxruntime/core/providers/openvino/backend_utils.h index 8ba35e0abd1bc..15ac81db1bdc1 100644 --- a/onnxruntime/core/providers/openvino/backend_utils.h +++ b/onnxruntime/core/providers/openvino/backend_utils.h @@ -106,6 +106,10 @@ void printPerformanceCounts(OVInferRequestPtr request, std::ostream& stream, std bool IsModelStreamXML(std::istream& model_stream); +void Set_Affinity(std::shared_ptr ov_model, const SessionContext& session_context); + +std::string get_shapes_string(const reshape_t& shapes); + } // namespace backend_utils } // namespace openvino_ep } // namespace onnxruntime diff --git a/onnxruntime/core/providers/openvino/backends/basic_backend.cc b/onnxruntime/core/providers/openvino/backends/basic_backend.cc index d7fc0553fb1d4..508b20213d402 100644 --- a/onnxruntime/core/providers/openvino/backends/basic_backend.cc +++ b/onnxruntime/core/providers/openvino/backends/basic_backend.cc @@ -99,6 +99,7 @@ BasicBackend::BasicBackend(std::unique_ptr& model_pr !session_context_.so_context_enable && session_context_.reshape.empty() && session_context_.layout.empty() && + session_context_.affinity.empty() && !enable_causallm && !eligible_for_cpu_fallback && auto_unified_compile); diff --git a/onnxruntime/core/providers/openvino/backends/basic_backend.h b/onnxruntime/core/providers/openvino/backends/basic_backend.h index 2cf3d3faa8b47..453efc5a56ca9 100644 --- a/onnxruntime/core/providers/openvino/backends/basic_backend.h +++ b/onnxruntime/core/providers/openvino/backends/basic_backend.h @@ -82,14 +82,17 @@ struct OnnxToOvNetworkBindings { } } - ORT_ENFORCE(matched_names, log_tag, - "Input names mismatch between OpenVINO and ONNX. ", onnx_name, - " doesn't exist in the list of OpenVINO input tensor names"); + //ORT_ENFORCE(matched_names, log_tag, + // "Input names mismatch between OpenVINO and ONNX. ", onnx_name, + // " doesn't exist in the list of OpenVINO input tensor names"); + if (!matched_names) { + continue; + } auto ov_param_index = std::distance(ov_parameters.begin(), it); - auto shape = ov_parameters[ov_param_index].get_partial_shape(); auto type = ov_parameters[ov_param_index].get_element_type(); + ParameterInfo info{onnx_name, ov_param_index, onnx_param_index, type, ParameterShape{shape}}; // Analyze shape dynamism and set flags @@ -112,7 +115,7 @@ struct OnnxToOvNetworkBindings { info.SetFullyDynamic(has_fully_dynamic); info.SetBoundedDynamic(has_bounded_dynamic); } - + input_output_map.push_back(std::move(info)); } }; diff --git a/onnxruntime/core/providers/openvino/contexts.h b/onnxruntime/core/providers/openvino/contexts.h index ebb716a64162c..bcbb463733b89 100644 --- a/onnxruntime/core/providers/openvino/contexts.h +++ b/onnxruntime/core/providers/openvino/contexts.h @@ -24,6 +24,7 @@ namespace fs = std::filesystem; using config_t = std::map; using reshape_t = std::map; using layout_t = std::map; +using affinity_t = std::map; struct ProviderInfo { std::string device_type{""}; // [device_type]: Overrides the accelerator hardware type and @@ -43,6 +44,7 @@ struct ProviderInfo { // it will be directly loaded. reshape_t reshape{}; // Used for reshaping the ov input tensor shape at runtime. layout_t layout{}; // Used for specifying the ov input/output tensor layout at runtime. + affinity_t affinity{}; // Used for specifying the nodes affinity at runtime. std::string model_priority{"DEFAULT"}; // High-level OpenVINO model priority hint // Defines what model should be provided with more performant // bounded resource first @@ -66,7 +68,7 @@ struct ProviderInfo { const ConfigOptions* config_options{NULL}; const std::unordered_set valid_provider_keys = {"device_type", "device_id", "device_luid", "cache_dir", "precision", "load_config", "context", "num_of_threads", "model_priority", "num_streams", "enable_opencl_throttling", "enable_qdq_optimizer", - "enable_causallm", "disable_dynamic_shapes", "reshape_input", "layout"}; + "enable_causallm", "disable_dynamic_shapes", "reshape_input", "layout", "affinity"}; }; struct RuntimeConfig { diff --git a/onnxruntime/core/providers/openvino/openvino_execution_provider.cc b/onnxruntime/core/providers/openvino/openvino_execution_provider.cc index a099f85b2a4b9..77b0821412e4c 100644 --- a/onnxruntime/core/providers/openvino/openvino_execution_provider.cc +++ b/onnxruntime/core/providers/openvino/openvino_execution_provider.cc @@ -184,7 +184,6 @@ common::Status OpenVINOExecutionProvider::Compile( for (const auto& fused_node_graph : fused_nodes) { const GraphViewer& graph_body_viewer = fused_node_graph.filtered_graph; - // Set include_embed_data to true only for the first backend manager backend_it->TryExportCompiledBlobAsEPCtxNode(graph_body_viewer, is_first); @@ -207,9 +206,9 @@ common::Status OpenVINOExecutionProvider::Compile( return status; } -#ifdef USE_OVEP_NPU_MEMORY + #ifdef USE_OVEP_NPU_MEMORY std::vector OpenVINOExecutionProvider::CreatePreferredAllocators() { - if (session_context_.device_type.find("NPU") != std::string::npos) { + /* if (session_context_.device_type.find("NPU") != std::string::npos) { AllocatorCreationInfo npu_allocator_info{ [this](OrtDevice::DeviceId device_id) { return std::make_unique( @@ -223,9 +222,9 @@ std::vector OpenVINOExecutionProvider::CreatePreferredAllocators() // fill in allocator return std::vector{CreateAllocator(npu_allocator_info)}; - } else { + } else {*/ return std::vector{}; - } + //} } #endif diff --git a/onnxruntime/core/providers/openvino/openvino_parser_utils.cc b/onnxruntime/core/providers/openvino/openvino_parser_utils.cc index a290fea73e0e8..c4daa2232eebf 100644 --- a/onnxruntime/core/providers/openvino/openvino_parser_utils.cc +++ b/onnxruntime/core/providers/openvino/openvino_parser_utils.cc @@ -127,7 +127,7 @@ reshape_t OpenVINOParserUtils::ParseInputShape(const std::string& reshape_input_ // Regular expressions for parsing const std::regex tensor_pattern(R"(([^\[\],]+)\s*\[(.*?)\])"); // e.g. "input_1[1..5, 2, 3..4],data[1,2,3]" - // const std::regex dimension_pattern(R"(\s*(\d+(?:\.\.\d+)?)\s*)"); // e.g. "1..5", "2", "3..4" + // const dimension_pattern(R"(\s*(\d+(?:\.\.\d+)?)\s*)"); // e.g. "1..5", "2", "3..4" const std::regex dimension_pattern(R"(\s*([^,\s]+)\s*)"); // Find all tensor shape definitions using regex auto tensor_begin = std::sregex_iterator( @@ -310,5 +310,33 @@ bool OpenVINOParserUtils::Check_Valid_Layout(const std::string& layout_str, cons return true; } +affinity_t OpenVINOParserUtils::ParseAffinity(const std::string& affinity_definition) { + LOGS_DEFAULT(INFO) << "[OpenVINO] Affinity is set : " << affinity_definition << "\n"; + affinity_t result_map; + + // Regex to capture device name and a list of nodes + // It captures: + // Group 1: device name (e.g., "device") + // Group 2: comma-separated list of nodes (e.g., "\"node1\", \"node2\"") + std::regex device_nodes_regex(R"(([^,\[\]]+)\[([^\]]+)\])"); + + std::sregex_iterator device_it(affinity_definition.begin(), affinity_definition.end(), device_nodes_regex); + std::sregex_iterator device_end; + + for (; device_it != device_end; ++device_it) { + std::smatch device_match = *device_it; + std::string device_name = device_match[1].str(); + std::string nodes_list_str = device_match[2].str(); + std::stringstream nodes_list(nodes_list_str); + std::string item; + + while (getline(nodes_list, item, ',')) { + result_map[item] = device_name; + } + } + + return result_map; +} + } // namespace openvino_ep } // namespace onnxruntime diff --git a/onnxruntime/core/providers/openvino/openvino_parser_utils.h b/onnxruntime/core/providers/openvino/openvino_parser_utils.h index a0936d627df40..090a133e906ab 100644 --- a/onnxruntime/core/providers/openvino/openvino_parser_utils.h +++ b/onnxruntime/core/providers/openvino/openvino_parser_utils.h @@ -22,6 +22,7 @@ class OpenVINOParserUtils { static std::string TrimWhitespace(const std::string& str); static ov::Dimension ParseDimensionRange(const std::string& range_str, const std::string& tensor_name); static bool Check_Valid_Layout(const std::string& layout_str, const std::string& tensor_name); + static affinity_t ParseAffinity(const std::string& affinity_definition); }; } // namespace openvino_ep diff --git a/onnxruntime/core/providers/openvino/openvino_provider_factory.cc b/onnxruntime/core/providers/openvino/openvino_provider_factory.cc index 7eb5b062fe7c8..38d52151465cd 100644 --- a/onnxruntime/core/providers/openvino/openvino_provider_factory.cc +++ b/onnxruntime/core/providers/openvino/openvino_provider_factory.cc @@ -266,6 +266,10 @@ static void ParseProviderInfo(const ProviderOptions& provider_options, pi.layout = OpenVINOParserUtils::ParseLayout(provider_options.at("layout")); } + if (provider_options.contains("affinity")) { + pi.affinity = OpenVINOParserUtils::ParseAffinity(provider_options.at("affinity")); + } + if (provider_options.contains("load_config")) { auto parse_config = [&](const std::string& config_str) -> std::map { // If the config string is empty, return an empty map and skip processing diff --git a/onnxruntime/core/providers/openvino/ov_interface.cc b/onnxruntime/core/providers/openvino/ov_interface.cc index 23be3447b8799..61477fe480566 100644 --- a/onnxruntime/core/providers/openvino/ov_interface.cc +++ b/onnxruntime/core/providers/openvino/ov_interface.cc @@ -84,7 +84,7 @@ std::shared_ptr OVCore::ReadModel(std::string&& model, const std::str ov::frontend::InputModel::Ptr inputModel; ov::AnyVector params{&modelStream, model_path}; - + FE = manager.load_by_model(params); if (FE) { inputModel = FE->load(params); diff --git a/onnxruntime/core/providers/openvino/ov_versions/data_ops.cc b/onnxruntime/core/providers/openvino/ov_versions/data_ops.cc index 373b2121a9b60..51754046ae199 100644 --- a/onnxruntime/core/providers/openvino/ov_versions/data_ops.cc +++ b/onnxruntime/core/providers/openvino/ov_versions/data_ops.cc @@ -152,6 +152,7 @@ std::vector supported_op_mode = { {"GreaterOrEqual", V_2022_1, {"CPU", "GPU"}}, {"GridSample", V_2022_3, {"CPU"}}, {"GridSample", V_2023_0, {"GPU"}}, + {"GroupQueryAttention", V_2025_0, {"CPU"}}, {"GRU", V_2024_1, {"CPU", "GPU"}}, {"HardMax", V_2023_1, {"CPU", "GPU"}}, {"Identity", V_2020_4, {"CPU", "GPU"}}, diff --git a/onnxruntime/python/onnxruntime_pybind_state.cc b/onnxruntime/python/onnxruntime_pybind_state.cc index 92cf6b085c01e..4961b290da8cf 100644 --- a/onnxruntime/python/onnxruntime_pybind_state.cc +++ b/onnxruntime/python/onnxruntime_pybind_state.cc @@ -1083,7 +1083,7 @@ static std::shared_ptr CreateExecutionProviderFactory ProviderOptions OV_provider_options_map; const std::unordered_set valid_provider_keys = {"device_type", "device_id", "device_luid", "cache_dir", "precision", "load_config", "context", "num_of_threads", "model_priority", "num_streams", "enable_opencl_throttling", "enable_qdq_optimizer", - "enable_causallm", "disable_dynamic_shapes", "reshape_input", "layout"}; + "enable_causallm", "disable_dynamic_shapes", "reshape_input", "layout", "affinity"}; auto it = provider_options_map.find(type); if (it != provider_options_map.end()) { for (auto option : it->second) { diff --git a/onnxruntime/test/perftest/command_args_parser.cc b/onnxruntime/test/perftest/command_args_parser.cc index 2c9377d48f0c4..5cada81db449d 100644 --- a/onnxruntime/test/perftest/command_args_parser.cc +++ b/onnxruntime/test/perftest/command_args_parser.cc @@ -74,6 +74,7 @@ ABSL_FLAG(std::string, i, "", " [OpenVINO only] [enable_opencl_throttling]: Enables OpenCL queue throttling for GPU device(Reduces the CPU Utilization while using GPU) \n" " [OpenVINO only] [reshape_input]: Sets model input shapes with support for bounded dynamic dimensions using 'min..max' syntax (e.g., [1..10,3,224,224]) \n" " [OpenVINO only] [layout]: Specifies the layout for inputs/outputs to interpret tensor dimensions correctly. \n" + " [OpenVINO only] [affinity]: Specifies the affinity of a certain node to a specific device in Hetero Mode. \n" " [Example] [For OpenVINO EP] -e openvino -i \"device_type|CPU num_of_threads|5 enable_opencl_throttling|true reshape_input|[1,3,60,60..100] layout|[NCHW] cache_dir|\"\"\"\n" "\n" " [QNN only] [backend_type]: QNN backend type. E.g., 'cpu', 'htp'. Mutually exclusive with 'backend_path'.\n" diff --git a/onnxruntime/test/perftest/ort_test_session.cc b/onnxruntime/test/perftest/ort_test_session.cc index cb40a9beafeee..53b509fafe432 100644 --- a/onnxruntime/test/perftest/ort_test_session.cc +++ b/onnxruntime/test/perftest/ort_test_session.cc @@ -974,12 +974,14 @@ select from 'TF8', 'TF16', 'UINT8', 'FLOAT', 'ITENSOR'. \n)"); ov_options[key] = value; } else if (key == "layout") { ov_options[key] = value; + } else if (key == "affinity") { + ov_options[key] = value; } else { ORT_THROW( "[ERROR] [OpenVINO] wrong key type entered. Choose from the following runtime key options that are available for OpenVINO." " ['device_type', 'device_id', 'num_of_threads', 'load_config', 'cache_dir', 'num_streams', " "'enable_opencl_throttling', 'disable_dynamic_shapes', 'enable_qdq_optimizer'," - " 'enable_causallm', 'reshape_input', 'layout', 'model_priority'] \n"); + " 'enable_causallm', 'reshape_input', 'layout', 'affinity', 'model_priority'] \n"); } } session_options.AppendExecutionProvider_OpenVINO_V2(ov_options);