From 61c97fd38be815bc44fad0620a07a1671b0489c7 Mon Sep 17 00:00:00 2001 From: Raul Torres <138264735+rauletorresc@users.noreply.github.com> Date: Fri, 14 Nov 2025 17:54:01 +0000 Subject: [PATCH] Unify device and priority dimension order in stream pool management **Descrition of the problem** Currently, `npu_counters` and `npu_streams` multi-dimensional arrays have priority as the major dimension enclosing the minor device and stream dimensions, e.g. `npu_streams[priority_id][device_id][stream_id]`. However `device_priority_flags` has priority and device interchanged. Moreover, iterating through `npu_streams` is made inefficiently, with the inner loop iterating through the outermost dimension, an the outer loop iterating through the innermost dimension. **Proposed solution** * Make `npu_counters` and `npu_streams` follow `device_priority_flags` dimension order, e.g. `npu_streams[device_id][priority_id][stream_id]`. * Rearrange loops so they follow and exploit the row-major order. --- torch_npu/csrc/core/npu/NPUStream.cpp | 28 +++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/torch_npu/csrc/core/npu/NPUStream.cpp b/torch_npu/csrc/core/npu/NPUStream.cpp index 07b36b0e0..4f5112d04 100644 --- a/torch_npu/csrc/core/npu/NPUStream.cpp +++ b/torch_npu/csrc/core/npu/NPUStream.cpp @@ -72,8 +72,8 @@ static std::once_flag device_priority_flags[C10_COMPILE_TIME_MAX_NPUS][kMaxStrea // SyncLaunch streams pool init flags static std::once_flag device_sync_launch_flags[C10_COMPILE_TIME_MAX_NPUS]; static std::array< - std::array, C10_COMPILE_TIME_MAX_NPUS>, - kMaxStreamPriorities> + std::array, kMaxStreamPriorities>, + C10_COMPILE_TIME_MAX_NPUS> npu_counters; static std::atomic sync_stream_counters[C10_COMPILE_TIME_MAX_NPUS]; // npu_streams is a stream pool, each device has a stream pool, @@ -81,8 +81,8 @@ static std::atomic sync_stream_counters[C10_COMPILE_TIME_MAX_NPUS]; static std::array< std::array< std::array, - C10_COMPILE_TIME_MAX_NPUS>, - kMaxStreamPriorities> + kMaxStreamPriorities>, + C10_COMPILE_TIME_MAX_NPUS> npu_streams; static thread_local std::unique_ptr current_streams = nullptr; @@ -177,9 +177,9 @@ static c10::StreamId NPUStream_getStreamId(const LeakyStreamInternals* ptr) return makeStreamId(StreamIdType::DEFAULT, 0); } for (const auto p : c10::irange(kMaxStreamPriorities)) { - if (pointer_within(ptr, npu_streams[p][device_index])) { + if (pointer_within(ptr, npu_streams[device_index][p])) { return makeStreamId(StreamIdType(static_cast(StreamIdType::NORMAL) + p), - ptr - npu_streams[p][device_index].data()); + ptr - npu_streams[device_index][p].data()); } } if (pointer_within(ptr, sync_launch_streams[device_index])) { @@ -218,7 +218,7 @@ static void initGlobalStreamState() // Initializes default streams default_streams[device_id].device_index = device_id; for (const auto p : c10::irange(kMaxStreamPriorities)) { - npu_counters[p][device_id] = 0; + npu_counters[device_id][p] = 0; } auto& default_streamsi = default_streams[device_id]; NPU_CHECK_ERROR( @@ -240,7 +240,7 @@ static void initDeviceStreamState(c10::DeviceIndex device_index, int p) NPUGuard device_guard{device_index}; static int StreamsPerPool = GetStreamsPerPool(); for (auto i = decltype(StreamsPerPool){0}; i < StreamsPerPool; ++i) { - auto& npu_streami = npu_streams[p][device_index][i]; + auto& npu_streami = npu_streams[device_index][p][i]; npu_streami.device_index = device_index; @@ -315,7 +315,7 @@ LeakyStreamInternals* NPUStream_internals(NPUStream s) return &default_streams[device_index]; case StreamIdType::NORMAL: case StreamIdType::HIGH: - return &npu_streams[static_cast(st) - static_cast(StreamIdType::NORMAL)][device_index][si]; + return &npu_streams[device_index][static_cast(st) - static_cast(StreamIdType::NORMAL)][si]; case StreamIdType::SECONDARY: return &secondary_streams[device_index]; case StreamIdType::SYNCLAUNCH: @@ -387,8 +387,8 @@ NPUStream getStreamFromPool(const int priority, c10::DeviceIndex device_index) // Initializes the stream pools (once) std::call_once( device_priority_flags[device_index][pri_idx], initDeviceStreamState, device_index, pri_idx); - const auto idx = get_idx(npu_counters[pri_idx][device_index]); - return NPUStream_fromInternals(&npu_streams[pri_idx][device_index][idx]); + const auto idx = get_idx(npu_counters[device_index][pri_idx]); + return NPUStream_fromInternals(&npu_streams[device_index][pri_idx][idx]); } NPUStream getNPUStreamFromPool(c10::DeviceIndex device_index) @@ -641,9 +641,9 @@ void recovery_all_npu_streams(c10::DeviceIndex device_index) NPU_CHECK_ERROR( acl::AclrtCreateStreamWithConfig(&secondary_streamsi.stream, 0, (ACL_STREAM_FAST_LAUNCH | ACL_STREAM_FAST_SYNC))); static int StreamsPerPool = GetStreamsPerPool(); - for (auto i = decltype(StreamsPerPool){0}; i < StreamsPerPool; ++i) { - for (const auto p : c10::irange(kMaxStreamPriorities)) { - auto& npu_streami = npu_streams[p][device_index][i]; + for (const auto p : c10::irange(kMaxStreamPriorities)) { + for (auto i = decltype(StreamsPerPool){0}; i < StreamsPerPool; ++i) { + auto& npu_streami = npu_streams[device_index][p][i]; if (npu_streami.stream == nullptr) { continue; }