@@ -89,6 +89,16 @@ static const bool UseImmediateCommandLists = [] {
8989 return std::stoi (ImmediateFlag) > 0 ;
9090}();
9191
92+ // This is an experimental option that allows the use of multiple command lists
93+ // when submitting barriers. The default is 0.
94+ static const bool UseMultipleCmdlistBarriers = [] {
95+ const char *UseMultipleCmdlistBarriersFlag =
96+ std::getenv (" SYCL_PI_LEVEL_ZERO_USE_MULTIPLE_COMMANDLIST_BARRIERS" );
97+ if (!UseMultipleCmdlistBarriersFlag)
98+ return false ;
99+ return std::stoi (UseMultipleCmdlistBarriersFlag) > 0 ;
100+ }();
101+
92102// This class encapsulates actions taken along with a call to Level Zero API.
93103class ZeCall {
94104private:
@@ -1182,13 +1192,14 @@ pi_result resetCommandLists(pi_queue Queue) {
11821192}
11831193
11841194// Retrieve an available command list to be used in a PI call.
1185- pi_result
1186- _pi_context::getAvailableCommandList (pi_queue Queue,
1187- pi_command_list_ptr_t &CommandList,
1188- bool UseCopyEngine, bool AllowBatching) {
1195+ pi_result _pi_context::getAvailableCommandList (
1196+ pi_queue Queue, pi_command_list_ptr_t &CommandList, bool UseCopyEngine,
1197+ bool AllowBatching, ze_command_queue_handle_t *ForcedCmdQueue) {
11891198 // Immediate commandlists have been pre-allocated and are always available.
11901199 if (UseImmediateCommandLists) {
11911200 CommandList = Queue->getQueueGroup (UseCopyEngine).getImmCmdList ();
1201+ if (auto Res = Queue->insertActiveBarriers (CommandList, UseCopyEngine))
1202+ return Res;
11921203 return PI_SUCCESS;
11931204 }
11941205
@@ -1198,16 +1209,20 @@ _pi_context::getAvailableCommandList(pi_queue Queue,
11981209 // First see if there is an command-list open for batching commands
11991210 // for this queue.
12001211 if (Queue->hasOpenCommandList (UseCopyEngine)) {
1201- if (AllowBatching) {
1212+ if (AllowBatching &&
1213+ (!ForcedCmdQueue ||
1214+ *ForcedCmdQueue == CommandBatch.OpenCommandList ->second .ZeQueue )) {
12021215 CommandList = CommandBatch.OpenCommandList ;
12031216 return PI_SUCCESS;
12041217 }
1205- // If this command isn't allowed to be batched, then we need to
1206- // go ahead and execute what is already in the batched list,
1207- // and then go on to process this. On exit from executeOpenCommandList
1208- // OpenCommandList will be invalidated.
1218+ // If this command isn't allowed to be batched or doesn't match the forced
1219+ // command queue, then we need to go ahead and execute what is already in
1220+ // the batched list, and then go on to process this. On exit from
1221+ // executeOpenCommandList OpenCommandList will be invalidated.
12091222 if (auto Res = Queue->executeOpenCommandList (UseCopyEngine))
12101223 return Res;
1224+ // Note that active barriers do not need to be inserted here as they will
1225+ // have been enqueued into the command-list when they were created.
12111226 }
12121227
12131228 // Create/Reuse the command list, because in Level Zero commands are added to
@@ -1231,10 +1246,13 @@ _pi_context::getAvailableCommandList(pi_queue Queue,
12311246 : Queue->Context
12321247 ->ZeComputeCommandListCache [Queue->Device ->ZeDevice ];
12331248
1234- if (ZeCommandListCache.size () > 0 ) {
1235- auto &ZeCommandList = ZeCommandListCache.front ();
1249+ for (auto ZeCommandListIt = ZeCommandListCache.begin ();
1250+ ZeCommandListIt != ZeCommandListCache.end (); ++ZeCommandListIt) {
1251+ auto &ZeCommandList = *ZeCommandListIt;
12361252 auto it = Queue->CommandListMap .find (ZeCommandList);
12371253 if (it != Queue->CommandListMap .end ()) {
1254+ if (ForcedCmdQueue && *ForcedCmdQueue != it->second .ZeQueue )
1255+ continue ;
12381256 CommandList = it;
12391257 if (CommandList->second .ZeFence != nullptr )
12401258 CommandList->second .ZeFenceInUse = true ;
@@ -1243,9 +1261,13 @@ _pi_context::getAvailableCommandList(pi_queue Queue,
12431261 // wasn't yet used in this queue then create a new entry in this
12441262 // queue's map to hold the fence and other associated command
12451263 // list information.
1264+ auto &QGroup = Queue->getQueueGroup (UseCopyEngine);
12461265 uint32_t QueueGroupOrdinal;
1247- auto &ZeCommandQueue =
1248- Queue->getQueueGroup (UseCopyEngine).getZeQueue (&QueueGroupOrdinal);
1266+ auto &ZeCommandQueue = ForcedCmdQueue
1267+ ? *ForcedCmdQueue
1268+ : QGroup.getZeQueue (&QueueGroupOrdinal);
1269+ if (ForcedCmdQueue)
1270+ QueueGroupOrdinal = QGroup.getCmdQueueOrdinal (ZeCommandQueue);
12491271
12501272 ze_fence_handle_t ZeFence;
12511273 ZE_CALL (zeFenceCreate, (ZeCommandQueue, &ZeFenceDesc, &ZeFence));
@@ -1256,7 +1278,9 @@ _pi_context::getAvailableCommandList(pi_queue Queue,
12561278 QueueGroupOrdinal})
12571279 .first ;
12581280 }
1259- ZeCommandListCache.pop_front ();
1281+ ZeCommandListCache.erase (ZeCommandListIt);
1282+ if (auto Res = Queue->insertActiveBarriers (CommandList, UseCopyEngine))
1283+ return Res;
12601284 return PI_SUCCESS;
12611285 }
12621286 }
@@ -1293,9 +1317,12 @@ _pi_context::getAvailableCommandList(pi_queue Queue,
12931317 ze_command_list_handle_t ZeCommandList;
12941318 ze_fence_handle_t ZeFence;
12951319
1320+ auto &QGroup = Queue->getQueueGroup (UseCopyEngine);
12961321 uint32_t QueueGroupOrdinal;
12971322 auto &ZeCommandQueue =
1298- Queue->getQueueGroup (UseCopyEngine).getZeQueue (&QueueGroupOrdinal);
1323+ ForcedCmdQueue ? *ForcedCmdQueue : QGroup.getZeQueue (&QueueGroupOrdinal);
1324+ if (ForcedCmdQueue)
1325+ QueueGroupOrdinal = QGroup.getCmdQueueOrdinal (ZeCommandQueue);
12991326
13001327 ZeStruct<ze_command_list_desc_t > ZeCommandListDesc;
13011328 ZeCommandListDesc.commandQueueGroupOrdinal = QueueGroupOrdinal;
@@ -1308,6 +1335,8 @@ _pi_context::getAvailableCommandList(pi_queue Queue,
13081335 std::tie (CommandList, std::ignore) = Queue->CommandListMap .insert (
13091336 std::pair<ze_command_list_handle_t , pi_command_list_info_t >(
13101337 ZeCommandList, {ZeFence, true , ZeCommandQueue, QueueGroupOrdinal}));
1338+ if (auto Res = Queue->insertActiveBarriers (CommandList, UseCopyEngine))
1339+ return Res;
13111340 pi_result = PI_SUCCESS;
13121341
13131342 return pi_result;
@@ -1585,6 +1614,18 @@ uint32_t _pi_queue::pi_queue_group_t::getQueueIndex(uint32_t *QueueGroupOrdinal,
15851614 return CurrentIndex;
15861615}
15871616
1617+ int32_t _pi_queue::pi_queue_group_t::getCmdQueueOrdinal (
1618+ ze_command_queue_handle_t CmdQueue) {
1619+ // Find out the right queue group ordinal (first queue might be "main" or
1620+ // "link")
1621+ auto QueueType = Type;
1622+ if (QueueType != queue_type::Compute)
1623+ QueueType = (ZeQueues[0 ] == CmdQueue && Queue->Device ->hasMainCopyEngine ())
1624+ ? queue_type::MainCopy
1625+ : queue_type::LinkCopy;
1626+ return Queue->Device ->QueueGroup [QueueType].ZeOrdinal ;
1627+ }
1628+
15881629// This function will return one of possibly multiple available native
15891630// queues and the value of the queue group ordinal.
15901631ze_command_queue_handle_t &
@@ -1697,6 +1738,36 @@ pi_command_list_ptr_t _pi_queue::eventOpenCommandList(pi_event Event) {
16971738 return CommandListMap.end ();
16981739}
16991740
1741+ pi_result _pi_queue::insertActiveBarriers (pi_command_list_ptr_t &CmdList,
1742+ bool UseCopyEngine) {
1743+ // Early exit if there are no active barriers.
1744+ if (ActiveBarriers.empty ())
1745+ return PI_SUCCESS;
1746+
1747+ // Create a wait-list and retain events. This will filter out finished events.
1748+ _pi_ze_event_list_t ActiveBarriersWaitList;
1749+ if (auto Res = ActiveBarriersWaitList.createAndRetainPiZeEventList (
1750+ ActiveBarriers.size (), ActiveBarriers.data (), this , UseCopyEngine))
1751+ return Res;
1752+
1753+ // We can now release all the active barriers and replace them with the ones
1754+ // in the wait list.
1755+ for (pi_event &BarrierEvent : ActiveBarriers)
1756+ PI_CALL (piEventRelease (BarrierEvent));
1757+ ActiveBarriers.clear ();
1758+ ActiveBarriers.insert (
1759+ ActiveBarriers.end (), ActiveBarriersWaitList.PiEventList ,
1760+ ActiveBarriersWaitList.PiEventList + ActiveBarriersWaitList.Length );
1761+
1762+ // If there are more active barriers, insert a barrier on the command-list. We
1763+ // do not need an event for finishing so we pass nullptr.
1764+ if (!ActiveBarriers.empty ())
1765+ ZE_CALL (zeCommandListAppendBarrier,
1766+ (CmdList->first , nullptr , ActiveBarriersWaitList.Length ,
1767+ ActiveBarriersWaitList.ZeEventList ));
1768+ return PI_SUCCESS;
1769+ }
1770+
17001771pi_result _pi_queue::executeOpenCommandList (bool IsCopy) {
17011772 auto &CommandBatch = IsCopy ? CopyCommandBatch : ComputeCommandBatch;
17021773 // If there are any commands still in the open command list for this
@@ -6013,35 +6084,154 @@ pi_result piEnqueueEventsWaitWithBarrier(pi_queue Queue,
60136084 // Lock automatically releases when this goes out of scope.
60146085 std::scoped_lock lock (Queue->Mutex );
60156086
6016- bool UseCopyEngine = false ;
6017-
6018- _pi_ze_event_list_t TmpWaitList;
6019- if (auto Res = TmpWaitList.createAndRetainPiZeEventList (
6020- NumEventsInWaitList, EventWaitList, Queue, UseCopyEngine))
6021- return Res;
6087+ // Helper function for appending a barrier to a command list.
6088+ auto insertBarrierIntoCmdList =
6089+ [&Queue](pi_command_list_ptr_t CmdList,
6090+ const _pi_ze_event_list_t &EventWaitList, pi_event &Event) {
6091+ if (auto Res = createEventAndAssociateQueue (
6092+ Queue, &Event, PI_COMMAND_TYPE_USER, CmdList))
6093+ return Res;
6094+ Event->WaitList = EventWaitList;
6095+ ZE_CALL (zeCommandListAppendBarrier,
6096+ (CmdList->first , Event->ZeEvent , EventWaitList.Length ,
6097+ EventWaitList.ZeEventList ));
6098+ return PI_SUCCESS;
6099+ };
60226100
6023- // Get a new command list to be used on this call
6101+ // Indicator for whether batching is allowed. This may be changed later in
6102+ // this function, but allow it by default.
60246103 bool OkToBatch = true ;
6025- pi_command_list_ptr_t CommandList{};
6026- if (auto Res = Queue->Context ->getAvailableCommandList (
6027- Queue, CommandList, UseCopyEngine, OkToBatch))
6028- return Res;
60296104
6030- ze_event_handle_t ZeEvent = nullptr ;
6031- auto Res = createEventAndAssociateQueue (Queue, Event, PI_COMMAND_TYPE_USER,
6032- CommandList);
6033- if (Res != PI_SUCCESS)
6034- return Res;
6035- ZeEvent = (*Event)->ZeEvent ;
6036- (*Event)->WaitList = TmpWaitList;
6105+ // If we have a list of events to make the barrier from, then we can create a
6106+ // barrier on these and use the resulting event as our future barrier.
6107+ // We use the same approach if
6108+ // SYCL_PI_LEVEL_ZERO_USE_MULTIPLE_COMMANDLIST_BARRIERS is not set to a
6109+ // positive value.
6110+ if (NumEventsInWaitList || !UseMultipleCmdlistBarriers) {
6111+ // Retain the events as they will be owned by the result event.
6112+ _pi_ze_event_list_t TmpWaitList;
6113+ if (auto Res = TmpWaitList.createAndRetainPiZeEventList (
6114+ NumEventsInWaitList, EventWaitList, Queue,
6115+ /* UseCopyEngine=*/ false ))
6116+ return Res;
60376117
6038- ZE_CALL (zeCommandListAppendBarrier,
6039- (CommandList->first , ZeEvent, (*Event)->WaitList .Length ,
6040- (*Event)->WaitList .ZeEventList ));
6118+ // Get an arbitrary command-list in the queue.
6119+ pi_command_list_ptr_t CmdList;
6120+ if (auto Res = Queue->Context ->getAvailableCommandList (
6121+ Queue, CmdList,
6122+ /* UseCopyEngine=*/ false , OkToBatch))
6123+ return Res;
60416124
6042- // Execute command list asynchronously as the event will be used
6043- // to track down its completion.
6044- return Queue->executeCommandList (CommandList, false , OkToBatch);
6125+ // Insert the barrier into the command-list and execute.
6126+ if (auto Res = insertBarrierIntoCmdList (CmdList, TmpWaitList, *Event))
6127+ return Res;
6128+ if (auto Res = Queue->executeCommandList (CmdList, false , OkToBatch))
6129+ return Res;
6130+
6131+ if (UseMultipleCmdlistBarriers) {
6132+ // Retain and save the resulting event for future commands.
6133+ PI_CALL (piEventRetain (*Event));
6134+ Queue->ActiveBarriers .push_back (*Event);
6135+ }
6136+ return PI_SUCCESS;
6137+ }
6138+
6139+ // Since there are no events to explicitly create a barrier for, we are
6140+ // inserting a queue-wide barrier. As such, the barrier will also encapsulate
6141+ // the active barriers, so we can release and clear the active barriers list.
6142+ // Doing it early prevents potential additional barriers from implicitly being
6143+ // appended.
6144+ for (pi_event &E : Queue->ActiveBarriers )
6145+ PI_CALL (piEventRelease (E));
6146+ Queue->ActiveBarriers .clear ();
6147+
6148+ // Get command lists for each command queue.
6149+ std::vector<pi_command_list_ptr_t > CmdLists;
6150+ if (UseImmediateCommandLists) {
6151+ // If immediate command lists are being used, each will act as their own
6152+ // queue, so we must insert a barrier into each.
6153+ CmdLists.reserve (Queue->CommandListMap .size ());
6154+ for (auto It = Queue->CommandListMap .begin ();
6155+ It != Queue->CommandListMap .end (); ++It)
6156+ CmdLists.push_back (It);
6157+ } else if (Queue->ComputeQueueGroup .ZeQueues .empty () &&
6158+ Queue->CopyQueueGroup .ZeQueues .empty ()) {
6159+ // If there are no queues, we get any available command list.
6160+ pi_command_list_ptr_t CmdList;
6161+ if (auto Res = Queue->Context ->getAvailableCommandList (
6162+ Queue, CmdList,
6163+ /* UseCopyEngine=*/ false , OkToBatch))
6164+ return Res;
6165+ CmdLists.push_back (CmdList);
6166+ } else {
6167+ size_t NumQueues = Queue->ComputeQueueGroup .ZeQueues .size () +
6168+ Queue->CopyQueueGroup .ZeQueues .size ();
6169+ // Only allow batching if there is only a single queue as otherwise the
6170+ // following availability command list lookups will prematurely push
6171+ // open batch command lists out.
6172+ OkToBatch = NumQueues == 1 ;
6173+ // Get an available command list tied to each command queue. We need these
6174+ // so a queue-wide barrier can be inserted into each command queue.
6175+ CmdLists.reserve (NumQueues);
6176+ for (auto QueueGroup : {Queue->ComputeQueueGroup , Queue->CopyQueueGroup }) {
6177+ bool UseCopyEngine = QueueGroup.Type != _pi_queue::queue_type::Compute;
6178+ for (ze_command_queue_handle_t ZeQueue : QueueGroup.ZeQueues ) {
6179+ pi_command_list_ptr_t CmdList;
6180+ if (auto Res = Queue->Context ->getAvailableCommandList (
6181+ Queue, CmdList, UseCopyEngine, OkToBatch, &ZeQueue))
6182+ return Res;
6183+ CmdLists.push_back (CmdList);
6184+ }
6185+ }
6186+ }
6187+
6188+ // Insert a barrier into each unique command queue using the available
6189+ // command-lists.
6190+ std::vector<pi_event> EventWaitVector (CmdLists.size ());
6191+ for (size_t I = 0 ; I < CmdLists.size (); ++I)
6192+ if (auto Res = insertBarrierIntoCmdList (CmdLists[I], _pi_ze_event_list_t {},
6193+ EventWaitVector[I]))
6194+ return Res;
6195+
6196+ if (CmdLists.size () > 1 ) {
6197+ // If there were multiple queues we need to create a "convergence" event to
6198+ // be our active barrier. This convergence event is signalled by a barrier
6199+ // on all the events from the barriers we have inserted into each queue.
6200+ // Use the first command list as our convergence command list.
6201+ pi_command_list_ptr_t &ConvergenceCmdList = CmdLists[0 ];
6202+
6203+ // Create an event list. It will take ownership over all relevant events so
6204+ // we relinquish ownership and let it keep all events it needs.
6205+ _pi_ze_event_list_t BaseWaitList;
6206+ if (auto Res = BaseWaitList.createAndRetainPiZeEventList (
6207+ EventWaitVector.size (), EventWaitVector.data (), Queue,
6208+ ConvergenceCmdList->second .isCopy (Queue)))
6209+ return Res;
6210+ for (pi_event &E : EventWaitVector)
6211+ PI_CALL (piEventRelease (E));
6212+
6213+ // Insert a barrier with the events from each command-queue into the
6214+ // convergence command list. The resulting event signals the convergence of
6215+ // all barriers.
6216+ if (auto Res =
6217+ insertBarrierIntoCmdList (ConvergenceCmdList, BaseWaitList, *Event))
6218+ return Res;
6219+ } else {
6220+ // If there is only a single queue we have inserted all the barriers we need
6221+ // and the single result event can be used as our active barrier and used as
6222+ // the return event.
6223+ *Event = EventWaitVector[0 ];
6224+ }
6225+
6226+ // Execute each command list so the barriers can be encountered.
6227+ for (pi_command_list_ptr_t &CmdList : CmdLists)
6228+ if (auto Res = Queue->executeCommandList (CmdList, false , OkToBatch))
6229+ return Res;
6230+
6231+ // We must keep the event internally to use if new command lists are created.
6232+ PI_CALL (piEventRetain (*Event));
6233+ Queue->ActiveBarriers .push_back (*Event);
6234+ return PI_SUCCESS;
60456235}
60466236
60476237pi_result piEnqueueMemBufferRead (pi_queue Queue, pi_mem Src,
@@ -6134,6 +6324,13 @@ pi_result _pi_queue::synchronize() {
61346324 if (ZeQueue)
61356325 ZE_CALL (zeHostSynchronize, (ZeQueue));
61366326 }
6327+
6328+ // With the entire queue synchronized, the active barriers must be done so we
6329+ // can remove them.
6330+ for (pi_event &BarrierEvent : ActiveBarriers)
6331+ PI_CALL (piEventRelease (BarrierEvent));
6332+ ActiveBarriers.clear ();
6333+
61376334 return PI_SUCCESS;
61386335}
61396336
0 commit comments