MTLSharedEvent for synchronization take 2#690
Conversation
|
Your PR requires formatting changes to meet the project's style guidelines. Click here to view the suggested changes.diff --git a/lib/mtl/capture.jl b/lib/mtl/capture.jl
index 7d28793f..9f29bdd3 100644
--- a/lib/mtl/capture.jl
+++ b/lib/mtl/capture.jl
@@ -12,13 +12,13 @@ Use [`beginScope()`](@ref) and [`endScope()`](@ref) to set the boundaries for a
"""
MTLCaptureScope
-function MTLCaptureScope(queue::MTLDevice, manager=MTLCaptureManager())
+function MTLCaptureScope(queue::MTLDevice, manager = MTLCaptureManager())
handle = @objc [manager::id{MTLCaptureManager} newCaptureScopeWithDevice:queue::id{MTLDevice}]::id{MTLCaptureScope}
- MTLCaptureScope(handle)
+ return MTLCaptureScope(handle)
end
-function MTLCaptureScope(queue::MTLCommandQueue, manager=MTLCaptureManager())
+function MTLCaptureScope(queue::MTLCommandQueue, manager = MTLCaptureManager())
handle = @objc [manager::id{MTLCaptureManager} newCaptureScopeWithCommandQueue:queue::id{MTLCommandQueue}]::id{MTLCaptureScope}
- MTLCaptureScope(handle)
+ return MTLCaptureScope(handle)
end
# @objcwrapper MTLCaptureScope <: NSObject
@@ -68,7 +68,8 @@ function MTLCaptureDescriptor()
end
# TODO: Add capture state
-function MTLCaptureDescriptor(obj::Union{MTLDevice, MTLCommandQueue, MTLCaptureScope},
+function MTLCaptureDescriptor(
+ obj::Union{MTLDevice, MTLCommandQueue, MTLCaptureScope},
destination::MTLCaptureDestination;
folder::String=nothing)
desc = MTLCaptureDescriptor()
@@ -119,7 +120,8 @@ end
Start GPU frame capture using the default capture object and specifying capture descriptor parameters directly.
"""
-function startCapture(obj::Union{MTLDevice, MTLCommandQueue, MTLCaptureScope},
+function startCapture(
+ obj::Union{MTLDevice, MTLCommandQueue, MTLCaptureScope},
destination::MTLCaptureDestination=MTLCaptureDestinationGPUTraceDocument;
folder::String=nothing)
if destination == MTLCaptureDestinationGPUTraceDocument && folder === nothing
diff --git a/lib/mtl/events.jl b/lib/mtl/events.jl
index 374f4285..8d2345ed 100644
--- a/lib/mtl/events.jl
+++ b/lib/mtl/events.jl
@@ -29,9 +29,11 @@ function MTLSharedEvent(dev::MTLDevice)
return obj
end
-function waitUntilSignaledValue(ev::MTLSharedEvent, value, timeoutMS=typemax(UInt64))
- @objc [ev::id{MTLSharedEvent} waitUntilSignaledValue:value::UInt64
- timeoutMS:timeoutMS::UInt64]::Bool
+function waitUntilSignaledValue(ev::MTLSharedEvent, value, timeoutMS = typemax(UInt64))
+ return @objc [
+ ev::id{MTLSharedEvent} waitUntilSignaledValue:value::UInt64
+ timeoutMS:timeoutMS::UInt64
+ ]::Bool
end
## shared event handle
diff --git a/src/state.jl b/src/state.jl
index ffa5f53d..918731c4 100644
--- a/src/state.jl
+++ b/src/state.jl
@@ -61,7 +61,7 @@ end
Return the `MTLSharedEvent` used to synchronize a queue
"""
function queue_event(queue::MTLCommandQueue)
- get!(task_local_storage(), (:MTLSharedEvent, queue)) do
+ return get!(task_local_storage(), (:MTLSharedEvent, queue)) do
MTLSharedEvent(queue.device)
end::MTLSharedEvent
end
diff --git a/test/capturing.jl b/test/capturing.jl
index 9b1da7f5..7dee3cae 100644
--- a/test/capturing.jl
+++ b/test/capturing.jl
@@ -36,9 +36,9 @@ desc.captureObject = dev
desc.destination = MTL.MTLCaptureDestinationGPUTraceDocument
@test desc.destination == MTL.MTLCaptureDestinationGPUTraceDocument
-# Capture Manager supports destination
-@test !supports_destination(manager, MTL.MTLCaptureDestinationDeveloperTools)
-@test supports_destination(manager, MTL.MTLCaptureDestinationGPUTraceDocument)
+ # Capture Manager supports destination
+ @test !supports_destination(manager, MTL.MTLCaptureDestinationDeveloperTools)
+ @test supports_destination(manager, MTL.MTLCaptureDestinationGPUTraceDocument)
# Output URL
@test desc.outputURL === nothing
@@ -69,7 +69,7 @@ bufferA = MtlArray{Float32,1,SharedStorage}(undef, tuple(4))
@test manager.isCapturing == false
startCapture(manager, desc)
@test manager.isCapturing
-@test_throws "Capture manager is already capturing." startCapture(manager, desc)
+ @test_throws "Capture manager is already capturing." startCapture(manager, desc)
Metal.@sync @metal threads=4 tester(bufferA)
stopCapture(manager)
@test manager.isCapturing == false |
f695de3 to
7804676
Compare
There was a problem hiding this comment.
Metal Benchmarks
Details
| Benchmark suite | Current: 04e52b9 | Previous: 822abde | Ratio |
|---|---|---|---|
latency/precompile |
25029149041 ns |
24961495792 ns |
1.00 |
latency/ttfp |
2129357833 ns |
2126570334 ns |
1.00 |
latency/import |
1223676125 ns |
1221863250 ns |
1.00 |
integration/metaldevrt |
836250 ns |
920646 ns |
0.91 |
integration/byval/slices=1 |
1544333 ns |
1636875 ns |
0.94 |
integration/byval/slices=3 |
9111916 ns |
8533229 ns |
1.07 |
integration/byval/reference |
1533291.5 ns |
1625125 ns |
0.94 |
integration/byval/slices=2 |
2568958 ns |
2708917 ns |
0.95 |
kernel/indexing |
568167 ns |
674520.5 ns |
0.84 |
kernel/indexing_checked |
572750 ns |
692979.5 ns |
0.83 |
kernel/launch |
12084 ns |
12292 ns |
0.98 |
kernel/rand |
552417 ns |
623458 ns |
0.89 |
array/construct |
6291 ns |
6250 ns |
1.01 |
array/broadcast |
593666.5 ns |
667000 ns |
0.89 |
array/random/randn/Float32 |
794125 ns |
822375 ns |
0.97 |
array/random/randn!/Float32 |
605666.5 ns |
627417 ns |
0.97 |
array/random/rand!/Int64 |
549958 ns |
555042 ns |
0.99 |
array/random/rand!/Float32 |
464000 ns |
588291 ns |
0.79 |
array/random/rand/Int64 |
774208 ns |
806208 ns |
0.96 |
array/random/rand/Float32 |
645208 ns |
649083 ns |
0.99 |
array/accumulate/Int64/1d |
1255834 ns |
1351708 ns |
0.93 |
array/accumulate/Int64/dims=1 |
1815625 ns |
1895167 ns |
0.96 |
array/accumulate/Int64/dims=2 |
2141166.5 ns |
2224583 ns |
0.96 |
array/accumulate/Int64/dims=1L |
11691792 ns |
11807458 ns |
0.99 |
array/accumulate/Int64/dims=2L |
9648791.5 ns |
9845792 ns |
0.98 |
array/accumulate/Float32/1d |
1100917 ns |
1211020.5 ns |
0.91 |
array/accumulate/Float32/dims=1 |
1542875 ns |
1623125 ns |
0.95 |
array/accumulate/Float32/dims=2 |
1858625 ns |
1913375 ns |
0.97 |
array/accumulate/Float32/dims=1L |
9777021 ns |
10039875 ns |
0.97 |
array/accumulate/Float32/dims=2L |
7226166 ns |
7298229.5 ns |
0.99 |
array/reductions/reduce/Int64/1d |
1280729.5 ns |
1341854 ns |
0.95 |
array/reductions/reduce/Int64/dims=1 |
1078083 ns |
1141520.5 ns |
0.94 |
array/reductions/reduce/Int64/dims=2 |
1176666 ns |
1277145.5 ns |
0.92 |
array/reductions/reduce/Int64/dims=1L |
2020708 ns |
2034687.5 ns |
0.99 |
array/reductions/reduce/Int64/dims=2L |
3404792 ns |
3549500 ns |
0.96 |
array/reductions/reduce/Float32/1d |
964958 ns |
1067125 ns |
0.90 |
array/reductions/reduce/Float32/dims=1 |
815000 ns |
883229.5 ns |
0.92 |
array/reductions/reduce/Float32/dims=2 |
751083 ns |
810979.5 ns |
0.93 |
array/reductions/reduce/Float32/dims=1L |
1330021 ns |
1373208.5 ns |
0.97 |
array/reductions/reduce/Float32/dims=2L |
1755500 ns |
1888250 ns |
0.93 |
array/reductions/mapreduce/Int64/1d |
1310875 ns |
1375438 ns |
0.95 |
array/reductions/mapreduce/Int64/dims=1 |
1079792 ns |
1176500 ns |
0.92 |
array/reductions/mapreduce/Int64/dims=2 |
1189562 ns |
1285062.5 ns |
0.93 |
array/reductions/mapreduce/Int64/dims=1L |
1904562.5 ns |
2087041 ns |
0.91 |
array/reductions/mapreduce/Int64/dims=2L |
3304000 ns |
3445167 ns |
0.96 |
array/reductions/mapreduce/Float32/1d |
985334 ns |
1065917 ns |
0.92 |
array/reductions/mapreduce/Float32/dims=1 |
802875 ns |
892125 ns |
0.90 |
array/reductions/mapreduce/Float32/dims=2 |
758625 ns |
809583 ns |
0.94 |
array/reductions/mapreduce/Float32/dims=1L |
1331812.5 ns |
1359000 ns |
0.98 |
array/reductions/mapreduce/Float32/dims=2L |
1775792 ns |
1898271 ns |
0.94 |
array/private/copyto!/gpu_to_gpu |
634333 ns |
654166 ns |
0.97 |
array/private/copyto!/cpu_to_gpu |
802708 ns |
826625 ns |
0.97 |
array/private/copyto!/gpu_to_cpu |
784458 ns |
813104.5 ns |
0.96 |
array/private/iteration/findall/int |
1571125 ns |
1668417 ns |
0.94 |
array/private/iteration/findall/bool |
1425333.5 ns |
1504500 ns |
0.95 |
array/private/iteration/findfirst/int |
1800541.5 ns |
1934521 ns |
0.93 |
array/private/iteration/findfirst/bool |
1652708 ns |
1855687 ns |
0.89 |
array/private/iteration/scalar |
4217125 ns |
4774709 ns |
0.88 |
array/private/iteration/logical |
2525375 ns |
2574750 ns |
0.98 |
array/private/iteration/findmin/1d |
1875812.5 ns |
2064500 ns |
0.91 |
array/private/iteration/findmin/2d |
1518667 ns |
1624500 ns |
0.93 |
array/private/copy |
568292 ns |
550458 ns |
1.03 |
array/shared/copyto!/gpu_to_gpu |
85125 ns |
83333 ns |
1.02 |
array/shared/copyto!/cpu_to_gpu |
82459 ns |
94500 ns |
0.87 |
array/shared/copyto!/gpu_to_cpu |
82167 ns |
82958 ns |
0.99 |
array/shared/iteration/findall/int |
1576021 ns |
1643125 ns |
0.96 |
array/shared/iteration/findall/bool |
1426479 ns |
1516125 ns |
0.94 |
array/shared/iteration/findfirst/int |
1325084 ns |
1457083 ns |
0.91 |
array/shared/iteration/findfirst/bool |
1308916.5 ns |
1419167 ns |
0.92 |
array/shared/iteration/scalar |
210584 ns |
159229.5 ns |
1.32 |
array/shared/iteration/logical |
2399542 ns |
2361416 ns |
1.02 |
array/shared/iteration/findmin/1d |
1392937.5 ns |
1620625 ns |
0.86 |
array/shared/iteration/findmin/2d |
1518708 ns |
1622208 ns |
0.94 |
array/shared/copy |
253458.5 ns |
245750 ns |
1.03 |
array/permutedims/4d |
2343333.5 ns |
2439396 ns |
0.96 |
array/permutedims/2d |
1146875 ns |
1226959 ns |
0.93 |
array/permutedims/3d |
1666333.5 ns |
1746270.5 ns |
0.95 |
metal/synchronization/stream |
18833 ns |
14833 ns |
1.27 |
metal/synchronization/context |
20042 ns |
15334 ns |
1.31 |
This comment was automatically generated by workflow using github-action-benchmark.
Codecov Report❌ Patch coverage is
Additional details and impacted files@@ Coverage Diff @@
## main #690 +/- ##
==========================================
+ Coverage 80.89% 81.20% +0.31%
==========================================
Files 62 62
Lines 2790 2810 +20
==========================================
+ Hits 2257 2282 +25
+ Misses 533 528 -5 ☔ View full report in Codecov by Sentry. 🚀 New features to boost your workflow:
|
63af0ed to
2a7c978
Compare
|
I suspect The current spinlock is a potential option. Another is maybe ensuring we synchronize after @maleadt any thoughts? |
Seems like the cleaner solution, no? |
|
Unfortunately synchronizing doesn't work. With 14d3d0e locally, when it fails, it usually takes ~90 ms for capture to stop. Should I just use a spinlock? |
|
Yeah, that seems reasonable. Do add some code comments about the reason though. |
- Default to capturing with MTLCaptureScope - Test improvments - Spinlock until capture is over in `stopCapture`
Add `waitUntilSignaledValue`
14d3d0e to
04e52b9
Compare
Not ready to merge as I haven't changed anything since #645 but leaving it up.
I'm considering maybe making it an experimental local preference that's off by default and then adding a special CI to try and catch failures.