From 7f1dfe9ab1cd11d54274db229b8c8958a9c0d049 Mon Sep 17 00:00:00 2001 From: Phillip Cloud <417981+cpcloud@users.noreply.github.com> Date: Tue, 20 Jan 2026 16:45:43 -0500 Subject: [PATCH 01/12] test: add Device test for zero sized allocation --- cuda_core/tests/test_device.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/cuda_core/tests/test_device.py b/cuda_core/tests/test_device.py index e4365ac0c9..e6d6131e72 100644 --- a/cuda_core/tests/test_device.py +++ b/cuda_core/tests/test_device.py @@ -47,6 +47,17 @@ def test_device_alloc(deinit_cuda): assert buffer.device_id == int(device) +def test_device_alloc_zero_bytes(deinit_cuda): + device = Device() + device.set_current() + buffer = device.allocate(0) + device.sync() + # TODO: should the handle be zero here? + assert buffer.handle != 0 + assert buffer.size == 0 + assert buffer.device_id == int(device) + + def test_device_id(deinit_cuda): for device in Device.get_all_devices(): device.set_current() From c78ecd756edbfd152c9c69815eebd8fb41bdccf6 Mon Sep 17 00:00:00 2001 From: Phillip Cloud <417981+cpcloud@users.noreply.github.com> Date: Tue, 20 Jan 2026 16:52:24 -0500 Subject: [PATCH 02/12] test: handle is zero --- cuda_core/tests/test_device.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/cuda_core/tests/test_device.py b/cuda_core/tests/test_device.py index e6d6131e72..7094eee667 100644 --- a/cuda_core/tests/test_device.py +++ b/cuda_core/tests/test_device.py @@ -52,8 +52,7 @@ def test_device_alloc_zero_bytes(deinit_cuda): device.set_current() buffer = device.allocate(0) device.sync() - # TODO: should the handle be zero here? - assert buffer.handle != 0 + assert buffer.handle == 0 assert buffer.size == 0 assert buffer.device_id == int(device) From d74e01d43e53cc2defd8ab9e65f9c324b8f7eeb6 Mon Sep 17 00:00:00 2001 From: Phillip Cloud <417981+cpcloud@users.noreply.github.com> Date: Wed, 21 Jan 2026 07:49:46 -0500 Subject: [PATCH 03/12] test: fix handle assertion --- cuda_core/tests/test_device.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cuda_core/tests/test_device.py b/cuda_core/tests/test_device.py index 7094eee667..8726c65a30 100644 --- a/cuda_core/tests/test_device.py +++ b/cuda_core/tests/test_device.py @@ -52,7 +52,7 @@ def test_device_alloc_zero_bytes(deinit_cuda): device.set_current() buffer = device.allocate(0) device.sync() - assert buffer.handle == 0 + assert buffer.handle >= 0 assert buffer.size == 0 assert buffer.device_id == int(device) From c73796de4628ec7e938b883a53db95dac748be43 Mon Sep 17 00:00:00 2001 From: Phillip Cloud <417981+cpcloud@users.noreply.github.com> Date: Wed, 21 Jan 2026 07:50:05 -0500 Subject: [PATCH 04/12] fix: ensure that zero sized allocations work --- cuda_core/cuda/core/_cpp/resource_handles.cpp | 54 +++++++++++++++++-- 1 file changed, 51 insertions(+), 3 deletions(-) diff --git a/cuda_core/cuda/core/_cpp/resource_handles.cpp b/cuda_core/cuda/core/_cpp/resource_handles.cpp index 724ea97169..35ee68ae26 100644 --- a/cuda_core/cuda/core/_cpp/resource_handles.cpp +++ b/cuda_core/cuda/core/_cpp/resource_handles.cpp @@ -455,7 +455,19 @@ void set_deallocation_stream(const DevicePtrHandle& h, StreamHandle h_stream) no DevicePtrHandle deviceptr_alloc_from_pool(size_t size, MemoryPoolHandle h_pool, StreamHandle h_stream) { GILReleaseGuard gil; - CUdeviceptr ptr; + CUdeviceptr ptr = 0; + + if (size == 0) { + auto box = std::shared_ptr( + new DevicePtrBox{ptr, h_stream}, + [h_pool](DevicePtrBox* b) { + GILReleaseGuard gil; + delete b; + } + ); + return DevicePtrHandle(box, &box->resource); + } + if (CUDA_SUCCESS != (err = p_cuMemAllocFromPoolAsync(&ptr, size, *h_pool, as_cu(h_stream)))) { return {}; } @@ -473,7 +485,19 @@ DevicePtrHandle deviceptr_alloc_from_pool(size_t size, MemoryPoolHandle h_pool, DevicePtrHandle deviceptr_alloc_async(size_t size, StreamHandle h_stream) { GILReleaseGuard gil; - CUdeviceptr ptr; + CUdeviceptr ptr = 0; + + if (size == 0) { + auto box = std::shared_ptr( + new DevicePtrBox{ptr, h_stream}, + [](DevicePtrBox* b) { + GILReleaseGuard gil; + delete b; + } + ); + return DevicePtrHandle(box, &box->resource); + } + if (CUDA_SUCCESS != (err = p_cuMemAllocAsync(&ptr, size, as_cu(h_stream)))) { return {}; } @@ -492,6 +516,18 @@ DevicePtrHandle deviceptr_alloc_async(size_t size, StreamHandle h_stream) { DevicePtrHandle deviceptr_alloc(size_t size) { GILReleaseGuard gil; CUdeviceptr ptr; + + if (size == 0) { + auto box = std::shared_ptr( + new DevicePtrBox{ptr, h_stream}, + [](DevicePtrBox* b) { + GILReleaseGuard gil; + delete b; + } + ); + return DevicePtrHandle(box, &box->resource); + } + if (CUDA_SUCCESS != (err = p_cuMemAlloc(&ptr, size))) { return {}; } @@ -509,7 +545,19 @@ DevicePtrHandle deviceptr_alloc(size_t size) { DevicePtrHandle deviceptr_alloc_host(size_t size) { GILReleaseGuard gil; - void* ptr; + void* ptr = nullptr; + + if (size == 0) { + auto box = std::shared_ptr( + new DevicePtrBox{reinterpret_cast(ptr), StreamHandle{}}, + [](DevicePtrBox* b) { + GILReleaseGuard gil; + delete b; + } + ); + return DevicePtrHandle(box, &box->resource); + } + if (CUDA_SUCCESS != (err = p_cuMemAllocHost(&ptr, size))) { return {}; } From 40463d8600fb8f6809df3f7c4b52e604376c18d8 Mon Sep 17 00:00:00 2001 From: Phillip Cloud <417981+cpcloud@users.noreply.github.com> Date: Wed, 21 Jan 2026 07:52:34 -0500 Subject: [PATCH 05/12] chore: zero out last device ptr --- cuda_core/cuda/core/_cpp/resource_handles.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cuda_core/cuda/core/_cpp/resource_handles.cpp b/cuda_core/cuda/core/_cpp/resource_handles.cpp index 35ee68ae26..8f5e93c5fc 100644 --- a/cuda_core/cuda/core/_cpp/resource_handles.cpp +++ b/cuda_core/cuda/core/_cpp/resource_handles.cpp @@ -515,7 +515,7 @@ DevicePtrHandle deviceptr_alloc_async(size_t size, StreamHandle h_stream) { DevicePtrHandle deviceptr_alloc(size_t size) { GILReleaseGuard gil; - CUdeviceptr ptr; + CUdeviceptr ptr = 0; if (size == 0) { auto box = std::shared_ptr( From 324de1ccd1e78370007041736f517d00d7b983c1 Mon Sep 17 00:00:00 2001 From: Phillip Cloud <417981+cpcloud@users.noreply.github.com> Date: Wed, 21 Jan 2026 07:53:41 -0500 Subject: [PATCH 06/12] fix: h_stream --- cuda_core/cuda/core/_cpp/resource_handles.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cuda_core/cuda/core/_cpp/resource_handles.cpp b/cuda_core/cuda/core/_cpp/resource_handles.cpp index 8f5e93c5fc..ad78c8ba7d 100644 --- a/cuda_core/cuda/core/_cpp/resource_handles.cpp +++ b/cuda_core/cuda/core/_cpp/resource_handles.cpp @@ -519,7 +519,7 @@ DevicePtrHandle deviceptr_alloc(size_t size) { if (size == 0) { auto box = std::shared_ptr( - new DevicePtrBox{ptr, h_stream}, + new DevicePtrBox{ptr, StreamHandle{}}, [](DevicePtrBox* b) { GILReleaseGuard gil; delete b; From e112fba712fc133a673d2e8d6262cfc33ca64665 Mon Sep 17 00:00:00 2001 From: Phillip Cloud <417981+cpcloud@users.noreply.github.com> Date: Wed, 21 Jan 2026 08:01:13 -0500 Subject: [PATCH 07/12] test: add graph memory resource test --- cuda_core/tests/test_graph_mem.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/cuda_core/tests/test_graph_mem.py b/cuda_core/tests/test_graph_mem.py index 5159fd2b2b..bcb8a800a1 100644 --- a/cuda_core/tests/test_graph_mem.py +++ b/cuda_core/tests/test_graph_mem.py @@ -182,6 +182,23 @@ def test_graph_alloc_with_output(mempool_device, mode): assert compare_buffer_to_constant(out, 6) +@pytest.mark.parametrize("mode", ["global", "thread_local", "relaxed"]) +def test_graph_mem_alloc_zero(mempool_device, mode): + device = mempool_device + gb = device.create_graph_builder().begin_building(mode) + stream = device.create_stream() + gmr = GraphMemoryResource(device) + buffer = gmr.allocate(0, stream=gb) + graph = gb.end_building().complete() + graph.upload(stream) + graph.launch(stream) + stream.sync() + + assert buffer.handle >= 0 + assert buffer.size == 0 + assert buffer.device_id == int(device) + + @pytest.mark.parametrize("mode", ["global", "thread_local", "relaxed"]) def test_graph_mem_set_attributes(mempool_device, mode): device = mempool_device From f48cd4d3ae063e94fb5b242b326d1030a2b22d4b Mon Sep 17 00:00:00 2001 From: Phillip Cloud <417981+cpcloud@users.noreply.github.com> Date: Wed, 21 Jan 2026 08:44:10 -0500 Subject: [PATCH 08/12] chore: handle legacy allocators --- cuda_core/cuda/core/_memory/_legacy.py | 25 +++++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) diff --git a/cuda_core/cuda/core/_memory/_legacy.py b/cuda_core/cuda/core/_memory/_legacy.py index 9250819610..62b9072a2d 100644 --- a/cuda_core/cuda/core/_memory/_legacy.py +++ b/cuda_core/cuda/core/_memory/_legacy.py @@ -46,8 +46,11 @@ def allocate(self, size, stream=None) -> Buffer: from cuda.core._stream import default_stream stream = default_stream() - err, ptr = driver.cuMemAllocHost(size) - raise_if_driver_error(err) + if size: + err, ptr = driver.cuMemAllocHost(size) + raise_if_driver_error(err) + else: + ptr = 0 return Buffer._init(ptr, size, self, stream) def deallocate(self, ptr: DevicePointerT, size, stream): @@ -64,8 +67,10 @@ def deallocate(self, ptr: DevicePointerT, size, stream): """ if stream is not None: stream.sync() - (err,) = driver.cuMemFreeHost(ptr) - raise_if_driver_error(err) + + if size: + (err,) = driver.cuMemFreeHost(ptr) + raise_if_driver_error(err) @property def is_device_accessible(self) -> bool: @@ -96,15 +101,19 @@ def allocate(self, size, stream=None) -> Buffer: from cuda.core._stream import default_stream stream = default_stream() - err, ptr = driver.cuMemAlloc(size) - raise_if_driver_error(err) + if size: + err, ptr = driver.cuMemAlloc(size) + raise_if_driver_error(err) + else: + ptr = 0 return Buffer._init(ptr, size, self, stream) def deallocate(self, ptr, size, stream): if stream is not None: stream.sync() - (err,) = driver.cuMemFree(ptr) - raise_if_driver_error(err) + if size: + (err,) = driver.cuMemFree(ptr) + raise_if_driver_error(err) @property def is_device_accessible(self) -> bool: From f7355d5a7fa69fb6a7868b13f9d8f1e00b50b472 Mon Sep 17 00:00:00 2001 From: Phillip Cloud <417981+cpcloud@users.noreply.github.com> Date: Wed, 21 Jan 2026 12:12:14 -0500 Subject: [PATCH 09/12] test: add zero byte alloc for other resources --- cuda_core/tests/test_memory.py | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py index 71adb4ffc7..ced9c2947b 100644 --- a/cuda_core/tests/test_memory.py +++ b/cuda_core/tests/test_memory.py @@ -1240,3 +1240,28 @@ def test_graph_memory_resource_object(init_cuda): # These objects are interned. assert gmr1 is gmr2 is gmr3 assert gmr1 == gmr2 == gmr3 + + +def test_memory_resource_alloc_zero_bytes(init_cuda, memory_resource_factory): + MR, MROps = memory_resource_factory + + device = Device() + device.set_current() + + if MR is DeviceMemoryResource and not device.properties.memory_pools_supported: + pytest.skip("Device does not support mempool operations") + elif MR is PinnedMemoryResource: + skip_if_pinned_memory_unsupported(device) + mr = MR() + elif MR is ManagedMemoryResource: + skip_if_managed_memory_unsupported(device) + mr = MR(MROps(preferred_location=device.device_id)) + else: + assert MR in (DeviceMemoryResource, VirtualMemoryResource) + mr = MR(device) + + buffer = mr.allocate(0) + device.sync() + assert buffer.handle >= 0 + assert buffer.size == 0 + assert buffer.device_id == mr.device_id From 8aa0108621d0c4911029e04c2d11f9ed50100574 Mon Sep 17 00:00:00 2001 From: Phillip Cloud <417981+cpcloud@users.noreply.github.com> Date: Wed, 21 Jan 2026 12:13:18 -0500 Subject: [PATCH 10/12] chore: revert cpp changes --- cuda_core/cuda/core/_cpp/resource_handles.cpp | 56 ++----------------- 1 file changed, 4 insertions(+), 52 deletions(-) diff --git a/cuda_core/cuda/core/_cpp/resource_handles.cpp b/cuda_core/cuda/core/_cpp/resource_handles.cpp index ad78c8ba7d..724ea97169 100644 --- a/cuda_core/cuda/core/_cpp/resource_handles.cpp +++ b/cuda_core/cuda/core/_cpp/resource_handles.cpp @@ -455,19 +455,7 @@ void set_deallocation_stream(const DevicePtrHandle& h, StreamHandle h_stream) no DevicePtrHandle deviceptr_alloc_from_pool(size_t size, MemoryPoolHandle h_pool, StreamHandle h_stream) { GILReleaseGuard gil; - CUdeviceptr ptr = 0; - - if (size == 0) { - auto box = std::shared_ptr( - new DevicePtrBox{ptr, h_stream}, - [h_pool](DevicePtrBox* b) { - GILReleaseGuard gil; - delete b; - } - ); - return DevicePtrHandle(box, &box->resource); - } - + CUdeviceptr ptr; if (CUDA_SUCCESS != (err = p_cuMemAllocFromPoolAsync(&ptr, size, *h_pool, as_cu(h_stream)))) { return {}; } @@ -485,19 +473,7 @@ DevicePtrHandle deviceptr_alloc_from_pool(size_t size, MemoryPoolHandle h_pool, DevicePtrHandle deviceptr_alloc_async(size_t size, StreamHandle h_stream) { GILReleaseGuard gil; - CUdeviceptr ptr = 0; - - if (size == 0) { - auto box = std::shared_ptr( - new DevicePtrBox{ptr, h_stream}, - [](DevicePtrBox* b) { - GILReleaseGuard gil; - delete b; - } - ); - return DevicePtrHandle(box, &box->resource); - } - + CUdeviceptr ptr; if (CUDA_SUCCESS != (err = p_cuMemAllocAsync(&ptr, size, as_cu(h_stream)))) { return {}; } @@ -515,19 +491,7 @@ DevicePtrHandle deviceptr_alloc_async(size_t size, StreamHandle h_stream) { DevicePtrHandle deviceptr_alloc(size_t size) { GILReleaseGuard gil; - CUdeviceptr ptr = 0; - - if (size == 0) { - auto box = std::shared_ptr( - new DevicePtrBox{ptr, StreamHandle{}}, - [](DevicePtrBox* b) { - GILReleaseGuard gil; - delete b; - } - ); - return DevicePtrHandle(box, &box->resource); - } - + CUdeviceptr ptr; if (CUDA_SUCCESS != (err = p_cuMemAlloc(&ptr, size))) { return {}; } @@ -545,19 +509,7 @@ DevicePtrHandle deviceptr_alloc(size_t size) { DevicePtrHandle deviceptr_alloc_host(size_t size) { GILReleaseGuard gil; - void* ptr = nullptr; - - if (size == 0) { - auto box = std::shared_ptr( - new DevicePtrBox{reinterpret_cast(ptr), StreamHandle{}}, - [](DevicePtrBox* b) { - GILReleaseGuard gil; - delete b; - } - ); - return DevicePtrHandle(box, &box->resource); - } - + void* ptr; if (CUDA_SUCCESS != (err = p_cuMemAllocHost(&ptr, size))) { return {}; } From 891c8c1067b23321a8e7edb5501b370601b7968b Mon Sep 17 00:00:00 2001 From: Phillip Cloud <417981+cpcloud@users.noreply.github.com> Date: Wed, 21 Jan 2026 12:16:07 -0500 Subject: [PATCH 11/12] chore: remove invalid check --- cuda_core/tests/test_memory.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py index ced9c2947b..5732e25e89 100644 --- a/cuda_core/tests/test_memory.py +++ b/cuda_core/tests/test_memory.py @@ -1257,7 +1257,7 @@ def test_memory_resource_alloc_zero_bytes(init_cuda, memory_resource_factory): skip_if_managed_memory_unsupported(device) mr = MR(MROps(preferred_location=device.device_id)) else: - assert MR in (DeviceMemoryResource, VirtualMemoryResource) + assert MR is DeviceMemoryResource mr = MR(device) buffer = mr.allocate(0) From b3f7285f37b4c9d1191b317855a3f4ab60d2c9f3 Mon Sep 17 00:00:00 2001 From: Phillip Cloud <417981+cpcloud@users.noreply.github.com> Date: Wed, 21 Jan 2026 12:43:58 -0500 Subject: [PATCH 12/12] chore: create managed memory resource helper --- cuda_core/tests/test_memory.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py index 5732e25e89..8851a4600a 100644 --- a/cuda_core/tests/test_memory.py +++ b/cuda_core/tests/test_memory.py @@ -1255,7 +1255,7 @@ def test_memory_resource_alloc_zero_bytes(init_cuda, memory_resource_factory): mr = MR() elif MR is ManagedMemoryResource: skip_if_managed_memory_unsupported(device) - mr = MR(MROps(preferred_location=device.device_id)) + mr = create_managed_memory_resource_or_skip(MROps(preferred_location=device.device_id)) else: assert MR is DeviceMemoryResource mr = MR(device)