Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
82 changes: 82 additions & 0 deletions cuda_core/cuda/core/_cpp/resource_handles.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,11 @@ decltype(&cuMemFreeHost) p_cuMemFreeHost = nullptr;

decltype(&cuMemPoolImportPointer) p_cuMemPoolImportPointer = nullptr;

decltype(&cuLibraryLoadFromFile) p_cuLibraryLoadFromFile = nullptr;
decltype(&cuLibraryLoadData) p_cuLibraryLoadData = nullptr;
decltype(&cuLibraryUnload) p_cuLibraryUnload = nullptr;
decltype(&cuLibraryGetKernel) p_cuLibraryGetKernel = nullptr;

// ============================================================================
// GIL management helpers
// ============================================================================
Expand Down Expand Up @@ -682,4 +687,81 @@ DevicePtrHandle deviceptr_import_ipc(MemoryPoolHandle h_pool, const void* export
}
}

// ============================================================================
// Library Handles
// ============================================================================

namespace {
struct LibraryBox {
CUlibrary resource;
};
} // namespace

LibraryHandle create_library_handle_from_file(const char* path) {
GILReleaseGuard gil;
CUlibrary library;
if (CUDA_SUCCESS != (err = p_cuLibraryLoadFromFile(&library, path, nullptr, nullptr, 0, nullptr, nullptr, 0))) {
return {};
}

auto box = std::shared_ptr<const LibraryBox>(
new LibraryBox{library},
[](const LibraryBox* b) {
GILReleaseGuard gil;
p_cuLibraryUnload(b->resource);
delete b;
}
);
return LibraryHandle(box, &box->resource);
}

LibraryHandle create_library_handle_from_data(const void* data) {
GILReleaseGuard gil;
CUlibrary library;
if (CUDA_SUCCESS != (err = p_cuLibraryLoadData(&library, data, nullptr, nullptr, 0, nullptr, nullptr, 0))) {
return {};
}

auto box = std::shared_ptr<const LibraryBox>(
new LibraryBox{library},
[](const LibraryBox* b) {
GILReleaseGuard gil;
p_cuLibraryUnload(b->resource);
delete b;
}
);
return LibraryHandle(box, &box->resource);
}

LibraryHandle create_library_handle_ref(CUlibrary library) {
auto box = std::make_shared<const LibraryBox>(LibraryBox{library});
return LibraryHandle(box, &box->resource);
}

// ============================================================================
// Kernel Handles
// ============================================================================

namespace {
struct KernelBox {
CUkernel resource;
LibraryHandle h_library; // Keeps library alive
};
} // namespace

KernelHandle create_kernel_handle(LibraryHandle h_library, const char* name) {
GILReleaseGuard gil;
CUkernel kernel;
if (CUDA_SUCCESS != (err = p_cuLibraryGetKernel(&kernel, *h_library, name))) {
return {};
}

return create_kernel_handle_ref(kernel, h_library);
}

KernelHandle create_kernel_handle_ref(CUkernel kernel, LibraryHandle h_library) {
auto box = std::make_shared<const KernelBox>(KernelBox{kernel, h_library});
return KernelHandle(box, &box->resource);
}

} // namespace cuda_core
66 changes: 66 additions & 0 deletions cuda_core/cuda/core/_cpp/resource_handles.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,12 @@ extern decltype(&cuMemFreeHost) p_cuMemFreeHost;

extern decltype(&cuMemPoolImportPointer) p_cuMemPoolImportPointer;

// Library
extern decltype(&cuLibraryLoadFromFile) p_cuLibraryLoadFromFile;
extern decltype(&cuLibraryLoadData) p_cuLibraryLoadData;
extern decltype(&cuLibraryUnload) p_cuLibraryUnload;
extern decltype(&cuLibraryGetKernel) p_cuLibraryGetKernel;

// ============================================================================
// Handle type aliases - expose only the raw CUDA resource
// ============================================================================
Expand All @@ -69,6 +75,8 @@ using ContextHandle = std::shared_ptr<const CUcontext>;
using StreamHandle = std::shared_ptr<const CUstream>;
using EventHandle = std::shared_ptr<const CUevent>;
using MemoryPoolHandle = std::shared_ptr<const CUmemoryPool>;
using LibraryHandle = std::shared_ptr<const CUlibrary>;
using KernelHandle = std::shared_ptr<const CUkernel>;

// ============================================================================
// Context handle functions
Expand Down Expand Up @@ -218,6 +226,40 @@ StreamHandle deallocation_stream(const DevicePtrHandle& h) noexcept;
// Set the deallocation stream for a device pointer handle.
void set_deallocation_stream(const DevicePtrHandle& h, StreamHandle h_stream) noexcept;

// ============================================================================
// Library handle functions
// ============================================================================

// Create an owning library handle by loading from a file path.
// When the last reference is released, cuLibraryUnload is called automatically.
// Returns empty handle on error (caller must check).
LibraryHandle create_library_handle_from_file(const char* path);

// Create an owning library handle by loading from memory data.
// The driver makes an internal copy of the data; caller can free it after return.
// When the last reference is released, cuLibraryUnload is called automatically.
// Returns empty handle on error (caller must check).
LibraryHandle create_library_handle_from_data(const void* data);

// Create a non-owning library handle (references existing library).
// Use for borrowed libraries (e.g., from foreign code).
// The library will NOT be unloaded when the handle is released.
LibraryHandle create_library_handle_ref(CUlibrary library);

// ============================================================================
// Kernel handle functions
// ============================================================================

// Get a kernel from a library by name.
// The kernel structurally depends on the provided library handle.
// Kernels have no explicit destroy - their lifetime is tied to the library.
// Returns empty handle on error (caller must check).
KernelHandle create_kernel_handle(LibraryHandle h_library, const char* name);

// Create a non-owning kernel handle with library dependency.
// Use for borrowed kernels. The library handle keeps the library alive.
KernelHandle create_kernel_handle_ref(CUkernel kernel, LibraryHandle h_library);

// ============================================================================
// Overloaded helper functions to extract raw resources from handles
// ============================================================================
Expand All @@ -243,6 +285,14 @@ inline CUdeviceptr as_cu(const DevicePtrHandle& h) noexcept {
return h ? *h : 0;
}

inline CUlibrary as_cu(const LibraryHandle& h) noexcept {
return h ? *h : nullptr;
}

inline CUkernel as_cu(const KernelHandle& h) noexcept {
return h ? *h : nullptr;
}

// as_intptr() - extract handle as intptr_t for Python interop
// Using signed intptr_t per C standard convention and issue #1342
inline std::intptr_t as_intptr(const ContextHandle& h) noexcept {
Expand All @@ -265,6 +315,14 @@ inline std::intptr_t as_intptr(const DevicePtrHandle& h) noexcept {
return static_cast<std::intptr_t>(as_cu(h));
}

inline std::intptr_t as_intptr(const LibraryHandle& h) noexcept {
return reinterpret_cast<std::intptr_t>(as_cu(h));
}

inline std::intptr_t as_intptr(const KernelHandle& h) noexcept {
return reinterpret_cast<std::intptr_t>(as_cu(h));
}

// as_py() - convert handle to Python driver wrapper object (returns new reference)
namespace detail {
// n.b. class lookup is not cached to avoid deadlock hazard, see DESIGN.md
Expand Down Expand Up @@ -300,4 +358,12 @@ inline PyObject* as_py(const DevicePtrHandle& h) noexcept {
return detail::make_py("CUdeviceptr", as_intptr(h));
}

inline PyObject* as_py(const LibraryHandle& h) noexcept {
return detail::make_py("CUlibrary", as_intptr(h));
}

inline PyObject* as_py(const KernelHandle& h) noexcept {
return detail::make_py("CUkernel", as_intptr(h));
}

} // namespace cuda_core
5 changes: 3 additions & 2 deletions cuda_core/cuda/core/_launcher.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ from cuda.bindings cimport cydriver

from cuda.core._launch_config cimport LaunchConfig
from cuda.core._kernel_arg_handler cimport ParamHolder
from cuda.core._module cimport Kernel
from cuda.core._resource_handles cimport as_cu
from cuda.core._stream cimport Stream_accept, Stream
from cuda.core._utils.cuda_utils cimport (
Expand Down Expand Up @@ -77,11 +78,11 @@ def launch(stream: Stream | GraphBuilder | IsStreamT, config: LaunchConfig, kern
cdef ParamHolder ker_args = ParamHolder(kernel_args)
cdef void** args_ptr = <void**><uintptr_t>(ker_args.ptr)

# TODO: cythonize Module/Kernel/...
# Note: We now use CUkernel handles exclusively (CUDA 12+), but they can be cast to
# CUfunction for use with cuLaunchKernel, as both handle types are interchangeable
# for kernel launch purposes.
cdef cydriver.CUfunction func_handle = <cydriver.CUfunction>(<uintptr_t>(kernel._handle))
cdef Kernel ker = <Kernel>kernel
cdef cydriver.CUfunction func_handle = <cydriver.CUfunction>as_cu(ker._h_kernel)

# Note: CUkernel can still be launched via cuLaunchKernel (not just cuLaunchKernelEx).
# We check both binding & driver versions here mainly to see if the "Ex" API is
Expand Down
10 changes: 5 additions & 5 deletions cuda_core/cuda/core/_linker.py
Original file line number Diff line number Diff line change
Expand Up @@ -444,29 +444,29 @@ def __init__(self, *object_codes: ObjectCode, options: LinkerOptions = None):
self._add_code_object(code)

def _add_code_object(self, object_code: ObjectCode):
data = object_code._module
data = object_code.code
with _exception_manager(self):
name_str = f"{object_code.name}"
if _nvjitlink and isinstance(data, bytes):
_nvjitlink.add_data(
self._mnff.handle,
self._input_type_from_code_type(object_code._code_type),
self._input_type_from_code_type(object_code.code_type),
data,
len(data),
name_str,
)
elif _nvjitlink and isinstance(data, str):
_nvjitlink.add_file(
self._mnff.handle,
self._input_type_from_code_type(object_code._code_type),
self._input_type_from_code_type(object_code.code_type),
data,
)
elif (not _nvjitlink) and isinstance(data, bytes):
name_bytes = name_str.encode()
handle_return(
_driver.cuLinkAddData(
self._mnff.handle,
self._input_type_from_code_type(object_code._code_type),
self._input_type_from_code_type(object_code.code_type),
data,
len(data),
name_bytes,
Expand All @@ -481,7 +481,7 @@ def _add_code_object(self, object_code: ObjectCode):
handle_return(
_driver.cuLinkAddFile(
self._mnff.handle,
self._input_type_from_code_type(object_code._code_type),
self._input_type_from_code_type(object_code.code_type),
data.encode(),
0,
None,
Expand Down
11 changes: 10 additions & 1 deletion cuda_core/cuda/core/_memory/_memory_pool.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,16 @@ cdef class _MemPool(MemoryResource):
IPCDataForMR _ipc_data
object _attributes
object _peer_accessible_by
object __weakref__


cdef class _MemPoolAttributes:
cdef:
MemoryPoolHandle _h_pool

@staticmethod
cdef _MemPoolAttributes _init(MemoryPoolHandle h_pool)

cdef int _getattribute(self, cydriver.CUmemPool_attribute attr_enum, void* value) except? -1


cdef class _MemPoolOptions:
Expand Down
21 changes: 7 additions & 14 deletions cuda_core/cuda/core/_memory/_memory_pool.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,6 @@ from cuda.core._utils.cuda_utils cimport (

from typing import TYPE_CHECKING
import platform # no-cython-lint
import weakref

from cuda.core._utils.cuda_utils import driver

Expand All @@ -50,16 +49,15 @@ cdef class _MemPoolOptions:


cdef class _MemPoolAttributes:
cdef:
object _mr_weakref
"""Provides access to memory pool attributes."""

def __init__(self, *args, **kwargs):
raise RuntimeError("_MemPoolAttributes cannot be instantiated directly. Please use MemoryResource APIs.")

@classmethod
def _init(cls, mr):
cdef _MemPoolAttributes self = _MemPoolAttributes.__new__(cls)
self._mr_weakref = mr
@staticmethod
cdef _MemPoolAttributes _init(MemoryPoolHandle h_pool):
cdef _MemPoolAttributes self = _MemPoolAttributes.__new__(_MemPoolAttributes)
self._h_pool = h_pool
return self

def __repr__(self):
Expand All @@ -69,12 +67,8 @@ cdef class _MemPoolAttributes:
)

cdef int _getattribute(self, cydriver.CUmemPool_attribute attr_enum, void* value) except?-1:
cdef _MemPool mr = <_MemPool>(self._mr_weakref())
if mr is None:
raise RuntimeError("_MemPool is expired")
cdef cydriver.CUmemoryPool pool_handle = as_cu(mr._h_pool)
with nogil:
HANDLE_RETURN(cydriver.cuMemPoolGetAttribute(pool_handle, attr_enum, value))
HANDLE_RETURN(cydriver.cuMemPoolGetAttribute(as_cu(self._h_pool), attr_enum, value))
return 0

@property
Expand Down Expand Up @@ -202,8 +196,7 @@ cdef class _MemPool(MemoryResource):
def attributes(self) -> _MemPoolAttributes:
"""Memory pool attributes."""
if self._attributes is None:
ref = weakref.ref(self)
self._attributes = _MemPoolAttributes._init(ref)
self._attributes = _MemPoolAttributes._init(self._h_pool)
return self._attributes

@property
Expand Down
54 changes: 54 additions & 0 deletions cuda_core/cuda/core/_module.pxd
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# SPDX-License-Identifier: Apache-2.0

from cuda.bindings cimport cydriver
from cuda.core._resource_handles cimport LibraryHandle, KernelHandle

cdef class ObjectCode
cdef class Kernel
cdef class KernelOccupancy
cdef class KernelAttributes


cdef class Kernel:
cdef:
KernelHandle _h_kernel
KernelAttributes _attributes # lazy
KernelOccupancy _occupancy # lazy

@staticmethod
cdef Kernel _from_obj(KernelHandle h_kernel)

cdef tuple _get_arguments_info(self, bint param_info=*)


cdef class ObjectCode:
cdef:
LibraryHandle _h_library
str _code_type
object _module # bytes/str source
dict _sym_map
str _name

cdef int _lazy_load_module(self) except -1


cdef class KernelOccupancy:
cdef:
KernelHandle _h_kernel

@staticmethod
cdef KernelOccupancy _init(KernelHandle h_kernel)


cdef class KernelAttributes:
cdef:
KernelHandle _h_kernel
dict _cache

@staticmethod
cdef KernelAttributes _init(KernelHandle h_kernel)

cdef int _get_cached_attribute(self, int device_id, cydriver.CUfunction_attribute attribute) except? -1
cdef int _resolve_device_id(self, device_id) except? -1
Loading
Loading