From c581bec8173e2ccd242b791fe3f47e33b567de77 Mon Sep 17 00:00:00 2001 From: Andy Jost Date: Wed, 21 Jan 2026 13:39:46 -0800 Subject: [PATCH 01/10] Cythonize _module.py (Phase 2a: cdef classes) Convert Kernel, ObjectCode, and KernelOccupancy to cdef classes with proper .pxd declarations. This phase establishes the Cython structure while maintaining Python driver module usage. Changes: - Rename _module.py to _module.pyx - Create _module.pxd with cdef class declarations - Convert Kernel, ObjectCode, KernelOccupancy to cdef class - Remove _backend dict in favor of direct driver calls - Add _init_py() Python-accessible factory for ObjectCode - Update _program.py and _linker.py to use _init_py() - Fix test to handle cdef class property descriptors Phase 2b will convert driver calls to cydriver with nogil blocks. Phase 2c will add RAII handles to resource_handles. --- cuda_core/cuda/core/_linker.py | 2 +- cuda_core/cuda/core/_module.pxd | 42 +++++ .../cuda/core/{_module.py => _module.pyx} | 152 +++++++++--------- cuda_core/cuda/core/_program.py | 6 +- cuda_core/tests/test_module.py | 7 +- 5 files changed, 126 insertions(+), 83 deletions(-) create mode 100644 cuda_core/cuda/core/_module.pxd rename cuda_core/cuda/core/{_module.py => _module.pyx} (87%) diff --git a/cuda_core/cuda/core/_linker.py b/cuda_core/cuda/core/_linker.py index 1f6f221a39..4fc6955d57 100644 --- a/cuda_core/cuda/core/_linker.py +++ b/cuda_core/cuda/core/_linker.py @@ -529,7 +529,7 @@ def link(self, target_type) -> ObjectCode: addr, size = handle_return(_driver.cuLinkComplete(self._mnff.handle)) code = (ctypes.c_char * size).from_address(addr) - return ObjectCode._init(bytes(code), target_type, name=self._options.name) + return ObjectCode._init_py(bytes(code), target_type, name=self._options.name) def get_error_log(self) -> str: """Get the error log generated by the linker. diff --git a/cuda_core/cuda/core/_module.pxd b/cuda_core/cuda/core/_module.pxd new file mode 100644 index 0000000000..4b8688bf2f --- /dev/null +++ b/cuda_core/cuda/core/_module.pxd @@ -0,0 +1,42 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +cdef class ObjectCode +cdef class Kernel +cdef class KernelOccupancy + + +cdef class Kernel: + cdef public: + object _handle # CUkernel (will become KernelHandle in phase 2c) + object _module # ObjectCode reference + object _attributes # KernelAttributes (lazy) + object _occupancy # KernelOccupancy (lazy) + + cdef object __weakref__ # Enable weak references + + @staticmethod + cdef Kernel _from_obj(object obj, ObjectCode mod) + + +cdef class ObjectCode: + cdef public: + object _handle # CUlibrary (will become LibraryHandle in phase 2c) + str _code_type + object _module # bytes/str source + dict _sym_map + str _name + + @staticmethod + cdef ObjectCode _init(object module, str code_type, str name=*, dict symbol_mapping=*) + + cdef int _lazy_load_module(self) except -1 + + +cdef class KernelOccupancy: + cdef public: + object _handle # CUkernel reference + + @staticmethod + cdef KernelOccupancy _init(object handle) diff --git a/cuda_core/cuda/core/_module.py b/cuda_core/cuda/core/_module.pyx similarity index 87% rename from cuda_core/cuda/core/_module.py rename to cuda_core/cuda/core/_module.pyx index dd3f4494d5..0e8400f7bd 100644 --- a/cuda_core/cuda/core/_module.py +++ b/cuda_core/cuda/core/_module.pyx @@ -2,11 +2,12 @@ # # SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + import functools import threading import weakref from collections import namedtuple -from typing import Union from cuda.core._device import Device from cuda.core._launch_config import LaunchConfig, _to_native_launch_config @@ -16,7 +17,9 @@ assert_type_str_or_bytes_like, raise_code_path_meant_to_be_unreachable, ) -from cuda.core._utils.cuda_utils import CUDAError, driver, get_binding_version, handle_return, precondition +from cuda.core._utils.cuda_utils import CUDAError, driver, get_binding_version, handle_return + +__all__ = ["Kernel", "ObjectCode"] # Lazy initialization state and synchronization # For Python 3.13t (free-threaded builds), we use a lock to ensure thread-safe initialization. @@ -27,7 +30,7 @@ _py_minor_ver = None _driver_ver = None _kernel_ctypes = None -_backend = {} +_paraminfo_supported = False def _lazy_init(): @@ -53,19 +56,12 @@ def _lazy_init(): if _inited: return - global _py_major_ver, _py_minor_ver, _driver_ver, _kernel_ctypes, _backend + global _py_major_ver, _py_minor_ver, _driver_ver, _kernel_ctypes, _paraminfo_supported # binding availability depends on cuda-python version _py_major_ver, _py_minor_ver = get_binding_version() - _backend = { - "file": driver.cuLibraryLoadFromFile, - "data": driver.cuLibraryLoadData, - "kernel": driver.cuLibraryGetKernel, - "attribute": driver.cuKernelGetAttribute, - } _kernel_ctypes = (driver.CUkernel,) _driver_ver = handle_return(driver.cuDriverGetVersion()) - if _driver_ver >= 12040: - _backend["paraminfo"] = driver.cuKernelGetParamInfo + _paraminfo_supported = _driver_ver >= 12040 # Mark as initialized (must be last to ensure all state is set) _inited = True @@ -96,6 +92,12 @@ def _get_kernel_ctypes(): return _kernel_ctypes +def _is_paraminfo_supported(): + """Return True if cuKernelGetParamInfo is available (driver >= 12.4).""" + _lazy_init() + return _paraminfo_supported + + @functools.cache def _is_cukernel_get_library_supported() -> bool: """Return True when cuKernelGetLibrary is available for inverse kernel-to-library lookup. @@ -115,20 +117,19 @@ def _make_dummy_library_handle(): class KernelAttributes: - def __new__(self, *args, **kwargs): - raise RuntimeError("KernelAttributes cannot be instantiated directly. Please use Kernel APIs.") + """Provides access to kernel attributes. Uses weakref to avoid preventing Kernel GC.""" - slots = ("_kernel", "_cache", "_loader") + __slots__ = ("_kernel", "_cache") + + def __new__(cls, *args, **kwargs): + raise RuntimeError("KernelAttributes cannot be instantiated directly. Please use Kernel APIs.") @classmethod def _init(cls, kernel): - self = super().__new__(cls) + self = object.__new__(cls) self._kernel = weakref.ref(kernel) self._cache = {} - - # Ensure backend is initialized before setting loader _lazy_init() - self._loader = _backend return self def _get_cached_attribute(self, device_id: Device | int, attribute: driver.CUfunction_attribute) -> int: @@ -141,7 +142,7 @@ def _get_cached_attribute(self, device_id: Device | int, attribute: driver.CUfun kernel = self._kernel() if kernel is None: raise RuntimeError("Cannot access kernel attributes for expired Kernel object") - result = handle_return(self._loader["attribute"](attribute, kernel._handle, device_id)) + result = handle_return(driver.cuKernelGetAttribute(attribute, kernel._handle, device_id)) self._cache[cache_key] = result return result @@ -245,21 +246,18 @@ def cluster_scheduling_policy_preference(self, device_id: Device | int = None) - MaxPotentialBlockSizeOccupancyResult = namedtuple("MaxPotential", ("min_grid_size", "max_block_size")) -class KernelOccupancy: +cdef class KernelOccupancy: """This class offers methods to query occupancy metrics that help determine optimal launch parameters such as block size, grid size, and shared memory usage. """ - def __new__(self, *args, **kwargs): + def __init__(self, *args, **kwargs): raise RuntimeError("KernelOccupancy cannot be instantiated directly. Please use Kernel APIs.") - slots = ("_handle",) - - @classmethod - def _init(cls, handle): - self = super().__new__(cls) + @staticmethod + cdef KernelOccupancy _init(object handle): + cdef KernelOccupancy self = KernelOccupancy.__new__(KernelOccupancy) self._handle = handle - return self def max_active_blocks_per_multiprocessor(self, block_size: int, dynamic_shared_memory_size: int) -> int: @@ -292,7 +290,7 @@ def max_active_blocks_per_multiprocessor(self, block_size: int, dynamic_shared_m ) def max_potential_block_size( - self, dynamic_shared_memory_needed: Union[int, driver.CUoccupancyB2DSize], block_size_limit: int + self, dynamic_shared_memory_needed: int | driver.CUoccupancyB2DSize, block_size_limit: int ) -> MaxPotentialBlockSizeOccupancyResult: """MaxPotentialBlockSizeOccupancyResult: Suggested launch configuration for reasonable occupancy. @@ -411,7 +409,7 @@ def max_active_clusters(self, config: LaunchConfig, stream: Stream | None = None ParamInfo = namedtuple("ParamInfo", ["offset", "size"]) -class Kernel: +cdef class Kernel: """Represent a compiled kernel that had been loaded onto the device. Kernel instances can execution when passed directly into the @@ -422,16 +420,13 @@ class Kernel: """ - __slots__ = ("_handle", "_module", "_attributes", "_occupancy", "__weakref__") - - def __new__(self, *args, **kwargs): + def __init__(self, *args, **kwargs): raise RuntimeError("Kernel objects cannot be instantiated directly. Please use ObjectCode APIs.") - @classmethod - def _from_obj(cls, obj, mod): + @staticmethod + cdef Kernel _from_obj(object obj, ObjectCode mod): assert_type(obj, _get_kernel_ctypes()) - assert_type(mod, ObjectCode) - ker = super().__new__(cls) + cdef Kernel ker = Kernel.__new__(Kernel) ker._handle = obj ker._module = mod ker._attributes = None @@ -446,8 +441,7 @@ def attributes(self) -> KernelAttributes: return self._attributes def _get_arguments_info(self, param_info=False) -> tuple[int, list[ParamInfo]]: - attr_impl = self.attributes - if "paraminfo" not in attr_impl._loader: + if not _is_paraminfo_supported(): driver_ver = _get_driver_ver() raise NotImplementedError( "Driver version 12.4 or newer is required for this function. " @@ -456,7 +450,7 @@ def _get_arguments_info(self, param_info=False) -> tuple[int, list[ParamInfo]]: arg_pos = 0 param_info_data = [] while True: - result = attr_impl._loader["paraminfo"](self._handle, arg_pos) + result = driver.cuKernelGetParamInfo(self._handle, arg_pos) if result[0] != driver.CUresult.CUDA_SUCCESS: break if param_info: @@ -487,7 +481,7 @@ def occupancy(self) -> KernelOccupancy: return self._occupancy @staticmethod - def from_handle(handle: int, mod: "ObjectCode" = None) -> "Kernel": + def from_handle(handle: int, mod: ObjectCode = None) -> Kernel: """Creates a new :obj:`Kernel` object from a foreign kernel handle. Uses a CUkernel pointer address to create a new :obj:`Kernel` object. @@ -515,7 +509,7 @@ def from_handle(handle: int, mod: "ObjectCode" = None) -> "Kernel": # For CUkernel, we can (optionally) inverse-lookup the owning CUlibrary via # cuKernelGetLibrary (added in CUDA 12.5). If the API is not available, we fall # back to a non-null dummy handle purely to disable lazy loading. - mod = ObjectCode._init(b"", "cubin") + mod = ObjectCode._init(b"", "cubin", "", None) if _is_cukernel_get_library_supported(): try: mod._handle = handle_return(driver.cuKernelGetLibrary(kernel_obj)) @@ -528,10 +522,12 @@ def from_handle(handle: int, mod: "ObjectCode" = None) -> "Kernel": return Kernel._from_obj(kernel_obj, mod) -CodeTypeT = Union[bytes, bytearray, str] +CodeTypeT = bytes | bytearray | str + +_supported_code_type = ("cubin", "ptx", "ltoir", "fatbin", "object", "library") -class ObjectCode: +cdef class ObjectCode: """Represent a compiled program to be loaded onto the device. This object provides a unified interface for different types of @@ -545,26 +541,20 @@ class ObjectCode: :class:`~cuda.core.Program` """ - __slots__ = ("_handle", "_code_type", "_module", "_loader", "_sym_map", "_name") - _supported_code_type = ("cubin", "ptx", "ltoir", "fatbin", "object", "library") - - def __new__(self, *args, **kwargs): + def __init__(self, *args, **kwargs): raise RuntimeError( "ObjectCode objects cannot be instantiated directly. " "Please use ObjectCode APIs (from_cubin, from_ptx) or Program APIs (compile)." ) - @classmethod - def _init(cls, module, code_type, *, name: str = "", symbol_mapping: dict | None = None): - self = super().__new__(cls) - assert code_type in self._supported_code_type, f"{code_type=} is not supported" + @staticmethod + cdef ObjectCode _init(object module, str code_type, str name = "", dict symbol_mapping = None): + assert code_type in _supported_code_type, f"{code_type=} is not supported" + cdef ObjectCode self = ObjectCode.__new__(ObjectCode) # handle is assigned during _lazy_load self._handle = None - - # Ensure backend is initialized before setting loader _lazy_init() - self._loader = _backend self._code_type = code_type self._module = module @@ -574,15 +564,20 @@ def _init(cls, module, code_type, *, name: str = "", symbol_mapping: dict | None return self @classmethod - def _reduce_helper(self, module, code_type, name, symbol_mapping): + def _init_py(cls, module, code_type, *, name: str = "", symbol_mapping: dict | None = None): + """Python-accessible factory method for use by _program.py and _linker.py.""" + return ObjectCode._init(module, code_type, name if name else "", symbol_mapping) + + @classmethod + def _reduce_helper(cls, module, code_type, name, symbol_mapping): # just for forwarding kwargs - return ObjectCode._init(module, code_type, name=name, symbol_mapping=symbol_mapping) + return ObjectCode._init(module, code_type, name if name else "", symbol_mapping) def __reduce__(self): return ObjectCode._reduce_helper, (self._module, self._code_type, self._name, self._sym_map) @staticmethod - def from_cubin(module: Union[bytes, str], *, name: str = "", symbol_mapping: dict | None = None) -> "ObjectCode": + def from_cubin(module: bytes | str, *, name: str = "", symbol_mapping: dict | None = None) -> ObjectCode: """Create an :class:`ObjectCode` instance from an existing cubin. Parameters @@ -597,10 +592,10 @@ def from_cubin(module: Union[bytes, str], *, name: str = "", symbol_mapping: dic should be mapped to the mangled names before trying to retrieve them (default to no mappings). """ - return ObjectCode._init(module, "cubin", name=name, symbol_mapping=symbol_mapping) + return ObjectCode._init(module, "cubin", name, symbol_mapping) @staticmethod - def from_ptx(module: Union[bytes, str], *, name: str = "", symbol_mapping: dict | None = None) -> "ObjectCode": + def from_ptx(module: bytes | str, *, name: str = "", symbol_mapping: dict | None = None) -> ObjectCode: """Create an :class:`ObjectCode` instance from an existing PTX. Parameters @@ -615,10 +610,10 @@ def from_ptx(module: Union[bytes, str], *, name: str = "", symbol_mapping: dict should be mapped to the mangled names before trying to retrieve them (default to no mappings). """ - return ObjectCode._init(module, "ptx", name=name, symbol_mapping=symbol_mapping) + return ObjectCode._init(module, "ptx", name, symbol_mapping) @staticmethod - def from_ltoir(module: Union[bytes, str], *, name: str = "", symbol_mapping: dict | None = None) -> "ObjectCode": + def from_ltoir(module: bytes | str, *, name: str = "", symbol_mapping: dict | None = None) -> ObjectCode: """Create an :class:`ObjectCode` instance from an existing LTOIR. Parameters @@ -633,10 +628,10 @@ def from_ltoir(module: Union[bytes, str], *, name: str = "", symbol_mapping: dic should be mapped to the mangled names before trying to retrieve them (default to no mappings). """ - return ObjectCode._init(module, "ltoir", name=name, symbol_mapping=symbol_mapping) + return ObjectCode._init(module, "ltoir", name, symbol_mapping) @staticmethod - def from_fatbin(module: Union[bytes, str], *, name: str = "", symbol_mapping: dict | None = None) -> "ObjectCode": + def from_fatbin(module: bytes | str, *, name: str = "", symbol_mapping: dict | None = None) -> ObjectCode: """Create an :class:`ObjectCode` instance from an existing fatbin. Parameters @@ -651,10 +646,10 @@ def from_fatbin(module: Union[bytes, str], *, name: str = "", symbol_mapping: di should be mapped to the mangled names before trying to retrieve them (default to no mappings). """ - return ObjectCode._init(module, "fatbin", name=name, symbol_mapping=symbol_mapping) + return ObjectCode._init(module, "fatbin", name, symbol_mapping) @staticmethod - def from_object(module: Union[bytes, str], *, name: str = "", symbol_mapping: dict | None = None) -> "ObjectCode": + def from_object(module: bytes | str, *, name: str = "", symbol_mapping: dict | None = None) -> ObjectCode: """Create an :class:`ObjectCode` instance from an existing object code. Parameters @@ -669,10 +664,10 @@ def from_object(module: Union[bytes, str], *, name: str = "", symbol_mapping: di should be mapped to the mangled names before trying to retrieve them (default to no mappings). """ - return ObjectCode._init(module, "object", name=name, symbol_mapping=symbol_mapping) + return ObjectCode._init(module, "object", name, symbol_mapping) @staticmethod - def from_library(module: Union[bytes, str], *, name: str = "", symbol_mapping: dict | None = None) -> "ObjectCode": + def from_library(module: bytes | str, *, name: str = "", symbol_mapping: dict | None = None) -> ObjectCode: """Create an :class:`ObjectCode` instance from an existing library. Parameters @@ -687,24 +682,24 @@ def from_library(module: Union[bytes, str], *, name: str = "", symbol_mapping: d should be mapped to the mangled names before trying to retrieve them (default to no mappings). """ - return ObjectCode._init(module, "library", name=name, symbol_mapping=symbol_mapping) + return ObjectCode._init(module, "library", name, symbol_mapping) # TODO: do we want to unload in a finalizer? Probably not.. - def _lazy_load_module(self, *args, **kwargs): + cdef int _lazy_load_module(self) except -1: if self._handle is not None: - return + return 0 module = self._module assert_type_str_or_bytes_like(module) if isinstance(module, str): - self._handle = handle_return(self._loader["file"](module.encode(), [], [], 0, [], [], 0)) - return + self._handle = handle_return(driver.cuLibraryLoadFromFile(module.encode(), [], [], 0, [], [], 0)) + return 0 if isinstance(module, (bytes, bytearray)): - self._handle = handle_return(self._loader["data"](module, [], [], 0, [], [], 0)) - return + self._handle = handle_return(driver.cuLibraryLoadData(module, [], [], 0, [], [], 0)) + return 0 raise_code_path_meant_to_be_unreachable() + return -1 - @precondition(_lazy_load_module) def get_kernel(self, name) -> Kernel: """Return the :obj:`~_module.Kernel` of a specified name from this object code. @@ -719,6 +714,7 @@ def get_kernel(self, name) -> Kernel: Newly created kernel object. """ + self._lazy_load_module() supported_code_types = ("cubin", "ptx", "fatbin") if self._code_type not in supported_code_types: raise RuntimeError(f'Unsupported code type "{self._code_type}" ({supported_code_types=})') @@ -727,7 +723,7 @@ def get_kernel(self, name) -> Kernel: except KeyError: name = name.encode() - data = handle_return(self._loader["kernel"](self._handle, name)) + data = handle_return(driver.cuLibraryGetKernel(self._handle, name)) return Kernel._from_obj(data, self) @property @@ -746,7 +742,6 @@ def code_type(self) -> str: return self._code_type @property - @precondition(_lazy_load_module) def handle(self): """Return the underlying handle object. @@ -755,4 +750,5 @@ def handle(self): This handle is a Python object. To get the memory address of the underlying C handle, call ``int(ObjectCode.handle)``. """ + self._lazy_load_module() return self._handle diff --git a/cuda_core/cuda/core/_program.py b/cuda_core/cuda/core/_program.py index 121dd13963..f903a4e781 100644 --- a/cuda_core/cuda/core/_program.py +++ b/cuda_core/cuda/core/_program.py @@ -690,7 +690,7 @@ def __init__(self, code, code_type, options: ProgramOptions = None): elif code_type == "ptx": assert_type(code, str) self._linker = Linker( - ObjectCode._init(code.encode(), code_type), options=self._translate_program_options(options) + ObjectCode._init_py(code.encode(), code_type), options=self._translate_program_options(options) ) self._backend = self._linker.backend @@ -808,7 +808,7 @@ def compile(self, target_type, name_expressions=(), logs=None): handle_return(nvrtc.nvrtcGetProgramLog(self._mnff.handle, log), handle=self._mnff.handle) logs.write(log.decode("utf-8", errors="backslashreplace")) - return ObjectCode._init(data, target_type, symbol_mapping=symbol_mapping, name=self._options.name) + return ObjectCode._init_py(data, target_type, symbol_mapping=symbol_mapping, name=self._options.name) elif self._backend == "NVVM": if target_type not in ("ptx", "ltoir"): @@ -834,7 +834,7 @@ def compile(self, target_type, name_expressions=(), logs=None): nvvm.get_program_log(self._mnff.handle, log) logs.write(log.decode("utf-8", errors="backslashreplace")) - return ObjectCode._init(data, target_type, name=self._options.name) + return ObjectCode._init_py(data, target_type, name=self._options.name) supported_backends = ("nvJitLink", "driver") if self._backend not in supported_backends: diff --git a/cuda_core/tests/test_module.py b/cuda_core/tests/test_module.py index f9bbcd3e4c..843fbf677f 100644 --- a/cuda_core/tests/test_module.py +++ b/cuda_core/tests/test_module.py @@ -231,7 +231,12 @@ def test_saxpy_arguments(get_saxpy_kernel_cubin, cuda12_4_prerequisite_check): _ = krn.num_arguments return - assert "ParamInfo" in str(type(krn).arguments_info.fget.__annotations__) + # Check that arguments_info returns ParamInfo objects (works for both Python and Cython classes) + # For Python classes: type(krn).arguments_info.fget.__annotations__ contains ParamInfo + # For Cython cdef classes: property descriptors don't have .fget, so we check the actual values + prop = type(krn).arguments_info + if hasattr(prop, "fget") and hasattr(prop.fget, "__annotations__"): + assert "ParamInfo" in str(prop.fget.__annotations__) arg_info = krn.arguments_info n_args = len(arg_info) assert n_args == krn.num_arguments From a7b81b54291c73ef60d187d4b3ca4d9d9410fa3f Mon Sep 17 00:00:00 2001 From: Andy Jost Date: Wed, 21 Jan 2026 14:55:17 -0800 Subject: [PATCH 02/10] Phase 2a refinements: hide private attrs, add public properties - Use strong types in .pxd (ObjectCode, KernelOccupancy) - Remove cdef public - attributes now private to C level - Add Kernel.handle property for external access - Add ObjectCode.symbol_mapping property (symmetric with input) - Update _launcher.pyx, _linker.py, tests to use public APIs --- cuda_core/cuda/core/_launcher.pyx | 3 +-- cuda_core/cuda/core/_linker.py | 10 ++++---- cuda_core/cuda/core/_module.pxd | 15 ++++++----- cuda_core/cuda/core/_module.pyx | 18 ++++++++++++- cuda_core/tests/test_module.py | 42 +++++++++++++++---------------- cuda_core/tests/test_program.py | 4 +-- 6 files changed, 52 insertions(+), 40 deletions(-) diff --git a/cuda_core/cuda/core/_launcher.pyx b/cuda_core/cuda/core/_launcher.pyx index 9559f7697a..9066e72732 100644 --- a/cuda_core/cuda/core/_launcher.pyx +++ b/cuda_core/cuda/core/_launcher.pyx @@ -77,11 +77,10 @@ def launch(stream: Stream | GraphBuilder | IsStreamT, config: LaunchConfig, kern cdef ParamHolder ker_args = ParamHolder(kernel_args) cdef void** args_ptr = (ker_args.ptr) - # TODO: cythonize Module/Kernel/... # Note: We now use CUkernel handles exclusively (CUDA 12+), but they can be cast to # CUfunction for use with cuLaunchKernel, as both handle types are interchangeable # for kernel launch purposes. - cdef cydriver.CUfunction func_handle = ((kernel._handle)) + cdef cydriver.CUfunction func_handle = ((kernel.handle)) # Note: CUkernel can still be launched via cuLaunchKernel (not just cuLaunchKernelEx). # We check both binding & driver versions here mainly to see if the "Ex" API is diff --git a/cuda_core/cuda/core/_linker.py b/cuda_core/cuda/core/_linker.py index 4fc6955d57..e621fe2c44 100644 --- a/cuda_core/cuda/core/_linker.py +++ b/cuda_core/cuda/core/_linker.py @@ -444,13 +444,13 @@ def __init__(self, *object_codes: ObjectCode, options: LinkerOptions = None): self._add_code_object(code) def _add_code_object(self, object_code: ObjectCode): - data = object_code._module + data = object_code.code with _exception_manager(self): name_str = f"{object_code.name}" if _nvjitlink and isinstance(data, bytes): _nvjitlink.add_data( self._mnff.handle, - self._input_type_from_code_type(object_code._code_type), + self._input_type_from_code_type(object_code.code_type), data, len(data), name_str, @@ -458,7 +458,7 @@ def _add_code_object(self, object_code: ObjectCode): elif _nvjitlink and isinstance(data, str): _nvjitlink.add_file( self._mnff.handle, - self._input_type_from_code_type(object_code._code_type), + self._input_type_from_code_type(object_code.code_type), data, ) elif (not _nvjitlink) and isinstance(data, bytes): @@ -466,7 +466,7 @@ def _add_code_object(self, object_code: ObjectCode): handle_return( _driver.cuLinkAddData( self._mnff.handle, - self._input_type_from_code_type(object_code._code_type), + self._input_type_from_code_type(object_code.code_type), data, len(data), name_bytes, @@ -481,7 +481,7 @@ def _add_code_object(self, object_code: ObjectCode): handle_return( _driver.cuLinkAddFile( self._mnff.handle, - self._input_type_from_code_type(object_code._code_type), + self._input_type_from_code_type(object_code.code_type), data.encode(), 0, None, diff --git a/cuda_core/cuda/core/_module.pxd b/cuda_core/cuda/core/_module.pxd index 4b8688bf2f..b83e08ed47 100644 --- a/cuda_core/cuda/core/_module.pxd +++ b/cuda_core/cuda/core/_module.pxd @@ -8,20 +8,19 @@ cdef class KernelOccupancy cdef class Kernel: - cdef public: + cdef: object _handle # CUkernel (will become KernelHandle in phase 2c) - object _module # ObjectCode reference - object _attributes # KernelAttributes (lazy) - object _occupancy # KernelOccupancy (lazy) - - cdef object __weakref__ # Enable weak references + ObjectCode _module # ObjectCode reference + object _attributes # KernelAttributes (regular Python class) + KernelOccupancy _occupancy # KernelOccupancy (lazy) + object __weakref__ # Enable weak references @staticmethod cdef Kernel _from_obj(object obj, ObjectCode mod) cdef class ObjectCode: - cdef public: + cdef: object _handle # CUlibrary (will become LibraryHandle in phase 2c) str _code_type object _module # bytes/str source @@ -35,7 +34,7 @@ cdef class ObjectCode: cdef class KernelOccupancy: - cdef public: + cdef: object _handle # CUkernel reference @staticmethod diff --git a/cuda_core/cuda/core/_module.pyx b/cuda_core/cuda/core/_module.pyx index 0e8400f7bd..6a04db7741 100644 --- a/cuda_core/cuda/core/_module.pyx +++ b/cuda_core/cuda/core/_module.pyx @@ -142,7 +142,7 @@ class KernelAttributes: kernel = self._kernel() if kernel is None: raise RuntimeError("Cannot access kernel attributes for expired Kernel object") - result = handle_return(driver.cuKernelGetAttribute(attribute, kernel._handle, device_id)) + result = handle_return(driver.cuKernelGetAttribute(attribute, kernel.handle, device_id)) self._cache[cache_key] = result return result @@ -480,6 +480,17 @@ cdef class Kernel: self._occupancy = KernelOccupancy._init(self._handle) return self._occupancy + @property + def handle(self): + """Return the underlying kernel handle object. + + .. caution:: + + This handle is a Python object. To get the memory address of the underlying C + handle, call ``int(Kernel.handle)``. + """ + return self._handle + @staticmethod def from_handle(handle: int, mod: ObjectCode = None) -> Kernel: """Creates a new :obj:`Kernel` object from a foreign kernel handle. @@ -741,6 +752,11 @@ cdef class ObjectCode: """Return the type of the underlying code object.""" return self._code_type + @property + def symbol_mapping(self) -> dict: + """Return a copy of the symbol mapping dictionary.""" + return dict(self._sym_map) + @property def handle(self): """Return the underlying handle object. diff --git a/cuda_core/tests/test_module.py b/cuda_core/tests/test_module.py index 843fbf677f..d5a35a1ea5 100644 --- a/cuda_core/tests/test_module.py +++ b/cuda_core/tests/test_module.py @@ -79,7 +79,7 @@ def get_saxpy_kernel_ptx(init_cuda): "ptx", name_expressions=("saxpy", "saxpy"), ) - ptx = mod._module + ptx = mod.code return ptx, mod @@ -100,10 +100,10 @@ def test_get_kernel(init_cuda): if any("The CUDA driver version is older than the backend version" in str(warning.message) for warning in w): pytest.skip("PTX version too new for current driver") - assert object_code._handle is None + # Verify lazy loading: get_kernel triggers module loading and returns a valid kernel kernel = object_code.get_kernel("ABC") - assert object_code._handle is not None - assert kernel._handle is not None + assert object_code.handle is not None + assert kernel.handle is not None @pytest.mark.parametrize( @@ -143,7 +143,7 @@ def test_read_only_kernel_attributes(get_saxpy_kernel_cubin, attr, expected_type def test_object_code_load_ptx(get_saxpy_kernel_ptx): ptx, mod = get_saxpy_kernel_ptx - sym_map = mod._sym_map + sym_map = mod.symbol_mapping mod_obj = ObjectCode.from_ptx(ptx, symbol_mapping=sym_map) assert mod.code == ptx if not Program._can_load_generated_ptx(): @@ -153,7 +153,7 @@ def test_object_code_load_ptx(get_saxpy_kernel_ptx): def test_object_code_load_ptx_from_file(get_saxpy_kernel_ptx, tmp_path): ptx, mod = get_saxpy_kernel_ptx - sym_map = mod._sym_map + sym_map = mod.symbol_mapping assert isinstance(ptx, bytes) ptx_file = tmp_path / "test.ptx" ptx_file.write_bytes(ptx) @@ -167,8 +167,8 @@ def test_object_code_load_ptx_from_file(get_saxpy_kernel_ptx, tmp_path): def test_object_code_load_cubin(get_saxpy_kernel_cubin): _, mod = get_saxpy_kernel_cubin - cubin = mod._module - sym_map = mod._sym_map + cubin = mod.code + sym_map = mod.symbol_mapping assert isinstance(cubin, bytes) mod = ObjectCode.from_cubin(cubin, symbol_mapping=sym_map) assert mod.code == cubin @@ -177,8 +177,8 @@ def test_object_code_load_cubin(get_saxpy_kernel_cubin): def test_object_code_load_cubin_from_file(get_saxpy_kernel_cubin, tmp_path): _, mod = get_saxpy_kernel_cubin - cubin = mod._module - sym_map = mod._sym_map + cubin = mod.code + sym_map = mod.symbol_mapping assert isinstance(cubin, bytes) cubin_file = tmp_path / "test.cubin" cubin_file.write_bytes(cubin) @@ -194,14 +194,13 @@ def test_object_code_handle(get_saxpy_kernel_cubin): def test_object_code_load_ltoir(get_saxpy_kernel_ltoir): mod = get_saxpy_kernel_ltoir - ltoir = mod._module - sym_map = mod._sym_map + ltoir = mod.code + sym_map = mod.symbol_mapping assert isinstance(ltoir, bytes) mod_obj = ObjectCode.from_ltoir(ltoir, symbol_mapping=sym_map) assert mod_obj.code == ltoir assert mod_obj.code_type == "ltoir" # ltoir doesn't support kernel retrieval directly as it's used for linking - assert mod_obj._handle is None # Test that get_kernel fails for unsupported code type with pytest.raises(RuntimeError, match=r'Unsupported code type "ltoir"'): mod_obj.get_kernel("saxpy") @@ -209,8 +208,8 @@ def test_object_code_load_ltoir(get_saxpy_kernel_ltoir): def test_object_code_load_ltoir_from_file(get_saxpy_kernel_ltoir, tmp_path): mod = get_saxpy_kernel_ltoir - ltoir = mod._module - sym_map = mod._sym_map + ltoir = mod.code + sym_map = mod.symbol_mapping assert isinstance(ltoir, bytes) ltoir_file = tmp_path / "test.ltoir" ltoir_file.write_bytes(ltoir) @@ -218,7 +217,6 @@ def test_object_code_load_ltoir_from_file(get_saxpy_kernel_ltoir, tmp_path): assert mod_obj.code == str(ltoir_file) assert mod_obj.code_type == "ltoir" # ltoir doesn't support kernel retrieval directly as it's used for linking - assert mod_obj._handle is None def test_saxpy_arguments(get_saxpy_kernel_cubin, cuda12_4_prerequisite_check): @@ -423,7 +421,7 @@ def test_module_serialization_roundtrip(get_saxpy_kernel_cubin): assert isinstance(result, ObjectCode) assert objcode.code == result.code - assert objcode._sym_map == result._sym_map + assert objcode.symbol_mapping == result.symbol_mapping assert objcode.code_type == result.code_type @@ -432,7 +430,7 @@ def test_kernel_from_handle(get_saxpy_kernel_cubin): original_kernel, objcode = get_saxpy_kernel_cubin # Get the handle from the original kernel - handle = int(original_kernel._handle) + handle = int(original_kernel.handle) # Create a new Kernel from the handle kernel_from_handle = Kernel.from_handle(handle, objcode) @@ -449,7 +447,7 @@ def test_kernel_from_handle_no_module(get_saxpy_kernel_cubin): original_kernel, _ = get_saxpy_kernel_cubin # Get the handle from the original kernel - handle = int(original_kernel._handle) + handle = int(original_kernel.handle) # Create a new Kernel from the handle without a module # This is supported on CUDA 12+ backend (CUkernel) @@ -486,7 +484,7 @@ def test_kernel_from_handle_type_validation(invalid_value): def test_kernel_from_handle_invalid_module_type(get_saxpy_kernel_cubin): """Test Kernel.from_handle() with invalid module parameter""" original_kernel, _ = get_saxpy_kernel_cubin - handle = int(original_kernel._handle) + handle = int(original_kernel.handle) # Invalid module type (should fail type assertion in _from_obj) with pytest.raises((TypeError, AssertionError)): @@ -499,7 +497,7 @@ def test_kernel_from_handle_invalid_module_type(get_saxpy_kernel_cubin): def test_kernel_from_handle_multiple_instances(get_saxpy_kernel_cubin): """Test creating multiple Kernel instances from the same handle""" original_kernel, objcode = get_saxpy_kernel_cubin - handle = int(original_kernel._handle) + handle = int(original_kernel.handle) # Create multiple Kernel instances from the same handle kernel1 = Kernel.from_handle(handle, objcode) @@ -512,4 +510,4 @@ def test_kernel_from_handle_multiple_instances(get_saxpy_kernel_cubin): assert isinstance(kernel3, Kernel) # All should reference the same underlying CUDA kernel handle - assert int(kernel1._handle) == int(kernel2._handle) == int(kernel3._handle) == handle + assert int(kernel1.handle) == int(kernel2.handle) == int(kernel3.handle) == handle diff --git a/cuda_core/tests/test_program.py b/cuda_core/tests/test_program.py index 9a9e4926ae..e2b3783dd7 100644 --- a/cuda_core/tests/test_program.py +++ b/cuda_core/tests/test_program.py @@ -335,7 +335,7 @@ def test_cpp_program_with_pch_options(init_cuda, tmp_path): @pytest.mark.parametrize("options", options) def test_ptx_program_with_various_options(init_cuda, ptx_code_object, options): - program = Program(ptx_code_object._module.decode(), "ptx", options=options) + program = Program(ptx_code_object.code.decode(), "ptx", options=options) assert program.backend == ("driver" if is_culink_backend else "nvJitLink") program.compile("cubin") program.close() @@ -378,7 +378,7 @@ def test_program_compile_valid_target_type(init_cuda): ptx_kernel = ptx_object_code.get_kernel("my_kernel") assert isinstance(ptx_kernel, Kernel) - program = Program(ptx_object_code._module.decode(), "ptx", options={"name": "24"}) + program = Program(ptx_object_code.code.decode(), "ptx", options={"name": "24"}) cubin_object_code = program.compile("cubin") assert isinstance(cubin_object_code, ObjectCode) assert cubin_object_code.name == "24" From 859bfc914ac1e5ba0bab722da23c43c6e5d9c9cc Mon Sep 17 00:00:00 2001 From: Andy Jost Date: Wed, 21 Jan 2026 15:24:22 -0800 Subject: [PATCH 03/10] Convert module-level functions and Kernel._get_arguments_info to cdef - Module globals: _inited, _py_major_ver, _py_minor_ver, _driver_ver, _kernel_ctypes, _paraminfo_supported -> cdef typed - Module functions: _lazy_init, _get_py_major_ver, _get_py_minor_ver, _get_driver_ver, _get_kernel_ctypes, _is_paraminfo_supported, _make_dummy_library_handle -> cdef inline with exception specs - Module constant: _supported_code_type -> cdef tuple - Kernel._get_arguments_info -> cdef tuple Note: KernelAttributes remains a regular Python class due to segfaults when converted to cdef class (likely due to weakref interaction with cdef class properties). --- cuda_core/cuda/core/_module.pxd | 4 ++- cuda_core/cuda/core/_module.pyx | 48 +++++++++++++++++---------------- 2 files changed, 28 insertions(+), 24 deletions(-) diff --git a/cuda_core/cuda/core/_module.pxd b/cuda_core/cuda/core/_module.pxd index b83e08ed47..5f0fb9b572 100644 --- a/cuda_core/cuda/core/_module.pxd +++ b/cuda_core/cuda/core/_module.pxd @@ -11,13 +11,15 @@ cdef class Kernel: cdef: object _handle # CUkernel (will become KernelHandle in phase 2c) ObjectCode _module # ObjectCode reference - object _attributes # KernelAttributes (regular Python class) + object _attributes # KernelAttributes (regular Python class, lazy) KernelOccupancy _occupancy # KernelOccupancy (lazy) object __weakref__ # Enable weak references @staticmethod cdef Kernel _from_obj(object obj, ObjectCode mod) + cdef tuple _get_arguments_info(self, bint param_info=*) + cdef class ObjectCode: cdef: diff --git a/cuda_core/cuda/core/_module.pyx b/cuda_core/cuda/core/_module.pyx index 6a04db7741..6dd0aad817 100644 --- a/cuda_core/cuda/core/_module.pyx +++ b/cuda_core/cuda/core/_module.pyx @@ -24,16 +24,16 @@ __all__ = ["Kernel", "ObjectCode"] # Lazy initialization state and synchronization # For Python 3.13t (free-threaded builds), we use a lock to ensure thread-safe initialization. # For regular Python builds with GIL, the lock overhead is minimal and the code remains safe. -_init_lock = threading.Lock() -_inited = False -_py_major_ver = None -_py_minor_ver = None -_driver_ver = None -_kernel_ctypes = None -_paraminfo_supported = False +cdef object _init_lock = threading.Lock() +cdef bint _inited = False +cdef int _py_major_ver = 0 +cdef int _py_minor_ver = 0 +cdef int _driver_ver = 0 +cdef tuple _kernel_ctypes = None +cdef bint _paraminfo_supported = False -def _lazy_init(): +cdef int _lazy_init() except -1: """ Initialize module-level state in a thread-safe manner. @@ -48,13 +48,13 @@ def _lazy_init(): global _inited # Fast path: already initialized (no lock needed for read) if _inited: - return + return 0 # Slow path: acquire lock and initialize with _init_lock: # Double-check: another thread might have initialized while we waited if _inited: - return + return 0 global _py_major_ver, _py_minor_ver, _driver_ver, _kernel_ctypes, _paraminfo_supported # binding availability depends on cuda-python version @@ -66,33 +66,35 @@ def _lazy_init(): # Mark as initialized (must be last to ensure all state is set) _inited = True + return 0 + -# Auto-initializing property accessors -def _get_py_major_ver(): +# Auto-initializing accessors (cdef for internal use) +cdef inline int _get_py_major_ver() except -1: """Get the Python binding major version, initializing if needed.""" _lazy_init() return _py_major_ver -def _get_py_minor_ver(): +cdef inline int _get_py_minor_ver() except -1: """Get the Python binding minor version, initializing if needed.""" _lazy_init() return _py_minor_ver -def _get_driver_ver(): +cdef inline int _get_driver_ver() except -1: """Get the CUDA driver version, initializing if needed.""" _lazy_init() return _driver_ver -def _get_kernel_ctypes(): +cdef inline tuple _get_kernel_ctypes(): """Get the kernel ctypes tuple, initializing if needed.""" _lazy_init() return _kernel_ctypes -def _is_paraminfo_supported(): +cdef inline bint _is_paraminfo_supported() except -1: """Return True if cuKernelGetParamInfo is available (driver >= 12.4).""" _lazy_init() return _paraminfo_supported @@ -111,7 +113,7 @@ def _is_cukernel_get_library_supported() -> bool: ) -def _make_dummy_library_handle(): +cdef inline object _make_dummy_library_handle(): """Create a non-null placeholder CUlibrary handle to disable lazy loading.""" return driver.CUlibrary(1) if hasattr(driver, "CUlibrary") else 1 @@ -132,10 +134,10 @@ class KernelAttributes: _lazy_init() return self - def _get_cached_attribute(self, device_id: Device | int, attribute: driver.CUfunction_attribute) -> int: + def _get_cached_attribute(self, device_id, attribute): """Helper function to get a cached attribute or fetch and cache it if not present.""" device_id = Device(device_id).device_id - cache_key = device_id, attribute + cache_key = (device_id, attribute) result = self._cache.get(cache_key, cache_key) if result is not cache_key: return result @@ -440,15 +442,15 @@ cdef class Kernel: self._attributes = KernelAttributes._init(self) return self._attributes - def _get_arguments_info(self, param_info=False) -> tuple[int, list[ParamInfo]]: + cdef tuple _get_arguments_info(self, bint param_info=False): if not _is_paraminfo_supported(): driver_ver = _get_driver_ver() raise NotImplementedError( "Driver version 12.4 or newer is required for this function. " f"Using driver version {driver_ver // 1000}.{(driver_ver % 1000) // 10}" ) - arg_pos = 0 - param_info_data = [] + cdef int arg_pos = 0 + cdef list param_info_data = [] while True: result = driver.cuKernelGetParamInfo(self._handle, arg_pos) if result[0] != driver.CUresult.CUDA_SUCCESS: @@ -535,7 +537,7 @@ cdef class Kernel: CodeTypeT = bytes | bytearray | str -_supported_code_type = ("cubin", "ptx", "ltoir", "fatbin", "object", "library") +cdef tuple _supported_code_type = ("cubin", "ptx", "ltoir", "fatbin", "object", "library") cdef class ObjectCode: From 360f918abc77e120d38a288ae8e5c894823a4627 Mon Sep 17 00:00:00 2001 From: Andy Jost Date: Wed, 21 Jan 2026 15:53:25 -0800 Subject: [PATCH 04/10] Convert KernelAttributes to cdef class Follow the _MemPoolAttributes pattern: - cdef class with inline cdef attributes (_kernel_weakref, _cache) - _init as @classmethod (not @staticmethod cdef) - _get_cached_attribute and _resolve_device_id use except? -1 - Explicit cast when dereferencing weakref --- cuda_core/cuda/core/_module.pyx | 75 +++++++++++++++++++++------------ 1 file changed, 49 insertions(+), 26 deletions(-) diff --git a/cuda_core/cuda/core/_module.pyx b/cuda_core/cuda/core/_module.pyx index 6dd0aad817..354dcbceb0 100644 --- a/cuda_core/cuda/core/_module.pyx +++ b/cuda_core/cuda/core/_module.pyx @@ -118,89 +118,110 @@ cdef inline object _make_dummy_library_handle(): return driver.CUlibrary(1) if hasattr(driver, "CUlibrary") else 1 -class KernelAttributes: +cdef class KernelAttributes: """Provides access to kernel attributes. Uses weakref to avoid preventing Kernel GC.""" - __slots__ = ("_kernel", "_cache") + cdef: + object _kernel_weakref + dict _cache - def __new__(cls, *args, **kwargs): + def __init__(self, *args, **kwargs): raise RuntimeError("KernelAttributes cannot be instantiated directly. Please use Kernel APIs.") @classmethod def _init(cls, kernel): - self = object.__new__(cls) - self._kernel = weakref.ref(kernel) + cdef KernelAttributes self = KernelAttributes.__new__(cls) + self._kernel_weakref = weakref.ref(kernel) self._cache = {} _lazy_init() return self - def _get_cached_attribute(self, device_id, attribute): + cdef int _get_cached_attribute(self, int device_id, object attribute) except? -1: """Helper function to get a cached attribute or fetch and cache it if not present.""" - device_id = Device(device_id).device_id - cache_key = (device_id, attribute) + cdef tuple cache_key = (device_id, attribute) result = self._cache.get(cache_key, cache_key) if result is not cache_key: return result - kernel = self._kernel() + cdef Kernel kernel = (self._kernel_weakref()) if kernel is None: raise RuntimeError("Cannot access kernel attributes for expired Kernel object") - result = handle_return(driver.cuKernelGetAttribute(attribute, kernel.handle, device_id)) + result = handle_return(driver.cuKernelGetAttribute(attribute, kernel._handle, device_id)) self._cache[cache_key] = result return result + cdef inline int _resolve_device_id(self, device_id) except? -1: + """Convert Device or int to device_id int.""" + return Device(device_id).device_id + def max_threads_per_block(self, device_id: Device | int = None) -> int: """int : The maximum number of threads per block. This attribute is read-only.""" return self._get_cached_attribute( - device_id, driver.CUfunction_attribute.CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK + self._resolve_device_id(device_id), driver.CUfunction_attribute.CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK ) def shared_size_bytes(self, device_id: Device | int = None) -> int: """int : The size in bytes of statically-allocated shared memory required by this function. This attribute is read-only.""" - return self._get_cached_attribute(device_id, driver.CUfunction_attribute.CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES) + return self._get_cached_attribute( + self._resolve_device_id(device_id), driver.CUfunction_attribute.CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES + ) def const_size_bytes(self, device_id: Device | int = None) -> int: """int : The size in bytes of user-allocated constant memory required by this function. This attribute is read-only.""" - return self._get_cached_attribute(device_id, driver.CUfunction_attribute.CU_FUNC_ATTRIBUTE_CONST_SIZE_BYTES) + return self._get_cached_attribute( + self._resolve_device_id(device_id), driver.CUfunction_attribute.CU_FUNC_ATTRIBUTE_CONST_SIZE_BYTES + ) def local_size_bytes(self, device_id: Device | int = None) -> int: """int : The size in bytes of local memory used by each thread of this function. This attribute is read-only.""" - return self._get_cached_attribute(device_id, driver.CUfunction_attribute.CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES) + return self._get_cached_attribute( + self._resolve_device_id(device_id), driver.CUfunction_attribute.CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES + ) def num_regs(self, device_id: Device | int = None) -> int: """int : The number of registers used by each thread of this function. This attribute is read-only.""" - return self._get_cached_attribute(device_id, driver.CUfunction_attribute.CU_FUNC_ATTRIBUTE_NUM_REGS) + return self._get_cached_attribute( + self._resolve_device_id(device_id), driver.CUfunction_attribute.CU_FUNC_ATTRIBUTE_NUM_REGS + ) def ptx_version(self, device_id: Device | int = None) -> int: """int : The PTX virtual architecture version for which the function was compiled. This attribute is read-only.""" - return self._get_cached_attribute(device_id, driver.CUfunction_attribute.CU_FUNC_ATTRIBUTE_PTX_VERSION) + return self._get_cached_attribute( + self._resolve_device_id(device_id), driver.CUfunction_attribute.CU_FUNC_ATTRIBUTE_PTX_VERSION + ) def binary_version(self, device_id: Device | int = None) -> int: """int : The binary architecture version for which the function was compiled. This attribute is read-only.""" - return self._get_cached_attribute(device_id, driver.CUfunction_attribute.CU_FUNC_ATTRIBUTE_BINARY_VERSION) + return self._get_cached_attribute( + self._resolve_device_id(device_id), driver.CUfunction_attribute.CU_FUNC_ATTRIBUTE_BINARY_VERSION + ) def cache_mode_ca(self, device_id: Device | int = None) -> bool: """bool : Whether the function has been compiled with user specified option "-Xptxas --dlcm=ca" set. This attribute is read-only.""" - return bool(self._get_cached_attribute(device_id, driver.CUfunction_attribute.CU_FUNC_ATTRIBUTE_CACHE_MODE_CA)) + return bool( + self._get_cached_attribute( + self._resolve_device_id(device_id), driver.CUfunction_attribute.CU_FUNC_ATTRIBUTE_CACHE_MODE_CA + ) + ) def max_dynamic_shared_size_bytes(self, device_id: Device | int = None) -> int: """int : The maximum size in bytes of dynamically-allocated shared memory that can be used by this function.""" return self._get_cached_attribute( - device_id, driver.CUfunction_attribute.CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES + self._resolve_device_id(device_id), driver.CUfunction_attribute.CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES ) def preferred_shared_memory_carveout(self, device_id: Device | int = None) -> int: """int : The shared memory carveout preference, in percent of the total shared memory.""" return self._get_cached_attribute( - device_id, driver.CUfunction_attribute.CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT + self._resolve_device_id(device_id), driver.CUfunction_attribute.CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT ) def cluster_size_must_be_set(self, device_id: Device | int = None) -> bool: @@ -208,40 +229,42 @@ class KernelAttributes: This attribute is read-only.""" return bool( self._get_cached_attribute( - device_id, driver.CUfunction_attribute.CU_FUNC_ATTRIBUTE_CLUSTER_SIZE_MUST_BE_SET + self._resolve_device_id(device_id), driver.CUfunction_attribute.CU_FUNC_ATTRIBUTE_CLUSTER_SIZE_MUST_BE_SET ) ) def required_cluster_width(self, device_id: Device | int = None) -> int: """int : The required cluster width in blocks.""" return self._get_cached_attribute( - device_id, driver.CUfunction_attribute.CU_FUNC_ATTRIBUTE_REQUIRED_CLUSTER_WIDTH + self._resolve_device_id(device_id), driver.CUfunction_attribute.CU_FUNC_ATTRIBUTE_REQUIRED_CLUSTER_WIDTH ) def required_cluster_height(self, device_id: Device | int = None) -> int: """int : The required cluster height in blocks.""" return self._get_cached_attribute( - device_id, driver.CUfunction_attribute.CU_FUNC_ATTRIBUTE_REQUIRED_CLUSTER_HEIGHT + self._resolve_device_id(device_id), driver.CUfunction_attribute.CU_FUNC_ATTRIBUTE_REQUIRED_CLUSTER_HEIGHT ) def required_cluster_depth(self, device_id: Device | int = None) -> int: """int : The required cluster depth in blocks.""" return self._get_cached_attribute( - device_id, driver.CUfunction_attribute.CU_FUNC_ATTRIBUTE_REQUIRED_CLUSTER_DEPTH + self._resolve_device_id(device_id), driver.CUfunction_attribute.CU_FUNC_ATTRIBUTE_REQUIRED_CLUSTER_DEPTH ) def non_portable_cluster_size_allowed(self, device_id: Device | int = None) -> bool: """bool : Whether the function can be launched with non-portable cluster size.""" return bool( self._get_cached_attribute( - device_id, driver.CUfunction_attribute.CU_FUNC_ATTRIBUTE_NON_PORTABLE_CLUSTER_SIZE_ALLOWED + self._resolve_device_id(device_id), + driver.CUfunction_attribute.CU_FUNC_ATTRIBUTE_NON_PORTABLE_CLUSTER_SIZE_ALLOWED, ) ) def cluster_scheduling_policy_preference(self, device_id: Device | int = None) -> int: """int : The block scheduling policy of a function.""" return self._get_cached_attribute( - device_id, driver.CUfunction_attribute.CU_FUNC_ATTRIBUTE_CLUSTER_SCHEDULING_POLICY_PREFERENCE + self._resolve_device_id(device_id), + driver.CUfunction_attribute.CU_FUNC_ATTRIBUTE_CLUSTER_SCHEDULING_POLICY_PREFERENCE, ) From 4f236e0a59faf4d6fe92a7298c5c90c588557ccf Mon Sep 17 00:00:00 2001 From: Andy Jost Date: Wed, 21 Jan 2026 17:16:57 -0800 Subject: [PATCH 05/10] Add LibraryHandle and KernelHandle to resource_handles infrastructure Extends the RAII handle system to support CUlibrary and CUkernel driver objects used in _module.pyx. This provides automatic lifetime management and proper cleanup for library and kernel handles. Changes: - Add LibraryHandle/KernelHandle types with factory functions - Update Kernel, ObjectCode, KernelOccupancy to use typed handles - Move KernelAttributes cdef block to .pxd for strong typing - Update _launcher.pyx to access kernel handle directly via cdef --- cuda_core/cuda/core/_cpp/resource_handles.cpp | 82 +++++++++++++ cuda_core/cuda/core/_cpp/resource_handles.hpp | 66 ++++++++++ cuda_core/cuda/core/_launcher.pyx | 4 +- cuda_core/cuda/core/_module.pxd | 26 ++-- cuda_core/cuda/core/_module.pyx | 114 +++++++++++------- cuda_core/cuda/core/_resource_handles.pxd | 18 +++ cuda_core/cuda/core/_resource_handles.pyx | 28 +++++ 7 files changed, 285 insertions(+), 53 deletions(-) diff --git a/cuda_core/cuda/core/_cpp/resource_handles.cpp b/cuda_core/cuda/core/_cpp/resource_handles.cpp index 724ea97169..7e6e388579 100644 --- a/cuda_core/cuda/core/_cpp/resource_handles.cpp +++ b/cuda_core/cuda/core/_cpp/resource_handles.cpp @@ -51,6 +51,11 @@ decltype(&cuMemFreeHost) p_cuMemFreeHost = nullptr; decltype(&cuMemPoolImportPointer) p_cuMemPoolImportPointer = nullptr; +decltype(&cuLibraryLoadFromFile) p_cuLibraryLoadFromFile = nullptr; +decltype(&cuLibraryLoadData) p_cuLibraryLoadData = nullptr; +decltype(&cuLibraryUnload) p_cuLibraryUnload = nullptr; +decltype(&cuLibraryGetKernel) p_cuLibraryGetKernel = nullptr; + // ============================================================================ // GIL management helpers // ============================================================================ @@ -682,4 +687,81 @@ DevicePtrHandle deviceptr_import_ipc(MemoryPoolHandle h_pool, const void* export } } +// ============================================================================ +// Library Handles +// ============================================================================ + +namespace { +struct LibraryBox { + CUlibrary resource; +}; +} // namespace + +LibraryHandle create_library_handle_from_file(const char* path) { + GILReleaseGuard gil; + CUlibrary library; + if (CUDA_SUCCESS != (err = p_cuLibraryLoadFromFile(&library, path, nullptr, nullptr, 0, nullptr, nullptr, 0))) { + return {}; + } + + auto box = std::shared_ptr( + new LibraryBox{library}, + [](const LibraryBox* b) { + GILReleaseGuard gil; + p_cuLibraryUnload(b->resource); + delete b; + } + ); + return LibraryHandle(box, &box->resource); +} + +LibraryHandle create_library_handle_from_data(const void* data) { + GILReleaseGuard gil; + CUlibrary library; + if (CUDA_SUCCESS != (err = p_cuLibraryLoadData(&library, data, nullptr, nullptr, 0, nullptr, nullptr, 0))) { + return {}; + } + + auto box = std::shared_ptr( + new LibraryBox{library}, + [](const LibraryBox* b) { + GILReleaseGuard gil; + p_cuLibraryUnload(b->resource); + delete b; + } + ); + return LibraryHandle(box, &box->resource); +} + +LibraryHandle create_library_handle_ref(CUlibrary library) { + auto box = std::make_shared(LibraryBox{library}); + return LibraryHandle(box, &box->resource); +} + +// ============================================================================ +// Kernel Handles +// ============================================================================ + +namespace { +struct KernelBox { + CUkernel resource; + LibraryHandle h_library; // Keeps library alive +}; +} // namespace + +KernelHandle get_kernel_from_library(LibraryHandle h_library, const char* name) { + GILReleaseGuard gil; + CUkernel kernel; + if (CUDA_SUCCESS != (err = p_cuLibraryGetKernel(&kernel, *h_library, name))) { + return {}; + } + + return create_kernel_handle_ref(kernel, h_library); +} + +KernelHandle create_kernel_handle_ref(CUkernel kernel, LibraryHandle h_library) { + auto box = std::make_shared(KernelBox{kernel, h_library}); + return KernelHandle(box, &box->resource); +} + } // namespace cuda_core diff --git a/cuda_core/cuda/core/_cpp/resource_handles.hpp b/cuda_core/cuda/core/_cpp/resource_handles.hpp index 4a6d9bb241..ba76a6c054 100644 --- a/cuda_core/cuda/core/_cpp/resource_handles.hpp +++ b/cuda_core/cuda/core/_cpp/resource_handles.hpp @@ -61,6 +61,12 @@ extern decltype(&cuMemFreeHost) p_cuMemFreeHost; extern decltype(&cuMemPoolImportPointer) p_cuMemPoolImportPointer; +// Library +extern decltype(&cuLibraryLoadFromFile) p_cuLibraryLoadFromFile; +extern decltype(&cuLibraryLoadData) p_cuLibraryLoadData; +extern decltype(&cuLibraryUnload) p_cuLibraryUnload; +extern decltype(&cuLibraryGetKernel) p_cuLibraryGetKernel; + // ============================================================================ // Handle type aliases - expose only the raw CUDA resource // ============================================================================ @@ -69,6 +75,8 @@ using ContextHandle = std::shared_ptr; using StreamHandle = std::shared_ptr; using EventHandle = std::shared_ptr; using MemoryPoolHandle = std::shared_ptr; +using LibraryHandle = std::shared_ptr; +using KernelHandle = std::shared_ptr; // ============================================================================ // Context handle functions @@ -218,6 +226,40 @@ StreamHandle deallocation_stream(const DevicePtrHandle& h) noexcept; // Set the deallocation stream for a device pointer handle. void set_deallocation_stream(const DevicePtrHandle& h, StreamHandle h_stream) noexcept; +// ============================================================================ +// Library handle functions +// ============================================================================ + +// Create an owning library handle by loading from a file path. +// When the last reference is released, cuLibraryUnload is called automatically. +// Returns empty handle on error (caller must check). +LibraryHandle create_library_handle_from_file(const char* path); + +// Create an owning library handle by loading from memory data. +// The driver makes an internal copy of the data; caller can free it after return. +// When the last reference is released, cuLibraryUnload is called automatically. +// Returns empty handle on error (caller must check). +LibraryHandle create_library_handle_from_data(const void* data); + +// Create a non-owning library handle (references existing library). +// Use for borrowed libraries (e.g., from foreign code). +// The library will NOT be unloaded when the handle is released. +LibraryHandle create_library_handle_ref(CUlibrary library); + +// ============================================================================ +// Kernel handle functions +// ============================================================================ + +// Get a kernel from a library by name. +// The kernel structurally depends on the provided library handle. +// Kernels have no explicit destroy - their lifetime is tied to the library. +// Returns empty handle on error (caller must check). +KernelHandle get_kernel_from_library(LibraryHandle h_library, const char* name); + +// Create a non-owning kernel handle with library dependency. +// Use for borrowed kernels. The library handle keeps the library alive. +KernelHandle create_kernel_handle_ref(CUkernel kernel, LibraryHandle h_library); + // ============================================================================ // Overloaded helper functions to extract raw resources from handles // ============================================================================ @@ -243,6 +285,14 @@ inline CUdeviceptr as_cu(const DevicePtrHandle& h) noexcept { return h ? *h : 0; } +inline CUlibrary as_cu(const LibraryHandle& h) noexcept { + return h ? *h : nullptr; +} + +inline CUkernel as_cu(const KernelHandle& h) noexcept { + return h ? *h : nullptr; +} + // as_intptr() - extract handle as intptr_t for Python interop // Using signed intptr_t per C standard convention and issue #1342 inline std::intptr_t as_intptr(const ContextHandle& h) noexcept { @@ -265,6 +315,14 @@ inline std::intptr_t as_intptr(const DevicePtrHandle& h) noexcept { return static_cast(as_cu(h)); } +inline std::intptr_t as_intptr(const LibraryHandle& h) noexcept { + return reinterpret_cast(as_cu(h)); +} + +inline std::intptr_t as_intptr(const KernelHandle& h) noexcept { + return reinterpret_cast(as_cu(h)); +} + // as_py() - convert handle to Python driver wrapper object (returns new reference) namespace detail { // n.b. class lookup is not cached to avoid deadlock hazard, see DESIGN.md @@ -300,4 +358,12 @@ inline PyObject* as_py(const DevicePtrHandle& h) noexcept { return detail::make_py("CUdeviceptr", as_intptr(h)); } +inline PyObject* as_py(const LibraryHandle& h) noexcept { + return detail::make_py("CUlibrary", as_intptr(h)); +} + +inline PyObject* as_py(const KernelHandle& h) noexcept { + return detail::make_py("CUkernel", as_intptr(h)); +} + } // namespace cuda_core diff --git a/cuda_core/cuda/core/_launcher.pyx b/cuda_core/cuda/core/_launcher.pyx index 9066e72732..48eb2038b2 100644 --- a/cuda_core/cuda/core/_launcher.pyx +++ b/cuda_core/cuda/core/_launcher.pyx @@ -8,6 +8,7 @@ from cuda.bindings cimport cydriver from cuda.core._launch_config cimport LaunchConfig from cuda.core._kernel_arg_handler cimport ParamHolder +from cuda.core._module cimport Kernel from cuda.core._resource_handles cimport as_cu from cuda.core._stream cimport Stream_accept, Stream from cuda.core._utils.cuda_utils cimport ( @@ -80,7 +81,8 @@ def launch(stream: Stream | GraphBuilder | IsStreamT, config: LaunchConfig, kern # Note: We now use CUkernel handles exclusively (CUDA 12+), but they can be cast to # CUfunction for use with cuLaunchKernel, as both handle types are interchangeable # for kernel launch purposes. - cdef cydriver.CUfunction func_handle = ((kernel.handle)) + cdef Kernel ker = kernel + cdef cydriver.CUfunction func_handle = as_cu(ker._h_kernel) # Note: CUkernel can still be launched via cuLaunchKernel (not just cuLaunchKernelEx). # We check both binding & driver versions here mainly to see if the "Ex" API is diff --git a/cuda_core/cuda/core/_module.pxd b/cuda_core/cuda/core/_module.pxd index 5f0fb9b572..c17c7163bd 100644 --- a/cuda_core/cuda/core/_module.pxd +++ b/cuda_core/cuda/core/_module.pxd @@ -2,28 +2,31 @@ # # SPDX-License-Identifier: Apache-2.0 +from cuda.core._resource_handles cimport LibraryHandle, KernelHandle + cdef class ObjectCode cdef class Kernel cdef class KernelOccupancy +cdef class KernelAttributes cdef class Kernel: cdef: - object _handle # CUkernel (will become KernelHandle in phase 2c) + KernelHandle _h_kernel ObjectCode _module # ObjectCode reference - object _attributes # KernelAttributes (regular Python class, lazy) - KernelOccupancy _occupancy # KernelOccupancy (lazy) + KernelAttributes _attributes # lazy + KernelOccupancy _occupancy # lazy object __weakref__ # Enable weak references @staticmethod - cdef Kernel _from_obj(object obj, ObjectCode mod) + cdef Kernel _from_obj(KernelHandle h_kernel, ObjectCode mod) cdef tuple _get_arguments_info(self, bint param_info=*) cdef class ObjectCode: cdef: - object _handle # CUlibrary (will become LibraryHandle in phase 2c) + LibraryHandle _h_library str _code_type object _module # bytes/str source dict _sym_map @@ -37,7 +40,16 @@ cdef class ObjectCode: cdef class KernelOccupancy: cdef: - object _handle # CUkernel reference + KernelHandle _h_kernel @staticmethod - cdef KernelOccupancy _init(object handle) + cdef KernelOccupancy _init(KernelHandle h_kernel) + + +cdef class KernelAttributes: + cdef: + object _kernel_weakref + dict _cache + + cdef int _get_cached_attribute(self, int device_id, object attribute) except? -1 + cdef int _resolve_device_id(self, device_id) except? -1 diff --git a/cuda_core/cuda/core/_module.pyx b/cuda_core/cuda/core/_module.pyx index 354dcbceb0..c48550241d 100644 --- a/cuda_core/cuda/core/_module.pyx +++ b/cuda_core/cuda/core/_module.pyx @@ -11,13 +11,25 @@ from collections import namedtuple from cuda.core._device import Device from cuda.core._launch_config import LaunchConfig, _to_native_launch_config +from cuda.core._resource_handles cimport ( + LibraryHandle, + KernelHandle, + create_library_handle_from_file, + create_library_handle_from_data, + create_library_handle_ref, + get_kernel_from_library, + create_kernel_handle_ref, + get_last_error, + as_py, +) from cuda.core._stream import Stream from cuda.core._utils.clear_error_support import ( - assert_type, assert_type_str_or_bytes_like, raise_code_path_meant_to_be_unreachable, ) -from cuda.core._utils.cuda_utils import CUDAError, driver, get_binding_version, handle_return +from cuda.core._utils.cuda_utils cimport HANDLE_RETURN +from cuda.core._utils.cuda_utils import driver, get_binding_version, handle_return +from cuda.bindings cimport cydriver __all__ = ["Kernel", "ObjectCode"] @@ -113,18 +125,14 @@ def _is_cukernel_get_library_supported() -> bool: ) -cdef inline object _make_dummy_library_handle(): - """Create a non-null placeholder CUlibrary handle to disable lazy loading.""" - return driver.CUlibrary(1) if hasattr(driver, "CUlibrary") else 1 +cdef inline LibraryHandle _make_empty_library_handle(): + """Create an empty LibraryHandle to indicate no library loaded.""" + return LibraryHandle() # Empty shared_ptr cdef class KernelAttributes: """Provides access to kernel attributes. Uses weakref to avoid preventing Kernel GC.""" - cdef: - object _kernel_weakref - dict _cache - def __init__(self, *args, **kwargs): raise RuntimeError("KernelAttributes cannot be instantiated directly. Please use Kernel APIs.") @@ -145,7 +153,7 @@ cdef class KernelAttributes: cdef Kernel kernel = (self._kernel_weakref()) if kernel is None: raise RuntimeError("Cannot access kernel attributes for expired Kernel object") - result = handle_return(driver.cuKernelGetAttribute(attribute, kernel._handle, device_id)) + result = handle_return(driver.cuKernelGetAttribute(attribute, as_py(kernel._h_kernel), device_id)) self._cache[cache_key] = result return result @@ -280,9 +288,9 @@ cdef class KernelOccupancy: raise RuntimeError("KernelOccupancy cannot be instantiated directly. Please use Kernel APIs.") @staticmethod - cdef KernelOccupancy _init(object handle): + cdef KernelOccupancy _init(KernelHandle h_kernel): cdef KernelOccupancy self = KernelOccupancy.__new__(KernelOccupancy) - self._handle = handle + self._h_kernel = h_kernel return self def max_active_blocks_per_multiprocessor(self, block_size: int, dynamic_shared_memory_size: int) -> int: @@ -311,7 +319,9 @@ cdef class KernelOccupancy: """ return handle_return( - driver.cuOccupancyMaxActiveBlocksPerMultiprocessor(self._handle, block_size, dynamic_shared_memory_size) + driver.cuOccupancyMaxActiveBlocksPerMultiprocessor( + as_py(self._h_kernel), block_size, dynamic_shared_memory_size + ) ) def max_potential_block_size( @@ -346,16 +356,17 @@ cdef class KernelOccupancy: Interpreter Lock may lead to deadlocks. """ + cdef object kernel_py = as_py(self._h_kernel) if isinstance(dynamic_shared_memory_needed, int): min_grid_size, max_block_size = handle_return( driver.cuOccupancyMaxPotentialBlockSize( - self._handle, None, dynamic_shared_memory_needed, block_size_limit + kernel_py, None, dynamic_shared_memory_needed, block_size_limit ) ) elif isinstance(dynamic_shared_memory_needed, driver.CUoccupancyB2DSize): min_grid_size, max_block_size = handle_return( driver.cuOccupancyMaxPotentialBlockSize( - self._handle, dynamic_shared_memory_needed.getPtr(), 0, block_size_limit + kernel_py, dynamic_shared_memory_needed.getPtr(), 0, block_size_limit ) ) else: @@ -383,7 +394,9 @@ cdef class KernelOccupancy: Dynamic shared memory available per block for given launch configuration. """ return handle_return( - driver.cuOccupancyAvailableDynamicSMemPerBlock(self._handle, num_blocks_per_multiprocessor, block_size) + driver.cuOccupancyAvailableDynamicSMemPerBlock( + as_py(self._h_kernel), num_blocks_per_multiprocessor, block_size + ) ) def max_potential_cluster_size(self, config: LaunchConfig, stream: Stream | None = None) -> int: @@ -406,7 +419,7 @@ cdef class KernelOccupancy: drv_cfg = _to_native_launch_config(config) if stream is not None: drv_cfg.hStream = stream.handle - return handle_return(driver.cuOccupancyMaxPotentialClusterSize(self._handle, drv_cfg)) + return handle_return(driver.cuOccupancyMaxPotentialClusterSize(as_py(self._h_kernel), drv_cfg)) def max_active_clusters(self, config: LaunchConfig, stream: Stream | None = None) -> int: """Maximum number of active clusters on the target device. @@ -428,7 +441,7 @@ cdef class KernelOccupancy: drv_cfg = _to_native_launch_config(config) if stream is not None: drv_cfg.hStream = stream.handle - return handle_return(driver.cuOccupancyMaxActiveClusters(self._handle, drv_cfg)) + return handle_return(driver.cuOccupancyMaxActiveClusters(as_py(self._h_kernel), drv_cfg)) ParamInfo = namedtuple("ParamInfo", ["offset", "size"]) @@ -449,10 +462,9 @@ cdef class Kernel: raise RuntimeError("Kernel objects cannot be instantiated directly. Please use ObjectCode APIs.") @staticmethod - cdef Kernel _from_obj(object obj, ObjectCode mod): - assert_type(obj, _get_kernel_ctypes()) + cdef Kernel _from_obj(KernelHandle h_kernel, ObjectCode mod): cdef Kernel ker = Kernel.__new__(Kernel) - ker._handle = obj + ker._h_kernel = h_kernel ker._module = mod ker._attributes = None ker._occupancy = None @@ -474,8 +486,9 @@ cdef class Kernel: ) cdef int arg_pos = 0 cdef list param_info_data = [] + cdef object kernel_py = as_py(self._h_kernel) while True: - result = driver.cuKernelGetParamInfo(self._handle, arg_pos) + result = driver.cuKernelGetParamInfo(kernel_py, arg_pos) if result[0] != driver.CUresult.CUDA_SUCCESS: break if param_info: @@ -502,7 +515,7 @@ cdef class Kernel: def occupancy(self) -> KernelOccupancy: """Get the occupancy information for launching this kernel.""" if self._occupancy is None: - self._occupancy = KernelOccupancy._init(self._handle) + self._occupancy = KernelOccupancy._init(self._h_kernel) return self._occupancy @property @@ -514,7 +527,7 @@ cdef class Kernel: This handle is a Python object. To get the memory address of the underlying C handle, call ``int(Kernel.handle)``. """ - return self._handle + return as_py(self._h_kernel) @staticmethod def from_handle(handle: int, mod: ObjectCode = None) -> Kernel: @@ -537,25 +550,28 @@ cdef class Kernel: if not isinstance(handle, int): raise TypeError(f"handle must be an integer, got {type(handle).__name__}") - # Convert the integer handle to CUkernel driver type - kernel_obj = driver.CUkernel(handle) + # Convert the integer handle to CUkernel + cdef cydriver.CUkernel cu_kernel = handle + cdef KernelHandle h_kernel - # If no module provided, create a placeholder + # If no module provided, create a placeholder and try to get the library if mod is None: - # For CUkernel, we can (optionally) inverse-lookup the owning CUlibrary via - # cuKernelGetLibrary (added in CUDA 12.5). If the API is not available, we fall - # back to a non-null dummy handle purely to disable lazy loading. mod = ObjectCode._init(b"", "cubin", "", None) if _is_cukernel_get_library_supported(): + # Try to get the owning library via cuKernelGetLibrary try: - mod._handle = handle_return(driver.cuKernelGetLibrary(kernel_obj)) - except (CUDAError, RuntimeError): - # Best-effort: don't fail construction if inverse lookup fails. - mod._handle = _make_dummy_library_handle() - else: - mod._handle = _make_dummy_library_handle() + cu_library = handle_return(driver.cuKernelGetLibrary(driver.CUkernel(handle))) + mod._h_library = create_library_handle_ref(int(cu_library)) + except Exception: + # Best-effort: don't fail construction if inverse lookup fails + pass + + # Create kernel handle with library dependency + h_kernel = create_kernel_handle_ref(cu_kernel, mod._h_library) + if not h_kernel: + HANDLE_RETURN(get_last_error()) - return Kernel._from_obj(kernel_obj, mod) + return Kernel._from_obj(h_kernel, mod) CodeTypeT = bytes | bytearray | str @@ -588,8 +604,8 @@ cdef class ObjectCode: assert code_type in _supported_code_type, f"{code_type=} is not supported" cdef ObjectCode self = ObjectCode.__new__(ObjectCode) - # handle is assigned during _lazy_load - self._handle = None + # _h_library is assigned during _lazy_load_module + self._h_library = LibraryHandle() # Empty handle _lazy_init() self._code_type = code_type @@ -723,15 +739,21 @@ cdef class ObjectCode: # TODO: do we want to unload in a finalizer? Probably not.. cdef int _lazy_load_module(self) except -1: - if self._handle is not None: + if self._h_library: return 0 module = self._module assert_type_str_or_bytes_like(module) + cdef bytes path_bytes if isinstance(module, str): - self._handle = handle_return(driver.cuLibraryLoadFromFile(module.encode(), [], [], 0, [], [], 0)) + path_bytes = module.encode() + self._h_library = create_library_handle_from_file(path_bytes) + if not self._h_library: + HANDLE_RETURN(get_last_error()) return 0 if isinstance(module, (bytes, bytearray)): - self._handle = handle_return(driver.cuLibraryLoadData(module, [], [], 0, [], [], 0)) + self._h_library = create_library_handle_from_data(module) + if not self._h_library: + HANDLE_RETURN(get_last_error()) return 0 raise_code_path_meant_to_be_unreachable() return -1 @@ -759,8 +781,10 @@ cdef class ObjectCode: except KeyError: name = name.encode() - data = handle_return(driver.cuLibraryGetKernel(self._handle, name)) - return Kernel._from_obj(data, self) + cdef KernelHandle h_kernel = get_kernel_from_library(self._h_library, name) + if not h_kernel: + HANDLE_RETURN(get_last_error()) + return Kernel._from_obj(h_kernel, self) @property def code(self) -> CodeTypeT: @@ -792,4 +816,4 @@ cdef class ObjectCode: handle, call ``int(ObjectCode.handle)``. """ self._lazy_load_module() - return self._handle + return as_py(self._h_library) diff --git a/cuda_core/cuda/core/_resource_handles.pxd b/cuda_core/cuda/core/_resource_handles.pxd index 7a634f3a82..10816481b0 100644 --- a/cuda_core/cuda/core/_resource_handles.pxd +++ b/cuda_core/cuda/core/_resource_handles.pxd @@ -21,6 +21,8 @@ cdef extern from "_cpp/resource_handles.hpp" namespace "cuda_core": ctypedef shared_ptr[const cydriver.CUevent] EventHandle ctypedef shared_ptr[const cydriver.CUmemoryPool] MemoryPoolHandle ctypedef shared_ptr[const cydriver.CUdeviceptr] DevicePtrHandle + ctypedef shared_ptr[const cydriver.CUlibrary] LibraryHandle + ctypedef shared_ptr[const cydriver.CUkernel] KernelHandle # as_cu() - extract the raw CUDA handle (inline C++) cydriver.CUcontext as_cu(ContextHandle h) noexcept nogil @@ -28,6 +30,8 @@ cdef extern from "_cpp/resource_handles.hpp" namespace "cuda_core": cydriver.CUevent as_cu(EventHandle h) noexcept nogil cydriver.CUmemoryPool as_cu(MemoryPoolHandle h) noexcept nogil cydriver.CUdeviceptr as_cu(DevicePtrHandle h) noexcept nogil + cydriver.CUlibrary as_cu(LibraryHandle h) noexcept nogil + cydriver.CUkernel as_cu(KernelHandle h) noexcept nogil # as_intptr() - extract handle as intptr_t for Python interop (inline C++) intptr_t as_intptr(ContextHandle h) noexcept nogil @@ -35,6 +39,8 @@ cdef extern from "_cpp/resource_handles.hpp" namespace "cuda_core": intptr_t as_intptr(EventHandle h) noexcept nogil intptr_t as_intptr(MemoryPoolHandle h) noexcept nogil intptr_t as_intptr(DevicePtrHandle h) noexcept nogil + intptr_t as_intptr(LibraryHandle h) noexcept nogil + intptr_t as_intptr(KernelHandle h) noexcept nogil # as_py() - convert handle to Python driver wrapper object (inline C++; requires GIL) object as_py(ContextHandle h) @@ -42,6 +48,8 @@ cdef extern from "_cpp/resource_handles.hpp" namespace "cuda_core": object as_py(EventHandle h) object as_py(MemoryPoolHandle h) object as_py(DevicePtrHandle h) + object as_py(LibraryHandle h) + object as_py(KernelHandle h) # ============================================================================= @@ -94,3 +102,13 @@ cdef DevicePtrHandle deviceptr_import_ipc( MemoryPoolHandle h_pool, const void* export_data, StreamHandle h_stream) nogil except+ cdef StreamHandle deallocation_stream(const DevicePtrHandle& h) noexcept nogil cdef void set_deallocation_stream(const DevicePtrHandle& h, StreamHandle h_stream) noexcept nogil + +# Library handles +cdef LibraryHandle create_library_handle_from_file(const char* path) nogil except+ +cdef LibraryHandle create_library_handle_from_data(const void* data) nogil except+ +cdef LibraryHandle create_library_handle_ref(cydriver.CUlibrary library) nogil except+ + +# Kernel handles +cdef KernelHandle get_kernel_from_library(LibraryHandle h_library, const char* name) nogil except+ +cdef KernelHandle create_kernel_handle_ref( + cydriver.CUkernel kernel, LibraryHandle h_library) nogil except+ diff --git a/cuda_core/cuda/core/_resource_handles.pyx b/cuda_core/cuda/core/_resource_handles.pyx index 7989cd1bb0..a65836b864 100644 --- a/cuda_core/cuda/core/_resource_handles.pyx +++ b/cuda_core/cuda/core/_resource_handles.pyx @@ -21,6 +21,8 @@ from ._resource_handles cimport ( EventHandle, MemoryPoolHandle, DevicePtrHandle, + LibraryHandle, + KernelHandle, ) import cuda.bindings.cydriver as cydriver @@ -91,6 +93,20 @@ cdef extern from "_cpp/resource_handles.hpp" namespace "cuda_core": void set_deallocation_stream "cuda_core::set_deallocation_stream" ( const DevicePtrHandle& h, StreamHandle h_stream) noexcept nogil + # Library handles + LibraryHandle create_library_handle_from_file "cuda_core::create_library_handle_from_file" ( + const char* path) nogil except+ + LibraryHandle create_library_handle_from_data "cuda_core::create_library_handle_from_data" ( + const void* data) nogil except+ + LibraryHandle create_library_handle_ref "cuda_core::create_library_handle_ref" ( + cydriver.CUlibrary library) nogil except+ + + # Kernel handles + KernelHandle get_kernel_from_library "cuda_core::get_kernel_from_library" ( + LibraryHandle h_library, const char* name) nogil except+ + KernelHandle create_kernel_handle_ref "cuda_core::create_kernel_handle_ref" ( + cydriver.CUkernel kernel, LibraryHandle h_library) nogil except+ + # ============================================================================= # CUDA Driver API capsule @@ -152,6 +168,12 @@ cdef extern from "_cpp/resource_handles.hpp" namespace "cuda_core": # IPC void* p_cuMemPoolImportPointer "reinterpret_cast(cuda_core::p_cuMemPoolImportPointer)" + # Library + void* p_cuLibraryLoadFromFile "reinterpret_cast(cuda_core::p_cuLibraryLoadFromFile)" + void* p_cuLibraryLoadData "reinterpret_cast(cuda_core::p_cuLibraryLoadData)" + void* p_cuLibraryUnload "reinterpret_cast(cuda_core::p_cuLibraryUnload)" + void* p_cuLibraryGetKernel "reinterpret_cast(cuda_core::p_cuLibraryGetKernel)" + # Initialize driver function pointers from cydriver.__pyx_capi__ at module load cdef void* _get_driver_fn(str name): @@ -195,3 +217,9 @@ p_cuMemFreeHost = _get_driver_fn("cuMemFreeHost") # IPC p_cuMemPoolImportPointer = _get_driver_fn("cuMemPoolImportPointer") + +# Library +p_cuLibraryLoadFromFile = _get_driver_fn("cuLibraryLoadFromFile") +p_cuLibraryLoadData = _get_driver_fn("cuLibraryLoadData") +p_cuLibraryUnload = _get_driver_fn("cuLibraryUnload") +p_cuLibraryGetKernel = _get_driver_fn("cuLibraryGetKernel") From 82d92c9a3fe0aef33ee39627e11c048c84f35b55 Mon Sep 17 00:00:00 2001 From: Andy Jost Date: Wed, 21 Jan 2026 17:36:04 -0800 Subject: [PATCH 06/10] Convert _module.pyx driver calls to cydriver with nogil Replaces Python-level driver API calls with low-level cydriver calls wrapped in nogil blocks for improved performance. This allows the GIL to be released during CUDA driver operations. Changes: - cuDriverGetVersion, cuKernelGetAttribute, cuKernelGetParamInfo - cuOccupancy* functions (with appropriate GIL handling for callbacks) - cuKernelGetLibrary - Update KernelAttributes._get_cached_attribute to use cydriver types --- cuda_core/cuda/core/_module.pxd | 3 +- cuda_core/cuda/core/_module.pyx | 168 +++++++++++++++++++------------- 2 files changed, 104 insertions(+), 67 deletions(-) diff --git a/cuda_core/cuda/core/_module.pxd b/cuda_core/cuda/core/_module.pxd index c17c7163bd..b49f064d52 100644 --- a/cuda_core/cuda/core/_module.pxd +++ b/cuda_core/cuda/core/_module.pxd @@ -2,6 +2,7 @@ # # SPDX-License-Identifier: Apache-2.0 +from cuda.bindings cimport cydriver from cuda.core._resource_handles cimport LibraryHandle, KernelHandle cdef class ObjectCode @@ -51,5 +52,5 @@ cdef class KernelAttributes: object _kernel_weakref dict _cache - cdef int _get_cached_attribute(self, int device_id, object attribute) except? -1 + cdef int _get_cached_attribute(self, int device_id, cydriver.CUfunction_attribute attribute) except? -1 cdef int _resolve_device_id(self, device_id) except? -1 diff --git a/cuda_core/cuda/core/_module.pyx b/cuda_core/cuda/core/_module.pyx index c48550241d..a1d3dadb49 100644 --- a/cuda_core/cuda/core/_module.pyx +++ b/cuda_core/cuda/core/_module.pyx @@ -4,13 +4,17 @@ from __future__ import annotations +from libc.stddef cimport size_t + import functools import threading import weakref from collections import namedtuple from cuda.core._device import Device -from cuda.core._launch_config import LaunchConfig, _to_native_launch_config +from cuda.core._launch_config cimport LaunchConfig +from cuda.core._launch_config import LaunchConfig +from cuda.core._stream cimport Stream from cuda.core._resource_handles cimport ( LibraryHandle, KernelHandle, @@ -20,6 +24,7 @@ from cuda.core._resource_handles cimport ( get_kernel_from_library, create_kernel_handle_ref, get_last_error, + as_cu, as_py, ) from cuda.core._stream import Stream @@ -28,7 +33,7 @@ from cuda.core._utils.clear_error_support import ( raise_code_path_meant_to_be_unreachable, ) from cuda.core._utils.cuda_utils cimport HANDLE_RETURN -from cuda.core._utils.cuda_utils import driver, get_binding_version, handle_return +from cuda.core._utils.cuda_utils import driver, get_binding_version from cuda.bindings cimport cydriver __all__ = ["Kernel", "ObjectCode"] @@ -62,6 +67,7 @@ cdef int _lazy_init() except -1: if _inited: return 0 + cdef int drv_ver # Slow path: acquire lock and initialize with _init_lock: # Double-check: another thread might have initialized while we waited @@ -72,7 +78,9 @@ cdef int _lazy_init() except -1: # binding availability depends on cuda-python version _py_major_ver, _py_minor_ver = get_binding_version() _kernel_ctypes = (driver.CUkernel,) - _driver_ver = handle_return(driver.cuDriverGetVersion()) + with nogil: + HANDLE_RETURN(cydriver.cuDriverGetVersion(&drv_ver)) + _driver_ver = drv_ver _paraminfo_supported = _driver_ver >= 12040 # Mark as initialized (must be last to ensure all state is set) @@ -144,16 +152,18 @@ cdef class KernelAttributes: _lazy_init() return self - cdef int _get_cached_attribute(self, int device_id, object attribute) except? -1: + cdef int _get_cached_attribute(self, int device_id, cydriver.CUfunction_attribute attribute) except? -1: """Helper function to get a cached attribute or fetch and cache it if not present.""" - cdef tuple cache_key = (device_id, attribute) - result = self._cache.get(cache_key, cache_key) - if result is not cache_key: - return result + cdef tuple cache_key = (device_id, attribute) + cached = self._cache.get(cache_key, cache_key) + if cached is not cache_key: + return cached cdef Kernel kernel = (self._kernel_weakref()) if kernel is None: raise RuntimeError("Cannot access kernel attributes for expired Kernel object") - result = handle_return(driver.cuKernelGetAttribute(attribute, as_py(kernel._h_kernel), device_id)) + cdef int result + with nogil: + HANDLE_RETURN(cydriver.cuKernelGetAttribute(&result, attribute, as_cu(kernel._h_kernel), device_id)) self._cache[cache_key] = result return result @@ -165,49 +175,49 @@ cdef class KernelAttributes: """int : The maximum number of threads per block. This attribute is read-only.""" return self._get_cached_attribute( - self._resolve_device_id(device_id), driver.CUfunction_attribute.CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK + self._resolve_device_id(device_id), cydriver.CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK ) def shared_size_bytes(self, device_id: Device | int = None) -> int: """int : The size in bytes of statically-allocated shared memory required by this function. This attribute is read-only.""" return self._get_cached_attribute( - self._resolve_device_id(device_id), driver.CUfunction_attribute.CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES + self._resolve_device_id(device_id), cydriver.CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES ) def const_size_bytes(self, device_id: Device | int = None) -> int: """int : The size in bytes of user-allocated constant memory required by this function. This attribute is read-only.""" return self._get_cached_attribute( - self._resolve_device_id(device_id), driver.CUfunction_attribute.CU_FUNC_ATTRIBUTE_CONST_SIZE_BYTES + self._resolve_device_id(device_id), cydriver.CU_FUNC_ATTRIBUTE_CONST_SIZE_BYTES ) def local_size_bytes(self, device_id: Device | int = None) -> int: """int : The size in bytes of local memory used by each thread of this function. This attribute is read-only.""" return self._get_cached_attribute( - self._resolve_device_id(device_id), driver.CUfunction_attribute.CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES + self._resolve_device_id(device_id), cydriver.CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES ) def num_regs(self, device_id: Device | int = None) -> int: """int : The number of registers used by each thread of this function. This attribute is read-only.""" return self._get_cached_attribute( - self._resolve_device_id(device_id), driver.CUfunction_attribute.CU_FUNC_ATTRIBUTE_NUM_REGS + self._resolve_device_id(device_id), cydriver.CU_FUNC_ATTRIBUTE_NUM_REGS ) def ptx_version(self, device_id: Device | int = None) -> int: """int : The PTX virtual architecture version for which the function was compiled. This attribute is read-only.""" return self._get_cached_attribute( - self._resolve_device_id(device_id), driver.CUfunction_attribute.CU_FUNC_ATTRIBUTE_PTX_VERSION + self._resolve_device_id(device_id), cydriver.CU_FUNC_ATTRIBUTE_PTX_VERSION ) def binary_version(self, device_id: Device | int = None) -> int: """int : The binary architecture version for which the function was compiled. This attribute is read-only.""" return self._get_cached_attribute( - self._resolve_device_id(device_id), driver.CUfunction_attribute.CU_FUNC_ATTRIBUTE_BINARY_VERSION + self._resolve_device_id(device_id), cydriver.CU_FUNC_ATTRIBUTE_BINARY_VERSION ) def cache_mode_ca(self, device_id: Device | int = None) -> bool: @@ -215,7 +225,7 @@ cdef class KernelAttributes: This attribute is read-only.""" return bool( self._get_cached_attribute( - self._resolve_device_id(device_id), driver.CUfunction_attribute.CU_FUNC_ATTRIBUTE_CACHE_MODE_CA + self._resolve_device_id(device_id), cydriver.CU_FUNC_ATTRIBUTE_CACHE_MODE_CA ) ) @@ -223,13 +233,13 @@ cdef class KernelAttributes: """int : The maximum size in bytes of dynamically-allocated shared memory that can be used by this function.""" return self._get_cached_attribute( - self._resolve_device_id(device_id), driver.CUfunction_attribute.CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES + self._resolve_device_id(device_id), cydriver.CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES ) def preferred_shared_memory_carveout(self, device_id: Device | int = None) -> int: """int : The shared memory carveout preference, in percent of the total shared memory.""" return self._get_cached_attribute( - self._resolve_device_id(device_id), driver.CUfunction_attribute.CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT + self._resolve_device_id(device_id), cydriver.CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT ) def cluster_size_must_be_set(self, device_id: Device | int = None) -> bool: @@ -237,26 +247,26 @@ cdef class KernelAttributes: This attribute is read-only.""" return bool( self._get_cached_attribute( - self._resolve_device_id(device_id), driver.CUfunction_attribute.CU_FUNC_ATTRIBUTE_CLUSTER_SIZE_MUST_BE_SET + self._resolve_device_id(device_id), cydriver.CU_FUNC_ATTRIBUTE_CLUSTER_SIZE_MUST_BE_SET ) ) def required_cluster_width(self, device_id: Device | int = None) -> int: """int : The required cluster width in blocks.""" return self._get_cached_attribute( - self._resolve_device_id(device_id), driver.CUfunction_attribute.CU_FUNC_ATTRIBUTE_REQUIRED_CLUSTER_WIDTH + self._resolve_device_id(device_id), cydriver.CU_FUNC_ATTRIBUTE_REQUIRED_CLUSTER_WIDTH ) def required_cluster_height(self, device_id: Device | int = None) -> int: """int : The required cluster height in blocks.""" return self._get_cached_attribute( - self._resolve_device_id(device_id), driver.CUfunction_attribute.CU_FUNC_ATTRIBUTE_REQUIRED_CLUSTER_HEIGHT + self._resolve_device_id(device_id), cydriver.CU_FUNC_ATTRIBUTE_REQUIRED_CLUSTER_HEIGHT ) def required_cluster_depth(self, device_id: Device | int = None) -> int: """int : The required cluster depth in blocks.""" return self._get_cached_attribute( - self._resolve_device_id(device_id), driver.CUfunction_attribute.CU_FUNC_ATTRIBUTE_REQUIRED_CLUSTER_DEPTH + self._resolve_device_id(device_id), cydriver.CU_FUNC_ATTRIBUTE_REQUIRED_CLUSTER_DEPTH ) def non_portable_cluster_size_allowed(self, device_id: Device | int = None) -> bool: @@ -264,7 +274,7 @@ cdef class KernelAttributes: return bool( self._get_cached_attribute( self._resolve_device_id(device_id), - driver.CUfunction_attribute.CU_FUNC_ATTRIBUTE_NON_PORTABLE_CLUSTER_SIZE_ALLOWED, + cydriver.CU_FUNC_ATTRIBUTE_NON_PORTABLE_CLUSTER_SIZE_ALLOWED, ) ) @@ -272,7 +282,7 @@ cdef class KernelAttributes: """int : The block scheduling policy of a function.""" return self._get_cached_attribute( self._resolve_device_id(device_id), - driver.CUfunction_attribute.CU_FUNC_ATTRIBUTE_CLUSTER_SCHEDULING_POLICY_PREFERENCE, + cydriver.CU_FUNC_ATTRIBUTE_CLUSTER_SCHEDULING_POLICY_PREFERENCE, ) @@ -318,11 +328,15 @@ cdef class KernelOccupancy: theoretical multiprocessor utilization (occupancy). """ - return handle_return( - driver.cuOccupancyMaxActiveBlocksPerMultiprocessor( - as_py(self._h_kernel), block_size, dynamic_shared_memory_size - ) - ) + cdef int num_blocks + cdef int c_block_size = block_size + cdef size_t c_shmem_size = dynamic_shared_memory_size + cdef cydriver.CUfunction func = as_cu(self._h_kernel) + with nogil: + HANDLE_RETURN(cydriver.cuOccupancyMaxActiveBlocksPerMultiprocessor( + &num_blocks, func, c_block_size, c_shmem_size + )) + return num_blocks def max_potential_block_size( self, dynamic_shared_memory_needed: int | driver.CUoccupancyB2DSize, block_size_limit: int @@ -356,19 +370,23 @@ cdef class KernelOccupancy: Interpreter Lock may lead to deadlocks. """ - cdef object kernel_py = as_py(self._h_kernel) + cdef int min_grid_size, max_block_size + cdef cydriver.CUfunction func = as_cu(self._h_kernel) + cdef cydriver.CUoccupancyB2DSize callback + cdef size_t c_shmem_size + cdef int c_block_size_limit = block_size_limit if isinstance(dynamic_shared_memory_needed, int): - min_grid_size, max_block_size = handle_return( - driver.cuOccupancyMaxPotentialBlockSize( - kernel_py, None, dynamic_shared_memory_needed, block_size_limit - ) - ) + c_shmem_size = dynamic_shared_memory_needed + with nogil: + HANDLE_RETURN(cydriver.cuOccupancyMaxPotentialBlockSize( + &min_grid_size, &max_block_size, func, NULL, c_shmem_size, c_block_size_limit + )) elif isinstance(dynamic_shared_memory_needed, driver.CUoccupancyB2DSize): - min_grid_size, max_block_size = handle_return( - driver.cuOccupancyMaxPotentialBlockSize( - kernel_py, dynamic_shared_memory_needed.getPtr(), 0, block_size_limit - ) - ) + # Callback may require GIL, so don't use nogil here + callback = dynamic_shared_memory_needed.getPtr() + HANDLE_RETURN(cydriver.cuOccupancyMaxPotentialBlockSize( + &min_grid_size, &max_block_size, func, callback, 0, c_block_size_limit + )) else: raise TypeError( "dynamic_shared_memory_needed expected to have type int, or CUoccupancyB2DSize, " @@ -393,11 +411,15 @@ cdef class KernelOccupancy: int Dynamic shared memory available per block for given launch configuration. """ - return handle_return( - driver.cuOccupancyAvailableDynamicSMemPerBlock( - as_py(self._h_kernel), num_blocks_per_multiprocessor, block_size - ) - ) + cdef size_t dynamic_smem_size + cdef int c_num_blocks = num_blocks_per_multiprocessor + cdef int c_block_size = block_size + cdef cydriver.CUfunction func = as_cu(self._h_kernel) + with nogil: + HANDLE_RETURN(cydriver.cuOccupancyAvailableDynamicSMemPerBlock( + &dynamic_smem_size, func, c_num_blocks, c_block_size + )) + return dynamic_smem_size def max_potential_cluster_size(self, config: LaunchConfig, stream: Stream | None = None) -> int: """Maximum potential cluster size. @@ -416,10 +438,16 @@ cdef class KernelOccupancy: int The maximum cluster size that can be launched for this kernel and launch configuration. """ - drv_cfg = _to_native_launch_config(config) + cdef cydriver.CUlaunchConfig drv_cfg = (config)._to_native_launch_config() + cdef Stream s if stream is not None: - drv_cfg.hStream = stream.handle - return handle_return(driver.cuOccupancyMaxPotentialClusterSize(as_py(self._h_kernel), drv_cfg)) + s = stream + drv_cfg.hStream = as_cu(s._h_stream) + cdef int cluster_size + cdef cydriver.CUfunction func = as_cu(self._h_kernel) + with nogil: + HANDLE_RETURN(cydriver.cuOccupancyMaxPotentialClusterSize(&cluster_size, func, &drv_cfg)) + return cluster_size def max_active_clusters(self, config: LaunchConfig, stream: Stream | None = None) -> int: """Maximum number of active clusters on the target device. @@ -438,10 +466,16 @@ cdef class KernelOccupancy: int The maximum number of clusters that could co-exist on the target device. """ - drv_cfg = _to_native_launch_config(config) + cdef cydriver.CUlaunchConfig drv_cfg = (config)._to_native_launch_config() + cdef Stream s if stream is not None: - drv_cfg.hStream = stream.handle - return handle_return(driver.cuOccupancyMaxActiveClusters(as_py(self._h_kernel), drv_cfg)) + s = stream + drv_cfg.hStream = as_cu(s._h_stream) + cdef int num_clusters + cdef cydriver.CUfunction func = as_cu(self._h_kernel) + with nogil: + HANDLE_RETURN(cydriver.cuOccupancyMaxActiveClusters(&num_clusters, func, &drv_cfg)) + return num_clusters ParamInfo = namedtuple("ParamInfo", ["offset", "size"]) @@ -484,19 +518,21 @@ cdef class Kernel: "Driver version 12.4 or newer is required for this function. " f"Using driver version {driver_ver // 1000}.{(driver_ver % 1000) // 10}" ) - cdef int arg_pos = 0 + cdef size_t arg_pos = 0 cdef list param_info_data = [] - cdef object kernel_py = as_py(self._h_kernel) + cdef cydriver.CUkernel cu_kernel = as_cu(self._h_kernel) + cdef size_t param_offset, param_size + cdef cydriver.CUresult err while True: - result = driver.cuKernelGetParamInfo(kernel_py, arg_pos) - if result[0] != driver.CUresult.CUDA_SUCCESS: + with nogil: + err = cydriver.cuKernelGetParamInfo(cu_kernel, arg_pos, ¶m_offset, ¶m_size) + if err != cydriver.CUDA_SUCCESS: break if param_info: - p_info = ParamInfo(offset=result[1], size=result[2]) - param_info_data.append(p_info) + param_info_data.append(ParamInfo(offset=param_offset, size=param_size)) arg_pos = arg_pos + 1 - if result[0] != driver.CUresult.CUDA_ERROR_INVALID_VALUE: - handle_return(result) + if err != cydriver.CUDA_ERROR_INVALID_VALUE: + HANDLE_RETURN(err) return arg_pos, param_info_data @property @@ -553,18 +589,18 @@ cdef class Kernel: # Convert the integer handle to CUkernel cdef cydriver.CUkernel cu_kernel = handle cdef KernelHandle h_kernel + cdef cydriver.CUlibrary cu_library + cdef cydriver.CUresult err # If no module provided, create a placeholder and try to get the library if mod is None: mod = ObjectCode._init(b"", "cubin", "", None) if _is_cukernel_get_library_supported(): # Try to get the owning library via cuKernelGetLibrary - try: - cu_library = handle_return(driver.cuKernelGetLibrary(driver.CUkernel(handle))) - mod._h_library = create_library_handle_ref(int(cu_library)) - except Exception: - # Best-effort: don't fail construction if inverse lookup fails - pass + with nogil: + err = cydriver.cuKernelGetLibrary(&cu_library, cu_kernel) + if err == cydriver.CUDA_SUCCESS: + mod._h_library = create_library_handle_ref(cu_library) # Create kernel handle with library dependency h_kernel = create_kernel_handle_ref(cu_kernel, mod._h_library) From eeb84c37ca983e92a80487eaa082bdee4a652b60 Mon Sep 17 00:00:00 2001 From: Andy Jost Date: Thu, 22 Jan 2026 07:57:46 -0800 Subject: [PATCH 07/10] Fix SEGV in Kernel.from_handle with non-int types Remove type annotation from handle parameter to prevent Cython's automatic float-to-int coercion, which caused a segmentation fault. The manual isinstance check properly validates all non-int types. --- cuda_core/cuda/core/_module.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cuda_core/cuda/core/_module.pyx b/cuda_core/cuda/core/_module.pyx index a1d3dadb49..dbdfb189bb 100644 --- a/cuda_core/cuda/core/_module.pyx +++ b/cuda_core/cuda/core/_module.pyx @@ -566,7 +566,7 @@ cdef class Kernel: return as_py(self._h_kernel) @staticmethod - def from_handle(handle: int, mod: ObjectCode = None) -> Kernel: + def from_handle(handle, mod: ObjectCode = None) -> Kernel: """Creates a new :obj:`Kernel` object from a foreign kernel handle. Uses a CUkernel pointer address to create a new :obj:`Kernel` object. From a9c0a37fee65c93c0a5e3b91c9df149223c8b4b0 Mon Sep 17 00:00:00 2001 From: Andy Jost Date: Thu, 22 Jan 2026 08:26:18 -0800 Subject: [PATCH 08/10] fix: add init_cuda fixture to tests requiring CUDA context Four tests in test_utils.py relied on CuPy implicitly creating a CUDA context but failed when pytest-randomly ordered them after tests using the init_cuda fixture, which pops the context on cleanup. --- cuda_core/tests/test_utils.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/cuda_core/tests/test_utils.py b/cuda_core/tests/test_utils.py index a3c62a6aee..dd9c52e817 100644 --- a/cuda_core/tests/test_utils.py +++ b/cuda_core/tests/test_utils.py @@ -345,7 +345,7 @@ def _get_ptr(array): for view_as in ["dlpack", "cai"] ], ) -def test_view_sliced_external(shape, slices, stride_order, view_as): +def test_view_sliced_external(init_cuda, shape, slices, stride_order, view_as): if view_as == "dlpack": if np is None: pytest.skip("NumPy is not installed") @@ -380,7 +380,7 @@ def test_view_sliced_external(shape, slices, stride_order, view_as): ("stride_order", "view_as"), [(stride_order, view_as) for stride_order in ["C", "F"] for view_as in ["dlpack", "cai"]], ) -def test_view_sliced_external_negative_offset(stride_order, view_as): +def test_view_sliced_external_negative_offset(init_cuda, stride_order, view_as): shape = (5,) if view_as == "dlpack": if np is None: @@ -422,7 +422,7 @@ def test_view_sliced_external_negative_offset(stride_order, view_as): ) @pytest.mark.parametrize("shape", [(0,), (0, 0), (0, 0, 0)]) @pytest.mark.parametrize("dtype", [np.int64, np.uint8, np.float64]) -def test_view_zero_size_array(api, shape, dtype): +def test_view_zero_size_array(init_cuda, api, shape, dtype): cp = pytest.importorskip("cupy") x = cp.empty(shape, dtype=dtype) @@ -446,7 +446,7 @@ def test_from_buffer_with_non_power_of_two_itemsize(): assert view.dtype == dtype -def test_struct_array(): +def test_struct_array(init_cuda): cp = pytest.importorskip("cupy") x = np.array([(1.0, 2), (2.0, 3)], dtype=[("array1", np.float64), ("array2", np.int64)]) From b262fa0615b312c0c35233b6f7b804f6275ad9ff Mon Sep 17 00:00:00 2001 From: Andy Jost Date: Thu, 22 Jan 2026 10:21:02 -0800 Subject: [PATCH 09/10] Refactor ObjectCode._init and add kernel lifetime test - Change ObjectCode._init from cdef to @classmethod def, matching the pattern used by Buffer, Stream, Graph, and other classes - Remove _init_py wrapper (no longer needed) - Update callers in _program.py and _linker.py - Add test_kernel_keeps_library_alive to verify that a Kernel keeps its underlying library alive after ObjectCode goes out of scope --- cuda_core/cuda/core/_linker.py | 2 +- cuda_core/cuda/core/_module.pxd | 3 -- cuda_core/cuda/core/_module.pyx | 27 +++++++---------- cuda_core/cuda/core/_program.py | 6 ++-- cuda_core/tests/test_module.py | 53 +++++++++++++++++++++++++++++++++ 5 files changed, 68 insertions(+), 23 deletions(-) diff --git a/cuda_core/cuda/core/_linker.py b/cuda_core/cuda/core/_linker.py index e621fe2c44..0257655164 100644 --- a/cuda_core/cuda/core/_linker.py +++ b/cuda_core/cuda/core/_linker.py @@ -529,7 +529,7 @@ def link(self, target_type) -> ObjectCode: addr, size = handle_return(_driver.cuLinkComplete(self._mnff.handle)) code = (ctypes.c_char * size).from_address(addr) - return ObjectCode._init_py(bytes(code), target_type, name=self._options.name) + return ObjectCode._init(bytes(code), target_type, name=self._options.name) def get_error_log(self) -> str: """Get the error log generated by the linker. diff --git a/cuda_core/cuda/core/_module.pxd b/cuda_core/cuda/core/_module.pxd index b49f064d52..ac512b39c8 100644 --- a/cuda_core/cuda/core/_module.pxd +++ b/cuda_core/cuda/core/_module.pxd @@ -33,9 +33,6 @@ cdef class ObjectCode: dict _sym_map str _name - @staticmethod - cdef ObjectCode _init(object module, str code_type, str name=*, dict symbol_mapping=*) - cdef int _lazy_load_module(self) except -1 diff --git a/cuda_core/cuda/core/_module.pyx b/cuda_core/cuda/core/_module.pyx index dbdfb189bb..4c46d79dcb 100644 --- a/cuda_core/cuda/core/_module.pyx +++ b/cuda_core/cuda/core/_module.pyx @@ -594,7 +594,7 @@ cdef class Kernel: # If no module provided, create a placeholder and try to get the library if mod is None: - mod = ObjectCode._init(b"", "cubin", "", None) + mod = ObjectCode._init(b"", "cubin") if _is_cukernel_get_library_supported(): # Try to get the owning library via cuKernelGetLibrary with nogil: @@ -635,8 +635,8 @@ cdef class ObjectCode: "Please use ObjectCode APIs (from_cubin, from_ptx) or Program APIs (compile)." ) - @staticmethod - cdef ObjectCode _init(object module, str code_type, str name = "", dict symbol_mapping = None): + @classmethod + def _init(cls, module, code_type, *, name: str = "", symbol_mapping: dict | None = None): assert code_type in _supported_code_type, f"{code_type=} is not supported" cdef ObjectCode self = ObjectCode.__new__(ObjectCode) @@ -647,19 +647,14 @@ cdef class ObjectCode: self._code_type = code_type self._module = module self._sym_map = {} if symbol_mapping is None else symbol_mapping - self._name = name + self._name = name if name else "" return self - @classmethod - def _init_py(cls, module, code_type, *, name: str = "", symbol_mapping: dict | None = None): - """Python-accessible factory method for use by _program.py and _linker.py.""" - return ObjectCode._init(module, code_type, name if name else "", symbol_mapping) - @classmethod def _reduce_helper(cls, module, code_type, name, symbol_mapping): # just for forwarding kwargs - return ObjectCode._init(module, code_type, name if name else "", symbol_mapping) + return cls._init(module, code_type, name=name if name else "", symbol_mapping=symbol_mapping) def __reduce__(self): return ObjectCode._reduce_helper, (self._module, self._code_type, self._name, self._sym_map) @@ -680,7 +675,7 @@ cdef class ObjectCode: should be mapped to the mangled names before trying to retrieve them (default to no mappings). """ - return ObjectCode._init(module, "cubin", name, symbol_mapping) + return ObjectCode._init(module, "cubin", name=name, symbol_mapping=symbol_mapping) @staticmethod def from_ptx(module: bytes | str, *, name: str = "", symbol_mapping: dict | None = None) -> ObjectCode: @@ -698,7 +693,7 @@ cdef class ObjectCode: should be mapped to the mangled names before trying to retrieve them (default to no mappings). """ - return ObjectCode._init(module, "ptx", name, symbol_mapping) + return ObjectCode._init(module, "ptx", name=name, symbol_mapping=symbol_mapping) @staticmethod def from_ltoir(module: bytes | str, *, name: str = "", symbol_mapping: dict | None = None) -> ObjectCode: @@ -716,7 +711,7 @@ cdef class ObjectCode: should be mapped to the mangled names before trying to retrieve them (default to no mappings). """ - return ObjectCode._init(module, "ltoir", name, symbol_mapping) + return ObjectCode._init(module, "ltoir", name=name, symbol_mapping=symbol_mapping) @staticmethod def from_fatbin(module: bytes | str, *, name: str = "", symbol_mapping: dict | None = None) -> ObjectCode: @@ -734,7 +729,7 @@ cdef class ObjectCode: should be mapped to the mangled names before trying to retrieve them (default to no mappings). """ - return ObjectCode._init(module, "fatbin", name, symbol_mapping) + return ObjectCode._init(module, "fatbin", name=name, symbol_mapping=symbol_mapping) @staticmethod def from_object(module: bytes | str, *, name: str = "", symbol_mapping: dict | None = None) -> ObjectCode: @@ -752,7 +747,7 @@ cdef class ObjectCode: should be mapped to the mangled names before trying to retrieve them (default to no mappings). """ - return ObjectCode._init(module, "object", name, symbol_mapping) + return ObjectCode._init(module, "object", name=name, symbol_mapping=symbol_mapping) @staticmethod def from_library(module: bytes | str, *, name: str = "", symbol_mapping: dict | None = None) -> ObjectCode: @@ -770,7 +765,7 @@ cdef class ObjectCode: should be mapped to the mangled names before trying to retrieve them (default to no mappings). """ - return ObjectCode._init(module, "library", name, symbol_mapping) + return ObjectCode._init(module, "library", name=name, symbol_mapping=symbol_mapping) # TODO: do we want to unload in a finalizer? Probably not.. diff --git a/cuda_core/cuda/core/_program.py b/cuda_core/cuda/core/_program.py index f903a4e781..121dd13963 100644 --- a/cuda_core/cuda/core/_program.py +++ b/cuda_core/cuda/core/_program.py @@ -690,7 +690,7 @@ def __init__(self, code, code_type, options: ProgramOptions = None): elif code_type == "ptx": assert_type(code, str) self._linker = Linker( - ObjectCode._init_py(code.encode(), code_type), options=self._translate_program_options(options) + ObjectCode._init(code.encode(), code_type), options=self._translate_program_options(options) ) self._backend = self._linker.backend @@ -808,7 +808,7 @@ def compile(self, target_type, name_expressions=(), logs=None): handle_return(nvrtc.nvrtcGetProgramLog(self._mnff.handle, log), handle=self._mnff.handle) logs.write(log.decode("utf-8", errors="backslashreplace")) - return ObjectCode._init_py(data, target_type, symbol_mapping=symbol_mapping, name=self._options.name) + return ObjectCode._init(data, target_type, symbol_mapping=symbol_mapping, name=self._options.name) elif self._backend == "NVVM": if target_type not in ("ptx", "ltoir"): @@ -834,7 +834,7 @@ def compile(self, target_type, name_expressions=(), logs=None): nvvm.get_program_log(self._mnff.handle, log) logs.write(log.decode("utf-8", errors="backslashreplace")) - return ObjectCode._init_py(data, target_type, name=self._options.name) + return ObjectCode._init(data, target_type, name=self._options.name) supported_backends = ("nvJitLink", "driver") if self._backend not in supported_backends: diff --git a/cuda_core/tests/test_module.py b/cuda_core/tests/test_module.py index d5a35a1ea5..8b1667d2f7 100644 --- a/cuda_core/tests/test_module.py +++ b/cuda_core/tests/test_module.py @@ -511,3 +511,56 @@ def test_kernel_from_handle_multiple_instances(get_saxpy_kernel_cubin): # All should reference the same underlying CUDA kernel handle assert int(kernel1.handle) == int(kernel2.handle) == int(kernel3.handle) == handle + + +def test_kernel_keeps_library_alive(init_cuda): + """Test that a Kernel keeps its underlying library alive after ObjectCode goes out of scope.""" + import gc + + import numpy as np + + def get_kernel_only(): + """Return a kernel, letting ObjectCode go out of scope.""" + code = """ + extern "C" __global__ void write_value(int* out) { + if (threadIdx.x == 0 && blockIdx.x == 0) { + *out = 42; + } + } + """ + program = Program(code, "c++") + object_code = program.compile("cubin") + kernel = object_code.get_kernel("write_value") + # ObjectCode goes out of scope here + return kernel + + kernel = get_kernel_only() + + # Force GC to ensure ObjectCode destructor runs + gc.collect() + + # Kernel should still be valid + assert kernel.handle is not None + assert kernel.num_arguments == 1 + + # Actually launch the kernel to prove library is still loaded + device = Device() + stream = device.create_stream() + + # Allocate pinned host buffer and device buffer + pinned_mr = cuda.core.LegacyPinnedMemoryResource() + host_buf = pinned_mr.allocate(4) # sizeof(int) + result = np.from_dlpack(host_buf).view(np.int32) + result[:] = 0 + + dev_buf = device.memory_resource.allocate(4) + + # Launch kernel + config = cuda.core.LaunchConfig(grid=1, block=1) + cuda.core.launch(stream, config, kernel, dev_buf) + + # Copy result back to host + dev_buf.copy_to(host_buf, stream=stream) + stream.sync() + + assert result[0] == 42, f"Expected 42, got {result[0]}" From 46056e6830563e63454d8fb242dc3e5abbc90149 Mon Sep 17 00:00:00 2001 From: Andy Jost Date: Thu, 22 Jan 2026 11:20:37 -0800 Subject: [PATCH 10/10] Simplify resource handle patterns and clean up tests - Remove Kernel._module (ObjectCode reference no longer needed since KernelHandle keeps library alive via LibraryHandle dependency) - Simplify Kernel._from_obj signature (remove unused ObjectCode param) - Replace weakref patterns with direct handle storage: - KernelAttributes: store KernelHandle instead of weakref to Kernel - _MemPoolAttributes: store MemoryPoolHandle instead of weakref to _MemPool - Rename get_kernel_from_library to create_kernel_handle for consistency - Remove fragile annotation introspection from test_saxpy_arguments - Update test_mempool_attributes_ownership to reflect new ownership semantics --- cuda_core/cuda/core/_cpp/resource_handles.cpp | 2 +- cuda_core/cuda/core/_cpp/resource_handles.hpp | 2 +- cuda_core/cuda/core/_memory/_memory_pool.pxd | 11 ++++++- cuda_core/cuda/core/_memory/_memory_pool.pyx | 21 +++++--------- cuda_core/cuda/core/_module.pxd | 9 +++--- cuda_core/cuda/core/_module.pyx | 29 ++++++++----------- cuda_core/cuda/core/_resource_handles.pxd | 2 +- cuda_core/cuda/core/_resource_handles.pyx | 2 +- cuda_core/tests/test_memory.py | 20 +++---------- cuda_core/tests/test_module.py | 6 ---- 10 files changed, 42 insertions(+), 62 deletions(-) diff --git a/cuda_core/cuda/core/_cpp/resource_handles.cpp b/cuda_core/cuda/core/_cpp/resource_handles.cpp index 7e6e388579..c4bb47261a 100644 --- a/cuda_core/cuda/core/_cpp/resource_handles.cpp +++ b/cuda_core/cuda/core/_cpp/resource_handles.cpp @@ -749,7 +749,7 @@ struct KernelBox { }; } // namespace -KernelHandle get_kernel_from_library(LibraryHandle h_library, const char* name) { +KernelHandle create_kernel_handle(LibraryHandle h_library, const char* name) { GILReleaseGuard gil; CUkernel kernel; if (CUDA_SUCCESS != (err = p_cuLibraryGetKernel(&kernel, *h_library, name))) { diff --git a/cuda_core/cuda/core/_cpp/resource_handles.hpp b/cuda_core/cuda/core/_cpp/resource_handles.hpp index ba76a6c054..1df181ee56 100644 --- a/cuda_core/cuda/core/_cpp/resource_handles.hpp +++ b/cuda_core/cuda/core/_cpp/resource_handles.hpp @@ -254,7 +254,7 @@ LibraryHandle create_library_handle_ref(CUlibrary library); // The kernel structurally depends on the provided library handle. // Kernels have no explicit destroy - their lifetime is tied to the library. // Returns empty handle on error (caller must check). -KernelHandle get_kernel_from_library(LibraryHandle h_library, const char* name); +KernelHandle create_kernel_handle(LibraryHandle h_library, const char* name); // Create a non-owning kernel handle with library dependency. // Use for borrowed kernels. The library handle keeps the library alive. diff --git a/cuda_core/cuda/core/_memory/_memory_pool.pxd b/cuda_core/cuda/core/_memory/_memory_pool.pxd index eaff8e4bab..434e6b07c1 100644 --- a/cuda_core/cuda/core/_memory/_memory_pool.pxd +++ b/cuda_core/cuda/core/_memory/_memory_pool.pxd @@ -16,7 +16,16 @@ cdef class _MemPool(MemoryResource): IPCDataForMR _ipc_data object _attributes object _peer_accessible_by - object __weakref__ + + +cdef class _MemPoolAttributes: + cdef: + MemoryPoolHandle _h_pool + + @staticmethod + cdef _MemPoolAttributes _init(MemoryPoolHandle h_pool) + + cdef int _getattribute(self, cydriver.CUmemPool_attribute attr_enum, void* value) except? -1 cdef class _MemPoolOptions: diff --git a/cuda_core/cuda/core/_memory/_memory_pool.pyx b/cuda_core/cuda/core/_memory/_memory_pool.pyx index 563f556015..b5823048e1 100644 --- a/cuda_core/cuda/core/_memory/_memory_pool.pyx +++ b/cuda_core/cuda/core/_memory/_memory_pool.pyx @@ -30,7 +30,6 @@ from cuda.core._utils.cuda_utils cimport ( from typing import TYPE_CHECKING import platform # no-cython-lint -import weakref from cuda.core._utils.cuda_utils import driver @@ -50,16 +49,15 @@ cdef class _MemPoolOptions: cdef class _MemPoolAttributes: - cdef: - object _mr_weakref + """Provides access to memory pool attributes.""" def __init__(self, *args, **kwargs): raise RuntimeError("_MemPoolAttributes cannot be instantiated directly. Please use MemoryResource APIs.") - @classmethod - def _init(cls, mr): - cdef _MemPoolAttributes self = _MemPoolAttributes.__new__(cls) - self._mr_weakref = mr + @staticmethod + cdef _MemPoolAttributes _init(MemoryPoolHandle h_pool): + cdef _MemPoolAttributes self = _MemPoolAttributes.__new__(_MemPoolAttributes) + self._h_pool = h_pool return self def __repr__(self): @@ -69,12 +67,8 @@ cdef class _MemPoolAttributes: ) cdef int _getattribute(self, cydriver.CUmemPool_attribute attr_enum, void* value) except?-1: - cdef _MemPool mr = <_MemPool>(self._mr_weakref()) - if mr is None: - raise RuntimeError("_MemPool is expired") - cdef cydriver.CUmemoryPool pool_handle = as_cu(mr._h_pool) with nogil: - HANDLE_RETURN(cydriver.cuMemPoolGetAttribute(pool_handle, attr_enum, value)) + HANDLE_RETURN(cydriver.cuMemPoolGetAttribute(as_cu(self._h_pool), attr_enum, value)) return 0 @property @@ -202,8 +196,7 @@ cdef class _MemPool(MemoryResource): def attributes(self) -> _MemPoolAttributes: """Memory pool attributes.""" if self._attributes is None: - ref = weakref.ref(self) - self._attributes = _MemPoolAttributes._init(ref) + self._attributes = _MemPoolAttributes._init(self._h_pool) return self._attributes @property diff --git a/cuda_core/cuda/core/_module.pxd b/cuda_core/cuda/core/_module.pxd index ac512b39c8..9333703175 100644 --- a/cuda_core/cuda/core/_module.pxd +++ b/cuda_core/cuda/core/_module.pxd @@ -14,13 +14,11 @@ cdef class KernelAttributes cdef class Kernel: cdef: KernelHandle _h_kernel - ObjectCode _module # ObjectCode reference KernelAttributes _attributes # lazy KernelOccupancy _occupancy # lazy - object __weakref__ # Enable weak references @staticmethod - cdef Kernel _from_obj(KernelHandle h_kernel, ObjectCode mod) + cdef Kernel _from_obj(KernelHandle h_kernel) cdef tuple _get_arguments_info(self, bint param_info=*) @@ -46,8 +44,11 @@ cdef class KernelOccupancy: cdef class KernelAttributes: cdef: - object _kernel_weakref + KernelHandle _h_kernel dict _cache + @staticmethod + cdef KernelAttributes _init(KernelHandle h_kernel) + cdef int _get_cached_attribute(self, int device_id, cydriver.CUfunction_attribute attribute) except? -1 cdef int _resolve_device_id(self, device_id) except? -1 diff --git a/cuda_core/cuda/core/_module.pyx b/cuda_core/cuda/core/_module.pyx index 4c46d79dcb..49a564b6b9 100644 --- a/cuda_core/cuda/core/_module.pyx +++ b/cuda_core/cuda/core/_module.pyx @@ -8,7 +8,6 @@ from libc.stddef cimport size_t import functools import threading -import weakref from collections import namedtuple from cuda.core._device import Device @@ -21,7 +20,7 @@ from cuda.core._resource_handles cimport ( create_library_handle_from_file, create_library_handle_from_data, create_library_handle_ref, - get_kernel_from_library, + create_kernel_handle, create_kernel_handle_ref, get_last_error, as_cu, @@ -139,15 +138,15 @@ cdef inline LibraryHandle _make_empty_library_handle(): cdef class KernelAttributes: - """Provides access to kernel attributes. Uses weakref to avoid preventing Kernel GC.""" + """Provides access to kernel attributes.""" def __init__(self, *args, **kwargs): raise RuntimeError("KernelAttributes cannot be instantiated directly. Please use Kernel APIs.") - @classmethod - def _init(cls, kernel): - cdef KernelAttributes self = KernelAttributes.__new__(cls) - self._kernel_weakref = weakref.ref(kernel) + @staticmethod + cdef KernelAttributes _init(KernelHandle h_kernel): + cdef KernelAttributes self = KernelAttributes.__new__(KernelAttributes) + self._h_kernel = h_kernel self._cache = {} _lazy_init() return self @@ -158,12 +157,9 @@ cdef class KernelAttributes: cached = self._cache.get(cache_key, cache_key) if cached is not cache_key: return cached - cdef Kernel kernel = (self._kernel_weakref()) - if kernel is None: - raise RuntimeError("Cannot access kernel attributes for expired Kernel object") cdef int result with nogil: - HANDLE_RETURN(cydriver.cuKernelGetAttribute(&result, attribute, as_cu(kernel._h_kernel), device_id)) + HANDLE_RETURN(cydriver.cuKernelGetAttribute(&result, attribute, as_cu(self._h_kernel), device_id)) self._cache[cache_key] = result return result @@ -496,10 +492,9 @@ cdef class Kernel: raise RuntimeError("Kernel objects cannot be instantiated directly. Please use ObjectCode APIs.") @staticmethod - cdef Kernel _from_obj(KernelHandle h_kernel, ObjectCode mod): + cdef Kernel _from_obj(KernelHandle h_kernel): cdef Kernel ker = Kernel.__new__(Kernel) ker._h_kernel = h_kernel - ker._module = mod ker._attributes = None ker._occupancy = None return ker @@ -508,7 +503,7 @@ cdef class Kernel: def attributes(self) -> KernelAttributes: """Get the read-only attributes of this kernel.""" if self._attributes is None: - self._attributes = KernelAttributes._init(self) + self._attributes = KernelAttributes._init(self._h_kernel) return self._attributes cdef tuple _get_arguments_info(self, bint param_info=False): @@ -607,7 +602,7 @@ cdef class Kernel: if not h_kernel: HANDLE_RETURN(get_last_error()) - return Kernel._from_obj(h_kernel, mod) + return Kernel._from_obj(h_kernel) CodeTypeT = bytes | bytearray | str @@ -812,10 +807,10 @@ cdef class ObjectCode: except KeyError: name = name.encode() - cdef KernelHandle h_kernel = get_kernel_from_library(self._h_library, name) + cdef KernelHandle h_kernel = create_kernel_handle(self._h_library, name) if not h_kernel: HANDLE_RETURN(get_last_error()) - return Kernel._from_obj(h_kernel, self) + return Kernel._from_obj(h_kernel) @property def code(self) -> CodeTypeT: diff --git a/cuda_core/cuda/core/_resource_handles.pxd b/cuda_core/cuda/core/_resource_handles.pxd index 10816481b0..b146d93aa7 100644 --- a/cuda_core/cuda/core/_resource_handles.pxd +++ b/cuda_core/cuda/core/_resource_handles.pxd @@ -109,6 +109,6 @@ cdef LibraryHandle create_library_handle_from_data(const void* data) nogil excep cdef LibraryHandle create_library_handle_ref(cydriver.CUlibrary library) nogil except+ # Kernel handles -cdef KernelHandle get_kernel_from_library(LibraryHandle h_library, const char* name) nogil except+ +cdef KernelHandle create_kernel_handle(LibraryHandle h_library, const char* name) nogil except+ cdef KernelHandle create_kernel_handle_ref( cydriver.CUkernel kernel, LibraryHandle h_library) nogil except+ diff --git a/cuda_core/cuda/core/_resource_handles.pyx b/cuda_core/cuda/core/_resource_handles.pyx index a65836b864..022929f7e3 100644 --- a/cuda_core/cuda/core/_resource_handles.pyx +++ b/cuda_core/cuda/core/_resource_handles.pyx @@ -102,7 +102,7 @@ cdef extern from "_cpp/resource_handles.hpp" namespace "cuda_core": cydriver.CUlibrary library) nogil except+ # Kernel handles - KernelHandle get_kernel_from_library "cuda_core::get_kernel_from_library" ( + KernelHandle create_kernel_handle "cuda_core::create_kernel_handle" ( LibraryHandle h_library, const char* name) nogil except+ KernelHandle create_kernel_handle_ref "cuda_core::create_kernel_handle_ref" ( cydriver.CUkernel kernel, LibraryHandle h_library) nogil except+ diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py index 8851a4600a..47091995e7 100644 --- a/cuda_core/tests/test_memory.py +++ b/cuda_core/tests/test_memory.py @@ -1162,7 +1162,7 @@ def test_mempool_attributes_repr(memory_resource_factory): def test_mempool_attributes_ownership(memory_resource_factory): - """Ensure the attributes bundle handles references correctly for all memory resource types.""" + """Ensure the attributes bundle keeps the pool alive via the handle.""" MR, MRops = memory_resource_factory device = Device() @@ -1190,21 +1190,9 @@ def test_mempool_attributes_ownership(memory_resource_factory): mr.close() del mr - # After deleting the memory resource, the attributes suite is disconnected. - with pytest.raises(RuntimeError, match="is expired"): - _ = attributes.used_mem_high - - # Even when a new object is created (we found a case where the same - # mempool handle was really reused). - if MR is DeviceMemoryResource: - mr = MR(device, dict(max_size=POOL_SIZE)) # noqa: F841 - elif MR is PinnedMemoryResource: - mr = MR(dict(max_size=POOL_SIZE)) # noqa: F841 - elif MR is ManagedMemoryResource: - mr = create_managed_memory_resource_or_skip(dict()) # noqa: F841 - - with pytest.raises(RuntimeError, match="is expired"): - _ = attributes.used_mem_high + # The attributes bundle keeps the pool alive via MemoryPoolHandle, + # so accessing attributes still works even after the MR is deleted. + _ = attributes.used_mem_high # Should not raise # Ensure that memory views dellocate their reference to dlpack tensors diff --git a/cuda_core/tests/test_module.py b/cuda_core/tests/test_module.py index 8b1667d2f7..72591b54d5 100644 --- a/cuda_core/tests/test_module.py +++ b/cuda_core/tests/test_module.py @@ -229,12 +229,6 @@ def test_saxpy_arguments(get_saxpy_kernel_cubin, cuda12_4_prerequisite_check): _ = krn.num_arguments return - # Check that arguments_info returns ParamInfo objects (works for both Python and Cython classes) - # For Python classes: type(krn).arguments_info.fget.__annotations__ contains ParamInfo - # For Cython cdef classes: property descriptors don't have .fget, so we check the actual values - prop = type(krn).arguments_info - if hasattr(prop, "fget") and hasattr(prop.fget, "__annotations__"): - assert "ParamInfo" in str(prop.fget.__annotations__) arg_info = krn.arguments_info n_args = len(arg_info) assert n_args == krn.num_arguments