From 750ca5a7716bfc99e3d31b7c64c8f1add5b18ca2 Mon Sep 17 00:00:00 2001 From: fbusato Date: Thu, 18 Dec 2025 12:16:00 -0800 Subject: [PATCH 01/56] first version --- .../include/cuda/__mdspan/mdspan_to_dlpack.h | 535 ++++++++++++++++++ 1 file changed, 535 insertions(+) create mode 100644 libcudacxx/include/cuda/__mdspan/mdspan_to_dlpack.h diff --git a/libcudacxx/include/cuda/__mdspan/mdspan_to_dlpack.h b/libcudacxx/include/cuda/__mdspan/mdspan_to_dlpack.h new file mode 100644 index 00000000000..d0f3064d7b6 --- /dev/null +++ b/libcudacxx/include/cuda/__mdspan/mdspan_to_dlpack.h @@ -0,0 +1,535 @@ +//===----------------------------------------------------------------------===// +// +// Part of the libcu++ Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDA___MDSPAN_MDSPAN_TO_DLPACK_H +#define _CUDA___MDSPAN_MDSPAN_TO_DLPACK_H + +#include + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#if !_CCCL_COMPILER(NVRTC) && _CCCL_HAS_INCLUDE() + +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include + +# include + +# if _CCCL_HAS_EXCEPTIONS() +# include +# endif // _CCCL_HAS_EXCEPTIONS() + +# include + +_CCCL_BEGIN_NAMESPACE_CUDA + +static_assert(DLPACK_MAJOR_VERSION == 1, "DLPACK_MAJOR_VERSION must be 1"); + +template +[[nodiscard]] _CCCL_HOST_API inline ::DLDataType __data_type_to_dlpack() noexcept +{ + if constexpr (::cuda::std::is_same_v<_ElementType, bool>) + { + return ::DLDataType{::kDLBool, 8, 1}; + } + //-------------------------------------------------------------------------------------------------------------------- + // Signed integer types + else if constexpr (::cuda::std::is_same_v<_ElementType, ::cuda::std::int8_t>) + { + return ::DLDataType{::kDLInt, 8, 1}; + } + else if constexpr (::cuda::std::is_same_v<_ElementType, ::cuda::std::int16_t>) + { + return ::DLDataType{::kDLInt, 16, 1}; + } + else if constexpr (::cuda::std::is_same_v<_ElementType, ::cuda::std::int32_t>) + { + return ::DLDataType{::kDLInt, 32, 1}; + } + else if constexpr (::cuda::std::is_same_v<_ElementType, long>) + { + return ::DLDataType{::kDLInt, ::cuda::std::__num_bits_v, 1}; + } + else if constexpr (::cuda::std::is_same_v<_ElementType, long long>) + { + return ::DLDataType{::kDLInt, 64, 1}; + } +# if _CCCL_HAS_INT128() + else if constexpr (::cuda::std::is_same_v<_ElementType, __int128_t>) + { + return ::DLDataType{::kDLInt, 128, 1}; + } +# endif // _CCCL_HAS_INT128() + //-------------------------------------------------------------------------------------------------------------------- + // Unsigned integer types + else if constexpr (::cuda::std::is_same_v<_ElementType, ::cuda::std::uint8_t>) + { + return ::DLDataType{::kDLUInt, 8, 1}; + } + else if constexpr (::cuda::std::is_same_v<_ElementType, ::cuda::std::uint16_t>) + { + return ::DLDataType{::kDLUInt, 16, 1}; + } + else if constexpr (::cuda::std::is_same_v<_ElementType, ::cuda::std::uint32_t>) + { + return ::DLDataType{::kDLUInt, 32, 1}; + } + else if constexpr (::cuda::std::is_same_v<_ElementType, unsigned long>) + { + return ::DLDataType{::kDLUInt, ::cuda::std::__num_bits_v, 1}; + } + else if constexpr (::cuda::std::is_same_v<_ElementType, unsigned long long>) + { + return ::DLDataType{::kDLUInt, 64, 1}; + } +# if _CCCL_HAS_INT128() + else if constexpr (::cuda::std::is_same_v<_ElementType, __uint128_t>) + { + return ::DLDataType{::kDLUInt, 128, 1}; + } +# endif // _CCCL_HAS_INT128() + //-------------------------------------------------------------------------------------------------------------------- + // Floating-point types +# if _CCCL_HAS_NVFP16() + else if constexpr (::cuda::std::is_same_v<_ElementType, ::__half>) + { + return ::DLDataType{::kDLFloat, 16, 1}; + } +# endif // _CCCL_HAS_NVFP16() +# if _CCCL_HAS_NVBF16() + else if constexpr (::cuda::std::is_same_v<_ElementType, ::__nv_bfloat16>) + { + return ::DLDataType{::kDLBfloat, 16, 1}; + } +# endif // _CCCL_HAS_NVBF16() + else if constexpr (::cuda::std::is_same_v<_ElementType, float>) + { + return ::DLDataType{::kDLFloat, 32, 1}; + } + else if constexpr (::cuda::std::is_same_v<_ElementType, double>) + { + return ::DLDataType{::kDLFloat, 64, 1}; + } +# if _CCCL_HAS_FLOAT128() + else if constexpr (::cuda::std::is_same_v<_ElementType, __float128>) + { + return ::DLDataType{::kDLFloat, 128, 1}; + } +# endif // _CCCL_HAS_FLOAT128() + //-------------------------------------------------------------------------------------------------------------------- + // Low-precision Floating-point types +# if _CCCL_HAS_NVFP8_E4M3() + else if constexpr (::cuda::std::is_same_v<_ElementType, ::__nv_fp8_e4m3>) + { + return ::DLDataType{::kDLFloat8_e4m3, 8, 1}; + } +# endif // _CCCL_HAS_NVFP8_E4M3() +# if _CCCL_HAS_NVFP8_E5M2() + else if constexpr (::cuda::std::is_same_v<_ElementType, ::__nv_fp8_e5m2>) + { + return ::DLDataType{::kDLFloat8_e5m2, 8, 1}; + } +# endif // _CCCL_HAS_NVFP8_E5M2() +# if _CCCL_HAS_NVFP8_E8M0() + else if constexpr (::cuda::std::is_same_v<_ElementType, ::__nv_fp8_e8m0>) + { + return ::DLDataType{::kDLFloat8_e8m0fnu, 8, 1}; + } +# endif // _CCCL_HAS_NVFP8_E8M0() +# if _CCCL_HAS_NVFP6_E2M3() + else if constexpr (::cuda::std::is_same_v<_ElementType, ::__nv_fp6_e2m3>) + { + return ::DLDataType{::kDLFloat6_e2m3fn, 6, 1}; + } +# endif // _CCCL_HAS_NVFP6_E2M3() +# if _CCCL_HAS_NVFP6_E3M2() + else if constexpr (::cuda::std::is_same_v<_ElementType, ::__nv_fp6_e3m2>) + { + return ::DLDataType{::kDLFloat6_e3m2fn, 6, 1}; + } +# endif // _CCCL_HAS_NVFP6_E3M2() +# if _CCCL_HAS_NVFP4_E2M1() + else if constexpr (::cuda::std::is_same_v<_ElementType, ::__nv_fp4_e2m1>) + { + return ::DLDataType{::kDLFloat4_e2m1fn, 4, 1}; + } +# endif // _CCCL_HAS_NVFP4_E2M1() + //-------------------------------------------------------------------------------------------------------------------- + // Complex types +# if _CCCL_HAS_NVFP16() + else if constexpr (::cuda::std::is_same_v<_ElementType, ::cuda::std::complex<::__half>>) + { + return ::DLDataType{::kDLComplex, 32, 1}; + } +# endif // _CCCL_HAS_NVFP16() + else if constexpr (::cuda::std::is_same_v<_ElementType, ::cuda::std::complex>) + { + return ::DLDataType{::kDLComplex, 64, 1}; + } + else if constexpr (::cuda::std::is_same_v<_ElementType, ::cuda::std::complex>) + { + return ::DLDataType{::kDLComplex, 128, 1}; + } +# if _CCCL_HAS_FLOAT128() + else if constexpr (::cuda::std::is_same_v<_ElementType, ::cuda::std::complex<__float128>>) + { + return ::DLDataType{::kDLComplex, 256, 1}; + } +# endif // _CCCL_HAS_FLOAT128() + //-------------------------------------------------------------------------------------------------------------------- + // Vector types (CUDA built-in vector types) +# if _CCCL_HAS_CTK() + else if constexpr (::cuda::std::is_same_v<_ElementType, ::char2>) + { + return ::DLDataType{::kDLInt, 8, 2}; + } + else if constexpr (::cuda::std::is_same_v<_ElementType, ::char4>) + { + return ::DLDataType{::kDLInt, 8, 4}; + } + else if constexpr (::cuda::std::is_same_v<_ElementType, ::uchar2>) + { + return ::DLDataType{::kDLUInt, 8, 2}; + } + else if constexpr (::cuda::std::is_same_v<_ElementType, ::uchar4>) + { + return ::DLDataType{::kDLUInt, 8, 4}; + } + else if constexpr (::cuda::std::is_same_v<_ElementType, ::short2>) + { + return ::DLDataType{::kDLInt, 16, 2}; + } + else if constexpr (::cuda::std::is_same_v<_ElementType, ::short4>) + { + return ::DLDataType{::kDLInt, 16, 4}; + } + else if constexpr (::cuda::std::is_same_v<_ElementType, ::ushort2>) + { + return ::DLDataType{::kDLUInt, 16, 2}; + } + else if constexpr (::cuda::std::is_same_v<_ElementType, ::ushort4>) + { + return ::DLDataType{::kDLUInt, 16, 4}; + } + else if constexpr (::cuda::std::is_same_v<_ElementType, ::int2>) + { + return ::DLDataType{::kDLInt, 32, 2}; + } + else if constexpr (::cuda::std::is_same_v<_ElementType, ::int4>) + { + return ::DLDataType{::kDLInt, 32, 4}; + } + else if constexpr (::cuda::std::is_same_v<_ElementType, ::uint2>) + { + return ::DLDataType{::kDLUInt, 32, 2}; + } + else if constexpr (::cuda::std::is_same_v<_ElementType, ::uint4>) + { + return ::DLDataType{::kDLUInt, 32, 4}; + } + else if constexpr (::cuda::std::is_same_v<_ElementType, ::long2>) + { + return ::DLDataType{::kDLInt, ::cuda::std::__num_bits_v, 2}; + } +# if _CCCL_CTK_AT_LEAST(13, 0) + else if constexpr (::cuda::std::is_same_v<_ElementType, ::long4_32a>) + { + return ::DLDataType{::kDLInt, ::cuda::std::__num_bits_v, 4}; + } +# else // ^^^ _CCCL_CTK_AT_LEAST(13, 0) ^^^ / vvv _CCCL_CTK_BELOW(13, 0) vvv + else if constexpr (::cuda::std::is_same_v<_ElementType, ::long4>) + { + return ::DLDataType{::kDLInt, ::cuda::std::__num_bits_v, 4}; + } +# endif // _CCCL_CTK_BELOW(13, 0) + else if constexpr (::cuda::std::is_same_v<_ElementType, ::ulong2>) + { + return ::DLDataType{::kDLUInt, ::cuda::std::__num_bits_v, 2}; + } +# if _CCCL_CTK_AT_LEAST(13, 0) + else if constexpr (::cuda::std::is_same_v<_ElementType, ::ulong4_32a>) + { + return ::DLDataType{::kDLUInt, ::cuda::std::__num_bits_v, 4}; + } +# else // ^^^ _CCCL_CTK_AT_LEAST(13, 0) ^^^ / vvv _CCCL_CTK_BELOW(13, 0) vvv + else if constexpr (::cuda::std::is_same_v<_ElementType, ::ulong4>) + { + return ::DLDataType{::kDLUInt, ::cuda::std::__num_bits_v, 4}; + } +# endif // _CCCL_CTK_BELOW(13, 0) + else if constexpr (::cuda::std::is_same_v<_ElementType, ::long2>) + { + return ::DLDataType{::kDLInt, ::cuda::std::__num_bits_v, 2}; + } +# if _CCCL_CTK_AT_LEAST(13, 0) + else if constexpr (::cuda::std::is_same_v<_ElementType, ::long4_32a>) + { + return ::DLDataType{::kDLInt, ::cuda::std::__num_bits_v, 4}; + } +# else // ^^^ _CCCL_CTK_AT_LEAST(13, 0) ^^^ / vvv _CCCL_CTK_BELOW(13, 0) vvv + else if constexpr (::cuda::std::is_same_v<_ElementType, ::long4>) + { + return ::DLDataType{::kDLInt, ::cuda::std::__num_bits_v, 4}; + } +# endif // _CCCL_CTK_BELOW(13, 0) + else if constexpr (::cuda::std::is_same_v<_ElementType, ::ulong2>) + { + return ::DLDataType{::kDLUInt, ::cuda::std::__num_bits_v, 2}; + } +# if _CCCL_CTK_AT_LEAST(13, 0) + else if constexpr (::cuda::std::is_same_v<_ElementType, ::ulong4_32a>) + { + return ::DLDataType{::kDLUInt, ::cuda::std::__num_bits_v, 4}; + } +# else // ^^^ _CCCL_CTK_AT_LEAST(13, 0) ^^^ / vvv _CCCL_CTK_BELOW(13, 0) vvv + else if constexpr (::cuda::std::is_same_v<_ElementType, ::ulong4>) + { + return ::DLDataType{::kDLUInt, ::cuda::std::__num_bits_v, 4}; + } +# endif // _CCCL_CTK_BELOW(13, 0) + else if constexpr (::cuda::std::is_same_v<_ElementType, ::longlong2>) + { + return ::DLDataType{::kDLInt, 64, 2}; + } +# if _CCCL_CTK_AT_LEAST(13, 0) + else if constexpr (::cuda::std::is_same_v<_ElementType, ::longlong4_32a>) + { + return ::DLDataType{::kDLInt, 64, 4}; + } +# else // ^^^ _CCCL_CTK_AT_LEAST(13, 0) ^^^ / vvv _CCCL_CTK_BELOW(13, 0) vvv + else if constexpr (::cuda::std::is_same_v<_ElementType, ::longlong4>) + { + return ::DLDataType{::kDLInt, 64, 4}; + } +# endif // _CCCL_CTK_BELOW(13, 0) + else if constexpr (::cuda::std::is_same_v<_ElementType, ::ulonglong2>) + { + return ::DLDataType{::kDLUInt, 64, 2}; + } +# if _CCCL_CTK_AT_LEAST(13, 0) + else if constexpr (::cuda::std::is_same_v<_ElementType, ::ulonglong4_32a>) + { + return ::DLDataType{::kDLUInt, 64, 4}; + } +# else // ^^^ _CCCL_CTK_AT_LEAST(13, 0) ^^^ / vvv _CCCL_CTK_BELOW(13, 0) vvv + else if constexpr (::cuda::std::is_same_v<_ElementType, ::ulonglong4>) + { + return ::DLDataType{::kDLUInt, 64, 4}; + } +# endif // _CCCL_CTK_BELOW(13, 0) + else if constexpr (::cuda::std::is_same_v<_ElementType, ::float2>) + { + return ::DLDataType{::kDLFloat, 32, 2}; + } + else if constexpr (::cuda::std::is_same_v<_ElementType, ::float4>) + { + return ::DLDataType{::kDLFloat, 32, 4}; + } + else if constexpr (::cuda::std::is_same_v<_ElementType, ::double2>) + { + return ::DLDataType{::kDLFloat, 64, 2}; + } +# if _CCCL_CTK_AT_LEAST(13, 0) + else if constexpr (::cuda::std::is_same_v<_ElementType, ::double4_32a>) + { + return ::DLDataType{::kDLFloat, 64, 4}; + } +# else // ^^^ _CCCL_CTK_AT_LEAST(13, 0) ^^^ / vvv _CCCL_CTK_BELOW(13, 0) vvv + else if constexpr (::cuda::std::is_same_v<_ElementType, ::double4>) + { + return ::DLDataType{::kDLFloat, 64, 4}; + } +# endif // _CCCL_CTK_BELOW(13, 0) +# endif // _CCCL_HAS_CTK() + //-------------------------------------------------------------------------------------------------------------------- + // Unsupported types + else + { + static_assert(::cuda::std::__always_false_v<_ElementType>, "Unsupported type"); + } +} + +template <::cuda::std::size_t _Rank> +class DLPackWrapper +{ + ::cuda::std::array<::cuda::std::int64_t, _Rank> __shape{}; + ::cuda::std::array<::cuda::std::int64_t, _Rank> __strides{}; + ::DLTensor __tensor{}; + + _CCCL_HOST_API void __update_tensor() noexcept + { + __tensor.shape = _Rank > 0 ? __shape.data() : nullptr; + __tensor.strides = _Rank > 0 ? __strides.data() : nullptr; + } + +public: + _CCCL_HOST_API explicit DLPackWrapper() noexcept + { + __update_tensor(); + } + + _CCCL_HOST_API DLPackWrapper(const DLPackWrapper& __other) noexcept + : __shape{__other.__shape} + , __strides{__other.__strides} + , __tensor{__other.__tensor} + { + __update_tensor(); + } + + _CCCL_HOST_API DLPackWrapper(DLPackWrapper&& __other) noexcept + : __shape{::cuda::std::move(__other.__shape)} + , __strides{::cuda::std::move(__other.__strides)} + , __tensor{__other.__tensor} + { + __other.__tensor = ::DLTensor{}; + __update_tensor(); + } + + _CCCL_HOST_API DLPackWrapper& operator=(const DLPackWrapper& __other) noexcept + { + if (this == &__other) + { + return *this; + } + __shape = __other.__shape; + __strides = __other.__strides; + __tensor = __other.__tensor; + __update_tensor(); + return *this; + } + + _CCCL_HOST_API DLPackWrapper& operator=(DLPackWrapper&& __other) noexcept + { + if (this == &__other) + { + return *this; + } + __shape = ::cuda::std::move(__other.__shape); + __strides = ::cuda::std::move(__other.__strides); + __tensor = __other.__tensor; + __other.__tensor = ::DLTensor{}; + __update_tensor(); + return *this; + } + + _CCCL_HIDE_FROM_ABI ~DLPackWrapper() noexcept = default; + + _CCCL_HOST_API ::DLTensor* operator->() noexcept + { + return &__tensor; + } + + _CCCL_HOST_API const ::DLTensor* operator->() const noexcept + { + return &__tensor; + } + + _CCCL_HOST_API ::DLTensor& get() noexcept + { + return __tensor; + } + + _CCCL_HOST_API const ::DLTensor& get() const noexcept + { + return __tensor; + } +}; + +template +[[nodiscard]] _CCCL_HOST_API DLPackWrapper<_Extents::rank()> __mdspan_to_dlpack( + const ::cuda::std::mdspan<_ElementType, _Extents, _Layout, _Accessor>& __mdspan, + ::DLDeviceType __device_type, + int __device_id) +{ + static_assert(::cuda::std::is_pointer_v, "data_handle_type must be a pointer"); + using __element_type = ::cuda::std::remove_cv_t<_ElementType>; + DLPackWrapper<_Extents::rank()> __wrapper{}; + auto& __tensor = __wrapper.get(); + __tensor.data = __mdspan.size() > 0 ? const_cast<__element_type*>(__mdspan.data_handle()) : nullptr; + __tensor.device = ::DLDevice{__device_type, __device_id}; + __tensor.ndim = __mdspan.rank(); + __tensor.dtype = ::cuda::__data_type_to_dlpack<::cuda::std::remove_cv_t<_ElementType>>(); + if constexpr (_Extents::rank() > 0) + { + constexpr auto __max_extent = ::cuda::std::numeric_limits<::cuda::std::int64_t>::max(); + for (::cuda::std::size_t __i = 0; __i < __mdspan.rank(); ++__i) + { + if (::cuda::std::cmp_greater(__mdspan.extent(__i), __max_extent)) + { + _CCCL_THROW(::std::invalid_argument{"Extent is too large"}); + } + if (::cuda::std::cmp_greater(__mdspan.stride(__i), __max_extent)) + { + _CCCL_THROW(::std::invalid_argument{"Stride is too large"}); + } + __tensor.shape[__i] = static_cast<::cuda::std::int64_t>(__mdspan.extent(__i)); + __tensor.strides[__i] = static_cast<::cuda::std::int64_t>(__mdspan.stride(__i)); + } + } + __tensor.byte_offset = 0; + return __wrapper; +} + +/*********************************************************************************************************************** + * Public API + **********************************************************************************************************************/ + +template +[[nodiscard]] _CCCL_HOST_API DLPackWrapper<_Extents::rank()> +mdspan_to_dlpack(const ::cuda::host_mdspan<_ElementType, _Extents, _Layout, _Accessor>& __mdspan) +{ + using __mdspan_type = ::cuda::std::mdspan<_ElementType, _Extents, _Layout, _Accessor>; + return ::cuda::__mdspan_to_dlpack(__mdspan_type{__mdspan}, ::kDLCPU, 0); +} + +template +[[nodiscard]] _CCCL_HOST_API DLPackWrapper<_Extents::rank()> +mdspan_to_dlpack(const ::cuda::device_mdspan<_ElementType, _Extents, _Layout, _Accessor>& __mdspan, + ::cuda::device_ref __device = ::cuda::device_ref{0}) +{ + using __mdspan_type = ::cuda::std::mdspan<_ElementType, _Extents, _Layout, _Accessor>; + return ::cuda::__mdspan_to_dlpack(__mdspan_type{__mdspan}, ::kDLCUDA, __device.get()); +} + +template +[[nodiscard]] _CCCL_HOST_API DLPackWrapper<_Extents::rank()> +mdspan_to_dlpack(const ::cuda::managed_mdspan<_ElementType, _Extents, _Layout, _Accessor>& __mdspan) +{ + using __mdspan_type = ::cuda::std::mdspan<_ElementType, _Extents, _Layout, _Accessor>; + return ::cuda::__mdspan_to_dlpack(__mdspan_type{__mdspan}, ::kDLCUDAManaged, 0); +} + +_CCCL_END_NAMESPACE_CUDA + +# include + +#endif // !_CCCL_COMPILER(NVRTC) && _CCCL_HAS_INCLUDE() +#endif // _CUDA___MDSPAN_MDSPAN_TO_DLPACK_H From f040c101b73eab17f02022b4eb61965c5e97a70a Mon Sep 17 00:00:00 2001 From: fbusato Date: Thu, 18 Dec 2025 16:50:02 -0800 Subject: [PATCH 02/56] add unit test --- .../include/cuda/__mdspan/mdspan_to_dlpack.h | 55 +-- libcudacxx/include/cuda/mdspan | 1 + .../mdspan_to_dlpack.pass.cpp | 408 ++++++++++++++++++ .../mdspan_to_dlpack.wrapper.pass.cpp | 209 +++++++++ 4 files changed, 626 insertions(+), 47 deletions(-) create mode 100644 libcudacxx/test/libcudacxx/cuda/containers/views/mdspan/mdspan_to_dlpack/mdspan_to_dlpack.pass.cpp create mode 100644 libcudacxx/test/libcudacxx/cuda/containers/views/mdspan/mdspan_to_dlpack/mdspan_to_dlpack.wrapper.pass.cpp diff --git a/libcudacxx/include/cuda/__mdspan/mdspan_to_dlpack.h b/libcudacxx/include/cuda/__mdspan/mdspan_to_dlpack.h index d0f3064d7b6..30138cbaee6 100644 --- a/libcudacxx/include/cuda/__mdspan/mdspan_to_dlpack.h +++ b/libcudacxx/include/cuda/__mdspan/mdspan_to_dlpack.h @@ -12,8 +12,6 @@ #include -#include - #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) # pragma GCC system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) @@ -24,8 +22,8 @@ #if !_CCCL_COMPILER(NVRTC) && _CCCL_HAS_INCLUDE() -# include # include +# include # include # include # include @@ -41,12 +39,10 @@ # include # include -# include - -# if _CCCL_HAS_EXCEPTIONS() -# include -# endif // _CCCL_HAS_EXCEPTIONS() +# include +# include +// # include _CCCL_BEGIN_NAMESPACE_CUDA @@ -149,7 +145,7 @@ template # if _CCCL_HAS_NVFP8_E4M3() else if constexpr (::cuda::std::is_same_v<_ElementType, ::__nv_fp8_e4m3>) { - return ::DLDataType{::kDLFloat8_e4m3, 8, 1}; + return ::DLDataType{::kDLFloat8_e4m3fn, 8, 1}; } # endif // _CCCL_HAS_NVFP8_E4M3() # if _CCCL_HAS_NVFP8_E5M2() @@ -198,12 +194,7 @@ template { return ::DLDataType{::kDLComplex, 128, 1}; } -# if _CCCL_HAS_FLOAT128() - else if constexpr (::cuda::std::is_same_v<_ElementType, ::cuda::std::complex<__float128>>) - { - return ::DLDataType{::kDLComplex, 256, 1}; - } -# endif // _CCCL_HAS_FLOAT128() + // 256-bit data types are not supported in DLPack, e.g. cuda::std::complex<__float128> //-------------------------------------------------------------------------------------------------------------------- // Vector types (CUDA built-in vector types) # if _CCCL_HAS_CTK() @@ -284,36 +275,6 @@ template { return ::DLDataType{::kDLUInt, ::cuda::std::__num_bits_v, 4}; } -# endif // _CCCL_CTK_BELOW(13, 0) - else if constexpr (::cuda::std::is_same_v<_ElementType, ::long2>) - { - return ::DLDataType{::kDLInt, ::cuda::std::__num_bits_v, 2}; - } -# if _CCCL_CTK_AT_LEAST(13, 0) - else if constexpr (::cuda::std::is_same_v<_ElementType, ::long4_32a>) - { - return ::DLDataType{::kDLInt, ::cuda::std::__num_bits_v, 4}; - } -# else // ^^^ _CCCL_CTK_AT_LEAST(13, 0) ^^^ / vvv _CCCL_CTK_BELOW(13, 0) vvv - else if constexpr (::cuda::std::is_same_v<_ElementType, ::long4>) - { - return ::DLDataType{::kDLInt, ::cuda::std::__num_bits_v, 4}; - } -# endif // _CCCL_CTK_BELOW(13, 0) - else if constexpr (::cuda::std::is_same_v<_ElementType, ::ulong2>) - { - return ::DLDataType{::kDLUInt, ::cuda::std::__num_bits_v, 2}; - } -# if _CCCL_CTK_AT_LEAST(13, 0) - else if constexpr (::cuda::std::is_same_v<_ElementType, ::ulong4_32a>) - { - return ::DLDataType{::kDLUInt, ::cuda::std::__num_bits_v, 4}; - } -# else // ^^^ _CCCL_CTK_AT_LEAST(13, 0) ^^^ / vvv _CCCL_CTK_BELOW(13, 0) vvv - else if constexpr (::cuda::std::is_same_v<_ElementType, ::ulong4>) - { - return ::DLDataType{::kDLUInt, ::cuda::std::__num_bits_v, 4}; - } # endif // _CCCL_CTK_BELOW(13, 0) else if constexpr (::cuda::std::is_same_v<_ElementType, ::longlong2>) { @@ -452,12 +413,12 @@ class DLPackWrapper return &__tensor; } - _CCCL_HOST_API ::DLTensor& get() noexcept + [[nodiscard]] _CCCL_HOST_API ::DLTensor& get() noexcept { return __tensor; } - _CCCL_HOST_API const ::DLTensor& get() const noexcept + [[nodiscard]] _CCCL_HOST_API const ::DLTensor& get() const noexcept { return __tensor; } diff --git a/libcudacxx/include/cuda/mdspan b/libcudacxx/include/cuda/mdspan index ae81a30219a..3129198d02a 100644 --- a/libcudacxx/include/cuda/mdspan +++ b/libcudacxx/include/cuda/mdspan @@ -22,6 +22,7 @@ #endif // no system header #include +#include #include #include #include diff --git a/libcudacxx/test/libcudacxx/cuda/containers/views/mdspan/mdspan_to_dlpack/mdspan_to_dlpack.pass.cpp b/libcudacxx/test/libcudacxx/cuda/containers/views/mdspan/mdspan_to_dlpack/mdspan_to_dlpack.pass.cpp new file mode 100644 index 00000000000..6139d99817c --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/containers/views/mdspan/mdspan_to_dlpack/mdspan_to_dlpack.pass.cpp @@ -0,0 +1,408 @@ +//===----------------------------------------------------------------------===// +// +// Part of the libcu++ Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// +// UNSUPPORTED: nvrtc + +#include +#include +#include +#include +#include +#include + +#include "test_macros.h" + +void check_datatype(const DLDataType& dt, uint8_t code, uint8_t bits, uint16_t lanes) +{ + assert(dt.code == code); + assert(dt.bits == bits); + assert(dt.lanes == lanes); +} + +bool test_mdspan_to_dlpack_host_layout_right() +{ + using extents_t = cuda::std::extents; + int data[6] = {0, 1, 2, 3, 4, 5}; + cuda::host_mdspan md{data, extents_t{}}; + auto dlpack_wrapper = cuda::mdspan_to_dlpack(md); + + assert(dlpack_wrapper->device.device_type == kDLCPU); + assert(dlpack_wrapper->device.device_id == 0); + assert(dlpack_wrapper->ndim == 2); + check_datatype(dlpack_wrapper->dtype, kDLInt, 32, 1); + assert(dlpack_wrapper->shape != nullptr); + assert(dlpack_wrapper->strides != nullptr); + assert(dlpack_wrapper->shape[0] == 2); + assert(dlpack_wrapper->shape[1] == 3); + assert(dlpack_wrapper->strides[0] == 3); + assert(dlpack_wrapper->strides[1] == 1); + assert(dlpack_wrapper->byte_offset == 0); + assert(dlpack_wrapper->data == data); + return true; +} + +bool test_mdspan_to_dlpack_host_layout_left() +{ + using extents_t = cuda::std::extents; + int data[6] = {0, 1, 2, 3, 4, 5}; + cuda::host_mdspan md{data, extents_t{}}; + auto dlpack_wrapper = cuda::mdspan_to_dlpack(md); + + assert(dlpack_wrapper->device.device_type == kDLCPU); + assert(dlpack_wrapper->device.device_id == 0); + check_datatype(dlpack_wrapper->dtype, kDLInt, 32, 1); + assert(dlpack_wrapper->ndim == 2); + assert(dlpack_wrapper->shape != nullptr); + assert(dlpack_wrapper->strides != nullptr); + assert(dlpack_wrapper->shape[0] == 2); + assert(dlpack_wrapper->shape[1] == 3); + assert(dlpack_wrapper->strides[0] == 1); + assert(dlpack_wrapper->strides[1] == 2); + assert(dlpack_wrapper->byte_offset == 0); + assert(dlpack_wrapper->data == data); + return true; +} + +bool test_mdspan_to_dlpack_empty_size() +{ + using extents_t = cuda::std::dims<2>; + int data[1] = {42}; + cuda::host_mdspan m{data, extents_t{0, 3}}; + auto dlpack_wrapper = cuda::mdspan_to_dlpack(m); + + assert(dlpack_wrapper->device.device_type == kDLCPU); + assert(dlpack_wrapper->device.device_id == 0); + check_datatype(dlpack_wrapper->dtype, kDLInt, 32, 1); + assert(dlpack_wrapper->ndim == 2); + assert(dlpack_wrapper->shape[0] == 0); + assert(dlpack_wrapper->shape[1] == 3); + assert(dlpack_wrapper->strides[0] == 3); + assert(dlpack_wrapper->strides[1] == 1); + assert(dlpack_wrapper->byte_offset == 0); + assert(dlpack_wrapper->data == nullptr); // size() == 0 => nullptr + return true; +} + +bool test_mdspan_to_dlpack_rank_0() +{ + using extents_t = cuda::std::extents; + int data[1] = {7}; + cuda::host_mdspan md{data, extents_t{}}; + auto dlpack_wrapper = cuda::mdspan_to_dlpack(md); + + assert(dlpack_wrapper->device.device_type == kDLCPU); + assert(dlpack_wrapper->device.device_id == 0); + check_datatype(dlpack_wrapper->dtype, kDLInt, 32, 1); + assert(dlpack_wrapper->ndim == 0); + assert(dlpack_wrapper->shape == nullptr); + assert(dlpack_wrapper->strides == nullptr); + assert(dlpack_wrapper->byte_offset == 0); + assert(dlpack_wrapper->data == data); // rank-0 mdspan has size() == 1 + return true; +} + +bool test_mdspan_to_dlpack_const_pointer() +{ + using extents_t = cuda::std::dims<3>; + const int data[6] = {0, 1, 2, 3, 4, 5}; + cuda::host_mdspan md{data, extents_t{2, 3, 4}}; + auto dlpack_wrapper = cuda::mdspan_to_dlpack(md); + + assert(dlpack_wrapper->device.device_type == kDLCPU); + assert(dlpack_wrapper->device.device_id == 0); + check_datatype(dlpack_wrapper->dtype, kDLInt, 32, 1); + assert(dlpack_wrapper->ndim == 3); + assert(dlpack_wrapper->shape[0] == 2); + assert(dlpack_wrapper->shape[1] == 3); + assert(dlpack_wrapper->shape[2] == 4); + assert(dlpack_wrapper->strides[0] == 12); + assert(dlpack_wrapper->strides[1] == 4); + assert(dlpack_wrapper->strides[2] == 1); + assert(dlpack_wrapper->byte_offset == 0); + assert(dlpack_wrapper->data == data); // rank-0 mdspan has size() == 1 + return true; +} + +bool test_mdspan_to_dlpack_device() +{ + using extents_t = cuda::std::extents; + float* data = nullptr; + assert(cudaMalloc(&data, 6 * sizeof(float)) == cudaSuccess); + cuda::device_mdspan md{data, extents_t{}}; + auto dlpack_wrapper = cuda::mdspan_to_dlpack(md, cuda::device_ref{0}); + + assert(dlpack_wrapper->device.device_type == kDLCUDA); + assert(dlpack_wrapper->device.device_id == 0); + assert(dlpack_wrapper->ndim == 2); + check_datatype(dlpack_wrapper->dtype, kDLFloat, 32, 1); + assert(dlpack_wrapper->shape[0] == 2); + assert(dlpack_wrapper->shape[1] == 3); + assert(dlpack_wrapper->strides[0] == 3); + assert(dlpack_wrapper->strides[1] == 1); + assert(dlpack_wrapper->byte_offset == 0); + assert(dlpack_wrapper->data == data); + return true; +} + +bool test_mdspan_to_dlpack_managed() +{ + using extents_t = cuda::std::extents; + float* data = nullptr; + assert(cudaMallocManaged(&data, 6 * sizeof(float)) == cudaSuccess); + cuda::managed_mdspan md{data, extents_t{}}; + auto dlpack_wrapper = cuda::mdspan_to_dlpack(md); + + assert(dlpack_wrapper->device.device_type == kDLCUDAManaged); + assert(dlpack_wrapper->device.device_id == 0); + assert(dlpack_wrapper->ndim == 2); + check_datatype(dlpack_wrapper->dtype, kDLFloat, 32, 1); + assert(dlpack_wrapper->shape[0] == 2); + assert(dlpack_wrapper->shape[1] == 3); + assert(dlpack_wrapper->strides[0] == 3); + assert(dlpack_wrapper->strides[1] == 1); + assert(dlpack_wrapper->byte_offset == 0); + assert(dlpack_wrapper->data == data); + return true; +} + +template +struct test_mdspan_to_dlpack_types_fn +{ + using list_t = ListT; + + cuda::std::array expected_types; + + template + void call_impl() const + { + using T = cuda::std::__type_at_c; + using extents_t = cuda::std::extents; + T* data = nullptr; + cuda::host_mdspan md{data, extents_t{}}; + auto dlpack_wrapper = cuda::mdspan_to_dlpack(md); + + auto type = expected_types[index]; + check_datatype(dlpack_wrapper->dtype, type.code, type.bits, type.lanes); + } + + template + void call(cuda::std::index_sequence) const + { + (call_impl(), ...); + } +}; + +bool test_mdspan_to_dlpack_types() +{ + using list_t = cuda::std::__type_list< + bool, + signed char, + short, + int, + long, + long long, +#if _CCCL_HAS_INT128() + __int128_t, +#endif + // Unsigned integer types + unsigned char, + unsigned short, + unsigned int, + unsigned long, + unsigned long long, +#if _CCCL_HAS_INT128() + __uint128_t, +#endif + // Floating-point types + float, + double, +#if _CCCL_HAS_NVFP16() + ::__half, +#endif +#if _CCCL_HAS_NVBF16() + ::__nv_bfloat16, +#endif +#if _CCCL_HAS_FLOAT128() + __float128, +#endif + // Low-precision floating-point types +#if _CCCL_HAS_NVFP8_E4M3() + ::__nv_fp8_e4m3, +#endif +#if _CCCL_HAS_NVFP8_E5M2() + ::__nv_fp8_e5m2, +#endif +#if _CCCL_HAS_NVFP8_E8M0() + ::__nv_fp8_e8m0, +#endif +#if _CCCL_HAS_NVFP6_E2M3() + ::__nv_fp6_e2m3, +#endif +#if _CCCL_HAS_NVFP6_E3M2() + ::__nv_fp6_e3m2, +#endif +#if _CCCL_HAS_NVFP4_E2M1() + ::__nv_fp4_e2m1, +#endif + // Complex types +#if _CCCL_HAS_NVFP16() + cuda::std::complex<::__half>, +#endif + cuda::std::complex, + cuda::std::complex, + // Vector types (CUDA built-in vector types) +#if _CCCL_HAS_CTK() + ::char2, + ::char4, + ::uchar2, + ::uchar4, + ::short2, + ::short4, + ::ushort2, + ::ushort4, + ::int2, + ::int4, + ::uint2, + ::uint4, + ::long2, +# if _CCCL_CTK_AT_LEAST(13, 0) + ::long4_32a, +# else + ::long4, +# endif + ::ulong2, +# if _CCCL_CTK_AT_LEAST(13, 0) + ::ulong4_32a, +# else + ::ulong4, +# endif + ::longlong2, +# if _CCCL_CTK_AT_LEAST(13, 0) + ::longlong4_32a, +# else + ::longlong4, +# endif + ::ulonglong2, +# if _CCCL_CTK_AT_LEAST(13, 0) + ::ulonglong4_32a, +# else + ::ulonglong4, +# endif + ::float2, + ::float4, + ::double2, +# if _CCCL_CTK_AT_LEAST(13, 0) + ::double4_32a +# else + ::double4 +# endif +#endif // _CCCL_HAS_CTK() + >; + cuda::std::array expected_types = { + DLDataType{kDLBool, 8, 1}, + // Signed integer types + DLDataType{kDLInt, 8, 1}, + DLDataType{kDLInt, 16, 1}, + DLDataType{kDLInt, 32, 1}, + + DLDataType{kDLInt, sizeof(long) * 8, 1}, + DLDataType{kDLInt, 64, 1}, +#if _CCCL_HAS_INT128() + DLDataType{kDLInt, 128, 1}, +#endif + // Unsigned integer types + DLDataType{kDLUInt, 8, 1}, + DLDataType{kDLUInt, 16, 1}, + DLDataType{kDLUInt, 32, 1}, + DLDataType{kDLUInt, sizeof(unsigned long) * 8, 1}, + DLDataType{kDLUInt, 64, 1}, +#if _CCCL_HAS_INT128() + DLDataType{kDLUInt, 128, 1}, +#endif + // Floating-point types + DLDataType{kDLFloat, 32, 1}, + DLDataType{kDLFloat, 64, 1}, +#if _CCCL_HAS_NVFP16() + DLDataType{kDLFloat, 16, 1}, +#endif +#if _CCCL_HAS_NVBF16() + DLDataType{kDLBfloat, 16, 1}, +#endif +#if _CCCL_HAS_FLOAT128() + DLDataType{kDLFloat, 128, 1}, +#endif + // Low-precision floating-point types +#if _CCCL_HAS_NVFP8_E4M3() + DLDataType{kDLFloat8_e4m3fn, 8, 1}, +#endif +#if _CCCL_HAS_NVFP8_E5M2() + DLDataType{kDLFloat8_e5m2, 8, 1}, +#endif +#if _CCCL_HAS_NVFP8_E8M0() + DLDataType{kDLFloat8_e8m0fnu, 8, 1}, +#endif +#if _CCCL_HAS_NVFP6_E2M3() + DLDataType{kDLFloat6_e2m3fn, 6, 1}, +#endif +#if _CCCL_HAS_NVFP6_E3M2() + DLDataType{kDLFloat6_e3m2fn, 6, 1}, +#endif +#if _CCCL_HAS_NVFP4_E2M1() + DLDataType{kDLFloat4_e2m1fn, 4, 1}, +#endif + // Complex types +#if _CCCL_HAS_NVFP16() + DLDataType{kDLComplex, 32, 1}, +#endif + DLDataType{kDLComplex, 64, 1}, + DLDataType{kDLComplex, 128, 1}, + // Vector types (CUDA built-in vector types) +#if _CCCL_HAS_CTK() + DLDataType{kDLInt, 8, 2}, + DLDataType{kDLInt, 8, 4}, + DLDataType{kDLUInt, 8, 2}, + DLDataType{kDLUInt, 8, 4}, + DLDataType{kDLInt, 16, 2}, + DLDataType{kDLInt, 16, 4}, + DLDataType{kDLUInt, 16, 2}, + DLDataType{kDLUInt, 16, 4}, + DLDataType{kDLInt, 32, 2}, + DLDataType{kDLInt, 32, 4}, + DLDataType{kDLUInt, 32, 2}, + DLDataType{kDLUInt, 32, 4}, + DLDataType{kDLInt, sizeof(long) * 8, 2}, + DLDataType{kDLInt, sizeof(long) * 8, 4}, + DLDataType{kDLUInt, sizeof(unsigned long) * 8, 2}, + DLDataType{kDLUInt, sizeof(unsigned long) * 8, 4}, + DLDataType{kDLInt, 64, 2}, + DLDataType{kDLInt, 64, 4}, + DLDataType{kDLUInt, 64, 2}, + DLDataType{kDLUInt, 64, 4}, + DLDataType{kDLFloat, 32, 2}, + DLDataType{kDLFloat, 32, 4}, + DLDataType{kDLFloat, 64, 2}, + DLDataType{kDLFloat, 64, 4}, +#endif // _CCCL_HAS_CTK() + }; + test_mdspan_to_dlpack_types_fn test_fn{expected_types}; + test_fn.call(cuda::std::make_index_sequence{}); + return true; +} + +int main(int, char**) +{ + NV_IF_TARGET(NV_IS_HOST, (assert(test_mdspan_to_dlpack_host_layout_right());)) + NV_IF_TARGET(NV_IS_HOST, (assert(test_mdspan_to_dlpack_host_layout_left());)) + NV_IF_TARGET(NV_IS_HOST, (assert(test_mdspan_to_dlpack_empty_size());)) + NV_IF_TARGET(NV_IS_HOST, (assert(test_mdspan_to_dlpack_rank_0());)) + NV_IF_TARGET(NV_IS_HOST, (assert(test_mdspan_to_dlpack_const_pointer());)) + NV_IF_TARGET(NV_IS_HOST, (assert(test_mdspan_to_dlpack_device());)) + NV_IF_TARGET(NV_IS_HOST, (assert(test_mdspan_to_dlpack_managed());)) + NV_IF_TARGET(NV_IS_HOST, (assert(test_mdspan_to_dlpack_types());)) + return 0; +} diff --git a/libcudacxx/test/libcudacxx/cuda/containers/views/mdspan/mdspan_to_dlpack/mdspan_to_dlpack.wrapper.pass.cpp b/libcudacxx/test/libcudacxx/cuda/containers/views/mdspan/mdspan_to_dlpack/mdspan_to_dlpack.wrapper.pass.cpp new file mode 100644 index 00000000000..0d17e582fe5 --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/containers/views/mdspan/mdspan_to_dlpack/mdspan_to_dlpack.wrapper.pass.cpp @@ -0,0 +1,209 @@ +//===----------------------------------------------------------------------===// +// +// Part of the libcu++ Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// +// UNSUPPORTED: nvrtc + +#include +#include +#include +#include +#include +#include +#include + +#include "test_macros.h" + +void check_datatype(const DLDataType& dt, uint8_t code, uint8_t bits, uint16_t lanes) +{ + assert(dt.code == code); + assert(dt.bits == bits); + assert(dt.lanes == lanes); +} + +bool test_mdspan_to_dlpack_wrapper_default_ctor() +{ + cuda::DLPackWrapper<3> dlpack_wrapper{}; + DLDataType default_dtype = {}; + DLDevice default_device = {}; + auto& tensor = dlpack_wrapper.get(); + assert(tensor.device.device_type == default_device.device_type); + assert(tensor.device.device_id == default_device.device_id); + check_datatype(tensor.dtype, default_dtype.code, default_dtype.bits, default_dtype.lanes); + assert(tensor.shape != nullptr); + assert(tensor.strides != nullptr); + return true; +} + +bool test_dlpack_wrapper_copy_ctor() +{ + using extents_t = cuda::std::extents; + int data[6] = {0, 1, 2, 3, 4, 5}; + cuda::host_mdspan md{data, extents_t{}}; + auto w = cuda::mdspan_to_dlpack(md); + auto* shape_ptr = w->shape; + auto* strides_ptr = w->strides; + + auto w2 = w; // copy construct + // Copy must not alias the source wrapper's shape/stride storage. + assert(w2->shape != nullptr); + assert(w2->strides != nullptr); + assert(w2->shape != shape_ptr); + assert(w2->strides != strides_ptr); + + // Source wrapper must remain intact. + assert(w->shape == shape_ptr); + assert(w->strides == strides_ptr); + + // Sanity-check copied tensor metadata and values. + assert(w2->device.device_type == kDLCPU); + assert(w2->device.device_id == 0); + assert(w2->ndim == 2); + check_datatype(w2->dtype, kDLInt, 32, 1); + assert(w2->shape[0] == 2); + assert(w2->shape[1] == 3); + assert(w2->strides[0] == 3); + assert(w2->strides[1] == 1); + assert(w2->byte_offset == 0); + assert(w2->data == data); + return true; +} + +bool test_dlpack_wrapper_move_ctor() +{ + using extents_t = cuda::std::extents; + int data[6] = {0, 1, 2, 3, 4, 5}; + cuda::host_mdspan md{data, extents_t{}}; + auto w = cuda::mdspan_to_dlpack(md); + auto* shape_ptr = w->shape; + auto* strides_ptr = w->strides; + auto moved = cuda::std::move(w); // move construct + + // Moved-to wrapper must not keep pointers to moved-from storage. + assert(moved->shape != nullptr); + assert(moved->strides != nullptr); + assert(moved->shape != shape_ptr); + assert(moved->strides != strides_ptr); + + // Moved-from wrapper is explicitly reset to a default/empty DLTensor. + assert(w->shape == nullptr); + assert(w->strides == nullptr); + assert(w->data == nullptr); + assert(w->ndim == 0); + + // Sanity-check moved-to tensor metadata and values. + assert(moved->device.device_type == kDLCPU); + assert(moved->device.device_id == 0); + assert(moved->ndim == 2); + check_datatype(moved->dtype, kDLInt, 32, 1); + assert(moved->shape[0] == 2); + assert(moved->shape[1] == 3); + assert(moved->strides[0] == 3); + assert(moved->strides[1] == 1); + assert(moved->byte_offset == 0); + assert(moved->data == data); + return true; +} + +bool test_dlpack_wrapper_copy_assignment() +{ + using extents_t = cuda::std::extents; + int data_a[6] = {0, 1, 2, 3, 4, 5}; + int data_b[6] = {6, 7, 8, 9, 10, 11}; + cuda::host_mdspan md_a{data_a, extents_t{}}; + cuda::host_mdspan md_b{data_b, extents_t{}}; + auto a = cuda::mdspan_to_dlpack(md_a); + auto b = cuda::mdspan_to_dlpack(md_b); + auto* b_shape_ptr = b->shape; + auto* b_strides_ptr = b->strides; + + b = a; // copy assign + // Destination must keep pointing to its own member arrays (not to `a`). + assert(b->shape == b_shape_ptr); + assert(b->strides == b_strides_ptr); + assert(b->shape != a->shape); + assert(b->strides != a->strides); + + // Values must be copied correctly. + assert(b->data == data_a); + assert(b->ndim == 2); + assert(b->shape[0] == 2); + assert(b->shape[1] == 3); + assert(b->strides[0] == 3); + assert(b->strides[1] == 1); + return true; +} + +bool test_dlpack_wrapper_move_assignment() +{ + using extents_t = cuda::std::extents; + int data_a[6] = {0, 1, 2, 3, 4, 5}; + int data_b[6] = {6, 7, 8, 9, 10, 11}; + cuda::host_mdspan md_a{data_a, extents_t{}}; + cuda::host_mdspan md_b{data_b, extents_t{}}; + auto a = cuda::mdspan_to_dlpack(md_a); + auto b = cuda::mdspan_to_dlpack(md_b); + auto* a_shape_ptr = a->shape; + auto* a_strides_ptr = a->strides; + auto* b_shape_ptr = b->shape; + auto* b_strides_ptr = b->strides; + + b = cuda::std::move(a); // move assign + // Destination must keep pointing to its own member arrays, not the source's. + assert(b->shape == b_shape_ptr); + assert(b->strides == b_strides_ptr); + assert(b->shape != a_shape_ptr); + assert(b->strides != a_strides_ptr); + + // Source must be reset. + assert(a->shape == nullptr); + assert(a->strides == nullptr); + assert(a->data == nullptr); + assert(a->ndim == 0); + + // Values must be moved correctly. + assert(b->data == data_a); + assert(b->ndim == 2); + assert(b->shape[0] == 2); + assert(b->shape[1] == 3); + assert(b->strides[0] == 3); + assert(b->strides[1] == 1); + return true; +} + +bool test_dlpack_wrapper_get() +{ + using wrapper_t = cuda::DLPackWrapper<2>; + static_assert(cuda::std::is_same_v().get()), ::DLTensor&>); + static_assert(cuda::std::is_same_v().get()), const ::DLTensor&>); + + wrapper_t w{}; + // `get()` must return a reference to the same underlying `DLTensor` as `operator->()`. + assert(&w.get() == w.operator->()); + + // Mutating through the reference returned by `get()` must be observable through `operator->()`. + auto& t = w.get(); + t.ndim = 123; + assert(w->ndim == 123); + + // Const overload should also alias the same underlying object. + const wrapper_t& cw = w; + assert(&cw.get() == cw.operator->()); + return true; +} + +int main(int, char**) +{ + NV_IF_TARGET( + NV_IS_HOST, + (assert(test_mdspan_to_dlpack_wrapper_default_ctor()); assert(test_dlpack_wrapper_copy_ctor()); + assert(test_dlpack_wrapper_move_ctor()); + assert(test_dlpack_wrapper_copy_assignment()); + assert(test_dlpack_wrapper_move_assignment()); + assert(test_dlpack_wrapper_get());)) + return 0; +} From 464ccc28b05d4d9b212f713a5f15de1aa09faffd Mon Sep 17 00:00:00 2001 From: fbusato Date: Fri, 19 Dec 2025 08:59:51 -0800 Subject: [PATCH 03/56] documentation --- docs/libcudacxx/extended_api/mdspan.rst | 6 + .../extended_api/mdspan/mdspan_to_dlpack.rst | 136 ++++++++++++++++++ 2 files changed, 142 insertions(+) create mode 100644 docs/libcudacxx/extended_api/mdspan/mdspan_to_dlpack.rst diff --git a/docs/libcudacxx/extended_api/mdspan.rst b/docs/libcudacxx/extended_api/mdspan.rst index 028ac918aec..ca0582fa0d5 100644 --- a/docs/libcudacxx/extended_api/mdspan.rst +++ b/docs/libcudacxx/extended_api/mdspan.rst @@ -10,6 +10,7 @@ Mdspan mdspan/host_device_accessor mdspan/restrict_accessor mdspan/shared_memory_accessor + mdspan/mdspan_to_dlpack .. list-table:: :widths: 25 45 30 30 @@ -34,3 +35,8 @@ Mdspan - ``mdspan`` and accessor for CUDA shared memory - CCCL 3.2.0 - CUDA 13.2 + + * - :ref:`mdspan to dlpack ` + - Convert a ``mdspan`` to a ``DLTensor`` + - CCCL 3.2.0 + - CUDA 13.2 diff --git a/docs/libcudacxx/extended_api/mdspan/mdspan_to_dlpack.rst b/docs/libcudacxx/extended_api/mdspan/mdspan_to_dlpack.rst new file mode 100644 index 00000000000..d758ff84f12 --- /dev/null +++ b/docs/libcudacxx/extended_api/mdspan/mdspan_to_dlpack.rst @@ -0,0 +1,136 @@ +.. _libcudacxx-extended-api-mdspan-mdspan-to-dlpack: + +``mdspan`` to DLPack +==================== + +This functionality provides a conversion from ``cuda::host_mdspan``, ``cuda::device_mdspan``, and ``cuda::managed_mdspan`` to a `DLPack `__ ``DLTensor`` view. + +Defined in the ```` header. + +Conversion functions +-------------------- + +.. code:: cuda + + namespace cuda { + + template + [[nodiscard]] DLPackWrapper + mdspan_to_dlpack(const cuda::host_mdspan& mdspan); + + template + [[nodiscard]] DLPackWrapper + mdspan_to_dlpack(const cuda::device_mdspan& mdspan, + cuda::device_ref device = cuda::device_ref{0}); + + template + [[nodiscard]] DLPackWrapper + mdspan_to_dlpack(const cuda::managed_mdspan& mdspan); + + } // namespace cuda + +Types +----- + +.. code:: cuda + + namespace cuda { + + template + class DLPackWrapper { + public: + DLPackWrapper(); + DLPackWrapper(const DLPackWrapper&) noexcept; + DLPackWrapper(DLPackWrapper&&) noexcept; + DLPackWrapper& operator=(const DLPackWrapper&) noexcept; + DLPackWrapper& operator=(DLPackWrapper&&) noexcept; + ~DLPackWrapper() noexcept = default; + + DLTensor* operator->() noexcept; + const DLTensor* operator->() const noexcept; + DLTensor& get() noexcept; + const DLTensor& get() const noexcept; + }; + + } // namespace cuda + +``cuda::DLPackWrapper`` stores a ``DLTensor`` and owns the backing storage for its ``shape`` and ``strides`` pointers. The class does not use any heap allocation. + +.. note:: Lifetime + + The ``DLTensor`` associated with ``cuda::DLPackWrapper`` must not outlive the wrapper. If the wrapper is destroyed, the returned ``DLTensor::shape`` and ``DLTensor::strides`` pointers will dangle. + +.. note:: Const-correctness + + ``DLTensor::data`` points at ``mdspan.data_handle()`` (or is ``nullptr`` if ``mdspan.size() == 0``). If ``T`` is ``const``, the pointer is ``const_cast``'d because ``DLTensor::data`` is unqualified. + +Semantics +--------- + +The conversion produces a non-owning DLPack view of the ``mdspan`` data and metadata: + +- ``DLTensor::ndim`` is ``mdspan.rank()``. +- For rank > 0, ``DLTensor::shape[i]`` is ``mdspan.extent(i)``. +- For rank > 0, ``DLTensor::strides[i]`` is ``mdspan.stride(i)``. +- ``DLTensor::byte_offset`` is always ``0``. +- ``DLTensor::device`` is: + + - ``{kDLCPU, 0}`` for ``cuda::host_mdspan`` + - ``{kDLCUDA, device.get()}`` for ``cuda::device_mdspan`` + - ``{kDLCUDAManaged, 0}`` for ``cuda::managed_mdspan`` + +Element types are mapped to ``DLDataType`` according to the DLPack conventions, including: + +- Signed and unsigned integers. +- IEEE-754 Floating-point and extended precision floating-point, including ``__half``, ``__nv_bfloat16``, FP8, FP6, FP4 when available. +- Complex: ``cuda::std::complex<__half>``, ``cuda::std::complex``, and ``cuda::std::complex``. +- `CUDA built-in vector types `__, such as ``int2``, ``float4``, etc.. + +Constraints and errors +---------------------- + +**Constraints** + +- The accessor ``data_handle_type`` must be a pointer type. + +**Runtime errors** + +- If any ``extent(i)`` or ``stride(i)`` cannot be represented in ``int64_t``, the conversion raises an exception. + +Availability notes +------------------ + +- This API is available only when DLPack headers are present (```` is found in the include path). +* ``dlpack/dlpack.h`` (`DLPack v1 `__) must be discoverable at compile time, namely available in the include path. + +References +---------- + +- `DLPack C API `__ documentation. + +Example +------- + +.. code:: cuda + + #include + + #include + #include + #include + + int main() { + using extents_t = cuda::std::extents; + + int data[6] = {0, 1, 2, 3, 4, 5}; + cuda::host_mdspan md{data, extents_t{}}; + + auto dl = cuda::mdspan_to_dlpack(md); + + // `dl` owns the shape/stride storage; `dl->data` is a non-owning pointer to `data`. + assert(dl->device.device_type == kDLCPU); + assert(dl->ndim == 2); + assert(dl->shape[0] == 2 && dl->shape[1] == 3); + assert(dl->strides[0] == 3 && dl->strides[1] == 1); + assert(dl->data == data); + } From 6f32ae9d54be96448d0d4c580b73b9e8582477eb Mon Sep 17 00:00:00 2001 From: Federico Busato <50413820+fbusato@users.noreply.github.com> Date: Fri, 19 Dec 2025 10:19:09 -0800 Subject: [PATCH 04/56] Update libcudacxx/include/cuda/__mdspan/mdspan_to_dlpack.h Co-authored-by: David Bayer <48736217+davebayer@users.noreply.github.com> --- .../include/cuda/__mdspan/mdspan_to_dlpack.h | 54 +------------------ 1 file changed, 2 insertions(+), 52 deletions(-) diff --git a/libcudacxx/include/cuda/__mdspan/mdspan_to_dlpack.h b/libcudacxx/include/cuda/__mdspan/mdspan_to_dlpack.h index 30138cbaee6..0bfa7e28560 100644 --- a/libcudacxx/include/cuda/__mdspan/mdspan_to_dlpack.h +++ b/libcudacxx/include/cuda/__mdspan/mdspan_to_dlpack.h @@ -58,60 +58,10 @@ template } //-------------------------------------------------------------------------------------------------------------------- // Signed integer types - else if constexpr (::cuda::std::is_same_v<_ElementType, ::cuda::std::int8_t>) + else if constexpr (::cuda::std::__cccl_is_integer_v<_ElementType>) { - return ::DLDataType{::kDLInt, 8, 1}; + return ::DLDataType{(::cuda::std::is_signed_v<_ElementType>) ? ::kDLInt : ::kDLUInt, ::cuda::std::__num_bits_v<_ElementType>, 1}; } - else if constexpr (::cuda::std::is_same_v<_ElementType, ::cuda::std::int16_t>) - { - return ::DLDataType{::kDLInt, 16, 1}; - } - else if constexpr (::cuda::std::is_same_v<_ElementType, ::cuda::std::int32_t>) - { - return ::DLDataType{::kDLInt, 32, 1}; - } - else if constexpr (::cuda::std::is_same_v<_ElementType, long>) - { - return ::DLDataType{::kDLInt, ::cuda::std::__num_bits_v, 1}; - } - else if constexpr (::cuda::std::is_same_v<_ElementType, long long>) - { - return ::DLDataType{::kDLInt, 64, 1}; - } -# if _CCCL_HAS_INT128() - else if constexpr (::cuda::std::is_same_v<_ElementType, __int128_t>) - { - return ::DLDataType{::kDLInt, 128, 1}; - } -# endif // _CCCL_HAS_INT128() - //-------------------------------------------------------------------------------------------------------------------- - // Unsigned integer types - else if constexpr (::cuda::std::is_same_v<_ElementType, ::cuda::std::uint8_t>) - { - return ::DLDataType{::kDLUInt, 8, 1}; - } - else if constexpr (::cuda::std::is_same_v<_ElementType, ::cuda::std::uint16_t>) - { - return ::DLDataType{::kDLUInt, 16, 1}; - } - else if constexpr (::cuda::std::is_same_v<_ElementType, ::cuda::std::uint32_t>) - { - return ::DLDataType{::kDLUInt, 32, 1}; - } - else if constexpr (::cuda::std::is_same_v<_ElementType, unsigned long>) - { - return ::DLDataType{::kDLUInt, ::cuda::std::__num_bits_v, 1}; - } - else if constexpr (::cuda::std::is_same_v<_ElementType, unsigned long long>) - { - return ::DLDataType{::kDLUInt, 64, 1}; - } -# if _CCCL_HAS_INT128() - else if constexpr (::cuda::std::is_same_v<_ElementType, __uint128_t>) - { - return ::DLDataType{::kDLUInt, 128, 1}; - } -# endif // _CCCL_HAS_INT128() //-------------------------------------------------------------------------------------------------------------------- // Floating-point types # if _CCCL_HAS_NVFP16() From ee05eda0e18e3b649f57ec523e537cdae37202aa Mon Sep 17 00:00:00 2001 From: fbusato Date: Fri, 19 Dec 2025 12:20:18 -0800 Subject: [PATCH 05/56] add many types --- .../include/cuda/__mdspan/mdspan_to_dlpack.h | 198 +++---------- .../include/cuda/__type_traits/vector_type.h | 193 ++++++++++++- .../cuda/std/__tuple_dir/vector_types.h | 40 +++ .../mdspan_to_dlpack.pass.cpp | 269 +++++++++++++----- .../type_traits/vector_type.compile.pass.cpp | 90 ++++++ 5 files changed, 543 insertions(+), 247 deletions(-) diff --git a/libcudacxx/include/cuda/__mdspan/mdspan_to_dlpack.h b/libcudacxx/include/cuda/__mdspan/mdspan_to_dlpack.h index 0bfa7e28560..2794788e70f 100644 --- a/libcudacxx/include/cuda/__mdspan/mdspan_to_dlpack.h +++ b/libcudacxx/include/cuda/__mdspan/mdspan_to_dlpack.h @@ -23,9 +23,12 @@ #if !_CCCL_COMPILER(NVRTC) && _CCCL_HAS_INCLUDE() # include +# include +# include # include # include # include +# include # include # include # include @@ -35,7 +38,6 @@ # include # include # include -# include # include # include @@ -60,38 +62,19 @@ template // Signed integer types else if constexpr (::cuda::std::__cccl_is_integer_v<_ElementType>) { - return ::DLDataType{(::cuda::std::is_signed_v<_ElementType>) ? ::kDLInt : ::kDLUInt, ::cuda::std::__num_bits_v<_ElementType>, 1}; + return ::DLDataType{ + (::cuda::std::is_signed_v<_ElementType>) ? ::kDLInt : ::kDLUInt, ::cuda::std::__num_bits_v<_ElementType>, 1}; } //-------------------------------------------------------------------------------------------------------------------- - // Floating-point types -# if _CCCL_HAS_NVFP16() - else if constexpr (::cuda::std::is_same_v<_ElementType, ::__half>) - { - return ::DLDataType{::kDLFloat, 16, 1}; - } -# endif // _CCCL_HAS_NVFP16() + // bfloat16 (must come before general floating-point) # if _CCCL_HAS_NVBF16() else if constexpr (::cuda::std::is_same_v<_ElementType, ::__nv_bfloat16>) { return ::DLDataType{::kDLBfloat, 16, 1}; } # endif // _CCCL_HAS_NVBF16() - else if constexpr (::cuda::std::is_same_v<_ElementType, float>) - { - return ::DLDataType{::kDLFloat, 32, 1}; - } - else if constexpr (::cuda::std::is_same_v<_ElementType, double>) - { - return ::DLDataType{::kDLFloat, 64, 1}; - } -# if _CCCL_HAS_FLOAT128() - else if constexpr (::cuda::std::is_same_v<_ElementType, __float128>) - { - return ::DLDataType{::kDLFloat, 128, 1}; - } -# endif // _CCCL_HAS_FLOAT128() //-------------------------------------------------------------------------------------------------------------------- - // Low-precision Floating-point types + // Low-precision Floating-point types (must come before general floating-point) # if _CCCL_HAS_NVFP8_E4M3() else if constexpr (::cuda::std::is_same_v<_ElementType, ::__nv_fp8_e4m3>) { @@ -129,156 +112,38 @@ template } # endif // _CCCL_HAS_NVFP4_E2M1() //-------------------------------------------------------------------------------------------------------------------- - // Complex types -# if _CCCL_HAS_NVFP16() - else if constexpr (::cuda::std::is_same_v<_ElementType, ::cuda::std::complex<::__half>>) + // Floating-point types (after specific types) + else if constexpr (::cuda::is_floating_point_v<_ElementType>) { - return ::DLDataType{::kDLComplex, 32, 1}; + return ::DLDataType{::kDLFloat, ::cuda::std::__num_bits_v<_ElementType>, 1}; } -# endif // _CCCL_HAS_NVFP16() - else if constexpr (::cuda::std::is_same_v<_ElementType, ::cuda::std::complex>) - { - return ::DLDataType{::kDLComplex, 64, 1}; - } - else if constexpr (::cuda::std::is_same_v<_ElementType, ::cuda::std::complex>) + //-------------------------------------------------------------------------------------------------------------------- + // Complex types + // 256-bit data types are not supported in DLPack, e.g. cuda::std::complex<__float128> + else if constexpr (::cuda::std::__is_cuda_std_complex_v<_ElementType> && sizeof(_ElementType) <= sizeof(double) * 2) { - return ::DLDataType{::kDLComplex, 128, 1}; + // DLPack encodes complex numbers as a compact struct of two scalar values, and `bits` stores + // the size of the full complex number (e.g. std::complex => bits=64). + return ::DLDataType{::kDLComplex, sizeof(_ElementType) * CHAR_BIT, 1}; } - // 256-bit data types are not supported in DLPack, e.g. cuda::std::complex<__float128> //-------------------------------------------------------------------------------------------------------------------- - // Vector types (CUDA built-in vector types) + // CUDA built-in vector types # if _CCCL_HAS_CTK() - else if constexpr (::cuda::std::is_same_v<_ElementType, ::char2>) - { - return ::DLDataType{::kDLInt, 8, 2}; - } - else if constexpr (::cuda::std::is_same_v<_ElementType, ::char4>) - { - return ::DLDataType{::kDLInt, 8, 4}; - } - else if constexpr (::cuda::std::is_same_v<_ElementType, ::uchar2>) + else if constexpr (::cuda::__is_vector_type_v<_ElementType> || ::cuda::__is_extended_fp_vector_type_v<_ElementType>) { - return ::DLDataType{::kDLUInt, 8, 2}; - } - else if constexpr (::cuda::std::is_same_v<_ElementType, ::uchar4>) - { - return ::DLDataType{::kDLUInt, 8, 4}; - } - else if constexpr (::cuda::std::is_same_v<_ElementType, ::short2>) - { - return ::DLDataType{::kDLInt, 16, 2}; - } - else if constexpr (::cuda::std::is_same_v<_ElementType, ::short4>) - { - return ::DLDataType{::kDLInt, 16, 4}; - } - else if constexpr (::cuda::std::is_same_v<_ElementType, ::ushort2>) - { - return ::DLDataType{::kDLUInt, 16, 2}; - } - else if constexpr (::cuda::std::is_same_v<_ElementType, ::ushort4>) - { - return ::DLDataType{::kDLUInt, 16, 4}; - } - else if constexpr (::cuda::std::is_same_v<_ElementType, ::int2>) - { - return ::DLDataType{::kDLInt, 32, 2}; - } - else if constexpr (::cuda::std::is_same_v<_ElementType, ::int4>) - { - return ::DLDataType{::kDLInt, 32, 4}; - } - else if constexpr (::cuda::std::is_same_v<_ElementType, ::uint2>) - { - return ::DLDataType{::kDLUInt, 32, 2}; - } - else if constexpr (::cuda::std::is_same_v<_ElementType, ::uint4>) - { - return ::DLDataType{::kDLUInt, 32, 4}; - } - else if constexpr (::cuda::std::is_same_v<_ElementType, ::long2>) - { - return ::DLDataType{::kDLInt, ::cuda::std::__num_bits_v, 2}; - } -# if _CCCL_CTK_AT_LEAST(13, 0) - else if constexpr (::cuda::std::is_same_v<_ElementType, ::long4_32a>) - { - return ::DLDataType{::kDLInt, ::cuda::std::__num_bits_v, 4}; - } -# else // ^^^ _CCCL_CTK_AT_LEAST(13, 0) ^^^ / vvv _CCCL_CTK_BELOW(13, 0) vvv - else if constexpr (::cuda::std::is_same_v<_ElementType, ::long4>) - { - return ::DLDataType{::kDLInt, ::cuda::std::__num_bits_v, 4}; - } -# endif // _CCCL_CTK_BELOW(13, 0) - else if constexpr (::cuda::std::is_same_v<_ElementType, ::ulong2>) - { - return ::DLDataType{::kDLUInt, ::cuda::std::__num_bits_v, 2}; - } -# if _CCCL_CTK_AT_LEAST(13, 0) - else if constexpr (::cuda::std::is_same_v<_ElementType, ::ulong4_32a>) - { - return ::DLDataType{::kDLUInt, ::cuda::std::__num_bits_v, 4}; - } -# else // ^^^ _CCCL_CTK_AT_LEAST(13, 0) ^^^ / vvv _CCCL_CTK_BELOW(13, 0) vvv - else if constexpr (::cuda::std::is_same_v<_ElementType, ::ulong4>) - { - return ::DLDataType{::kDLUInt, ::cuda::std::__num_bits_v, 4}; - } -# endif // _CCCL_CTK_BELOW(13, 0) - else if constexpr (::cuda::std::is_same_v<_ElementType, ::longlong2>) - { - return ::DLDataType{::kDLInt, 64, 2}; - } -# if _CCCL_CTK_AT_LEAST(13, 0) - else if constexpr (::cuda::std::is_same_v<_ElementType, ::longlong4_32a>) - { - return ::DLDataType{::kDLInt, 64, 4}; - } -# else // ^^^ _CCCL_CTK_AT_LEAST(13, 0) ^^^ / vvv _CCCL_CTK_BELOW(13, 0) vvv - else if constexpr (::cuda::std::is_same_v<_ElementType, ::longlong4>) - { - return ::DLDataType{::kDLInt, 64, 4}; - } -# endif // _CCCL_CTK_BELOW(13, 0) - else if constexpr (::cuda::std::is_same_v<_ElementType, ::ulonglong2>) - { - return ::DLDataType{::kDLUInt, 64, 2}; - } -# if _CCCL_CTK_AT_LEAST(13, 0) - else if constexpr (::cuda::std::is_same_v<_ElementType, ::ulonglong4_32a>) - { - return ::DLDataType{::kDLUInt, 64, 4}; - } -# else // ^^^ _CCCL_CTK_AT_LEAST(13, 0) ^^^ / vvv _CCCL_CTK_BELOW(13, 0) vvv - else if constexpr (::cuda::std::is_same_v<_ElementType, ::ulonglong4>) - { - return ::DLDataType{::kDLUInt, 64, 4}; - } -# endif // _CCCL_CTK_BELOW(13, 0) - else if constexpr (::cuda::std::is_same_v<_ElementType, ::float2>) - { - return ::DLDataType{::kDLFloat, 32, 2}; - } - else if constexpr (::cuda::std::is_same_v<_ElementType, ::float4>) - { - return ::DLDataType{::kDLFloat, 32, 4}; - } - else if constexpr (::cuda::std::is_same_v<_ElementType, ::double2>) - { - return ::DLDataType{::kDLFloat, 64, 2}; - } -# if _CCCL_CTK_AT_LEAST(13, 0) - else if constexpr (::cuda::std::is_same_v<_ElementType, ::double4_32a>) - { - return ::DLDataType{::kDLFloat, 64, 4}; - } -# else // ^^^ _CCCL_CTK_AT_LEAST(13, 0) ^^^ / vvv _CCCL_CTK_BELOW(13, 0) vvv - else if constexpr (::cuda::std::is_same_v<_ElementType, ::double4>) - { - return ::DLDataType{::kDLFloat, 64, 4}; + constexpr ::cuda::std::uint16_t __lanes = ::cuda::std::tuple_size_v<_ElementType>; + if constexpr (__lanes == 2 || __lanes == 4) + { + using __scalar_t = ::cuda::std::remove_cv_t<::cuda::std::tuple_element_t<0, _ElementType>>; + auto __scalar = ::cuda::__data_type_to_dlpack<__scalar_t>(); + __scalar.lanes = __lanes; + return __scalar; + } + else + { + static_assert(::cuda::std::__always_false_v<_ElementType>, "Unsupported vector type"); + } } -# endif // _CCCL_CTK_BELOW(13, 0) # endif // _CCCL_HAS_CTK() //-------------------------------------------------------------------------------------------------------------------- // Unsupported types @@ -286,6 +151,7 @@ template { static_assert(::cuda::std::__always_false_v<_ElementType>, "Unsupported type"); } + return ::DLDataType{}; } template <::cuda::std::size_t _Rank> diff --git a/libcudacxx/include/cuda/__type_traits/vector_type.h b/libcudacxx/include/cuda/__type_traits/vector_type.h index 7e21b8a806e..18139f75325 100644 --- a/libcudacxx/include/cuda/__type_traits/vector_type.h +++ b/libcudacxx/include/cuda/__type_traits/vector_type.h @@ -24,7 +24,6 @@ #if _CCCL_HAS_CTK() # include -# include # include # if !_CCCL_CUDA_COMPILATION() @@ -346,10 +345,200 @@ using __vector_type_t = decltype(::cuda::__cccl_vector_type_t_impl<_Tp, _Size>() template inline constexpr bool __has_vector_type_v = !::cuda::std::is_same_v<__vector_type_t<_Tp, _Size>, void>; +template +inline constexpr bool __is_vector_type_v = false; + +template <> +inline constexpr bool __is_vector_type_v<::char1> = true; +template <> +inline constexpr bool __is_vector_type_v<::char2> = true; +template <> +inline constexpr bool __is_vector_type_v<::char3> = true; +template <> +inline constexpr bool __is_vector_type_v<::char4> = true; + +template <> +inline constexpr bool __is_vector_type_v<::uchar1> = true; +template <> +inline constexpr bool __is_vector_type_v<::uchar2> = true; +template <> +inline constexpr bool __is_vector_type_v<::uchar3> = true; +template <> +inline constexpr bool __is_vector_type_v<::uchar4> = true; + +template <> +inline constexpr bool __is_vector_type_v<::short1> = true; +template <> +inline constexpr bool __is_vector_type_v<::short2> = true; +template <> +inline constexpr bool __is_vector_type_v<::short3> = true; +template <> +inline constexpr bool __is_vector_type_v<::short4> = true; + +template <> +inline constexpr bool __is_vector_type_v<::ushort1> = true; +template <> +inline constexpr bool __is_vector_type_v<::ushort2> = true; +template <> +inline constexpr bool __is_vector_type_v<::ushort3> = true; +template <> +inline constexpr bool __is_vector_type_v<::ushort4> = true; + +template <> +inline constexpr bool __is_vector_type_v<::int1> = true; +template <> +inline constexpr bool __is_vector_type_v<::int2> = true; +template <> +inline constexpr bool __is_vector_type_v<::int3> = true; +template <> +inline constexpr bool __is_vector_type_v<::int4> = true; + +template <> +inline constexpr bool __is_vector_type_v<::uint1> = true; +template <> +inline constexpr bool __is_vector_type_v<::uint2> = true; +template <> +inline constexpr bool __is_vector_type_v<::uint3> = true; +template <> +inline constexpr bool __is_vector_type_v<::uint4> = true; + +template <> +inline constexpr bool __is_vector_type_v<::long1> = true; +template <> +inline constexpr bool __is_vector_type_v<::long2> = true; +template <> +inline constexpr bool __is_vector_type_v<::long3> = true; +# if _CCCL_CTK_AT_LEAST(13, 0) +template <> +inline constexpr bool __is_vector_type_v<::long4_16a> = true; +template <> +inline constexpr bool __is_vector_type_v<::long4_32a> = true; +# else // ^^^ _CCCL_CTK_AT_LEAST(13, 0) ^^^ / vvv _CCCL_CTK_BELOW(13, 0) vvv +template <> +inline constexpr bool __is_vector_type_v<::long4> = true; +# endif // ^^^ _CCCL_CTK_BELOW(13, 0) ^^^ + +template <> +inline constexpr bool __is_vector_type_v<::ulong1> = true; +template <> +inline constexpr bool __is_vector_type_v<::ulong2> = true; +template <> +inline constexpr bool __is_vector_type_v<::ulong3> = true; +# if _CCCL_CTK_AT_LEAST(13, 0) +template <> +inline constexpr bool __is_vector_type_v<::ulong4_16a> = true; +template <> +inline constexpr bool __is_vector_type_v<::ulong4_32a> = true; +# else // ^^^ _CCCL_CTK_AT_LEAST(13, 0) ^^^ / vvv _CCCL_CTK_BELOW(13, 0) vvv +template <> +inline constexpr bool __is_vector_type_v<::ulong4> = true; +# endif // ^^^ _CCCL_CTK_BELOW(13, 0) ^^^ + +template <> +inline constexpr bool __is_vector_type_v<::longlong1> = true; +template <> +inline constexpr bool __is_vector_type_v<::longlong2> = true; +template <> +inline constexpr bool __is_vector_type_v<::longlong3> = true; +# if _CCCL_CTK_AT_LEAST(13, 0) +template <> +inline constexpr bool __is_vector_type_v<::longlong4_16a> = true; +template <> +inline constexpr bool __is_vector_type_v<::longlong4_32a> = true; +# else // ^^^ _CCCL_CTK_AT_LEAST(13, 0) ^^^ / vvv _CCCL_CTK_BELOW(13, 0) vvv +template <> +inline constexpr bool __is_vector_type_v<::longlong4> = true; +# endif // ^^^ _CCCL_CTK_BELOW(13, 0) ^^^ + +template <> +inline constexpr bool __is_vector_type_v<::ulonglong1> = true; +template <> +inline constexpr bool __is_vector_type_v<::ulonglong2> = true; +template <> +inline constexpr bool __is_vector_type_v<::ulonglong3> = true; +# if _CCCL_CTK_AT_LEAST(13, 0) +template <> +inline constexpr bool __is_vector_type_v<::ulonglong4_16a> = true; +template <> +inline constexpr bool __is_vector_type_v<::ulonglong4_32a> = true; +# else // ^^^ _CCCL_CTK_AT_LEAST(13, 0) ^^^ / vvv _CCCL_CTK_BELOW(13, 0) vvv +template <> +inline constexpr bool __is_vector_type_v<::ulonglong4> = true; +# endif // ^^^ _CCCL_CTK_BELOW(13, 0) ^^^ + +template <> +inline constexpr bool __is_vector_type_v<::float1> = true; +template <> +inline constexpr bool __is_vector_type_v<::float2> = true; +template <> +inline constexpr bool __is_vector_type_v<::float3> = true; +template <> +inline constexpr bool __is_vector_type_v<::float4> = true; + +template <> +inline constexpr bool __is_vector_type_v<::double1> = true; +template <> +inline constexpr bool __is_vector_type_v<::double2> = true; +template <> +inline constexpr bool __is_vector_type_v<::double3> = true; +# if _CCCL_CTK_AT_LEAST(13, 0) +template <> +inline constexpr bool __is_vector_type_v<::double4_16a> = true; +template <> +inline constexpr bool __is_vector_type_v<::double4_32a> = true; +# else // ^^^ _CCCL_CTK_AT_LEAST(13, 0) ^^^ / vvv _CCCL_CTK_BELOW(13, 0) vvv +template <> +inline constexpr bool __is_vector_type_v<::double4> = true; +# endif // ^^^ _CCCL_CTK_BELOW(13, 0) ^^^ + +template <> +inline constexpr bool __is_vector_type_v<::dim3> = true; + +template +inline constexpr bool __is_extended_fp_vector_type_v = false; + +# if _CCCL_HAS_NVFP8() +template <> +inline constexpr bool __is_extended_fp_vector_type_v<::__nv_bfloat162> = true; +template <> +inline constexpr bool __is_extended_fp_vector_type_v<::__half2> = true; +template <> +inline constexpr bool __is_extended_fp_vector_type_v<::__nv_fp8x2_e4m3> = true; +template <> +inline constexpr bool __is_extended_fp_vector_type_v<::__nv_fp8x2_e5m2> = true; +template <> +inline constexpr bool __is_extended_fp_vector_type_v<::__nv_fp8x4_e4m3> = true; +template <> +inline constexpr bool __is_extended_fp_vector_type_v<::__nv_fp8x4_e5m2> = true; +# if _CCCL_CTK_AT_LEAST(12, 8) +template <> +inline constexpr bool __is_extended_fp_vector_type_v<::__nv_fp8x2_e8m0> = true; +template <> +inline constexpr bool __is_extended_fp_vector_type_v<::__nv_fp8x4_e8m0> = true; +# endif // _CCCL_CTK_AT_LEAST(12, 8) +# endif // _CCCL_HAS_NVFP8() + +# if _CCCL_HAS_NVFP6() +template <> +inline constexpr bool __is_extended_fp_vector_type_v<::__nv_fp6x2_e2m3> = true; +template <> +inline constexpr bool __is_extended_fp_vector_type_v<::__nv_fp6x2_e3m2> = true; +template <> +inline constexpr bool __is_extended_fp_vector_type_v<::__nv_fp6x4_e2m3> = true; +template <> +inline constexpr bool __is_extended_fp_vector_type_v<::__nv_fp6x4_e3m2> = true; +# endif // _CCCL_HAS_NVFP6() + +# if _CCCL_HAS_NVFP4() +template <> +inline constexpr bool __is_extended_fp_vector_type_v<::__nv_fp4x2_e2m1> = true; +template <> +inline constexpr bool __is_extended_fp_vector_type_v<::__nv_fp4x4_e2m1> = true; +# endif // _CCCL_HAS_NVFP4() + _CCCL_END_NAMESPACE_CUDA # include #endif // !_CCCL_HAS_CTK() - #endif // _CUDA__TYPE_TRAITS_VECTOR_TYPE_H diff --git a/libcudacxx/include/cuda/std/__tuple_dir/vector_types.h b/libcudacxx/include/cuda/std/__tuple_dir/vector_types.h index a44443ed091..125b6b87492 100644 --- a/libcudacxx/include/cuda/std/__tuple_dir/vector_types.h +++ b/libcudacxx/include/cuda/std/__tuple_dir/vector_types.h @@ -121,6 +121,26 @@ _LIBCUDACXX_SPECIALIZE_TUPLE_INTERFACE(__half, __half, 2) # if _CCCL_HAS_NVBF16() _LIBCUDACXX_SPECIALIZE_TUPLE_INTERFACE(__nv_bfloat16, __nv_bfloat16, 2) # endif // _CCCL_HAS_NVBF16() +# if _CCCL_HAS_NVFP8() +_LIBCUDACXX_SPECIALIZE_TUPLE_INTERFACE(__nv_fp8x, __nv_fp8_e5m2, 2, _e5m2) +_LIBCUDACXX_SPECIALIZE_TUPLE_INTERFACE(__nv_fp8x, __nv_fp8_e5m2, 4, _e5m2) +_LIBCUDACXX_SPECIALIZE_TUPLE_INTERFACE(__nv_fp8x, __nv_fp8_e4m3, 2, _e4m3) +_LIBCUDACXX_SPECIALIZE_TUPLE_INTERFACE(__nv_fp8x, __nv_fp8_e4m3, 4, _e4m3) +# if _CCCL_CTK_AT_LEAST(12, 8) +_LIBCUDACXX_SPECIALIZE_TUPLE_INTERFACE(__nv_fp8x, __nv_fp8_e8m0, 2, _e8m0) +_LIBCUDACXX_SPECIALIZE_TUPLE_INTERFACE(__nv_fp8x, __nv_fp8_e8m0, 4, _e8m0) +# endif // _CCCL_CTK_AT_LEAST(12, 8) +# endif // _CCCL_HAS_NVFP8() +# if _CCCL_HAS_NVFP6() +_LIBCUDACXX_SPECIALIZE_TUPLE_INTERFACE(__nv_fp6x, __nv_fp6_e3m2, 2, _e3m2) +_LIBCUDACXX_SPECIALIZE_TUPLE_INTERFACE(__nv_fp6x, __nv_fp6_e3m2, 4, _e3m2) +_LIBCUDACXX_SPECIALIZE_TUPLE_INTERFACE(__nv_fp6x, __nv_fp6_e2m3, 2, _e2m3) +_LIBCUDACXX_SPECIALIZE_TUPLE_INTERFACE(__nv_fp6x, __nv_fp6_e2m3, 4, _e2m3) +# endif // _CCCL_HAS_NVFP6() +# if _CCCL_HAS_NVFP4() +_LIBCUDACXX_SPECIALIZE_TUPLE_INTERFACE(__nv_fp4x, __nv_fp4_e2m1, 2, _e2m1) +_LIBCUDACXX_SPECIALIZE_TUPLE_INTERFACE(__nv_fp4x, __nv_fp4_e2m1, 4, _e2m1) +# endif // _CCCL_HAS_NVFP4() template struct __get_element; @@ -273,6 +293,26 @@ _LIBCUDACXX_SPECIALIZE_GET(__half2, __half) # if _CCCL_HAS_NVBF16() _LIBCUDACXX_SPECIALIZE_GET(__nv_bfloat162, __nv_bfloat16) # endif // _CCCL_HAS_NVBF16() +# if _CCCL_HAS_NVFP8() +_LIBCUDACXX_SPECIALIZE_GET(__nv_fp8x2_e5m2, __nv_fp8_e5m2) +_LIBCUDACXX_SPECIALIZE_GET(__nv_fp8x4_e5m2, __nv_fp8_e5m2) +_LIBCUDACXX_SPECIALIZE_GET(__nv_fp8x2_e4m3, __nv_fp8_e4m3) +_LIBCUDACXX_SPECIALIZE_GET(__nv_fp8x4_e4m3, __nv_fp8_e4m3) +# if _CCCL_CTK_AT_LEAST(12, 8) +_LIBCUDACXX_SPECIALIZE_GET(__nv_fp8x2_e8m0, __nv_fp8_e8m0) +_LIBCUDACXX_SPECIALIZE_GET(__nv_fp8x4_e8m0, __nv_fp8_e8m0) +# endif // _CCCL_CTK_AT_LEAST(12, 8) +# endif // _CCCL_HAS_NVFP8() +# if _CCCL_HAS_NVFP6() +_LIBCUDACXX_SPECIALIZE_GET(__nv_fp6x2_e3m2, __nv_fp6_e3m2) +_LIBCUDACXX_SPECIALIZE_GET(__nv_fp6x4_e3m2, __nv_fp6_e3m2) +_LIBCUDACXX_SPECIALIZE_GET(__nv_fp6x2_e2m3, __nv_fp6_e2m3) +_LIBCUDACXX_SPECIALIZE_GET(__nv_fp6x4_e2m3, __nv_fp6_e2m3) +# endif // _CCCL_HAS_NVFP6() +# if _CCCL_HAS_NVFP4() +_LIBCUDACXX_SPECIALIZE_GET(__nv_fp4x2_e2m1, __nv_fp4_e2m1) +_LIBCUDACXX_SPECIALIZE_GET(__nv_fp4x4_e2m1, __nv_fp4_e2m1) +# endif // _CCCL_HAS_NVFP4() _CCCL_END_NAMESPACE_CUDA_STD diff --git a/libcudacxx/test/libcudacxx/cuda/containers/views/mdspan/mdspan_to_dlpack/mdspan_to_dlpack.pass.cpp b/libcudacxx/test/libcudacxx/cuda/containers/views/mdspan/mdspan_to_dlpack/mdspan_to_dlpack.pass.cpp index 6139d99817c..6b418301ac5 100644 --- a/libcudacxx/test/libcudacxx/cuda/containers/views/mdspan/mdspan_to_dlpack/mdspan_to_dlpack.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/containers/views/mdspan/mdspan_to_dlpack/mdspan_to_dlpack.pass.cpp @@ -197,7 +197,7 @@ struct test_mdspan_to_dlpack_types_fn } }; -bool test_mdspan_to_dlpack_types() +bool test_mdspan_to_dlpack_basic_types() { using list_t = cuda::std::__type_list< bool, @@ -220,7 +220,47 @@ bool test_mdspan_to_dlpack_types() #endif // Floating-point types float, - double, + double +#if _CCCL_HAS_FLOAT128() + , + __float128 +#endif + >; + cuda::std::array expected_types = { + DLDataType{kDLBool, 8, 1}, + // Signed integer types + DLDataType{kDLInt, 8, 1}, + DLDataType{kDLInt, 16, 1}, + DLDataType{kDLInt, 32, 1}, + DLDataType{kDLInt, sizeof(long) * 8, 1}, + DLDataType{kDLInt, 64, 1}, +#if _CCCL_HAS_INT128() + DLDataType{kDLInt, 128, 1}, +#endif + // Unsigned integer types + DLDataType{kDLUInt, 8, 1}, + DLDataType{kDLUInt, 16, 1}, + DLDataType{kDLUInt, 32, 1}, + DLDataType{kDLUInt, sizeof(unsigned long) * 8, 1}, + DLDataType{kDLUInt, 64, 1}, +#if _CCCL_HAS_INT128() + DLDataType{kDLUInt, 128, 1}, +#endif + // Floating-point types + DLDataType{kDLFloat, 32, 1}, + DLDataType{kDLFloat, 64, 1}, +#if _CCCL_HAS_FLOAT128() + DLDataType{kDLFloat, 128, 1}, +#endif + }; + test_mdspan_to_dlpack_types_fn test_fn{expected_types}; + test_fn.call(cuda::std::make_index_sequence{}); + return true; +} + +bool test_mdspan_to_dlpack_extended_fp_and_complex_types() +{ + using list_t = cuda::std::__type_list< #if _CCCL_HAS_NVFP16() ::__half, #endif @@ -254,79 +294,8 @@ bool test_mdspan_to_dlpack_types() cuda::std::complex<::__half>, #endif cuda::std::complex, - cuda::std::complex, - // Vector types (CUDA built-in vector types) -#if _CCCL_HAS_CTK() - ::char2, - ::char4, - ::uchar2, - ::uchar4, - ::short2, - ::short4, - ::ushort2, - ::ushort4, - ::int2, - ::int4, - ::uint2, - ::uint4, - ::long2, -# if _CCCL_CTK_AT_LEAST(13, 0) - ::long4_32a, -# else - ::long4, -# endif - ::ulong2, -# if _CCCL_CTK_AT_LEAST(13, 0) - ::ulong4_32a, -# else - ::ulong4, -# endif - ::longlong2, -# if _CCCL_CTK_AT_LEAST(13, 0) - ::longlong4_32a, -# else - ::longlong4, -# endif - ::ulonglong2, -# if _CCCL_CTK_AT_LEAST(13, 0) - ::ulonglong4_32a, -# else - ::ulonglong4, -# endif - ::float2, - ::float4, - ::double2, -# if _CCCL_CTK_AT_LEAST(13, 0) - ::double4_32a -# else - ::double4 -# endif -#endif // _CCCL_HAS_CTK() - >; + cuda::std::complex>; cuda::std::array expected_types = { - DLDataType{kDLBool, 8, 1}, - // Signed integer types - DLDataType{kDLInt, 8, 1}, - DLDataType{kDLInt, 16, 1}, - DLDataType{kDLInt, 32, 1}, - - DLDataType{kDLInt, sizeof(long) * 8, 1}, - DLDataType{kDLInt, 64, 1}, -#if _CCCL_HAS_INT128() - DLDataType{kDLInt, 128, 1}, -#endif - // Unsigned integer types - DLDataType{kDLUInt, 8, 1}, - DLDataType{kDLUInt, 16, 1}, - DLDataType{kDLUInt, 32, 1}, - DLDataType{kDLUInt, sizeof(unsigned long) * 8, 1}, - DLDataType{kDLUInt, 64, 1}, -#if _CCCL_HAS_INT128() - DLDataType{kDLUInt, 128, 1}, -#endif - // Floating-point types - DLDataType{kDLFloat, 32, 1}, - DLDataType{kDLFloat, 64, 1}, #if _CCCL_HAS_NVFP16() DLDataType{kDLFloat, 16, 1}, #endif @@ -360,9 +329,76 @@ bool test_mdspan_to_dlpack_types() DLDataType{kDLComplex, 32, 1}, #endif DLDataType{kDLComplex, 64, 1}, - DLDataType{kDLComplex, 128, 1}, - // Vector types (CUDA built-in vector types) + DLDataType{kDLComplex, 128, 1}}; + test_mdspan_to_dlpack_types_fn test_fn{expected_types}; + test_fn.call(cuda::std::make_index_sequence{}); + return true; +} + #if _CCCL_HAS_CTK() +bool test_mdspan_to_dlpack_vector_types() +{ + using list_t = cuda::std::__type_list< + ::char2, + ::char4, + ::uchar2, + ::uchar4, + ::short2, + ::short4, + ::ushort2, + ::ushort4, + ::int2, + ::int4, + ::uint2, + ::uint4, + ::long2 +# if _CCCL_CTK_AT_LEAST(13, 0) + , + ::long4_32a +# else + , + ::long4 +# endif + , + ::ulong2 +# if _CCCL_CTK_AT_LEAST(13, 0) + , + ::ulong4_32a +# else + , + ::ulong4 +# endif + , + ::longlong2 +# if _CCCL_CTK_AT_LEAST(13, 0) + , + ::longlong4_32a +# else + , + ::longlong4 +# endif + , + ::ulonglong2 +# if _CCCL_CTK_AT_LEAST(13, 0) + , + ::ulonglong4_32a +# else + , + ::ulonglong4 +# endif + , + ::float2, + ::float4, + ::double2 +# if _CCCL_CTK_AT_LEAST(13, 0) + , + ::double4_32a +# else + , + ::double4 +# endif + >; + cuda::std::array expected_types = { DLDataType{kDLInt, 8, 2}, DLDataType{kDLInt, 8, 4}, DLDataType{kDLUInt, 8, 2}, @@ -386,11 +422,81 @@ bool test_mdspan_to_dlpack_types() DLDataType{kDLFloat, 32, 2}, DLDataType{kDLFloat, 32, 4}, DLDataType{kDLFloat, 64, 2}, - DLDataType{kDLFloat, 64, 4}, + DLDataType{kDLFloat, 64, 4}}; + test_mdspan_to_dlpack_types_fn test_fn{expected_types}; + test_fn.call(cuda::std::make_index_sequence{}); + return true; +} #endif // _CCCL_HAS_CTK() + +bool test_mdspan_to_dlpack_extended_fp_vector_types() +{ + using list_t = cuda::std::__type_list< +#if _CCCL_HAS_NVFP16() + ::__half2, +#endif +#if _CCCL_HAS_NVBF16() + ::__nv_bfloat162, +#endif +#if _CCCL_HAS_NVFP8_E4M3() + ::__nv_fp8x2_e4m3, + ::__nv_fp8x4_e4m3, +#endif +#if _CCCL_HAS_NVFP8_E5M2() + ::__nv_fp8x2_e5m2, + ::__nv_fp8x4_e5m2, +#endif +#if _CCCL_HAS_NVFP8_E8M0() + ::__nv_fp8x2_e8m0, + ::__nv_fp8x4_e8m0, +#endif +#if _CCCL_HAS_NVFP6_E2M3() + ::__nv_fp6x2_e2m3, + ::__nv_fp6x4_e2m3, +#endif +#if _CCCL_HAS_NVFP6_E3M2() + ::__nv_fp6x2_e3m2, + ::__nv_fp6x4_e3m2, +#endif +#if _CCCL_HAS_NVFP4_E2M1() + ::__nv_fp4x2_e2m1, + ::__nv_fp4x4_e2m1, +#endif + void* /* dummy to allow trailing commas */>; + cuda::std::array expected_types = { +#if _CCCL_HAS_NVFP16() + DLDataType{kDLFloat, 16, 2}, +#endif +#if _CCCL_HAS_NVBF16() + DLDataType{kDLBfloat, 16, 2}, +#endif +#if _CCCL_HAS_NVFP8_E4M3() + DLDataType{kDLFloat8_e4m3fn, 8, 2}, + DLDataType{kDLFloat8_e4m3fn, 8, 4}, +#endif +#if _CCCL_HAS_NVFP8_E5M2() + DLDataType{kDLFloat8_e5m2, 8, 2}, + DLDataType{kDLFloat8_e5m2, 8, 4}, +#endif +#if _CCCL_HAS_NVFP8_E8M0() + DLDataType{kDLFloat8_e8m0fnu, 8, 2}, + DLDataType{kDLFloat8_e8m0fnu, 8, 4}, +#endif +#if _CCCL_HAS_NVFP6_E2M3() + DLDataType{kDLFloat6_e2m3fn, 6, 2}, + DLDataType{kDLFloat6_e2m3fn, 6, 4}, +#endif +#if _CCCL_HAS_NVFP6_E3M2() + DLDataType{kDLFloat6_e3m2fn, 6, 2}, + DLDataType{kDLFloat6_e3m2fn, 6, 4}, +#endif +#if _CCCL_HAS_NVFP4_E2M1() + DLDataType{kDLFloat4_e2m1fn, 4, 2}, + DLDataType{kDLFloat4_e2m1fn, 4, 4}, +#endif }; test_mdspan_to_dlpack_types_fn test_fn{expected_types}; - test_fn.call(cuda::std::make_index_sequence{}); + test_fn.call(cuda::std::make_index_sequence{}); return true; } @@ -403,6 +509,11 @@ int main(int, char**) NV_IF_TARGET(NV_IS_HOST, (assert(test_mdspan_to_dlpack_const_pointer());)) NV_IF_TARGET(NV_IS_HOST, (assert(test_mdspan_to_dlpack_device());)) NV_IF_TARGET(NV_IS_HOST, (assert(test_mdspan_to_dlpack_managed());)) - NV_IF_TARGET(NV_IS_HOST, (assert(test_mdspan_to_dlpack_types());)) + NV_IF_TARGET(NV_IS_HOST, (assert(test_mdspan_to_dlpack_basic_types());)) + NV_IF_TARGET(NV_IS_HOST, (assert(test_mdspan_to_dlpack_extended_fp_and_complex_types());)) +#if _CCCL_HAS_CTK() + NV_IF_TARGET(NV_IS_HOST, (assert(test_mdspan_to_dlpack_vector_types());)) +#endif // _CCCL_HAS_CTK() + NV_IF_TARGET(NV_IS_HOST, (assert(test_mdspan_to_dlpack_extended_fp_vector_types());)) return 0; } diff --git a/libcudacxx/test/libcudacxx/libcxx/type_traits/vector_type.compile.pass.cpp b/libcudacxx/test/libcudacxx/libcxx/type_traits/vector_type.compile.pass.cpp index 406b6157cbf..1561857be02 100644 --- a/libcudacxx/test/libcudacxx/libcxx/type_traits/vector_type.compile.pass.cpp +++ b/libcudacxx/test/libcudacxx/libcxx/type_traits/vector_type.compile.pass.cpp @@ -29,31 +29,61 @@ __host__ __device__ void test() test(); test(); + static_assert(cuda::__is_vector_type_v); + static_assert(cuda::__is_vector_type_v); + static_assert(cuda::__is_vector_type_v); + static_assert(cuda::__is_vector_type_v); + test(); test(); test(); test(); + static_assert(cuda::__is_vector_type_v); + static_assert(cuda::__is_vector_type_v); + static_assert(cuda::__is_vector_type_v); + static_assert(cuda::__is_vector_type_v); + test(); test(); test(); test(); + static_assert(cuda::__is_vector_type_v); + static_assert(cuda::__is_vector_type_v); + static_assert(cuda::__is_vector_type_v); + static_assert(cuda::__is_vector_type_v); + test(); test(); test(); test(); + static_assert(cuda::__is_vector_type_v); + static_assert(cuda::__is_vector_type_v); + static_assert(cuda::__is_vector_type_v); + static_assert(cuda::__is_vector_type_v); + test(); test(); test(); test(); + static_assert(cuda::__is_vector_type_v); + static_assert(cuda::__is_vector_type_v); + static_assert(cuda::__is_vector_type_v); + static_assert(cuda::__is_vector_type_v); + test(); test(); test(); test(); + static_assert(cuda::__is_vector_type_v); + static_assert(cuda::__is_vector_type_v); + static_assert(cuda::__is_vector_type_v); + static_assert(cuda::__is_vector_type_v); + test(); test(); test(); @@ -63,6 +93,16 @@ __host__ __device__ void test() test(); #endif // ^^^ _CCCL_CTK_BELOW(13, 0) ^^^ + static_assert(cuda::__is_vector_type_v); + static_assert(cuda::__is_vector_type_v); + static_assert(cuda::__is_vector_type_v); +#if _CCCL_CTK_AT_LEAST(13, 0) + static_assert(cuda::__is_vector_type_v); + static_assert(cuda::__is_vector_type_v); +#else // ^^^ _CCCL_CTK_AT_LEAST(13, 0) ^^^ / vvv _CCCL_CTK_BELOW(13, 0) vvv + static_assert(cuda::__is_vector_type_v); +#endif // _CCCL_CTK_AT_LEAST(13, 0) + test(); test(); test(); @@ -72,6 +112,16 @@ __host__ __device__ void test() test(); #endif // ^^^ _CCCL_CTK_BELOW(13, 0) ^^^ + static_assert(cuda::__is_vector_type_v); + static_assert(cuda::__is_vector_type_v); + static_assert(cuda::__is_vector_type_v); +#if _CCCL_CTK_AT_LEAST(13, 0) + static_assert(cuda::__is_vector_type_v); + static_assert(cuda::__is_vector_type_v); +#else // ^^^ _CCCL_CTK_AT_LEAST(13, 0) ^^^ / vvv _CCCL_CTK_BELOW(13, 0) vvv + static_assert(cuda::__is_vector_type_v); +#endif // _CCCL_CTK_AT_LEAST(13, 0) + test(); test(); test(); @@ -81,6 +131,16 @@ __host__ __device__ void test() test(); #endif // ^^^ _CCCL_CTK_BELOW(13, 0) ^^^ + static_assert(cuda::__is_vector_type_v); + static_assert(cuda::__is_vector_type_v); + static_assert(cuda::__is_vector_type_v); +#if _CCCL_CTK_AT_LEAST(13, 0) + static_assert(cuda::__is_vector_type_v); + static_assert(cuda::__is_vector_type_v); +#else // ^^^ _CCCL_CTK_AT_LEAST(13, 0) ^^^ / vvv _CCCL_CTK_BELOW(13, 0) vvv + static_assert(cuda::__is_vector_type_v); +#endif // _CCCL_CTK_AT_LEAST(13, 0) + test(); test(); test(); @@ -90,11 +150,26 @@ __host__ __device__ void test() test(); #endif // ^^^ _CCCL_CTK_BELOW(13, 0) ^^^ + static_assert(cuda::__is_vector_type_v); + static_assert(cuda::__is_vector_type_v); + static_assert(cuda::__is_vector_type_v); +#if _CCCL_CTK_AT_LEAST(13, 0) + static_assert(cuda::__is_vector_type_v); + static_assert(cuda::__is_vector_type_v); +#else // ^^^ _CCCL_CTK_AT_LEAST(13, 0) ^^^ / vvv _CCCL_CTK_BELOW(13, 0) vvv + static_assert(cuda::__is_vector_type_v); +#endif // _CCCL_CTK_AT_LEAST(13, 0) + test(); test(); test(); test(); + static_assert(cuda::__is_vector_type_v); + static_assert(cuda::__is_vector_type_v); + static_assert(cuda::__is_vector_type_v); + static_assert(cuda::__is_vector_type_v); + test(); test(); test(); @@ -104,11 +179,26 @@ __host__ __device__ void test() test(); #endif // ^^^ _CCCL_CTK_BELOW(13, 0) ^^^ + static_assert(cuda::__is_vector_type_v); + static_assert(cuda::__is_vector_type_v); + static_assert(cuda::__is_vector_type_v); +#if _CCCL_CTK_AT_LEAST(13, 0) + static_assert(cuda::__is_vector_type_v); + static_assert(cuda::__is_vector_type_v); +#else // ^^^ _CCCL_CTK_AT_LEAST(13, 0) ^^^ / vvv _CCCL_CTK_BELOW(13, 0) vvv + static_assert(cuda::__is_vector_type_v); +#endif // _CCCL_CTK_AT_LEAST(13, 0) + + static_assert(cuda::__is_vector_type_v); + // 2. Test invalid combinations test(); test(); test(); + + static_assert(!cuda::__is_vector_type_v); + static_assert(!cuda::__is_vector_type_v); } int main(int, char**) From 4d2e0dacf75ea2b77f6148640d1741bc9b455800 Mon Sep 17 00:00:00 2001 From: fbusato Date: Fri, 19 Dec 2025 12:48:05 -0800 Subject: [PATCH 06/56] remove operator-> --- .../extended_api/mdspan/mdspan_to_dlpack.rst | 49 +++-- .../include/cuda/__mdspan/mdspan_to_dlpack.h | 54 +++--- .../mdspan_to_dlpack.pass.cpp | 174 +++++++++--------- .../mdspan_to_dlpack.wrapper.pass.cpp | 169 ++++++++--------- 4 files changed, 224 insertions(+), 222 deletions(-) diff --git a/docs/libcudacxx/extended_api/mdspan/mdspan_to_dlpack.rst b/docs/libcudacxx/extended_api/mdspan/mdspan_to_dlpack.rst index d758ff84f12..0213aab28ab 100644 --- a/docs/libcudacxx/extended_api/mdspan/mdspan_to_dlpack.rst +++ b/docs/libcudacxx/extended_api/mdspan/mdspan_to_dlpack.rst @@ -15,17 +15,17 @@ Conversion functions namespace cuda { template - [[nodiscard]] DLPackWrapper - mdspan_to_dlpack(const cuda::host_mdspan& mdspan); + [[nodiscard]] dlpack_tensor + to_dlpack(const cuda::host_mdspan& mdspan); template - [[nodiscard]] DLPackWrapper - mdspan_to_dlpack(const cuda::device_mdspan& mdspan, + [[nodiscard]] dlpack_tensor + to_dlpack(const cuda::device_mdspan& mdspan, cuda::device_ref device = cuda::device_ref{0}); template - [[nodiscard]] DLPackWrapper - mdspan_to_dlpack(const cuda::managed_mdspan& mdspan); + [[nodiscard]] dlpack_tensor + to_dlpack(const cuda::managed_mdspan& mdspan); } // namespace cuda @@ -37,28 +37,26 @@ Types namespace cuda { template - class DLPackWrapper { + class dlpack_tensor { public: - DLPackWrapper(); - DLPackWrapper(const DLPackWrapper&) noexcept; - DLPackWrapper(DLPackWrapper&&) noexcept; - DLPackWrapper& operator=(const DLPackWrapper&) noexcept; - DLPackWrapper& operator=(DLPackWrapper&&) noexcept; - ~DLPackWrapper() noexcept = default; - - DLTensor* operator->() noexcept; - const DLTensor* operator->() const noexcept; + dlpack_tensor(); + dlpack_tensor(const dlpack_tensor&) noexcept; + dlpack_tensor(dlpack_tensor&&) noexcept; + dlpack_tensor& operator=(const dlpack_tensor&) noexcept; + dlpack_tensor& operator=(dlpack_tensor&&) noexcept; + ~dlpack_tensor() noexcept = default; + DLTensor& get() noexcept; const DLTensor& get() const noexcept; }; } // namespace cuda -``cuda::DLPackWrapper`` stores a ``DLTensor`` and owns the backing storage for its ``shape`` and ``strides`` pointers. The class does not use any heap allocation. +``cuda::dlpack_tensor`` stores a ``DLTensor`` and owns the backing storage for its ``shape`` and ``strides`` pointers. The class does not use any heap allocation. .. note:: Lifetime - The ``DLTensor`` associated with ``cuda::DLPackWrapper`` must not outlive the wrapper. If the wrapper is destroyed, the returned ``DLTensor::shape`` and ``DLTensor::strides`` pointers will dangle. + The ``DLTensor`` associated with ``cuda::dlpack_tensor`` must not outlive the wrapper. If the wrapper is destroyed, the returned ``DLTensor::shape`` and ``DLTensor::strides`` pointers will dangle. .. note:: Const-correctness @@ -125,12 +123,13 @@ Example int data[6] = {0, 1, 2, 3, 4, 5}; cuda::host_mdspan md{data, extents_t{}}; - auto dl = cuda::mdspan_to_dlpack(md); + auto dl = cuda::to_dlpack(md); + const auto& dltensor = dl.get(); - // `dl` owns the shape/stride storage; `dl->data` is a non-owning pointer to `data`. - assert(dl->device.device_type == kDLCPU); - assert(dl->ndim == 2); - assert(dl->shape[0] == 2 && dl->shape[1] == 3); - assert(dl->strides[0] == 3 && dl->strides[1] == 1); - assert(dl->data == data); + // `dl` owns the shape/stride storage; `dltensor.data` is a non-owning pointer to `data`. + assert(dltensor.device.device_type == kDLCPU); + assert(dltensor.ndim == 2); + assert(dltensor.shape[0] == 2 && dltensor.shape[1] == 3); + assert(dltensor.strides[0] == 3 && dltensor.strides[1] == 1); + assert(dltensor.data == data); } diff --git a/libcudacxx/include/cuda/__mdspan/mdspan_to_dlpack.h b/libcudacxx/include/cuda/__mdspan/mdspan_to_dlpack.h index 2794788e70f..f05631eab19 100644 --- a/libcudacxx/include/cuda/__mdspan/mdspan_to_dlpack.h +++ b/libcudacxx/include/cuda/__mdspan/mdspan_to_dlpack.h @@ -155,7 +155,7 @@ template } template <::cuda::std::size_t _Rank> -class DLPackWrapper +class dlpack_tensor { ::cuda::std::array<::cuda::std::int64_t, _Rank> __shape{}; ::cuda::std::array<::cuda::std::int64_t, _Rank> __strides{}; @@ -168,12 +168,12 @@ class DLPackWrapper } public: - _CCCL_HOST_API explicit DLPackWrapper() noexcept + _CCCL_HOST_API explicit dlpack_tensor() noexcept { __update_tensor(); } - _CCCL_HOST_API DLPackWrapper(const DLPackWrapper& __other) noexcept + _CCCL_HOST_API dlpack_tensor(const dlpack_tensor& __other) noexcept : __shape{__other.__shape} , __strides{__other.__strides} , __tensor{__other.__tensor} @@ -181,7 +181,7 @@ class DLPackWrapper __update_tensor(); } - _CCCL_HOST_API DLPackWrapper(DLPackWrapper&& __other) noexcept + _CCCL_HOST_API dlpack_tensor(dlpack_tensor&& __other) noexcept : __shape{::cuda::std::move(__other.__shape)} , __strides{::cuda::std::move(__other.__strides)} , __tensor{__other.__tensor} @@ -190,7 +190,7 @@ class DLPackWrapper __update_tensor(); } - _CCCL_HOST_API DLPackWrapper& operator=(const DLPackWrapper& __other) noexcept + _CCCL_HOST_API dlpack_tensor& operator=(const dlpack_tensor& __other) noexcept { if (this == &__other) { @@ -203,7 +203,7 @@ class DLPackWrapper return *this; } - _CCCL_HOST_API DLPackWrapper& operator=(DLPackWrapper&& __other) noexcept + _CCCL_HOST_API dlpack_tensor& operator=(dlpack_tensor&& __other) noexcept { if (this == &__other) { @@ -217,17 +217,7 @@ class DLPackWrapper return *this; } - _CCCL_HIDE_FROM_ABI ~DLPackWrapper() noexcept = default; - - _CCCL_HOST_API ::DLTensor* operator->() noexcept - { - return &__tensor; - } - - _CCCL_HOST_API const ::DLTensor* operator->() const noexcept - { - return &__tensor; - } + _CCCL_HIDE_FROM_ABI ~dlpack_tensor() noexcept = default; [[nodiscard]] _CCCL_HOST_API ::DLTensor& get() noexcept { @@ -241,14 +231,14 @@ class DLPackWrapper }; template -[[nodiscard]] _CCCL_HOST_API DLPackWrapper<_Extents::rank()> __mdspan_to_dlpack( - const ::cuda::std::mdspan<_ElementType, _Extents, _Layout, _Accessor>& __mdspan, - ::DLDeviceType __device_type, - int __device_id) +[[nodiscard]] _CCCL_HOST_API dlpack_tensor<_Extents::rank()> +__to_dlpack(const ::cuda::std::mdspan<_ElementType, _Extents, _Layout, _Accessor>& __mdspan, + ::DLDeviceType __device_type, + int __device_id) { static_assert(::cuda::std::is_pointer_v, "data_handle_type must be a pointer"); using __element_type = ::cuda::std::remove_cv_t<_ElementType>; - DLPackWrapper<_Extents::rank()> __wrapper{}; + dlpack_tensor<_Extents::rank()> __wrapper{}; auto& __tensor = __wrapper.get(); __tensor.data = __mdspan.size() > 0 ? const_cast<__element_type*>(__mdspan.data_handle()) : nullptr; __tensor.device = ::DLDevice{__device_type, __device_id}; @@ -280,28 +270,28 @@ template -[[nodiscard]] _CCCL_HOST_API DLPackWrapper<_Extents::rank()> -mdspan_to_dlpack(const ::cuda::host_mdspan<_ElementType, _Extents, _Layout, _Accessor>& __mdspan) +[[nodiscard]] _CCCL_HOST_API dlpack_tensor<_Extents::rank()> +to_dlpack(const ::cuda::host_mdspan<_ElementType, _Extents, _Layout, _Accessor>& __mdspan) { using __mdspan_type = ::cuda::std::mdspan<_ElementType, _Extents, _Layout, _Accessor>; - return ::cuda::__mdspan_to_dlpack(__mdspan_type{__mdspan}, ::kDLCPU, 0); + return ::cuda::__to_dlpack(__mdspan_type{__mdspan}, ::kDLCPU, 0); } template -[[nodiscard]] _CCCL_HOST_API DLPackWrapper<_Extents::rank()> -mdspan_to_dlpack(const ::cuda::device_mdspan<_ElementType, _Extents, _Layout, _Accessor>& __mdspan, - ::cuda::device_ref __device = ::cuda::device_ref{0}) +[[nodiscard]] _CCCL_HOST_API dlpack_tensor<_Extents::rank()> +to_dlpack(const ::cuda::device_mdspan<_ElementType, _Extents, _Layout, _Accessor>& __mdspan, + ::cuda::device_ref __device = ::cuda::device_ref{0}) { using __mdspan_type = ::cuda::std::mdspan<_ElementType, _Extents, _Layout, _Accessor>; - return ::cuda::__mdspan_to_dlpack(__mdspan_type{__mdspan}, ::kDLCUDA, __device.get()); + return ::cuda::__to_dlpack(__mdspan_type{__mdspan}, ::kDLCUDA, __device.get()); } template -[[nodiscard]] _CCCL_HOST_API DLPackWrapper<_Extents::rank()> -mdspan_to_dlpack(const ::cuda::managed_mdspan<_ElementType, _Extents, _Layout, _Accessor>& __mdspan) +[[nodiscard]] _CCCL_HOST_API dlpack_tensor<_Extents::rank()> +to_dlpack(const ::cuda::managed_mdspan<_ElementType, _Extents, _Layout, _Accessor>& __mdspan) { using __mdspan_type = ::cuda::std::mdspan<_ElementType, _Extents, _Layout, _Accessor>; - return ::cuda::__mdspan_to_dlpack(__mdspan_type{__mdspan}, ::kDLCUDAManaged, 0); + return ::cuda::__to_dlpack(__mdspan_type{__mdspan}, ::kDLCUDAManaged, 0); } _CCCL_END_NAMESPACE_CUDA diff --git a/libcudacxx/test/libcudacxx/cuda/containers/views/mdspan/mdspan_to_dlpack/mdspan_to_dlpack.pass.cpp b/libcudacxx/test/libcudacxx/cuda/containers/views/mdspan/mdspan_to_dlpack/mdspan_to_dlpack.pass.cpp index 6b418301ac5..b066a9b2569 100644 --- a/libcudacxx/test/libcudacxx/cuda/containers/views/mdspan/mdspan_to_dlpack/mdspan_to_dlpack.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/containers/views/mdspan/mdspan_to_dlpack/mdspan_to_dlpack.pass.cpp @@ -29,20 +29,21 @@ bool test_mdspan_to_dlpack_host_layout_right() using extents_t = cuda::std::extents; int data[6] = {0, 1, 2, 3, 4, 5}; cuda::host_mdspan md{data, extents_t{}}; - auto dlpack_wrapper = cuda::mdspan_to_dlpack(md); + auto dlpack_wrapper = cuda::to_dlpack(md); + const auto& dltensor = dlpack_wrapper.get(); - assert(dlpack_wrapper->device.device_type == kDLCPU); - assert(dlpack_wrapper->device.device_id == 0); - assert(dlpack_wrapper->ndim == 2); - check_datatype(dlpack_wrapper->dtype, kDLInt, 32, 1); - assert(dlpack_wrapper->shape != nullptr); - assert(dlpack_wrapper->strides != nullptr); - assert(dlpack_wrapper->shape[0] == 2); - assert(dlpack_wrapper->shape[1] == 3); - assert(dlpack_wrapper->strides[0] == 3); - assert(dlpack_wrapper->strides[1] == 1); - assert(dlpack_wrapper->byte_offset == 0); - assert(dlpack_wrapper->data == data); + assert(dltensor.device.device_type == kDLCPU); + assert(dltensor.device.device_id == 0); + assert(dltensor.ndim == 2); + check_datatype(dltensor.dtype, kDLInt, 32, 1); + assert(dltensor.shape != nullptr); + assert(dltensor.strides != nullptr); + assert(dltensor.shape[0] == 2); + assert(dltensor.shape[1] == 3); + assert(dltensor.strides[0] == 3); + assert(dltensor.strides[1] == 1); + assert(dltensor.byte_offset == 0); + assert(dltensor.data == data); return true; } @@ -51,20 +52,21 @@ bool test_mdspan_to_dlpack_host_layout_left() using extents_t = cuda::std::extents; int data[6] = {0, 1, 2, 3, 4, 5}; cuda::host_mdspan md{data, extents_t{}}; - auto dlpack_wrapper = cuda::mdspan_to_dlpack(md); + auto dlpack_wrapper = cuda::to_dlpack(md); + const auto& dltensor = dlpack_wrapper.get(); - assert(dlpack_wrapper->device.device_type == kDLCPU); - assert(dlpack_wrapper->device.device_id == 0); - check_datatype(dlpack_wrapper->dtype, kDLInt, 32, 1); - assert(dlpack_wrapper->ndim == 2); - assert(dlpack_wrapper->shape != nullptr); - assert(dlpack_wrapper->strides != nullptr); - assert(dlpack_wrapper->shape[0] == 2); - assert(dlpack_wrapper->shape[1] == 3); - assert(dlpack_wrapper->strides[0] == 1); - assert(dlpack_wrapper->strides[1] == 2); - assert(dlpack_wrapper->byte_offset == 0); - assert(dlpack_wrapper->data == data); + assert(dltensor.device.device_type == kDLCPU); + assert(dltensor.device.device_id == 0); + check_datatype(dltensor.dtype, kDLInt, 32, 1); + assert(dltensor.ndim == 2); + assert(dltensor.shape != nullptr); + assert(dltensor.strides != nullptr); + assert(dltensor.shape[0] == 2); + assert(dltensor.shape[1] == 3); + assert(dltensor.strides[0] == 1); + assert(dltensor.strides[1] == 2); + assert(dltensor.byte_offset == 0); + assert(dltensor.data == data); return true; } @@ -73,18 +75,19 @@ bool test_mdspan_to_dlpack_empty_size() using extents_t = cuda::std::dims<2>; int data[1] = {42}; cuda::host_mdspan m{data, extents_t{0, 3}}; - auto dlpack_wrapper = cuda::mdspan_to_dlpack(m); + auto dlpack_wrapper = cuda::to_dlpack(m); + const auto& dltensor = dlpack_wrapper.get(); - assert(dlpack_wrapper->device.device_type == kDLCPU); - assert(dlpack_wrapper->device.device_id == 0); - check_datatype(dlpack_wrapper->dtype, kDLInt, 32, 1); - assert(dlpack_wrapper->ndim == 2); - assert(dlpack_wrapper->shape[0] == 0); - assert(dlpack_wrapper->shape[1] == 3); - assert(dlpack_wrapper->strides[0] == 3); - assert(dlpack_wrapper->strides[1] == 1); - assert(dlpack_wrapper->byte_offset == 0); - assert(dlpack_wrapper->data == nullptr); // size() == 0 => nullptr + assert(dltensor.device.device_type == kDLCPU); + assert(dltensor.device.device_id == 0); + check_datatype(dltensor.dtype, kDLInt, 32, 1); + assert(dltensor.ndim == 2); + assert(dltensor.shape[0] == 0); + assert(dltensor.shape[1] == 3); + assert(dltensor.strides[0] == 3); + assert(dltensor.strides[1] == 1); + assert(dltensor.byte_offset == 0); + assert(dltensor.data == nullptr); // size() == 0 => nullptr return true; } @@ -93,16 +96,17 @@ bool test_mdspan_to_dlpack_rank_0() using extents_t = cuda::std::extents; int data[1] = {7}; cuda::host_mdspan md{data, extents_t{}}; - auto dlpack_wrapper = cuda::mdspan_to_dlpack(md); + auto dlpack_wrapper = cuda::to_dlpack(md); + const auto& dltensor = dlpack_wrapper.get(); - assert(dlpack_wrapper->device.device_type == kDLCPU); - assert(dlpack_wrapper->device.device_id == 0); - check_datatype(dlpack_wrapper->dtype, kDLInt, 32, 1); - assert(dlpack_wrapper->ndim == 0); - assert(dlpack_wrapper->shape == nullptr); - assert(dlpack_wrapper->strides == nullptr); - assert(dlpack_wrapper->byte_offset == 0); - assert(dlpack_wrapper->data == data); // rank-0 mdspan has size() == 1 + assert(dltensor.device.device_type == kDLCPU); + assert(dltensor.device.device_id == 0); + check_datatype(dltensor.dtype, kDLInt, 32, 1); + assert(dltensor.ndim == 0); + assert(dltensor.shape == nullptr); + assert(dltensor.strides == nullptr); + assert(dltensor.byte_offset == 0); + assert(dltensor.data == data); // rank-0 mdspan has size() == 1 return true; } @@ -111,20 +115,21 @@ bool test_mdspan_to_dlpack_const_pointer() using extents_t = cuda::std::dims<3>; const int data[6] = {0, 1, 2, 3, 4, 5}; cuda::host_mdspan md{data, extents_t{2, 3, 4}}; - auto dlpack_wrapper = cuda::mdspan_to_dlpack(md); + auto dlpack_wrapper = cuda::to_dlpack(md); + const auto& dltensor = dlpack_wrapper.get(); - assert(dlpack_wrapper->device.device_type == kDLCPU); - assert(dlpack_wrapper->device.device_id == 0); - check_datatype(dlpack_wrapper->dtype, kDLInt, 32, 1); - assert(dlpack_wrapper->ndim == 3); - assert(dlpack_wrapper->shape[0] == 2); - assert(dlpack_wrapper->shape[1] == 3); - assert(dlpack_wrapper->shape[2] == 4); - assert(dlpack_wrapper->strides[0] == 12); - assert(dlpack_wrapper->strides[1] == 4); - assert(dlpack_wrapper->strides[2] == 1); - assert(dlpack_wrapper->byte_offset == 0); - assert(dlpack_wrapper->data == data); // rank-0 mdspan has size() == 1 + assert(dltensor.device.device_type == kDLCPU); + assert(dltensor.device.device_id == 0); + check_datatype(dltensor.dtype, kDLInt, 32, 1); + assert(dltensor.ndim == 3); + assert(dltensor.shape[0] == 2); + assert(dltensor.shape[1] == 3); + assert(dltensor.shape[2] == 4); + assert(dltensor.strides[0] == 12); + assert(dltensor.strides[1] == 4); + assert(dltensor.strides[2] == 1); + assert(dltensor.byte_offset == 0); + assert(dltensor.data == data); // rank-0 mdspan has size() == 1 return true; } @@ -134,18 +139,19 @@ bool test_mdspan_to_dlpack_device() float* data = nullptr; assert(cudaMalloc(&data, 6 * sizeof(float)) == cudaSuccess); cuda::device_mdspan md{data, extents_t{}}; - auto dlpack_wrapper = cuda::mdspan_to_dlpack(md, cuda::device_ref{0}); + auto dlpack_wrapper = cuda::to_dlpack(md, cuda::device_ref{0}); + const auto& dltensor = dlpack_wrapper.get(); - assert(dlpack_wrapper->device.device_type == kDLCUDA); - assert(dlpack_wrapper->device.device_id == 0); - assert(dlpack_wrapper->ndim == 2); - check_datatype(dlpack_wrapper->dtype, kDLFloat, 32, 1); - assert(dlpack_wrapper->shape[0] == 2); - assert(dlpack_wrapper->shape[1] == 3); - assert(dlpack_wrapper->strides[0] == 3); - assert(dlpack_wrapper->strides[1] == 1); - assert(dlpack_wrapper->byte_offset == 0); - assert(dlpack_wrapper->data == data); + assert(dltensor.device.device_type == kDLCUDA); + assert(dltensor.device.device_id == 0); + assert(dltensor.ndim == 2); + check_datatype(dltensor.dtype, kDLFloat, 32, 1); + assert(dltensor.shape[0] == 2); + assert(dltensor.shape[1] == 3); + assert(dltensor.strides[0] == 3); + assert(dltensor.strides[1] == 1); + assert(dltensor.byte_offset == 0); + assert(dltensor.data == data); return true; } @@ -155,18 +161,19 @@ bool test_mdspan_to_dlpack_managed() float* data = nullptr; assert(cudaMallocManaged(&data, 6 * sizeof(float)) == cudaSuccess); cuda::managed_mdspan md{data, extents_t{}}; - auto dlpack_wrapper = cuda::mdspan_to_dlpack(md); + auto dlpack_wrapper = cuda::to_dlpack(md); + const auto& dltensor = dlpack_wrapper.get(); - assert(dlpack_wrapper->device.device_type == kDLCUDAManaged); - assert(dlpack_wrapper->device.device_id == 0); - assert(dlpack_wrapper->ndim == 2); - check_datatype(dlpack_wrapper->dtype, kDLFloat, 32, 1); - assert(dlpack_wrapper->shape[0] == 2); - assert(dlpack_wrapper->shape[1] == 3); - assert(dlpack_wrapper->strides[0] == 3); - assert(dlpack_wrapper->strides[1] == 1); - assert(dlpack_wrapper->byte_offset == 0); - assert(dlpack_wrapper->data == data); + assert(dltensor.device.device_type == kDLCUDAManaged); + assert(dltensor.device.device_id == 0); + assert(dltensor.ndim == 2); + check_datatype(dltensor.dtype, kDLFloat, 32, 1); + assert(dltensor.shape[0] == 2); + assert(dltensor.shape[1] == 3); + assert(dltensor.strides[0] == 3); + assert(dltensor.strides[1] == 1); + assert(dltensor.byte_offset == 0); + assert(dltensor.data == data); return true; } @@ -184,10 +191,11 @@ struct test_mdspan_to_dlpack_types_fn using extents_t = cuda::std::extents; T* data = nullptr; cuda::host_mdspan md{data, extents_t{}}; - auto dlpack_wrapper = cuda::mdspan_to_dlpack(md); + auto dlpack_wrapper = cuda::to_dlpack(md); + const auto& dltensor = dlpack_wrapper.get(); auto type = expected_types[index]; - check_datatype(dlpack_wrapper->dtype, type.code, type.bits, type.lanes); + check_datatype(dltensor.dtype, type.code, type.bits, type.lanes); } template diff --git a/libcudacxx/test/libcudacxx/cuda/containers/views/mdspan/mdspan_to_dlpack/mdspan_to_dlpack.wrapper.pass.cpp b/libcudacxx/test/libcudacxx/cuda/containers/views/mdspan/mdspan_to_dlpack/mdspan_to_dlpack.wrapper.pass.cpp index 0d17e582fe5..58c6f99d9c8 100644 --- a/libcudacxx/test/libcudacxx/cuda/containers/views/mdspan/mdspan_to_dlpack/mdspan_to_dlpack.wrapper.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/containers/views/mdspan/mdspan_to_dlpack/mdspan_to_dlpack.wrapper.pass.cpp @@ -27,7 +27,7 @@ void check_datatype(const DLDataType& dt, uint8_t code, uint8_t bits, uint16_t l bool test_mdspan_to_dlpack_wrapper_default_ctor() { - cuda::DLPackWrapper<3> dlpack_wrapper{}; + cuda::dlpack_tensor<3> dlpack_wrapper{}; DLDataType default_dtype = {}; DLDevice default_device = {}; auto& tensor = dlpack_wrapper.get(); @@ -44,32 +44,34 @@ bool test_dlpack_wrapper_copy_ctor() using extents_t = cuda::std::extents; int data[6] = {0, 1, 2, 3, 4, 5}; cuda::host_mdspan md{data, extents_t{}}; - auto w = cuda::mdspan_to_dlpack(md); - auto* shape_ptr = w->shape; - auto* strides_ptr = w->strides; + auto w = cuda::to_dlpack(md); + auto& t = w.get(); + auto* shape_ptr = t.shape; + auto* strides_ptr = t.strides; auto w2 = w; // copy construct // Copy must not alias the source wrapper's shape/stride storage. - assert(w2->shape != nullptr); - assert(w2->strides != nullptr); - assert(w2->shape != shape_ptr); - assert(w2->strides != strides_ptr); + auto& t2 = w2.get(); + assert(t2.shape != nullptr); + assert(t2.strides != nullptr); + assert(t2.shape != shape_ptr); + assert(t2.strides != strides_ptr); // Source wrapper must remain intact. - assert(w->shape == shape_ptr); - assert(w->strides == strides_ptr); + assert(t.shape == shape_ptr); + assert(t.strides == strides_ptr); // Sanity-check copied tensor metadata and values. - assert(w2->device.device_type == kDLCPU); - assert(w2->device.device_id == 0); - assert(w2->ndim == 2); - check_datatype(w2->dtype, kDLInt, 32, 1); - assert(w2->shape[0] == 2); - assert(w2->shape[1] == 3); - assert(w2->strides[0] == 3); - assert(w2->strides[1] == 1); - assert(w2->byte_offset == 0); - assert(w2->data == data); + assert(t2.device.device_type == kDLCPU); + assert(t2.device.device_id == 0); + assert(t2.ndim == 2); + check_datatype(t2.dtype, kDLInt, 32, 1); + assert(t2.shape[0] == 2); + assert(t2.shape[1] == 3); + assert(t2.strides[0] == 3); + assert(t2.strides[1] == 1); + assert(t2.byte_offset == 0); + assert(t2.data == data); return true; } @@ -78,34 +80,36 @@ bool test_dlpack_wrapper_move_ctor() using extents_t = cuda::std::extents; int data[6] = {0, 1, 2, 3, 4, 5}; cuda::host_mdspan md{data, extents_t{}}; - auto w = cuda::mdspan_to_dlpack(md); - auto* shape_ptr = w->shape; - auto* strides_ptr = w->strides; + auto w = cuda::to_dlpack(md); + auto& t = w.get(); + auto* shape_ptr = t.shape; + auto* strides_ptr = t.strides; auto moved = cuda::std::move(w); // move construct // Moved-to wrapper must not keep pointers to moved-from storage. - assert(moved->shape != nullptr); - assert(moved->strides != nullptr); - assert(moved->shape != shape_ptr); - assert(moved->strides != strides_ptr); + auto& tm = moved.get(); + assert(tm.shape != nullptr); + assert(tm.strides != nullptr); + assert(tm.shape != shape_ptr); + assert(tm.strides != strides_ptr); // Moved-from wrapper is explicitly reset to a default/empty DLTensor. - assert(w->shape == nullptr); - assert(w->strides == nullptr); - assert(w->data == nullptr); - assert(w->ndim == 0); + assert(t.shape == nullptr); + assert(t.strides == nullptr); + assert(t.data == nullptr); + assert(t.ndim == 0); // Sanity-check moved-to tensor metadata and values. - assert(moved->device.device_type == kDLCPU); - assert(moved->device.device_id == 0); - assert(moved->ndim == 2); - check_datatype(moved->dtype, kDLInt, 32, 1); - assert(moved->shape[0] == 2); - assert(moved->shape[1] == 3); - assert(moved->strides[0] == 3); - assert(moved->strides[1] == 1); - assert(moved->byte_offset == 0); - assert(moved->data == data); + assert(tm.device.device_type == kDLCPU); + assert(tm.device.device_id == 0); + assert(tm.ndim == 2); + check_datatype(tm.dtype, kDLInt, 32, 1); + assert(tm.shape[0] == 2); + assert(tm.shape[1] == 3); + assert(tm.strides[0] == 3); + assert(tm.strides[1] == 1); + assert(tm.byte_offset == 0); + assert(tm.data == data); return true; } @@ -116,25 +120,27 @@ bool test_dlpack_wrapper_copy_assignment() int data_b[6] = {6, 7, 8, 9, 10, 11}; cuda::host_mdspan md_a{data_a, extents_t{}}; cuda::host_mdspan md_b{data_b, extents_t{}}; - auto a = cuda::mdspan_to_dlpack(md_a); - auto b = cuda::mdspan_to_dlpack(md_b); - auto* b_shape_ptr = b->shape; - auto* b_strides_ptr = b->strides; + auto a = cuda::to_dlpack(md_a); + auto b = cuda::to_dlpack(md_b); + auto& ta = a.get(); + auto& tb = b.get(); + auto* b_shape_ptr = tb.shape; + auto* b_strides_ptr = tb.strides; b = a; // copy assign // Destination must keep pointing to its own member arrays (not to `a`). - assert(b->shape == b_shape_ptr); - assert(b->strides == b_strides_ptr); - assert(b->shape != a->shape); - assert(b->strides != a->strides); + assert(tb.shape == b_shape_ptr); + assert(tb.strides == b_strides_ptr); + assert(tb.shape != ta.shape); + assert(tb.strides != ta.strides); // Values must be copied correctly. - assert(b->data == data_a); - assert(b->ndim == 2); - assert(b->shape[0] == 2); - assert(b->shape[1] == 3); - assert(b->strides[0] == 3); - assert(b->strides[1] == 1); + assert(tb.data == data_a); + assert(tb.ndim == 2); + assert(tb.shape[0] == 2); + assert(tb.shape[1] == 3); + assert(tb.strides[0] == 3); + assert(tb.strides[1] == 1); return true; } @@ -145,54 +151,53 @@ bool test_dlpack_wrapper_move_assignment() int data_b[6] = {6, 7, 8, 9, 10, 11}; cuda::host_mdspan md_a{data_a, extents_t{}}; cuda::host_mdspan md_b{data_b, extents_t{}}; - auto a = cuda::mdspan_to_dlpack(md_a); - auto b = cuda::mdspan_to_dlpack(md_b); - auto* a_shape_ptr = a->shape; - auto* a_strides_ptr = a->strides; - auto* b_shape_ptr = b->shape; - auto* b_strides_ptr = b->strides; + auto a = cuda::to_dlpack(md_a); + auto b = cuda::to_dlpack(md_b); + auto& ta = a.get(); + auto& tb = b.get(); + auto* a_shape_ptr = ta.shape; + auto* a_strides_ptr = ta.strides; + auto* b_shape_ptr = tb.shape; + auto* b_strides_ptr = tb.strides; b = cuda::std::move(a); // move assign // Destination must keep pointing to its own member arrays, not the source's. - assert(b->shape == b_shape_ptr); - assert(b->strides == b_strides_ptr); - assert(b->shape != a_shape_ptr); - assert(b->strides != a_strides_ptr); + assert(tb.shape == b_shape_ptr); + assert(tb.strides == b_strides_ptr); + assert(tb.shape != a_shape_ptr); + assert(tb.strides != a_strides_ptr); // Source must be reset. - assert(a->shape == nullptr); - assert(a->strides == nullptr); - assert(a->data == nullptr); - assert(a->ndim == 0); + assert(ta.shape == nullptr); + assert(ta.strides == nullptr); + assert(ta.data == nullptr); + assert(ta.ndim == 0); // Values must be moved correctly. - assert(b->data == data_a); - assert(b->ndim == 2); - assert(b->shape[0] == 2); - assert(b->shape[1] == 3); - assert(b->strides[0] == 3); - assert(b->strides[1] == 1); + assert(tb.data == data_a); + assert(tb.ndim == 2); + assert(tb.shape[0] == 2); + assert(tb.shape[1] == 3); + assert(tb.strides[0] == 3); + assert(tb.strides[1] == 1); return true; } bool test_dlpack_wrapper_get() { - using wrapper_t = cuda::DLPackWrapper<2>; + using wrapper_t = cuda::dlpack_tensor<2>; static_assert(cuda::std::is_same_v().get()), ::DLTensor&>); static_assert(cuda::std::is_same_v().get()), const ::DLTensor&>); wrapper_t w{}; - // `get()` must return a reference to the same underlying `DLTensor` as `operator->()`. - assert(&w.get() == w.operator->()); - - // Mutating through the reference returned by `get()` must be observable through `operator->()`. + // Mutating through the reference returned by `get()` must be observable. auto& t = w.get(); t.ndim = 123; - assert(w->ndim == 123); + assert(w.get().ndim == 123); // Const overload should also alias the same underlying object. const wrapper_t& cw = w; - assert(&cw.get() == cw.operator->()); + assert(&cw.get() == &w.get()); return true; } From f2903200f7423c4045b565a562128b50c5a75843 Mon Sep 17 00:00:00 2001 From: fbusato Date: Fri, 19 Dec 2025 12:50:59 -0800 Subject: [PATCH 07/56] formatting --- docs/libcudacxx/extended_api/mdspan/mdspan_to_dlpack.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/libcudacxx/extended_api/mdspan/mdspan_to_dlpack.rst b/docs/libcudacxx/extended_api/mdspan/mdspan_to_dlpack.rst index 0213aab28ab..c7cb9fe2f87 100644 --- a/docs/libcudacxx/extended_api/mdspan/mdspan_to_dlpack.rst +++ b/docs/libcudacxx/extended_api/mdspan/mdspan_to_dlpack.rst @@ -17,16 +17,16 @@ Conversion functions template [[nodiscard]] dlpack_tensor to_dlpack(const cuda::host_mdspan& mdspan); - + template [[nodiscard]] dlpack_tensor to_dlpack(const cuda::device_mdspan& mdspan, cuda::device_ref device = cuda::device_ref{0}); - + template [[nodiscard]] dlpack_tensor to_dlpack(const cuda::managed_mdspan& mdspan); - + } // namespace cuda Types From 7a228481edf458a4e0e2a38cb585d7f892c01c13 Mon Sep 17 00:00:00 2001 From: fbusato Date: Fri, 19 Dec 2025 16:00:31 -0800 Subject: [PATCH 08/56] fix MSVC warning --- libcudacxx/include/cuda/__mdspan/mdspan_to_dlpack.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libcudacxx/include/cuda/__mdspan/mdspan_to_dlpack.h b/libcudacxx/include/cuda/__mdspan/mdspan_to_dlpack.h index f05631eab19..3e206b9b6b2 100644 --- a/libcudacxx/include/cuda/__mdspan/mdspan_to_dlpack.h +++ b/libcudacxx/include/cuda/__mdspan/mdspan_to_dlpack.h @@ -242,7 +242,7 @@ __to_dlpack(const ::cuda::std::mdspan<_ElementType, _Extents, _Layout, _Accessor auto& __tensor = __wrapper.get(); __tensor.data = __mdspan.size() > 0 ? const_cast<__element_type*>(__mdspan.data_handle()) : nullptr; __tensor.device = ::DLDevice{__device_type, __device_id}; - __tensor.ndim = __mdspan.rank(); + __tensor.ndim = static_cast(__mdspan.rank()); __tensor.dtype = ::cuda::__data_type_to_dlpack<::cuda::std::remove_cv_t<_ElementType>>(); if constexpr (_Extents::rank() > 0) { From f78db300dbf5e1ec4f755028fcbec566f5e8dac7 Mon Sep 17 00:00:00 2001 From: fbusato Date: Fri, 19 Dec 2025 16:06:02 -0800 Subject: [PATCH 09/56] improve documentation --- .../extended_api/mdspan/mdspan_to_dlpack.rst | 35 +++++++++---------- 1 file changed, 17 insertions(+), 18 deletions(-) diff --git a/docs/libcudacxx/extended_api/mdspan/mdspan_to_dlpack.rst b/docs/libcudacxx/extended_api/mdspan/mdspan_to_dlpack.rst index c7cb9fe2f87..75d089a6d13 100644 --- a/docs/libcudacxx/extended_api/mdspan/mdspan_to_dlpack.rst +++ b/docs/libcudacxx/extended_api/mdspan/mdspan_to_dlpack.rst @@ -3,7 +3,7 @@ ``mdspan`` to DLPack ==================== -This functionality provides a conversion from ``cuda::host_mdspan``, ``cuda::device_mdspan``, and ``cuda::managed_mdspan`` to a `DLPack `__ ``DLTensor`` view. +This functionality provides a conversion from ``cuda::host_mdspan``, ``cuda::device_mdspan``, and ``cuda::managed_mdspan`` to `DLPack `__ ``DLTensor`` view. Defined in the ```` header. @@ -48,17 +48,17 @@ Types DLTensor& get() noexcept; const DLTensor& get() const noexcept; - }; + }; } // namespace cuda ``cuda::dlpack_tensor`` stores a ``DLTensor`` and owns the backing storage for its ``shape`` and ``strides`` pointers. The class does not use any heap allocation. -.. note:: Lifetime +.. note:: **Lifetime** The ``DLTensor`` associated with ``cuda::dlpack_tensor`` must not outlive the wrapper. If the wrapper is destroyed, the returned ``DLTensor::shape`` and ``DLTensor::strides`` pointers will dangle. -.. note:: Const-correctness +.. note:: **Const-correctness** ``DLTensor::data`` points at ``mdspan.data_handle()`` (or is ``nullptr`` if ``mdspan.size() == 0``). If ``T`` is ``const``, the pointer is ``const_cast``'d because ``DLTensor::data`` is unqualified. @@ -79,27 +79,27 @@ The conversion produces a non-owning DLPack view of the ``mdspan`` data and meta Element types are mapped to ``DLDataType`` according to the DLPack conventions, including: +- ``bool``. - Signed and unsigned integers. -- IEEE-754 Floating-point and extended precision floating-point, including ``__half``, ``__nv_bfloat16``, FP8, FP6, FP4 when available. +- IEEE-754 Floating-point and extended precision floating-point, including ``__half``, ``__nv_bfloat16``, ``__float128``, FP8, FP6, FP4 when available. - Complex: ``cuda::std::complex<__half>``, ``cuda::std::complex``, and ``cuda::std::complex``. -- `CUDA built-in vector types `__, such as ``int2``, ``float4``, etc.. +- `CUDA built-in vector types `__, such as ``int2``, ``float4``, etc. +- Vector types for extended floating-point, such as ``__half2``, ``__nv_fp8x4_e4m3``, etc. -Constraints and errors ----------------------- - -**Constraints** +Constraints +----------- - The accessor ``data_handle_type`` must be a pointer type. -**Runtime errors** +Runtime errors +-------------- - If any ``extent(i)`` or ``stride(i)`` cannot be represented in ``int64_t``, the conversion raises an exception. Availability notes ------------------ -- This API is available only when DLPack headers are present (```` is found in the include path). -* ``dlpack/dlpack.h`` (`DLPack v1 `__) must be discoverable at compile time, namely available in the include path. +- This API is available only when DLPack header is present, namely ```` is found in the include path. References ---------- @@ -111,14 +111,13 @@ Example .. code:: cuda - #include - #include - #include - #include + #include + #include + #include int main() { - using extents_t = cuda::std::extents; + using extents_t = cuda::std::extents; int data[6] = {0, 1, 2, 3, 4, 5}; cuda::host_mdspan md{data, extents_t{}}; From 1467ab23a447675fa4faefc95bc63d5df0d526bd Mon Sep 17 00:00:00 2001 From: fbusato Date: Fri, 19 Dec 2025 16:57:54 -0800 Subject: [PATCH 10/56] fix MSVC warning --- libcudacxx/include/cuda/__mdspan/mdspan_to_dlpack.h | 1 + 1 file changed, 1 insertion(+) diff --git a/libcudacxx/include/cuda/__mdspan/mdspan_to_dlpack.h b/libcudacxx/include/cuda/__mdspan/mdspan_to_dlpack.h index 3e206b9b6b2..f8ef04bf1aa 100644 --- a/libcudacxx/include/cuda/__mdspan/mdspan_to_dlpack.h +++ b/libcudacxx/include/cuda/__mdspan/mdspan_to_dlpack.h @@ -151,6 +151,7 @@ template { static_assert(::cuda::std::__always_false_v<_ElementType>, "Unsupported type"); } + _CCCL_UNREACHABLE(); return ::DLDataType{}; } From d844f65254ae64e31e2663f4582b7324010fc10d Mon Sep 17 00:00:00 2001 From: fbusato Date: Fri, 19 Dec 2025 16:59:26 -0800 Subject: [PATCH 11/56] first version --- .../include/cuda/__mdspan/dlpack_to_mdspan.h | 156 ++++++++++++++++++ 1 file changed, 156 insertions(+) create mode 100644 libcudacxx/include/cuda/__mdspan/dlpack_to_mdspan.h diff --git a/libcudacxx/include/cuda/__mdspan/dlpack_to_mdspan.h b/libcudacxx/include/cuda/__mdspan/dlpack_to_mdspan.h new file mode 100644 index 00000000000..ae9be140f26 --- /dev/null +++ b/libcudacxx/include/cuda/__mdspan/dlpack_to_mdspan.h @@ -0,0 +1,156 @@ +//===----------------------------------------------------------------------===// +// +// Part of the libcu++ Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDA___MDSPAN_DLPACK_TO_MDSPAN_H +#define _CUDA___MDSPAN_DLPACK_TO_MDSPAN_H + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#if !_CCCL_COMPILER(NVRTC) && _CCCL_HAS_INCLUDE() + +# include +# include +# include +# include +# include +# include + +# include + +# include +// +# include + +_CCCL_BEGIN_NAMESPACE_CUDA + +static_assert(DLPACK_MAJOR_VERSION == 1, "DLPACK_MAJOR_VERSION must be 1"); + +template +[[nodiscard]] _CCCL_HOST_API inline bool __validate_dlpack_data_type(const ::DLDataType& __dtype) noexcept +{ + const auto __expected = ::cuda::__data_type_to_dlpack<_ElementType>(); + return __dtype.code == __expected.code && __dtype.bits == __expected.bits && __dtype.lanes == __expected.lanes; +} + +template +[[nodiscard]] +_CCCL_HOST_API ::cuda::std::mdspan<_ElementType, ::cuda::std::dextents<::cuda::std::int64_t, _Rank>, _LayoutPolicy> +__to_mdspan(const ::DLTensor& __tensor) +{ + using __extents_type = ::cuda::std::dextents<::cuda::std::int64_t, _Rank>; + using __mdspan_type = ::cuda::std::mdspan<_ElementType, __extents_type, _LayoutPolicy>; + using __mapping_type = typename _LayoutPolicy::template mapping<__extents_type>; + using __element_type = typename __mdspan_type::element_type; + if (__tensor.ndim != int{_Rank}) + { + _CCCL_THROW(::std::invalid_argument{"DLTensor rank does not match expected rank"}); + } + if (!::cuda::__validate_dlpack_data_type<__element_type>(__tensor.dtype)) + { + _CCCL_THROW(::std::invalid_argument{"DLTensor data type does not match expected type"}); + } + auto __base_data = static_cast(__tensor.data) + __tensor.byte_offset; + auto __data = reinterpret_cast<__element_type*>(__base_data); + if constexpr (_Rank == 0) + { + return __mdspan_type{__data, __mapping_type{}}; + } + else if constexpr (::cuda::std::is_same_v<_LayoutPolicy, ::cuda::std::layout_stride>) + { + using ::cuda::std::int64_t; + using ::cuda::std::size_t; + ::cuda::std::array __extents_arr{}; + ::cuda::std::array __strides_arr{}; + for (size_t __i = 0; __i < _Rank; ++__i) + { + __extents_arr[__i] = __tensor.shape[__i]; + // strides == nullptr means row-major (C-contiguous) layout + if (__tensor.strides != nullptr) + { + __strides_arr[__i] = __tensor.strides[__i]; + } + else + { + __strides_arr[__i] = 1; + for (size_t __j = __i + 1; __j < _Rank; ++__j) + { + __strides_arr[__i] *= __tensor.shape[__j]; + } + } + } + __extents_type __extents{__extents_arr}; + __mapping_type __mapping{__extents, __strides_arr}; + return __mdspan_type{__data, __mapping}; + } + else + { + static_assert(::cuda::std::__always_false_v<_LayoutPolicy>, "Unsupported layout policy"); + } +} + +/*********************************************************************************************************************** + * Public API + **********************************************************************************************************************/ + +template +[[nodiscard]] +_CCCL_HOST_API ::cuda::host_mdspan<_ElementType, ::cuda::std::dextents<::cuda::std::int64_t, _Rank>, _LayoutPolicy> +to_host_mdspan(const ::DLTensor& __tensor) +{ + if (__tensor.device.device_type != ::kDLCPU) + { + _CCCL_THROW(::std::invalid_argument{"DLTensor device type must be kDLCPU for host_mdspan"}); + } + using __extents_type = ::cuda::std::dextents<::cuda::std::int64_t, _Rank>; + using __mdspan_type = ::cuda::host_mdspan<_ElementType, __extents_type, _LayoutPolicy>; + return __mdspan_type{::cuda::__to_mdspan<_ElementType, _Rank, _LayoutPolicy>(__tensor)}; +} + +template +[[nodiscard]] +_CCCL_HOST_API ::cuda::device_mdspan<_ElementType, ::cuda::std::dextents<::cuda::std::int64_t, _Rank>, _LayoutPolicy> +to_device_mdspan(const ::DLTensor& __tensor) +{ + if (__tensor.device.device_type != ::kDLCUDA) + { + _CCCL_THROW(::std::invalid_argument{"DLTensor device type must be kDLCUDA for device_mdspan"}); + } + using __extents_type = ::cuda::std::dextents<::cuda::std::int64_t, _Rank>; + using __mdspan_type = ::cuda::device_mdspan<_ElementType, __extents_type, _LayoutPolicy>; + return __mdspan_type{::cuda::__to_mdspan<_ElementType, _Rank, _LayoutPolicy>(__tensor)}; +} + +template +[[nodiscard]] +_CCCL_HOST_API ::cuda::managed_mdspan<_ElementType, ::cuda::std::dextents<::cuda::std::int64_t, _Rank>, _LayoutPolicy> +to_managed_mdspan(const ::DLTensor& __tensor) +{ + if (__tensor.device.device_type != ::kDLCUDAManaged) + { + _CCCL_THROW(::std::invalid_argument{"DLTensor device type must be kDLCUDAManaged for managed_mdspan"}); + } + using __extents_type = ::cuda::std::dextents<::cuda::std::int64_t, _Rank>; + using __mdspan_type = ::cuda::managed_mdspan<_ElementType, __extents_type, _LayoutPolicy>; + return __mdspan_type{::cuda::__to_mdspan<_ElementType, _Rank, _LayoutPolicy>(__tensor)}; +} + +_CCCL_END_NAMESPACE_CUDA + +# include + +#endif // !_CCCL_COMPILER(NVRTC) && _CCCL_HAS_INCLUDE() +#endif // _CUDA___MDSPAN_DLPACK_TO_MDSPAN_H From 3843556549317447e7d3d57ff6695eea289f7262 Mon Sep 17 00:00:00 2001 From: fbusato Date: Mon, 22 Dec 2025 16:27:59 -0800 Subject: [PATCH 12/56] complete the implementation --- .../include/cuda/__mdspan/dlpack_to_mdspan.h | 174 +++++++--- .../include/cuda/__mdspan/mdspan_to_dlpack.h | 303 ++++++++++++++++++ 2 files changed, 441 insertions(+), 36 deletions(-) create mode 100644 libcudacxx/include/cuda/__mdspan/mdspan_to_dlpack.h diff --git a/libcudacxx/include/cuda/__mdspan/dlpack_to_mdspan.h b/libcudacxx/include/cuda/__mdspan/dlpack_to_mdspan.h index ae9be140f26..44c1ea89fa2 100644 --- a/libcudacxx/include/cuda/__mdspan/dlpack_to_mdspan.h +++ b/libcudacxx/include/cuda/__mdspan/dlpack_to_mdspan.h @@ -23,10 +23,13 @@ #if !_CCCL_COMPILER(NVRTC) && _CCCL_HAS_INCLUDE() # include -# include +// # include // __data_type_to_dlpack +# include # include # include +# include # include +# include # include # include @@ -46,60 +49,159 @@ template return __dtype.code == __expected.code && __dtype.bits == __expected.bits && __dtype.lanes == __expected.lanes; } -template [[nodiscard]] -_CCCL_HOST_API ::cuda::std::mdspan<_ElementType, ::cuda::std::dextents<::cuda::std::int64_t, _Rank>, _LayoutPolicy> -__to_mdspan(const ::DLTensor& __tensor) +_CCCL_HOST_API inline ::cuda::std::int64_t __layout_right_stride( + const ::cuda::std::int64_t* __shapes, ::cuda::std::size_t __pos, ::cuda::std::size_t __rank) noexcept { - using __extents_type = ::cuda::std::dextents<::cuda::std::int64_t, _Rank>; - using __mdspan_type = ::cuda::std::mdspan<_ElementType, __extents_type, _LayoutPolicy>; - using __mapping_type = typename _LayoutPolicy::template mapping<__extents_type>; - using __element_type = typename __mdspan_type::element_type; - if (__tensor.ndim != int{_Rank}) + ::cuda::std::int64_t __stride = 1; + for (auto __i = __pos + 1; __i < __rank; ++__i) { - _CCCL_THROW(::std::invalid_argument{"DLTensor rank does not match expected rank"}); + __stride *= __shapes[__i]; // TODO: check for overflow } - if (!::cuda::__validate_dlpack_data_type<__element_type>(__tensor.dtype)) + return __stride; +} + +[[nodiscard]] +_CCCL_HOST_API inline ::cuda::std::int64_t +__layout_left_stride(const ::cuda::std::int64_t* __shapes, ::cuda::std::size_t __pos) noexcept +{ + ::cuda::std::int64_t __stride = 1; + for (::cuda::std::size_t __i = 0; __i < __pos; ++__i) { - _CCCL_THROW(::std::invalid_argument{"DLTensor data type does not match expected type"}); + __stride *= __shapes[__i]; // TODO: check for overflow } - auto __base_data = static_cast(__tensor.data) + __tensor.byte_offset; - auto __data = reinterpret_cast<__element_type*>(__base_data); - if constexpr (_Rank == 0) + return __stride; +} + +template +_CCCL_HOST_API void __validate_dlpack_strides(const ::DLTensor& __tensor, [[maybe_unused]] ::cuda::std::size_t __rank) +{ + constexpr bool __is_layout_right = ::cuda::std::is_same_v<_LayoutPolicy, ::cuda::std::layout_right>; + constexpr bool __is_layout_left = ::cuda::std::is_same_v<_LayoutPolicy, ::cuda::std::layout_left>; + constexpr bool __is_layout_stride = ::cuda::std::is_same_v<_LayoutPolicy, ::cuda::std::layout_stride>; + const auto __strides_ptr = __tensor.strides; + if (__strides_ptr == nullptr) { - return __mdspan_type{__data, __mapping_type{}}; +# if DLPACK_MAJOR_VERSION > 1 || (DLPACK_MAJOR_VERSION == 1 && DLPACK_MINOR_VERSION >= 2) + _CCCL_THROW(::std::invalid_argument{"strides=nullptr is not supported for DLPack v1.2 and later"}); +# else + // strides == nullptr means row-major (C-contiguous) layout + if (__is_layout_left && __rank > 1) + { + _CCCL_THROW(::std::invalid_argument{"strides must be non-null for layout_left"}); + } + else + { + return; + } +# endif // DLPACK_MAJOR_VERSION > 1 || (DLPACK_MAJOR_VERSION == 1 && DLPACK_MINOR_VERSION >= 2) } - else if constexpr (::cuda::std::is_same_v<_LayoutPolicy, ::cuda::std::layout_stride>) + for (::cuda::std::size_t __pos = 0; __pos < __rank; ++__pos) { - using ::cuda::std::int64_t; - using ::cuda::std::size_t; - ::cuda::std::array __extents_arr{}; - ::cuda::std::array __strides_arr{}; - for (size_t __i = 0; __i < _Rank; ++__i) + if constexpr (__is_layout_right) { - __extents_arr[__i] = __tensor.shape[__i]; - // strides == nullptr means row-major (C-contiguous) layout - if (__tensor.strides != nullptr) + if (__strides_ptr[__pos] != ::cuda::__layout_right_stride(__tensor.shape, __pos, __rank)) { - __strides_arr[__i] = __tensor.strides[__i]; + _CCCL_THROW(::std::invalid_argument{"DLTensor strides are not compatible with layout_right"}); } - else + } + else if constexpr (__is_layout_left) + { + if (__strides_ptr[__pos] != ::cuda::__layout_left_stride(__tensor.shape, __pos)) { - __strides_arr[__i] = 1; - for (size_t __j = __i + 1; __j < _Rank; ++__j) - { - __strides_arr[__i] *= __tensor.shape[__j]; - } + _CCCL_THROW(::std::invalid_argument{"DLTensor strides are not compatible with layout_left"}); + } + } + else if constexpr (__is_layout_stride) + { + if (__strides_ptr[__pos] <= 0) + { + _CCCL_THROW(::std::invalid_argument{"mdspan strides must be positive"}); } } - __extents_type __extents{__extents_arr}; - __mapping_type __mapping{__extents, __strides_arr}; - return __mdspan_type{__data, __mapping}; } - else +} + +template +[[nodiscard]] +_CCCL_HOST_API ::cuda::std::mdspan<_ElementType, ::cuda::std::dextents<::cuda::std::int64_t, _Rank>, _LayoutPolicy> +__to_mdspan(const ::DLTensor& __tensor) +{ + using __extents_type = ::cuda::std::dextents<::cuda::std::int64_t, _Rank>; + using __mdspan_type = ::cuda::std::mdspan<_ElementType, __extents_type, _LayoutPolicy>; + using __mapping_type = typename _LayoutPolicy::template mapping<__extents_type>; + using __element_type = typename __mdspan_type::element_type; + constexpr bool __is_layout_right = ::cuda::std::is_same_v<_LayoutPolicy, ::cuda::std::layout_right>; + constexpr bool __is_layout_left = ::cuda::std::is_same_v<_LayoutPolicy, ::cuda::std::layout_left>; + constexpr bool __is_layout_stride = ::cuda::std::is_same_v<_LayoutPolicy, ::cuda::std::layout_stride>; + // TODO: add support for layout_right_padded and layout_left_padded + if constexpr (!__is_layout_right && !__is_layout_left && !__is_layout_stride) { static_assert(::cuda::std::__always_false_v<_LayoutPolicy>, "Unsupported layout policy"); } + else + { + if (__tensor.ndim != int{_Rank}) + { + _CCCL_THROW(::std::invalid_argument{"DLTensor rank does not match expected rank"}); + } + if (!::cuda::__validate_dlpack_data_type<__element_type>(__tensor.dtype)) + { + _CCCL_THROW(::std::invalid_argument{"DLTensor data type does not match expected type"}); + } + if (__tensor.data == nullptr) + { + _CCCL_THROW(::std::invalid_argument{"DLTensor data must be non-null"}); + } + auto __base_data = static_cast(__tensor.data) + __tensor.byte_offset; + auto __data = reinterpret_cast<__element_type*>(__base_data); + const auto __datatype_size = __tensor.dtype.bits * __tensor.dtype.lanes / 8; + // this is not the exact solution because data type size != data type alignment. + // However, it always works for the supported data types. + if (__datatype_size > 0 && !::cuda::is_aligned(__data, __datatype_size)) + { + _CCCL_THROW(::std::invalid_argument{"DLTensor data must be aligned to the data type"}); + } + if constexpr (_Rank == 0) + { + return __mdspan_type{__data, __mapping_type{}}; + } + else // Rank > 0 + { + if (__tensor.shape == nullptr) + { + _CCCL_THROW(::std::invalid_argument{"DLTensor shape must be non-null"}); + } + using ::cuda::std::int64_t; + using ::cuda::std::size_t; + ::cuda::std::array __extents_array{}; + for (size_t __i = 0; __i < _Rank; ++__i) + { + if (__tensor.shape[__i] < 0) + { + _CCCL_THROW(::std::invalid_argument{"DLTensor shape must be positive"}); + } + __extents_array[__i] = __tensor.shape[__i]; + } + ::cuda::__validate_dlpack_strides<_LayoutPolicy>(__tensor, _Rank); + if constexpr (__is_layout_stride) + { + ::cuda::std::array __strides_array{}; + for (size_t __i = 0; __i < _Rank; ++__i) + { + const bool __has_strides = __tensor.strides != nullptr; + __strides_array[__i] = + __has_strides ? __tensor.strides[__i] : ::cuda::__layout_right_stride(__tensor.shape, __i, _Rank); + } + return __mdspan_type{__data, __mapping_type{__extents_array, __strides_array}}; + } + else + { + __extents_type __extents{__extents_array}; + return __mdspan_type{__data, __extents}; + } + } + } } /*********************************************************************************************************************** diff --git a/libcudacxx/include/cuda/__mdspan/mdspan_to_dlpack.h b/libcudacxx/include/cuda/__mdspan/mdspan_to_dlpack.h new file mode 100644 index 00000000000..f8ef04bf1aa --- /dev/null +++ b/libcudacxx/include/cuda/__mdspan/mdspan_to_dlpack.h @@ -0,0 +1,303 @@ +//===----------------------------------------------------------------------===// +// +// Part of the libcu++ Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDA___MDSPAN_MDSPAN_TO_DLPACK_H +#define _CUDA___MDSPAN_MDSPAN_TO_DLPACK_H + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#if !_CCCL_COMPILER(NVRTC) && _CCCL_HAS_INCLUDE() + +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include + +# include + +# include +// +# include + +_CCCL_BEGIN_NAMESPACE_CUDA + +static_assert(DLPACK_MAJOR_VERSION == 1, "DLPACK_MAJOR_VERSION must be 1"); + +template +[[nodiscard]] _CCCL_HOST_API inline ::DLDataType __data_type_to_dlpack() noexcept +{ + if constexpr (::cuda::std::is_same_v<_ElementType, bool>) + { + return ::DLDataType{::kDLBool, 8, 1}; + } + //-------------------------------------------------------------------------------------------------------------------- + // Signed integer types + else if constexpr (::cuda::std::__cccl_is_integer_v<_ElementType>) + { + return ::DLDataType{ + (::cuda::std::is_signed_v<_ElementType>) ? ::kDLInt : ::kDLUInt, ::cuda::std::__num_bits_v<_ElementType>, 1}; + } + //-------------------------------------------------------------------------------------------------------------------- + // bfloat16 (must come before general floating-point) +# if _CCCL_HAS_NVBF16() + else if constexpr (::cuda::std::is_same_v<_ElementType, ::__nv_bfloat16>) + { + return ::DLDataType{::kDLBfloat, 16, 1}; + } +# endif // _CCCL_HAS_NVBF16() + //-------------------------------------------------------------------------------------------------------------------- + // Low-precision Floating-point types (must come before general floating-point) +# if _CCCL_HAS_NVFP8_E4M3() + else if constexpr (::cuda::std::is_same_v<_ElementType, ::__nv_fp8_e4m3>) + { + return ::DLDataType{::kDLFloat8_e4m3fn, 8, 1}; + } +# endif // _CCCL_HAS_NVFP8_E4M3() +# if _CCCL_HAS_NVFP8_E5M2() + else if constexpr (::cuda::std::is_same_v<_ElementType, ::__nv_fp8_e5m2>) + { + return ::DLDataType{::kDLFloat8_e5m2, 8, 1}; + } +# endif // _CCCL_HAS_NVFP8_E5M2() +# if _CCCL_HAS_NVFP8_E8M0() + else if constexpr (::cuda::std::is_same_v<_ElementType, ::__nv_fp8_e8m0>) + { + return ::DLDataType{::kDLFloat8_e8m0fnu, 8, 1}; + } +# endif // _CCCL_HAS_NVFP8_E8M0() +# if _CCCL_HAS_NVFP6_E2M3() + else if constexpr (::cuda::std::is_same_v<_ElementType, ::__nv_fp6_e2m3>) + { + return ::DLDataType{::kDLFloat6_e2m3fn, 6, 1}; + } +# endif // _CCCL_HAS_NVFP6_E2M3() +# if _CCCL_HAS_NVFP6_E3M2() + else if constexpr (::cuda::std::is_same_v<_ElementType, ::__nv_fp6_e3m2>) + { + return ::DLDataType{::kDLFloat6_e3m2fn, 6, 1}; + } +# endif // _CCCL_HAS_NVFP6_E3M2() +# if _CCCL_HAS_NVFP4_E2M1() + else if constexpr (::cuda::std::is_same_v<_ElementType, ::__nv_fp4_e2m1>) + { + return ::DLDataType{::kDLFloat4_e2m1fn, 4, 1}; + } +# endif // _CCCL_HAS_NVFP4_E2M1() + //-------------------------------------------------------------------------------------------------------------------- + // Floating-point types (after specific types) + else if constexpr (::cuda::is_floating_point_v<_ElementType>) + { + return ::DLDataType{::kDLFloat, ::cuda::std::__num_bits_v<_ElementType>, 1}; + } + //-------------------------------------------------------------------------------------------------------------------- + // Complex types + // 256-bit data types are not supported in DLPack, e.g. cuda::std::complex<__float128> + else if constexpr (::cuda::std::__is_cuda_std_complex_v<_ElementType> && sizeof(_ElementType) <= sizeof(double) * 2) + { + // DLPack encodes complex numbers as a compact struct of two scalar values, and `bits` stores + // the size of the full complex number (e.g. std::complex => bits=64). + return ::DLDataType{::kDLComplex, sizeof(_ElementType) * CHAR_BIT, 1}; + } + //-------------------------------------------------------------------------------------------------------------------- + // CUDA built-in vector types +# if _CCCL_HAS_CTK() + else if constexpr (::cuda::__is_vector_type_v<_ElementType> || ::cuda::__is_extended_fp_vector_type_v<_ElementType>) + { + constexpr ::cuda::std::uint16_t __lanes = ::cuda::std::tuple_size_v<_ElementType>; + if constexpr (__lanes == 2 || __lanes == 4) + { + using __scalar_t = ::cuda::std::remove_cv_t<::cuda::std::tuple_element_t<0, _ElementType>>; + auto __scalar = ::cuda::__data_type_to_dlpack<__scalar_t>(); + __scalar.lanes = __lanes; + return __scalar; + } + else + { + static_assert(::cuda::std::__always_false_v<_ElementType>, "Unsupported vector type"); + } + } +# endif // _CCCL_HAS_CTK() + //-------------------------------------------------------------------------------------------------------------------- + // Unsupported types + else + { + static_assert(::cuda::std::__always_false_v<_ElementType>, "Unsupported type"); + } + _CCCL_UNREACHABLE(); + return ::DLDataType{}; +} + +template <::cuda::std::size_t _Rank> +class dlpack_tensor +{ + ::cuda::std::array<::cuda::std::int64_t, _Rank> __shape{}; + ::cuda::std::array<::cuda::std::int64_t, _Rank> __strides{}; + ::DLTensor __tensor{}; + + _CCCL_HOST_API void __update_tensor() noexcept + { + __tensor.shape = _Rank > 0 ? __shape.data() : nullptr; + __tensor.strides = _Rank > 0 ? __strides.data() : nullptr; + } + +public: + _CCCL_HOST_API explicit dlpack_tensor() noexcept + { + __update_tensor(); + } + + _CCCL_HOST_API dlpack_tensor(const dlpack_tensor& __other) noexcept + : __shape{__other.__shape} + , __strides{__other.__strides} + , __tensor{__other.__tensor} + { + __update_tensor(); + } + + _CCCL_HOST_API dlpack_tensor(dlpack_tensor&& __other) noexcept + : __shape{::cuda::std::move(__other.__shape)} + , __strides{::cuda::std::move(__other.__strides)} + , __tensor{__other.__tensor} + { + __other.__tensor = ::DLTensor{}; + __update_tensor(); + } + + _CCCL_HOST_API dlpack_tensor& operator=(const dlpack_tensor& __other) noexcept + { + if (this == &__other) + { + return *this; + } + __shape = __other.__shape; + __strides = __other.__strides; + __tensor = __other.__tensor; + __update_tensor(); + return *this; + } + + _CCCL_HOST_API dlpack_tensor& operator=(dlpack_tensor&& __other) noexcept + { + if (this == &__other) + { + return *this; + } + __shape = ::cuda::std::move(__other.__shape); + __strides = ::cuda::std::move(__other.__strides); + __tensor = __other.__tensor; + __other.__tensor = ::DLTensor{}; + __update_tensor(); + return *this; + } + + _CCCL_HIDE_FROM_ABI ~dlpack_tensor() noexcept = default; + + [[nodiscard]] _CCCL_HOST_API ::DLTensor& get() noexcept + { + return __tensor; + } + + [[nodiscard]] _CCCL_HOST_API const ::DLTensor& get() const noexcept + { + return __tensor; + } +}; + +template +[[nodiscard]] _CCCL_HOST_API dlpack_tensor<_Extents::rank()> +__to_dlpack(const ::cuda::std::mdspan<_ElementType, _Extents, _Layout, _Accessor>& __mdspan, + ::DLDeviceType __device_type, + int __device_id) +{ + static_assert(::cuda::std::is_pointer_v, "data_handle_type must be a pointer"); + using __element_type = ::cuda::std::remove_cv_t<_ElementType>; + dlpack_tensor<_Extents::rank()> __wrapper{}; + auto& __tensor = __wrapper.get(); + __tensor.data = __mdspan.size() > 0 ? const_cast<__element_type*>(__mdspan.data_handle()) : nullptr; + __tensor.device = ::DLDevice{__device_type, __device_id}; + __tensor.ndim = static_cast(__mdspan.rank()); + __tensor.dtype = ::cuda::__data_type_to_dlpack<::cuda::std::remove_cv_t<_ElementType>>(); + if constexpr (_Extents::rank() > 0) + { + constexpr auto __max_extent = ::cuda::std::numeric_limits<::cuda::std::int64_t>::max(); + for (::cuda::std::size_t __i = 0; __i < __mdspan.rank(); ++__i) + { + if (::cuda::std::cmp_greater(__mdspan.extent(__i), __max_extent)) + { + _CCCL_THROW(::std::invalid_argument{"Extent is too large"}); + } + if (::cuda::std::cmp_greater(__mdspan.stride(__i), __max_extent)) + { + _CCCL_THROW(::std::invalid_argument{"Stride is too large"}); + } + __tensor.shape[__i] = static_cast<::cuda::std::int64_t>(__mdspan.extent(__i)); + __tensor.strides[__i] = static_cast<::cuda::std::int64_t>(__mdspan.stride(__i)); + } + } + __tensor.byte_offset = 0; + return __wrapper; +} + +/*********************************************************************************************************************** + * Public API + **********************************************************************************************************************/ + +template +[[nodiscard]] _CCCL_HOST_API dlpack_tensor<_Extents::rank()> +to_dlpack(const ::cuda::host_mdspan<_ElementType, _Extents, _Layout, _Accessor>& __mdspan) +{ + using __mdspan_type = ::cuda::std::mdspan<_ElementType, _Extents, _Layout, _Accessor>; + return ::cuda::__to_dlpack(__mdspan_type{__mdspan}, ::kDLCPU, 0); +} + +template +[[nodiscard]] _CCCL_HOST_API dlpack_tensor<_Extents::rank()> +to_dlpack(const ::cuda::device_mdspan<_ElementType, _Extents, _Layout, _Accessor>& __mdspan, + ::cuda::device_ref __device = ::cuda::device_ref{0}) +{ + using __mdspan_type = ::cuda::std::mdspan<_ElementType, _Extents, _Layout, _Accessor>; + return ::cuda::__to_dlpack(__mdspan_type{__mdspan}, ::kDLCUDA, __device.get()); +} + +template +[[nodiscard]] _CCCL_HOST_API dlpack_tensor<_Extents::rank()> +to_dlpack(const ::cuda::managed_mdspan<_ElementType, _Extents, _Layout, _Accessor>& __mdspan) +{ + using __mdspan_type = ::cuda::std::mdspan<_ElementType, _Extents, _Layout, _Accessor>; + return ::cuda::__to_dlpack(__mdspan_type{__mdspan}, ::kDLCUDAManaged, 0); +} + +_CCCL_END_NAMESPACE_CUDA + +# include + +#endif // !_CCCL_COMPILER(NVRTC) && _CCCL_HAS_INCLUDE() +#endif // _CUDA___MDSPAN_MDSPAN_TO_DLPACK_H From 977909fc85266d14c7d95c07b1f6b773775f42e3 Mon Sep 17 00:00:00 2001 From: fbusato Date: Mon, 22 Dec 2025 17:38:19 -0800 Subject: [PATCH 13/56] add unit test --- .../include/cuda/__mdspan/dlpack_to_mdspan.h | 2 +- libcudacxx/include/cuda/mdspan | 1 + .../dlpack_to_mdspan.pass.cpp | 679 ++++++++++++++++++ 3 files changed, 681 insertions(+), 1 deletion(-) create mode 100644 libcudacxx/test/libcudacxx/cuda/containers/views/mdspan/dlpack_to_mdspan/dlpack_to_mdspan.pass.cpp diff --git a/libcudacxx/include/cuda/__mdspan/dlpack_to_mdspan.h b/libcudacxx/include/cuda/__mdspan/dlpack_to_mdspan.h index 44c1ea89fa2..98fdc8ed9b0 100644 --- a/libcudacxx/include/cuda/__mdspan/dlpack_to_mdspan.h +++ b/libcudacxx/include/cuda/__mdspan/dlpack_to_mdspan.h @@ -23,7 +23,7 @@ #if !_CCCL_COMPILER(NVRTC) && _CCCL_HAS_INCLUDE() # include -// # include // __data_type_to_dlpack +# include # include # include # include diff --git a/libcudacxx/include/cuda/mdspan b/libcudacxx/include/cuda/mdspan index ae81a30219a..5cac6773109 100644 --- a/libcudacxx/include/cuda/mdspan +++ b/libcudacxx/include/cuda/mdspan @@ -21,6 +21,7 @@ # pragma system_header #endif // no system header +#include #include #include #include diff --git a/libcudacxx/test/libcudacxx/cuda/containers/views/mdspan/dlpack_to_mdspan/dlpack_to_mdspan.pass.cpp b/libcudacxx/test/libcudacxx/cuda/containers/views/mdspan/dlpack_to_mdspan/dlpack_to_mdspan.pass.cpp new file mode 100644 index 00000000000..b56caa50e1f --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/containers/views/mdspan/dlpack_to_mdspan/dlpack_to_mdspan.pass.cpp @@ -0,0 +1,679 @@ +//===----------------------------------------------------------------------===// +// +// Part of the libcu++ Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// +// UNSUPPORTED: nvrtc + +#include + +#include +#include +#include +#include +#include + +#include + +#include "test_macros.h" + +template +using dlpack_array = cuda::std::array; + +//============================================================================== +// Test: Rank-0 mdspan conversion +//============================================================================== + +bool test_rank0() +{ + float data = 42.0f; + DLTensor tensor{}; + tensor.data = &data; + tensor.device = DLDevice{kDLCPU, 0}; + tensor.ndim = 0; + tensor.dtype = DLDataType{DLDataTypeCode::kDLFloat, 32, 1}; + + auto host_mdspan = cuda::to_host_mdspan(tensor); + + assert(host_mdspan.rank() == 0); + assert(host_mdspan.size() == 1); + assert(host_mdspan.data_handle() == &data); + assert(host_mdspan() == 42.0f); + return true; +} + +//============================================================================== +// Test: Empty tensor (zero in one dimension) +//============================================================================== + +bool test_empty_tensor() +{ + int dummy = 0; // Non-null but won't be accessed + dlpack_array<2> shape = {0, 5}; + dlpack_array<2> strides = {5, 1}; // row-major + DLTensor tensor{}; + tensor.data = &dummy; + tensor.device = DLDevice{kDLCPU, 0}; + tensor.ndim = 2; + tensor.dtype = DLDataType{DLDataTypeCode::kDLInt, 32, 1}; + tensor.shape = shape.data(); + tensor.strides = strides.data(); + + auto host_mdspan = cuda::to_host_mdspan(tensor); + + assert(host_mdspan.extent(0) == 0); + assert(host_mdspan.extent(1) == 5); + assert(host_mdspan.size() == 0); + assert(host_mdspan.empty()); + return true; +} + +//============================================================================== +// Test: Rank-1 mdspan with layout_right (row-major) +//============================================================================== + +bool test_rank1() +{ + cuda::std::array data = {1, 2, 3, 4, 5}; + dlpack_array<1> shape = {5}; + dlpack_array<1> strides = {1}; + DLTensor tensor{}; + tensor.data = data.data(); + tensor.device = DLDevice{kDLCPU, 0}; + tensor.ndim = 1; + tensor.dtype = ::DLDataType{::kDLInt, 32, 1}; + tensor.shape = shape.data(); + tensor.strides = strides.data(); + + auto host_mdspan_right = cuda::to_host_mdspan(tensor); + auto host_mdspan_left = cuda::to_host_mdspan(tensor); + auto host_mdspan_stride = cuda::to_host_mdspan(tensor); + + assert(host_mdspan_right.rank() == 1); + assert(host_mdspan_right.extent(0) == 5); + assert(host_mdspan_right.stride(0) == 1); + for (int i = 0; i < 5; ++i) + { + assert(host_mdspan_right(i) == data[i]); + } + assert(host_mdspan_left.rank() == 1); + assert(host_mdspan_left.extent(0) == 5); + assert(host_mdspan_left.stride(0) == 1); + for (int i = 0; i < 5; ++i) + { + assert(host_mdspan_left(i) == data[i]); + } + assert(host_mdspan_stride.rank() == 1); + assert(host_mdspan_stride.extent(0) == 5); + assert(host_mdspan_stride.stride(0) == 1); + for (int i = 0; i < 5; ++i) + { + assert(host_mdspan_stride(i) == data[i]); + } + return true; +} + +//============================================================================== +// Test: Rank-2 mdspan with layout_right (row-major) +//============================================================================== + +bool test_rank2_layout_right() +{ + // 2x3 matrix in row-major order + cuda::std::array data = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f}; + dlpack_array<2> shape = {2, 3}; + dlpack_array<2> strides = {3, 1}; // row-major + DLTensor tensor{}; + tensor.data = data.data(); + tensor.device = DLDevice{kDLCPU, 0}; + tensor.ndim = 2; + tensor.dtype = cuda::__data_type_to_dlpack(); + tensor.shape = shape.data(); + tensor.strides = strides.data(); + + auto host_mdspan = cuda::to_host_mdspan(tensor); + + assert(host_mdspan.rank() == 2); + assert(host_mdspan.extent(0) == 2); + assert(host_mdspan.extent(1) == 3); + assert(host_mdspan.stride(0) == 3); // row stride + assert(host_mdspan.stride(1) == 1); // column stride + + // Check values: row-major layout + assert(host_mdspan(0, 0) == 1.0f); + assert(host_mdspan(0, 1) == 2.0f); + assert(host_mdspan(0, 2) == 3.0f); + assert(host_mdspan(1, 0) == 4.0f); + assert(host_mdspan(1, 1) == 5.0f); + assert(host_mdspan(1, 2) == 6.0f); + return true; +} + +//============================================================================== +// Test: Rank-2 mdspan with layout_left (column-major) +//============================================================================== + +bool test_rank2_layout_left() +{ + // 2x3 matrix in column-major order + cuda::std::array data = {1.0f, 4.0f, 2.0f, 5.0f, 3.0f, 6.0f}; + dlpack_array<2> shape = {2, 3}; + dlpack_array<2> strides = {1, 2}; // column-major + DLTensor tensor{}; + tensor.data = data.data(); + tensor.device = DLDevice{kDLCPU, 0}; + tensor.ndim = 2; + tensor.dtype = cuda::__data_type_to_dlpack(); + tensor.shape = shape.data(); + tensor.strides = strides.data(); + + auto host_mdspan = cuda::to_host_mdspan(tensor); + + assert(host_mdspan.rank() == 2); + assert(host_mdspan.extent(0) == 2); + assert(host_mdspan.extent(1) == 3); + assert(host_mdspan.stride(0) == 1); // row stride + assert(host_mdspan.stride(1) == 2); // column stride + + // Check values: column-major layout + assert(host_mdspan(0, 0) == 1.0f); + assert(host_mdspan(0, 1) == 2.0f); + assert(host_mdspan(0, 2) == 3.0f); + assert(host_mdspan(1, 0) == 4.0f); + assert(host_mdspan(1, 1) == 5.0f); + assert(host_mdspan(1, 2) == 6.0f); + return true; +} + +//============================================================================== +// Test: Rank-2 mdspan with layout_stride (arbitrary strides) +//============================================================================== + +bool test_rank2_layout_stride() +{ + // 2x3 matrix with custom strides (e.g., padded) + cuda::std::array data = {1, 2, 3, 0, 4, 5, 6, 0}; // Each row padded to 4 elements + dlpack_array<2> shape = {2, 3}; + dlpack_array<2> strides = {4, 1}; // Row stride = 4 (padded), col stride = 1 + DLTensor tensor{}; + tensor.data = data.data(); + tensor.device = DLDevice{kDLCPU, 0}; + tensor.ndim = 2; + tensor.dtype = cuda::__data_type_to_dlpack(); + tensor.shape = shape.data(); + tensor.strides = strides.data(); + + auto host_mdspan = cuda::to_host_mdspan(tensor); + + assert(host_mdspan.rank() == 2); + assert(host_mdspan.extent(0) == 2); + assert(host_mdspan.extent(1) == 3); + assert(host_mdspan.stride(0) == 4); + assert(host_mdspan.stride(1) == 1); + + assert(host_mdspan(0, 0) == 1); + assert(host_mdspan(0, 1) == 2); + assert(host_mdspan(0, 2) == 3); + assert(host_mdspan(1, 0) == 4); + assert(host_mdspan(1, 1) == 5); + assert(host_mdspan(1, 2) == 6); + return true; +} + +//============================================================================== +// Test: layout_stride with default (layout_right) strides when strides is nullptr +// Note: This tests the fallback behavior for DLPack < 1.2 +//============================================================================== +#if !(DLPACK_MAJOR_VERSION == 1 && DLPACK_MINOR_VERSION >= 2) +bool test_layout_stride_null_strides() +{ + cuda::std::array data = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f}; + dlpack_array<2> shape = {2, 3}; + DLTensor tensor{}; + tensor.data = data.data(); + tensor.device = DLDevice{kDLCPU, 0}; + tensor.ndim = 2; + tensor.dtype = cuda::__data_type_to_dlpack(); + tensor.shape = shape.data(); + tensor.strides = nullptr; // null strides + + auto host_mdspan = cuda::to_host_mdspan(tensor); + + // Should use row-major strides by default + assert(host_mdspan.stride(0) == 3); + assert(host_mdspan.stride(1) == 1); + return true; +} +#endif + +//============================================================================== +// Test: byte_offset support +//============================================================================== + +bool test_byte_offset() +{ + cuda::std::array data = {0, 0, 1, 2, 3, 4, 5, 6}; + // Skip first 2 ints (8 bytes) + dlpack_array<1> shape = {6}; + dlpack_array<1> strides = {1}; + DLTensor tensor{}; + tensor.data = data.data(); + tensor.device = DLDevice{kDLCPU, 0}; + tensor.ndim = 1; + tensor.dtype = DLDataType{DLDataTypeCode::kDLInt, 32, 1}; + tensor.shape = shape.data(); + tensor.strides = strides.data(); + tensor.byte_offset = sizeof(int) * 2; + + auto host_mdspan = cuda::to_host_mdspan(tensor); + + assert(host_mdspan.extent(0) == 6); + assert(host_mdspan(0) == 1); + assert(host_mdspan(5) == 6); + return true; +} + +//============================================================================== +// Exception tests +//============================================================================== + +void test_exception_wrong_rank() +{ + cuda::std::array data{}; + dlpack_array<2> shape = {2, 3}; + dlpack_array<2> strides = {3, 1}; // row-major + DLTensor tensor{}; + tensor.data = data.data(); + tensor.device = DLDevice{kDLCPU, 0}; + tensor.ndim = 2; + tensor.dtype = DLDataType{DLDataTypeCode::kDLInt, 32, 1}; + tensor.shape = shape.data(); + tensor.strides = strides.data(); + + bool caught = false; + try + { + // Try to convert rank-2 tensor to rank-1 mdspan + unused(cuda::to_host_mdspan(tensor)); + } + catch (const std::invalid_argument&) + { + caught = true; + } + assert(caught); +} + +void test_exception_wrong_dtype() +{ + cuda::std::array data{}; + dlpack_array<1> shape = {4}; + dlpack_array<1> strides = {1}; + DLTensor tensor{}; + tensor.data = data.data(); + tensor.device = DLDevice{kDLCPU, 0}; + tensor.ndim = 1; + tensor.dtype = DLDataType{DLDataTypeCode::kDLInt, 32, 1}; // dtype is int + tensor.shape = shape.data(); + tensor.strides = strides.data(); + + bool caught = false; + try + { + // Try to convert int tensor to float mdspan + unused(cuda::to_host_mdspan(tensor)); + } + catch (const std::invalid_argument&) + { + caught = true; + } + assert(caught); +} + +void test_exception_null_data() +{ + dlpack_array<1> shape = {4}; + dlpack_array<1> strides = {1}; + DLTensor tensor{}; + tensor.data = nullptr; + tensor.device = DLDevice{kDLCPU, 0}; + tensor.ndim = 1; + tensor.dtype = DLDataType{DLDataTypeCode::kDLInt, 32, 1}; + tensor.shape = shape.data(); + tensor.strides = strides.data(); + + bool caught = false; + try + { + unused(cuda::to_host_mdspan(tensor)); + } + catch (const std::invalid_argument&) + { + caught = true; + } + assert(caught); +} + +void test_exception_null_shape() +{ + cuda::std::array data{}; + DLTensor tensor{}; + tensor.data = data.data(); + tensor.device = DLDevice{kDLCPU, 0}; + tensor.ndim = 1; + tensor.dtype = DLDataType{DLDataTypeCode::kDLInt, 32, 1}; + tensor.shape = nullptr; // null shape + + bool caught = false; + try + { + unused(cuda::to_host_mdspan(tensor)); + } + catch (const std::invalid_argument&) + { + caught = true; + } + assert(caught); +} + +void test_exception_negative_shape() +{ + cuda::std::array data{}; + dlpack_array<1> shape = {-3}; // negative shape + dlpack_array<1> strides = {1}; + DLTensor tensor{}; + tensor.data = data.data(); + tensor.device = DLDevice{kDLCPU, 0}; + tensor.ndim = 1; + tensor.dtype = DLDataType{DLDataTypeCode::kDLInt, 32, 1}; + tensor.shape = shape.data(); + tensor.strides = strides.data(); + + bool caught = false; + try + { + unused(cuda::to_host_mdspan(tensor)); + } + catch (const std::invalid_argument&) + { + caught = true; + } + assert(caught); +} + +void test_exception_wrong_device_type_host() +{ + cuda::std::array data{}; + dlpack_array<1> shape = {4}; + dlpack_array<1> strides = {1}; + DLTensor tensor{}; + tensor.data = data.data(); + tensor.device = DLDevice{::kDLCUDA, 0}; // CUDA device, not CPU + tensor.ndim = 1; + tensor.dtype = DLDataType{DLDataTypeCode::kDLInt, 32, 1}; + tensor.shape = shape.data(); + tensor.strides = strides.data(); + + bool caught = false; + try + { + unused(cuda::to_host_mdspan(tensor)); + } + catch (const std::invalid_argument&) + { + caught = true; + } + assert(caught); +} + +void test_exception_wrong_device_type_device() +{ + cuda::std::array data{}; + dlpack_array<1> shape = {4}; + dlpack_array<1> strides = {1}; + DLTensor tensor{}; + tensor.data = data.data(); + tensor.device = DLDevice{kDLCPU, 0}; // CPU device, not CUDA + tensor.ndim = 1; + tensor.dtype = DLDataType{DLDataTypeCode::kDLInt, 32, 1}; + tensor.shape = shape.data(); + tensor.strides = strides.data(); + + bool caught = false; + try + { + unused(cuda::to_device_mdspan(tensor)); + } + catch (const std::invalid_argument&) + { + caught = true; + } + assert(caught); +} + +void test_exception_wrong_device_type_managed() +{ + cuda::std::array data{}; + dlpack_array<1> shape = {4}; + dlpack_array<1> strides = {1}; + DLTensor tensor{}; + tensor.data = data.data(); + tensor.device = DLDevice{kDLCPU, 0}; // CPU device, not CUDA managed + tensor.ndim = 1; + tensor.dtype = DLDataType{DLDataTypeCode::kDLInt, 32, 1}; + tensor.shape = shape.data(); + tensor.strides = strides.data(); + + bool caught = false; + try + { + unused(cuda::to_managed_mdspan(tensor)); + } + catch (const std::invalid_argument&) + { + caught = true; + } + assert(caught); +} + +void test_exception_stride_mismatch_layout_right() +{ + cuda::std::array data{}; + dlpack_array<2> shape = {2, 3}; + dlpack_array<2> strides = {1, 2}; // Column-major, not row-major + DLTensor tensor{}; + tensor.data = data.data(); + tensor.device = DLDevice{kDLCPU, 0}; + tensor.ndim = 2; + tensor.dtype = DLDataType{DLDataTypeCode::kDLFloat, 32, 1}; + tensor.shape = shape.data(); + tensor.strides = strides.data(); + + bool caught = false; + try + { + unused(cuda::to_host_mdspan(tensor)); + } + catch (const std::invalid_argument&) + { + caught = true; + } + assert(caught); +} + +void test_exception_stride_mismatch_layout_left() +{ + cuda::std::array data{}; + dlpack_array<2> shape = {2, 3}; + dlpack_array<2> strides = {3, 1}; // Row-major, not column-major + DLTensor tensor{}; + tensor.data = data.data(); + tensor.device = DLDevice{kDLCPU, 0}; + tensor.ndim = 2; + tensor.dtype = DLDataType{DLDataTypeCode::kDLFloat, 32, 1}; + tensor.shape = shape.data(); + tensor.strides = strides.data(); + + bool caught = false; + try + { + unused(cuda::to_host_mdspan(tensor)); + } + catch (const std::invalid_argument&) + { + caught = true; + } + assert(caught); +} + +void test_exception_zero_stride_layout_stride() +{ + cuda::std::array data{}; + dlpack_array<2> shape = {2, 3}; + dlpack_array<2> strides = {0, 1}; // Zero stride is invalid + DLTensor tensor{}; + tensor.data = data.data(); + tensor.device = DLDevice{kDLCPU, 0}; + tensor.ndim = 2; + tensor.dtype = DLDataType{DLDataTypeCode::kDLInt, 32, 1}; + tensor.shape = shape.data(); + tensor.strides = strides.data(); + + bool caught = false; + try + { + unused(cuda::to_host_mdspan(tensor)); + } + catch (const std::invalid_argument&) + { + caught = true; + } + assert(caught); +} + +void test_exception_null_strides_dlpack_v12() +{ + cuda::std::array data{}; + dlpack_array<2> shape = {2, 3}; + DLTensor tensor{}; + tensor.data = data.data(); + tensor.device = DLDevice{kDLCPU, 0}; + tensor.ndim = 2; + tensor.dtype = DLDataType{DLDataTypeCode::kDLFloat, 32, 1}; + tensor.shape = shape.data(); + tensor.strides = nullptr; // null strides not allowed in DLPack v1.2+ + + bool caught = false; + try + { + unused(cuda::to_host_mdspan(tensor)); + } + catch (const std::invalid_argument&) + { + caught = true; + } + assert(caught); +} + +void test_exception_misaligned_data() +{ + // Create a buffer that allows us to get a misaligned pointer + alignas(16) cuda::std::array buffer{}; + // Get a pointer that's 1 byte into the buffer (misaligned for int) + auto misaligned_ptr = reinterpret_cast(buffer.data() + 1); + dlpack_array<1> shape = {3}; + dlpack_array<1> strides = {1}; + DLTensor tensor{}; + tensor.data = misaligned_ptr; + tensor.device = DLDevice{kDLCPU, 0}; + tensor.ndim = 1; + tensor.dtype = DLDataType{DLDataTypeCode::kDLInt, 32, 1}; + tensor.shape = shape.data(); + tensor.strides = strides.data(); + + bool caught = false; + try + { + unused(cuda::to_host_mdspan(tensor)); + } + catch (const std::invalid_argument&) + { + caught = true; + } + assert(caught); +} + +bool test_exceptions() +{ + test_exception_wrong_rank(); + test_exception_wrong_dtype(); + test_exception_null_data(); + test_exception_null_shape(); + test_exception_negative_shape(); + test_exception_wrong_device_type_host(); + test_exception_wrong_device_type_device(); + test_exception_wrong_device_type_managed(); + test_exception_stride_mismatch_layout_right(); + test_exception_stride_mismatch_layout_left(); + test_exception_zero_stride_layout_stride(); +#if DLPACK_MAJOR_VERSION > 1 || (DLPACK_MAJOR_VERSION == 1 && DLPACK_MINOR_VERSION >= 2) + test_exception_null_strides_dlpack_v12(); +#endif + test_exception_misaligned_data(); + return true; +} + +//============================================================================== +// Test: Return type checking +//============================================================================== + +bool test_return_types() +{ + cuda::std::array data{}; + dlpack_array<1> shape = {4}; + dlpack_array<1> strides = {1}; + DLTensor tensor{}; + tensor.data = data.data(); + tensor.device = DLDevice{kDLCPU, 0}; + tensor.ndim = 1; + tensor.dtype = cuda::__data_type_to_dlpack(); + tensor.shape = shape.data(); + tensor.strides = strides.data(); + + // Check return type of to_host_mdspan + auto host_ms = cuda::to_host_mdspan(tensor); + static_assert( + cuda::std::is_same_v, cuda::std::layout_stride>>); + assert(host_ms.extent(0) == 4); + + auto host_ms_right = cuda::to_host_mdspan(tensor); + static_assert( + cuda::std::is_same_v, cuda::std::layout_right>>); + assert(host_ms_right.extent(0) == 4); + + return true; +} + +int main(int, char**) +{ + NV_IF_TARGET( + NV_IS_HOST, + (assert(test_rank0()); // + assert(test_rank1()); + assert(test_rank2_layout_right()); + assert(test_rank2_layout_left()); + assert(test_rank2_layout_stride()); + assert(test_element_types()); + assert(test_byte_offset()); + assert(test_empty_tensor()); + assert(test_return_types()); + assert(test_exceptions());)) +#if !(DLPACK_MAJOR_VERSION > 1 || (DLPACK_MAJOR_VERSION == 1 && DLPACK_MINOR_VERSION >= 2)) + NV_IF_TARGET(NV_IS_HOST, (assert(test_layout_stride_null_strides());)) +#endif + return 0; +} From b0e1fbc2186538c0f3fcf94a2af91346ce8a5a0e Mon Sep 17 00:00:00 2001 From: Ashwin Srinath <3190405+shwina@users.noreply.github.com> Date: Thu, 18 Dec 2025 10:06:23 -0500 Subject: [PATCH 14/56] cuda.coop: Use cuda.core.experimental.Linker instead of internal numba-cuda Linker to link LTO (#7011) Co-authored-by: Ashwin Srinath --- python/cuda_cccl/cuda/coop/_types.py | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/python/cuda_cccl/cuda/coop/_types.py b/python/cuda_cccl/cuda/coop/_types.py index e808e1fb9e7..d630a75d43d 100644 --- a/python/cuda_cccl/cuda/coop/_types.py +++ b/python/cuda_cccl/cuda/coop/_types.py @@ -14,8 +14,8 @@ from numba.core import cgutils from numba.core.extending import intrinsic, overload from numba.core.typing import signature -from numba.cuda import LTOIR -from numba.cuda.cudadrv import driver as cuda_driver + +from cuda.core.experimental import Linker, LinkerOptions, ObjectCode from . import _nvrtc as nvrtc from ._common import find_unsigned @@ -765,16 +765,15 @@ def get_lto_ir(self, threads=None): # Convert the LTO into PTX in order to extract the size and alignment # variables. - obj = LTOIR(name=self.c_name, data=blob) - linker = cuda_driver._Linker.new( - cc=device.compute_capability, - additional_flags=["-ptx"], - lto=obj, + ltoir_obj = ObjectCode.from_ltoir(blob, name=self.c_name) + linker_options = LinkerOptions( + arch=f"sm_{cc}", + link_time_optimization=True, + ptx=True, ) - ltoir_bytes = obj.data - linker.add_ltoir(ltoir_bytes) - ptx = linker.get_linked_ptx() - ptx = ptx.decode("utf-8") + linker = Linker(ltoir_obj, options=linker_options) + linked_ptx = linker.link("ptx") + ptx = linked_ptx.code.decode("utf-8") self._temp_storage_bytes = find_unsigned("temp_storage_bytes", ptx) self._temp_storage_alignment = find_unsigned("temp_storage_alignment", ptx) From 50da3d406255162aa31dac6e0c1c106f3b8d7eb3 Mon Sep 17 00:00:00 2001 From: David Bayer <48736217+davebayer@users.noreply.github.com> Date: Thu, 18 Dec 2025 23:36:41 +0100 Subject: [PATCH 15/56] Make c2h vector comparisons `constexpr` (#7009) --- c2h/include/c2h/test_util_vec.h | 355 ++++++++++++++++---------------- 1 file changed, 178 insertions(+), 177 deletions(-) diff --git a/c2h/include/c2h/test_util_vec.h b/c2h/include/c2h/test_util_vec.h index 4c275d6a65a..f1cf1fb3116 100644 --- a/c2h/include/c2h/test_util_vec.h +++ b/c2h/include/c2h/test_util_vec.h @@ -49,198 +49,199 @@ inline int CoutCast(signed char val) /** * Vector1 overloads */ -# define C2H_VEC_OVERLOAD_1(T) \ - /* Ostream output */ \ - inline std::ostream& operator<<(std::ostream& os, const T& val) \ - { \ - os << '(' << CoutCast(val.x) << ')'; \ - return os; \ - } \ - /* Inequality */ \ - inline __host__ __device__ bool operator!=(const T& a, const T& b) \ - { \ - return (a.x != b.x); \ - } \ - /* Equality */ \ - inline __host__ __device__ bool operator==(const T& a, const T& b) \ - { \ - return (a.x == b.x); \ - } \ - /* Max */ \ - inline __host__ __device__ bool operator>(const T& a, const T& b) \ - { \ - return (a.x > b.x); \ - } \ - /* Min */ \ - inline __host__ __device__ bool operator<(const T& a, const T& b) \ - { \ - return (a.x < b.x); \ - } \ - /* Summation (non-reference addends for VS2003 -O3 warpscan workaround */ \ - inline __host__ __device__ T operator+(T a, T b) \ - { \ - T retval = make_##T(a.x + b.x); \ - return retval; \ +# define C2H_VEC_OVERLOAD_1(T) \ + /* Ostream output */ \ + inline std::ostream& operator<<(std::ostream& os, const T& val) \ + { \ + os << '(' << CoutCast(val.x) << ')'; \ + return os; \ + } \ + /* Inequality */ \ + inline __host__ __device__ constexpr bool operator!=(const T& a, const T& b) \ + { \ + return (a.x != b.x); \ + } \ + /* Equality */ \ + inline __host__ __device__ constexpr bool operator==(const T& a, const T& b) \ + { \ + return (a.x == b.x); \ + } \ + /* Max */ \ + inline __host__ __device__ constexpr bool operator>(const T& a, const T& b) \ + { \ + return (a.x > b.x); \ + } \ + /* Min */ \ + inline __host__ __device__ constexpr bool operator<(const T& a, const T& b) \ + { \ + return (a.x < b.x); \ + } \ + /* Summation (non-reference addends for VS2003 -O3 warpscan workaround */ \ + inline __host__ __device__ constexpr T operator+(T a, T b) \ + { \ + using V = decltype(T::x); \ + return T{static_cast(a.x + b.x)}; \ } /** * Vector2 overloads */ -# define C2H_VEC_OVERLOAD_2(T) \ - /* Ostream output */ \ - inline std::ostream& operator<<(std::ostream& os, const T& val) \ - { \ - os << '(' << CoutCast(val.x) << ',' << CoutCast(val.y) << ')'; \ - return os; \ - } \ - /* Inequality */ \ - inline __host__ __device__ bool operator!=(const T& a, const T& b) \ - { \ - return (a.x != b.x) || (a.y != b.y); \ - } \ - /* Equality */ \ - inline __host__ __device__ bool operator==(const T& a, const T& b) \ - { \ - return (a.x == b.x) && (a.y == b.y); \ - } \ - /* Max */ \ - inline __host__ __device__ bool operator>(const T& a, const T& b) \ - { \ - if (a.x > b.x) \ - return true; \ - else if (b.x > a.x) \ - return false; \ - return a.y > b.y; \ - } \ - /* Min */ \ - inline __host__ __device__ bool operator<(const T& a, const T& b) \ - { \ - if (a.x < b.x) \ - return true; \ - else if (b.x < a.x) \ - return false; \ - return a.y < b.y; \ - } \ - /* Summation (non-reference addends for VS2003 -O3 warpscan workaround */ \ - inline __host__ __device__ T operator+(T a, T b) \ - { \ - T retval = make_##T(a.x + b.x, a.y + b.y); \ - return retval; \ +# define C2H_VEC_OVERLOAD_2(T) \ + /* Ostream output */ \ + inline std::ostream& operator<<(std::ostream& os, const T& val) \ + { \ + os << '(' << CoutCast(val.x) << ',' << CoutCast(val.y) << ')'; \ + return os; \ + } \ + /* Inequality */ \ + inline __host__ __device__ constexpr bool operator!=(const T& a, const T& b) \ + { \ + return (a.x != b.x) || (a.y != b.y); \ + } \ + /* Equality */ \ + inline __host__ __device__ constexpr bool operator==(const T& a, const T& b) \ + { \ + return (a.x == b.x) && (a.y == b.y); \ + } \ + /* Max */ \ + inline __host__ __device__ constexpr bool operator>(const T& a, const T& b) \ + { \ + if (a.x > b.x) \ + return true; \ + else if (b.x > a.x) \ + return false; \ + return a.y > b.y; \ + } \ + /* Min */ \ + inline __host__ __device__ constexpr bool operator<(const T& a, const T& b) \ + { \ + if (a.x < b.x) \ + return true; \ + else if (b.x < a.x) \ + return false; \ + return a.y < b.y; \ + } \ + /* Summation (non-reference addends for VS2003 -O3 warpscan workaround */ \ + inline __host__ __device__ constexpr T operator+(T a, T b) \ + { \ + using V = decltype(T::x); \ + return T{static_cast(a.x + b.x), static_cast(a.y + b.y)}; \ } /** * Vector3 overloads */ -# define C2H_VEC_OVERLOAD_3(T) \ - /* Ostream output */ \ - inline std::ostream& operator<<(std::ostream& os, const T& val) \ - { \ - os << '(' << CoutCast(val.x) << ',' << CoutCast(val.y) << ',' << CoutCast(val.z) << ')'; \ - return os; \ - } \ - /* Inequality */ \ - inline __host__ __device__ bool operator!=(const T& a, const T& b) \ - { \ - return (a.x != b.x) || (a.y != b.y) || (a.z != b.z); \ - } \ - /* Equality */ \ - inline __host__ __device__ bool operator==(const T& a, const T& b) \ - { \ - return (a.x == b.x) && (a.y == b.y) && (a.z == b.z); \ - } \ - /* Max */ \ - inline __host__ __device__ bool operator>(const T& a, const T& b) \ - { \ - if (a.x > b.x) \ - return true; \ - else if (b.x > a.x) \ - return false; \ - if (a.y > b.y) \ - return true; \ - else if (b.y > a.y) \ - return false; \ - return a.z > b.z; \ - } \ - /* Min */ \ - inline __host__ __device__ bool operator<(const T& a, const T& b) \ - { \ - if (a.x < b.x) \ - return true; \ - else if (b.x < a.x) \ - return false; \ - if (a.y < b.y) \ - return true; \ - else if (b.y < a.y) \ - return false; \ - return a.z < b.z; \ - } \ - /* Summation (non-reference addends for VS2003 -O3 warpscan workaround */ \ - inline __host__ __device__ T operator+(T a, T b) \ - { \ - T retval = make_##T(a.x + b.x, a.y + b.y, a.z + b.z); \ - return retval; \ +# define C2H_VEC_OVERLOAD_3(T) \ + /* Ostream output */ \ + inline std::ostream& operator<<(std::ostream& os, const T& val) \ + { \ + os << '(' << CoutCast(val.x) << ',' << CoutCast(val.y) << ',' << CoutCast(val.z) << ')'; \ + return os; \ + } \ + /* Inequality */ \ + inline __host__ __device__ constexpr bool operator!=(const T& a, const T& b) \ + { \ + return (a.x != b.x) || (a.y != b.y) || (a.z != b.z); \ + } \ + /* Equality */ \ + inline __host__ __device__ constexpr bool operator==(const T& a, const T& b) \ + { \ + return (a.x == b.x) && (a.y == b.y) && (a.z == b.z); \ + } \ + /* Max */ \ + inline __host__ __device__ constexpr bool operator>(const T& a, const T& b) \ + { \ + if (a.x > b.x) \ + return true; \ + else if (b.x > a.x) \ + return false; \ + if (a.y > b.y) \ + return true; \ + else if (b.y > a.y) \ + return false; \ + return a.z > b.z; \ + } \ + /* Min */ \ + inline __host__ __device__ constexpr bool operator<(const T& a, const T& b) \ + { \ + if (a.x < b.x) \ + return true; \ + else if (b.x < a.x) \ + return false; \ + if (a.y < b.y) \ + return true; \ + else if (b.y < a.y) \ + return false; \ + return a.z < b.z; \ + } \ + /* Summation (non-reference addends for VS2003 -O3 warpscan workaround */ \ + inline __host__ __device__ constexpr T operator+(T a, T b) \ + { \ + using V = decltype(T::x); \ + return T{static_cast(a.x + b.x), static_cast(a.y + b.y), static_cast(a.z + b.z)}; \ } /** * Vector4 overloads */ -# define C2H_VEC_OVERLOAD_4(T) \ - /* Ostream output */ \ - inline std::ostream& operator<<(std::ostream& os, const T& val) \ - { \ - os << '(' << CoutCast(val.x) << ',' << CoutCast(val.y) << ',' << CoutCast(val.z) << ',' << CoutCast(val.w) \ - << ')'; \ - return os; \ - } \ - /* Inequality */ \ - inline __host__ __device__ bool operator!=(const T& a, const T& b) \ - { \ - return (a.x != b.x) || (a.y != b.y) || (a.z != b.z) || (a.w != b.w); \ - } \ - /* Equality */ \ - inline __host__ __device__ bool operator==(const T& a, const T& b) \ - { \ - return (a.x == b.x) && (a.y == b.y) && (a.z == b.z) && (a.w == b.w); \ - } \ - /* Max */ \ - inline __host__ __device__ bool operator>(const T& a, const T& b) \ - { \ - if (a.x > b.x) \ - return true; \ - else if (b.x > a.x) \ - return false; \ - if (a.y > b.y) \ - return true; \ - else if (b.y > a.y) \ - return false; \ - if (a.z > b.z) \ - return true; \ - else if (b.z > a.z) \ - return false; \ - return a.w > b.w; \ - } \ - /* Min */ \ - inline __host__ __device__ bool operator<(const T& a, const T& b) \ - { \ - if (a.x < b.x) \ - return true; \ - else if (b.x < a.x) \ - return false; \ - if (a.y < b.y) \ - return true; \ - else if (b.y < a.y) \ - return false; \ - if (a.z < b.z) \ - return true; \ - else if (b.z < a.z) \ - return false; \ - return a.w < b.w; \ - } \ - /* Summation (non-reference addends for VS2003 -O3 warpscan workaround */ \ - inline __host__ __device__ T operator+(T a, T b) \ - { \ - const auto retval = make_##T(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w); \ - return retval; \ +# define C2H_VEC_OVERLOAD_4(T) \ + /* Ostream output */ \ + inline std::ostream& operator<<(std::ostream& os, const T& val) \ + { \ + os << '(' << CoutCast(val.x) << ',' << CoutCast(val.y) << ',' << CoutCast(val.z) << ',' << CoutCast(val.w) \ + << ')'; \ + return os; \ + } \ + /* Inequality */ \ + inline __host__ __device__ constexpr bool operator!=(const T& a, const T& b) \ + { \ + return (a.x != b.x) || (a.y != b.y) || (a.z != b.z) || (a.w != b.w); \ + } \ + /* Equality */ \ + inline __host__ __device__ constexpr bool operator==(const T& a, const T& b) \ + { \ + return (a.x == b.x) && (a.y == b.y) && (a.z == b.z) && (a.w == b.w); \ + } \ + /* Max */ \ + inline __host__ __device__ constexpr bool operator>(const T& a, const T& b) \ + { \ + if (a.x > b.x) \ + return true; \ + else if (b.x > a.x) \ + return false; \ + if (a.y > b.y) \ + return true; \ + else if (b.y > a.y) \ + return false; \ + if (a.z > b.z) \ + return true; \ + else if (b.z > a.z) \ + return false; \ + return a.w > b.w; \ + } \ + /* Min */ \ + inline __host__ __device__ constexpr bool operator<(const T& a, const T& b) \ + { \ + if (a.x < b.x) \ + return true; \ + else if (b.x < a.x) \ + return false; \ + if (a.y < b.y) \ + return true; \ + else if (b.y < a.y) \ + return false; \ + if (a.z < b.z) \ + return true; \ + else if (b.z < a.z) \ + return false; \ + return a.w < b.w; \ + } \ + /* Summation (non-reference addends for VS2003 -O3 warpscan workaround */ \ + inline __host__ __device__ constexpr T operator+(T a, T b) \ + { \ + using V = decltype(T::x); \ + return T{ \ + static_cast(a.x + b.x), static_cast(a.y + b.y), static_cast(a.z + b.z), static_cast(a.w + b.w)}; \ } /** From f8a4d06c5c4ccf00f7a621c23cb2bd5ddf54138d Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Thu, 18 Dec 2025 23:49:12 +0100 Subject: [PATCH 16/56] improves comments on decoupled lookback example (#7015) --- .../device/example_device_decoupled_look_back.cu | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/cub/examples/device/example_device_decoupled_look_back.cu b/cub/examples/device/example_device_decoupled_look_back.cu index fae75c7102f..035452bb8ae 100644 --- a/cub/examples/device/example_device_decoupled_look_back.cu +++ b/cub/examples/device/example_device_decoupled_look_back.cu @@ -51,8 +51,14 @@ __global__ void decoupled_look_back_kernel(cub::ScanTileState tile_sta if (warp_id == 0) { - // Perform the decoupled look-back - // Invocation of the prefix will block until the look-back is complete. + // Perform the decoupled look-back. + // 1. Publish the block's local aggregate to global memory immediately. + // This allows downstream blocks to include this tile's contribution without waiting for this block to fully + // resolve its global prefix. + // 2. Block and traverse predecessor tiles (look-back) to compute the global exclusive prefix for this tile. + // 3. Update this tile's global state to 'Prefix' (inclusive sum), creating a checkpoint that stops the look-back + // of downstream blocks. + // Note, the invocation of the prefix will block until the look-back is complete. MessageT exclusive_prefix = prefix(block_aggregate); if (tid == 0) From e9f0a13dd68e5f8931403cacf1f6346e822bc215 Mon Sep 17 00:00:00 2001 From: Bernhard Manfred Gruber Date: Thu, 18 Dec 2025 23:57:10 +0100 Subject: [PATCH 17/56] Extract reduce_op_sync into a free function (#7004) This allows us to use it independently --- .../warp/specializations/warp_reduce_shfl.cuh | 76 +++++++++---------- 1 file changed, 38 insertions(+), 38 deletions(-) diff --git a/cub/cub/warp/specializations/warp_reduce_shfl.cuh b/cub/cub/warp/specializations/warp_reduce_shfl.cuh index bd060e3715a..ee207f8f726 100644 --- a/cub/cub/warp/specializations/warp_reduce_shfl.cuh +++ b/cub/cub/warp/specializations/warp_reduce_shfl.cuh @@ -41,6 +41,43 @@ CUB_NAMESPACE_BEGIN namespace detail { +template +_CCCL_DEVICE _CCCL_FORCEINLINE T reduce_op_sync(T input, const uint32_t mask, ReductionOp) +{ + static_assert(::cuda::std::is_integral_v, "T must be an integral type"); + static_assert(sizeof(T) <= sizeof(unsigned), "T must be less than or equal to unsigned"); + using promoted_t = ::cuda::std::conditional_t<::cuda::std::is_unsigned_v, unsigned, int>; + if constexpr (is_cuda_maximum_v) + { + return static_cast(__reduce_max_sync(mask, static_cast(input))); + } + else if constexpr (is_cuda_minimum_v) + { + return static_cast(__reduce_min_sync(mask, static_cast(input))); + } + else if constexpr (is_cuda_std_plus_v) + { + return static_cast(__reduce_add_sync(mask, static_cast(input))); + } + else if constexpr (is_cuda_std_bit_and_v) + { + return static_cast(__reduce_and_sync(mask, static_cast(input))); + } + else if constexpr (is_cuda_std_bit_or_v) + { + return static_cast(__reduce_or_sync(mask, static_cast(input))); + } + else if constexpr (is_cuda_std_bit_xor_v) + { + return static_cast(__reduce_xor_sync(mask, static_cast(input))); + } + else + { + _CCCL_UNREACHABLE(); + return T{}; + } +} + /** * @brief WarpReduceShfl provides SHFL-based variants of parallel reduction of items partitioned * across a CUDA thread warp. @@ -444,43 +481,6 @@ struct WarpReduceShfl // Reduction operations //--------------------------------------------------------------------- - template - _CCCL_DEVICE _CCCL_FORCEINLINE T reduce_op_sync(T input, ReductionOp) - { - static_assert(::cuda::std::is_integral_v, "T must be an integral type"); - static_assert(sizeof(T) <= sizeof(unsigned), "T must be less than or equal to unsigned"); - using promoted_t = ::cuda::std::conditional_t<::cuda::std::is_unsigned_v, unsigned, int>; - if constexpr (is_cuda_maximum_v) - { - return static_cast(__reduce_max_sync(member_mask, static_cast(input))); - } - else if constexpr (is_cuda_minimum_v) - { - return static_cast(__reduce_min_sync(member_mask, static_cast(input))); - } - else if constexpr (is_cuda_std_plus_v) - { - return static_cast(__reduce_add_sync(member_mask, static_cast(input))); - } - else if constexpr (is_cuda_std_bit_and_v) - { - return static_cast(__reduce_and_sync(member_mask, static_cast(input))); - } - else if constexpr (is_cuda_std_bit_or_v) - { - return static_cast(__reduce_or_sync(member_mask, static_cast(input))); - } - else if constexpr (is_cuda_std_bit_xor_v) - { - return static_cast(__reduce_xor_sync(member_mask, static_cast(input))); - } - else - { - _CCCL_UNREACHABLE(); - return T{}; - } - } - /** * @brief Reduction * @@ -504,7 +504,7 @@ struct WarpReduceShfl && (is_cuda_minimum_maximum_v || is_cuda_std_plus_v || is_cuda_std_bitwise_v) ) { - NV_IF_TARGET(NV_PROVIDES_SM_80, (return reduce_op_sync(input, reduction_op);)) + NV_IF_TARGET(NV_PROVIDES_SM_80, (return reduce_op_sync(input, member_mask, reduction_op);)) } T output = input; // Template-iterate reduction steps From 362d316ca5b9aa626795d732502b23adfff1d3c0 Mon Sep 17 00:00:00 2001 From: Nader Al Awar Date: Thu, 18 Dec 2025 17:20:52 -0600 Subject: [PATCH 18/56] Remove experimental namespace from cuda.core import (#7022) --- python/cuda_cccl/cuda/compute/_caching.py | 5 ++++- .../cuda_cccl/cuda/compute/_utils/temp_storage_buffer.py | 9 +++++++-- python/cuda_cccl/cuda/coop/_types.py | 5 ++++- 3 files changed, 15 insertions(+), 4 deletions(-) diff --git a/python/cuda_cccl/cuda/compute/_caching.py b/python/cuda_cccl/cuda/compute/_caching.py index 394125017a9..0443f38c0ea 100644 --- a/python/cuda_cccl/cuda/compute/_caching.py +++ b/python/cuda_cccl/cuda/compute/_caching.py @@ -5,7 +5,10 @@ import functools -from cuda.core.experimental import Device +try: + from cuda.core import Device +except ImportError: + from cuda.core.experimental import Device def cache_with_key(key): diff --git a/python/cuda_cccl/cuda/compute/_utils/temp_storage_buffer.py b/python/cuda_cccl/cuda/compute/_utils/temp_storage_buffer.py index 27fdb0f9b13..82f5a729890 100644 --- a/python/cuda_cccl/cuda/compute/_utils/temp_storage_buffer.py +++ b/python/cuda_cccl/cuda/compute/_utils/temp_storage_buffer.py @@ -4,8 +4,13 @@ from typing import Optional from cuda.bindings import driver, runtime -from cuda.core.experimental import Device -from cuda.core.experimental._utils.cuda_utils import handle_return + +try: + from cuda.core import Device + from cuda.core._utils.cuda_utils import handle_return +except ImportError: + from cuda.core.experimental import Device + from cuda.core.experimental._utils.cuda_utils import handle_return from ..typing import StreamLike diff --git a/python/cuda_cccl/cuda/coop/_types.py b/python/cuda_cccl/cuda/coop/_types.py index d630a75d43d..275c365ecbe 100644 --- a/python/cuda_cccl/cuda/coop/_types.py +++ b/python/cuda_cccl/cuda/coop/_types.py @@ -15,7 +15,10 @@ from numba.core.extending import intrinsic, overload from numba.core.typing import signature -from cuda.core.experimental import Linker, LinkerOptions, ObjectCode +try: + from cuda.core import Linker, LinkerOptions, ObjectCode +except ImportError: + from cuda.core.experimental import Linker, LinkerOptions, ObjectCode from . import _nvrtc as nvrtc from ._common import find_unsigned From 28d22c91a2fac32f079ce71832c750f9fd1ca3f7 Mon Sep 17 00:00:00 2001 From: Eric Niebler Date: Thu, 18 Dec 2025 17:03:30 -0800 Subject: [PATCH 19/56] reexpress completion signature transform alias to make clangd happy (#7026) --- .../cuda/experimental/__execution/completion_signatures.cuh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cudax/include/cuda/experimental/__execution/completion_signatures.cuh b/cudax/include/cuda/experimental/__execution/completion_signatures.cuh index d89ef9b03d6..5a020bed9a2 100644 --- a/cudax/include/cuda/experimental/__execution/completion_signatures.cuh +++ b/cudax/include/cuda/experimental/__execution/completion_signatures.cuh @@ -356,7 +356,8 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT completion_signatures //! @tparam _Fn The callable metafunction to apply. //! @tparam _Continuation The template to collect results into. template > - using __transform _CCCL_NODEBUG_ALIAS = __transform_q<_Fn::template __call, _Continuation::template __call>; + using __transform _CCCL_NODEBUG_ALIAS = + ::cuda::std::__type_call<_Continuation, ::cuda::std::__type_apply<_Fn, _Sigs>...>; //! @brief Calls a metafunction with the signatures as arguments. //! @tparam _Fn The metafunction to call. From 1e28e8c7f0c2016ca0134248d5353a8f975ad100 Mon Sep 17 00:00:00 2001 From: Eric Niebler Date: Thu, 18 Dec 2025 17:14:42 -0800 Subject: [PATCH 20/56] Qualify call to `__launch_impl` in launch.h to avoid ambiguity errors (#7024) Co-authored-by: pciolkosz --- libcudacxx/include/cuda/__launch/launch.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libcudacxx/include/cuda/__launch/launch.h b/libcudacxx/include/cuda/__launch/launch.h index 8ed5350b301..e2df77229b6 100644 --- a/libcudacxx/include/cuda/__launch/launch.h +++ b/libcudacxx/include/cuda/__launch/launch.h @@ -190,7 +190,7 @@ _CCCL_HOST_API auto launch(_Submitter&& __submitter, { auto __launcher = __kernel_launcher>...>; - return __launch_impl( + return ::cuda::__launch_impl( cuda::__forward_or_cast_to_stream_ref<_Submitter>(::cuda::std::forward<_Submitter>(__submitter)), __combined, ::cuda::__get_cufunction_of(__launcher), From f21a15834f065299bff007ad5a076ed3fdb12e3b Mon Sep 17 00:00:00 2001 From: David Bayer <48736217+davebayer@users.noreply.github.com> Date: Fri, 19 Dec 2025 02:14:52 +0100 Subject: [PATCH 21/56] Rework hierarchy levels (#6957) * Rework hierarchy levels * add missing launches to native cluster level queries * remove dependency on runtime storage --------- Co-authored-by: pciolkosz --- cudax/examples/simple_p2p.cu | 2 +- .../cuda/experimental/__execution/queries.cuh | 1 + .../__execution/stream/adaptor.cuh | 5 +- .../__execution/stream/scheduler.cuh | 4 +- .../cuda/experimental/__launch/launch.cuh | 9 +- cudax/test/common/host_device.cuh | 4 +- cudax/test/launch/launch_smoke.cu | 6 +- libcudacxx/include/cuda/__fwd/hierarchy.h | 69 ++++ .../include/cuda/__hierarchy/block_level.h | 141 ++++++++ .../include/cuda/__hierarchy/cluster_level.h | 74 ++++ .../include/cuda/__hierarchy/dimensions.h | 21 +- .../cuda/__hierarchy/get_launch_dimensions.h | 90 +++++ .../include/cuda/__hierarchy/grid_level.h | 44 +++ .../cuda/__hierarchy/hierarchy_dimensions.h | 107 ++---- .../cuda/__hierarchy/hierarchy_level_base.h | 339 ++++++++++++++++++ .../cuda/__hierarchy/hierarchy_levels.h | 99 +---- .../cuda/__hierarchy/hierarchy_query_result.h | 149 ++++++++ .../cuda/__hierarchy/level_dimensions.h | 16 +- .../__hierarchy/native_hierarchy_level_base.h | 203 +++++++++++ .../include/cuda/__hierarchy/thread_level.h | 96 +++++ libcudacxx/include/cuda/__hierarchy/traits.h | 119 ++++++ .../include/cuda/__hierarchy/warp_level.h | 67 ++++ libcudacxx/include/cuda/__launch/launch.h | 13 +- libcudacxx/include/cuda/hierarchy | 16 +- .../cuda/ccclrt/common/host_device.cuh | 4 +- .../hierarchy/hierarchy_custom_types.cu | 8 +- .../cuda/ccclrt/hierarchy/hierarchy_smoke.cu | 240 ++++++------- .../cuda/ccclrt/launch/configuration.cu | 10 +- .../cuda/ccclrt/launch/launch_smoke.cu | 6 +- .../cuda/containers/buffer/transform.cu | 5 +- .../block_level/hierarchy_queries.pass.cpp | 240 +++++++++++++ ...ierarchy_query_signatures.compile.pass.cpp | 128 +++++++ .../native_hierarchy_queries.pass.cpp | 125 +++++++ ...ierarchy_query_signatures.compile.pass.cpp | 93 +++++ .../cluster_level/hierarchy_queries.pass.cpp | 185 ++++++++++ ...ierarchy_query_signatures.compile.pass.cpp | 119 ++++++ .../native_hierarchy_queries.pass.cpp | 97 +++++ ...ierarchy_query_signatures.compile.pass.cpp | 94 +++++ .../hierarchy_objects.compile.pass.cpp | 24 ++ .../hierarchy/hierarchy_query_result.pass.cpp | 113 ++++++ .../thread_level/hierarchy_queries.pass.cpp | 285 +++++++++++++++ ...ierarchy_query_signatures.compile.pass.cpp | 129 +++++++ .../native_hierarchy_queries.pass.cpp | 169 +++++++++ ...ierarchy_query_signatures.compile.pass.cpp | 102 ++++++ ...achable_hierarchy_level_v.compile.pass.cpp | 59 +++ .../native_hierarchy_queries.pass.cpp | 164 +++++++++ ...ierarchy_query_signatures.compile.pass.cpp | 95 +++++ libcudacxx/test/support/hierarchy_queries.h | 115 ++++++ 48 files changed, 3964 insertions(+), 339 deletions(-) create mode 100644 libcudacxx/include/cuda/__fwd/hierarchy.h create mode 100644 libcudacxx/include/cuda/__hierarchy/block_level.h create mode 100644 libcudacxx/include/cuda/__hierarchy/cluster_level.h create mode 100644 libcudacxx/include/cuda/__hierarchy/get_launch_dimensions.h create mode 100644 libcudacxx/include/cuda/__hierarchy/grid_level.h create mode 100644 libcudacxx/include/cuda/__hierarchy/hierarchy_level_base.h create mode 100644 libcudacxx/include/cuda/__hierarchy/hierarchy_query_result.h create mode 100644 libcudacxx/include/cuda/__hierarchy/native_hierarchy_level_base.h create mode 100644 libcudacxx/include/cuda/__hierarchy/thread_level.h create mode 100644 libcudacxx/include/cuda/__hierarchy/traits.h create mode 100644 libcudacxx/include/cuda/__hierarchy/warp_level.h create mode 100644 libcudacxx/test/libcudacxx/cuda/hierarchy/block_level/hierarchy_queries.pass.cpp create mode 100644 libcudacxx/test/libcudacxx/cuda/hierarchy/block_level/hierarchy_query_signatures.compile.pass.cpp create mode 100644 libcudacxx/test/libcudacxx/cuda/hierarchy/block_level/native_hierarchy_queries.pass.cpp create mode 100644 libcudacxx/test/libcudacxx/cuda/hierarchy/block_level/native_hierarchy_query_signatures.compile.pass.cpp create mode 100644 libcudacxx/test/libcudacxx/cuda/hierarchy/cluster_level/hierarchy_queries.pass.cpp create mode 100644 libcudacxx/test/libcudacxx/cuda/hierarchy/cluster_level/hierarchy_query_signatures.compile.pass.cpp create mode 100644 libcudacxx/test/libcudacxx/cuda/hierarchy/cluster_level/native_hierarchy_queries.pass.cpp create mode 100644 libcudacxx/test/libcudacxx/cuda/hierarchy/cluster_level/native_hierarchy_query_signatures.compile.pass.cpp create mode 100644 libcudacxx/test/libcudacxx/cuda/hierarchy/hierarchy_objects.compile.pass.cpp create mode 100644 libcudacxx/test/libcudacxx/cuda/hierarchy/hierarchy_query_result.pass.cpp create mode 100644 libcudacxx/test/libcudacxx/cuda/hierarchy/thread_level/hierarchy_queries.pass.cpp create mode 100644 libcudacxx/test/libcudacxx/cuda/hierarchy/thread_level/hierarchy_query_signatures.compile.pass.cpp create mode 100644 libcudacxx/test/libcudacxx/cuda/hierarchy/thread_level/native_hierarchy_queries.pass.cpp create mode 100644 libcudacxx/test/libcudacxx/cuda/hierarchy/thread_level/native_hierarchy_query_signatures.compile.pass.cpp create mode 100644 libcudacxx/test/libcudacxx/cuda/hierarchy/traits/is_natively_reachable_hierarchy_level_v.compile.pass.cpp create mode 100644 libcudacxx/test/libcudacxx/cuda/hierarchy/warp_level/native_hierarchy_queries.pass.cpp create mode 100644 libcudacxx/test/libcudacxx/cuda/hierarchy/warp_level/native_hierarchy_query_signatures.compile.pass.cpp create mode 100644 libcudacxx/test/support/hierarchy_queries.h diff --git a/cudax/examples/simple_p2p.cu b/cudax/examples/simple_p2p.cu index 07a9b174f3f..7b3b42b8a4b 100644 --- a/cudax/examples/simple_p2p.cu +++ b/cudax/examples/simple_p2p.cu @@ -52,7 +52,7 @@ struct simple_kernel __device__ void operator()(Configuration config, ::cuda::std::span src, ::cuda::std::span dst) { // Just a dummy kernel, doing enough for us to verify that everything worked - const auto idx = config.dims.rank(cuda::thread); + const auto idx = config.dims.rank(cuda::gpu_thread); dst[idx] = src[idx] * 2.0f; } }; diff --git a/cudax/include/cuda/experimental/__execution/queries.cuh b/cudax/include/cuda/experimental/__execution/queries.cuh index 9a36b021764..f89881c7d19 100644 --- a/cudax/include/cuda/experimental/__execution/queries.cuh +++ b/cudax/include/cuda/experimental/__execution/queries.cuh @@ -26,6 +26,7 @@ _CCCL_SUPPRESS_DEPRECATED_PUSH _CCCL_SUPPRESS_DEPRECATED_POP #include +#include #include #include #include diff --git a/cudax/include/cuda/experimental/__execution/stream/adaptor.cuh b/cudax/include/cuda/experimental/__execution/stream/adaptor.cuh index 5fca08ae21c..66f8dcc402e 100644 --- a/cudax/include/cuda/experimental/__execution/stream/adaptor.cuh +++ b/cudax/include/cuda/experimental/__execution/stream/adaptor.cuh @@ -23,6 +23,7 @@ #include #include +#include #include #include #include @@ -269,7 +270,7 @@ private: // the receiver tell us how to launch the kernel. auto const __launch_config = get_launch_config(execution::get_env(__state.__state_.__rcvr_)); using __launch_dims_t = decltype(__launch_config.dims); - constexpr int __block_threads = __launch_dims_t::static_count(thread, block); + constexpr int __block_threads = __launch_dims_t::static_count(gpu_thread, block); // Start the child operation state. This will launch kernels for all the predecessors // of this operation. @@ -295,7 +296,7 @@ private: _CCCL_DEVICE_API void __device_start() noexcept { using __launch_dims_t = __dims_of_t<__rcvr_config_t>; - constexpr int __block_threads = __launch_dims_t::static_count(thread, block); + constexpr int __block_threads = __launch_dims_t::static_count(gpu_thread, block); auto& __state = __get_state(); // without the following, the kernel in __host_start will fail to launch with diff --git a/cudax/include/cuda/experimental/__execution/stream/scheduler.cuh b/cudax/include/cuda/experimental/__execution/stream/scheduler.cuh index 09f866b2f4a..7c83a6a2583 100644 --- a/cudax/include/cuda/experimental/__execution/stream/scheduler.cuh +++ b/cudax/include/cuda/experimental/__execution/stream/scheduler.cuh @@ -134,7 +134,7 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT stream_scheduler // the completion kernel, we will be completing the parent's receiver, so we must let // the receiver tell us how to launch the kernel. auto const __launch_dims = get_launch_config(execution::get_env(__rcvr_)).dims; - constexpr int __block_threads = decltype(__launch_dims)::static_count(cuda::thread, cuda::block); + constexpr int __block_threads = decltype(__launch_dims)::static_count(cuda::gpu_thread, cuda::block); int const __grid_blocks = __launch_dims.count(cuda::block, cuda::grid); static_assert(__block_threads != ::cuda::std::dynamic_extent); @@ -153,7 +153,7 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT stream_scheduler _CCCL_DEVICE_API void __device_start() noexcept { using __launch_dims_t = decltype(get_launch_config(execution::get_env(__rcvr_)).dims); - constexpr int __block_threads = __launch_dims_t::static_count(cuda::thread, cuda::block); + constexpr int __block_threads = __launch_dims_t::static_count(cuda::gpu_thread, cuda::block); // without the following, the kernel in __host_start will fail to launch with // cudaErrorInvalidDeviceFunction. diff --git a/cudax/include/cuda/experimental/__launch/launch.cuh b/cudax/include/cuda/experimental/__launch/launch.cuh index c3d042f49eb..09519362f7b 100644 --- a/cudax/include/cuda/experimental/__launch/launch.cuh +++ b/cudax/include/cuda/experimental/__launch/launch.cuh @@ -22,6 +22,7 @@ #endif // no system header #include +#include #include #include #include @@ -108,7 +109,7 @@ _CCCL_HOST_API auto __launch_impl(_Dst&& __dst, _Config __conf, ::CUfunction __k static_assert(!::cuda::std::is_same_v, "Can't launch a configuration without hierarchy dimensions"); ::CUlaunchConfig __config{}; - constexpr bool __has_cluster_level = has_level; + constexpr bool __has_cluster_level = has_level_v; constexpr unsigned int __num_attrs_needed = ::cuda::__detail::kernel_config_count_attr_space(__conf) + __has_cluster_level; ::CUlaunchAttribute __attrs[__num_attrs_needed == 0 ? 1 : __num_attrs_needed]; @@ -124,9 +125,9 @@ _CCCL_HOST_API auto __launch_impl(_Dst&& __dst, _Config __conf, ::CUfunction __k __config.gridDimX = static_cast(__conf.dims.extents(block, grid).x); __config.gridDimY = static_cast(__conf.dims.extents(block, grid).y); __config.gridDimZ = static_cast(__conf.dims.extents(block, grid).z); - __config.blockDimX = static_cast(__conf.dims.extents(thread, block).x); - __config.blockDimY = static_cast(__conf.dims.extents(thread, block).y); - __config.blockDimZ = static_cast(__conf.dims.extents(thread, block).z); + __config.blockDimX = static_cast(__conf.dims.extents(gpu_thread, block).x); + __config.blockDimY = static_cast(__conf.dims.extents(gpu_thread, block).y); + __config.blockDimZ = static_cast(__conf.dims.extents(gpu_thread, block).z); if constexpr (__has_cluster_level) { diff --git a/cudax/test/common/host_device.cuh b/cudax/test/common/host_device.cuh index ff4744d0ccf..e74bc42ff1a 100644 --- a/cudax/test/common/host_device.cuh +++ b/cudax/test/common/host_device.cuh @@ -67,10 +67,10 @@ void test_host_dev(const Dims& dims, const Lambda& lambda, const Filters&... fil cudaLaunchAttribute attrs[1]; config.attrs = &attrs[0]; - config.blockDim = dims.extents(cuda::thread, cuda::block); + config.blockDim = dims.extents(cuda::gpu_thread, cuda::block); config.gridDim = dims.extents(cuda::block, cuda::grid); - if constexpr (cuda::has_level) + if constexpr (cuda::has_level_v) { dim3 cluster_dims = dims.extents(cuda::block, cuda::cluster); config.attrs[config.numAttrs].id = cudaLaunchAttributeClusterDimension; diff --git a/cudax/test/launch/launch_smoke.cu b/cudax/test/launch/launch_smoke.cu index b6baab937bc..5ade1c0cf03 100644 --- a/cudax/test/launch/launch_smoke.cu +++ b/cudax/test/launch/launch_smoke.cu @@ -55,7 +55,7 @@ struct functor_taking_config template __device__ void operator()(Config config, int grid_size) { - static_assert(config.dims.static_count(cuda::thread, cuda::block) == BlockSize); + static_assert(config.dims.static_count(cuda::gpu_thread, cuda::block) == BlockSize); CUDAX_REQUIRE(config.dims.count(cuda::block, cuda::grid) == grid_size); kernel_run_proof = true; } @@ -248,7 +248,7 @@ void launch_smoke_test(StreamOrPathBuilder& dst) // Lambda { cudax::launch(dst, cuda::block_dims<256>() & cuda::grid_dims(1), [] __device__(auto config) { - if (config.dims.rank(cuda::thread, cuda::block) == 0) + if (config.dims.rank(cuda::gpu_thread, cuda::block) == 0) { printf("Hello from the GPU\n"); kernel_run_proof = true; @@ -354,7 +354,7 @@ void test_default_config() auto block = cuda::block_dims<256>; auto verify_lambda = [] __device__(auto config) { - static_assert(config.dims.count(cuda::thread, cuda::block) == 256); + static_assert(config.dims.count(cuda::gpu_thread, cuda::block) == 256); CUDAX_REQUIRE(config.dims.count(cuda::block) == 4); cooperative_groups::this_grid().sync(); }; diff --git a/libcudacxx/include/cuda/__fwd/hierarchy.h b/libcudacxx/include/cuda/__fwd/hierarchy.h new file mode 100644 index 00000000000..54cbeadeca1 --- /dev/null +++ b/libcudacxx/include/cuda/__fwd/hierarchy.h @@ -0,0 +1,69 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDA___FWD_HIERARCHY_H +#define _CUDA___FWD_HIERARCHY_H + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include + +#include + +_CCCL_BEGIN_NAMESPACE_CUDA + +// hierarchy level + +template +struct hierarchy_level_base; + +template +struct __native_hierarchy_level_base; + +struct grid_level; +struct cluster_level; +struct block_level; +struct warp_level; +struct thread_level; + +template +inline constexpr bool __is_hierarchy_level_v = ::cuda::std::is_base_of_v, _Tp>; + +template +inline constexpr bool __is_native_hierarchy_level_v = + ::cuda::std::is_base_of_v<__native_hierarchy_level_base<_Tp>, _Tp>; + +// hierarchy + +template +struct hierarchy_dimensions; + +template +inline constexpr bool __is_hierarchy_v = false; +template +inline constexpr bool __is_hierarchy_v> = true; + +template +struct allowed_levels; + +_CCCL_END_NAMESPACE_CUDA + +#include + +#endif // _CUDA___FWD_HIERARCHY_H diff --git a/libcudacxx/include/cuda/__hierarchy/block_level.h b/libcudacxx/include/cuda/__hierarchy/block_level.h new file mode 100644 index 00000000000..cfa000df0d2 --- /dev/null +++ b/libcudacxx/include/cuda/__hierarchy/block_level.h @@ -0,0 +1,141 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDA___HIERARCHY_BLOCK_LEVEL_H +#define _CUDA___HIERARCHY_BLOCK_LEVEL_H + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include +#include +#include +#include +#include + +#include + +_CCCL_BEGIN_NAMESPACE_CUDA + +struct block_level : __native_hierarchy_level_base +{ + using product_type = unsigned; + using allowed_above = allowed_levels; + using allowed_below = allowed_levels; + + using __next_native_level = cluster_level; + + using __base_type = __native_hierarchy_level_base; + using __base_type::count_as; + using __base_type::extents_as; + using __base_type::index_as; + using __base_type::rank_as; + + // interactions with cluster level + + _CCCL_TEMPLATE(class _Tp) + _CCCL_REQUIRES(::cuda::std::__cccl_is_integer_v<_Tp>) + [[nodiscard]] _CCCL_DEVICE_API static ::cuda::std::dims<3, _Tp> extents_as(const cluster_level&) noexcept + { + ::dim3 __dims{1u, 1u, 1u}; + NV_IF_TARGET(NV_PROVIDES_SM_90, (__dims = ::__clusterDim();)) + return ::cuda::std::dims<3, _Tp>{static_cast<_Tp>(__dims.x), static_cast<_Tp>(__dims.y), static_cast<_Tp>(__dims.z)}; + } + + _CCCL_TEMPLATE(class _Tp) + _CCCL_REQUIRES(::cuda::std::__cccl_is_integer_v<_Tp>) + [[nodiscard]] _CCCL_DEVICE_API static _Tp count_as(const cluster_level&) noexcept + { + unsigned __count = 1; + NV_IF_TARGET(NV_PROVIDES_SM_90, (__count = ::__clusterSizeInBlocks();)) + return static_cast<_Tp>(__count); + } + + _CCCL_TEMPLATE(class _Tp) + _CCCL_REQUIRES(::cuda::std::__cccl_is_integer_v<_Tp>) + [[nodiscard]] _CCCL_DEVICE_API static hierarchy_query_result<_Tp> index_as(const cluster_level&) noexcept + { + ::dim3 __idx{0u, 0u, 0u}; + NV_IF_TARGET(NV_PROVIDES_SM_90, (__idx = ::__clusterRelativeBlockIdx();)) + return {static_cast<_Tp>(__idx.x), static_cast<_Tp>(__idx.y), static_cast<_Tp>(__idx.z)}; + } + + _CCCL_TEMPLATE(class _Tp) + _CCCL_REQUIRES(::cuda::std::__cccl_is_integer_v<_Tp>) + [[nodiscard]] _CCCL_DEVICE_API static _Tp rank_as(const cluster_level&) noexcept + { + unsigned __rank = 0; + NV_IF_TARGET(NV_PROVIDES_SM_90, (__rank = ::__clusterRelativeBlockRank();)) + return static_cast<_Tp>(__rank); + } + + // interactions with grid level + + _CCCL_TEMPLATE(class _Tp) + _CCCL_REQUIRES(::cuda::std::__cccl_is_integer_v<_Tp>) + [[nodiscard]] _CCCL_DEVICE_API static ::cuda::std::dims<3, _Tp> extents_as(const grid_level&) noexcept + { + return ::cuda::std::dims<3, _Tp>{ + static_cast<_Tp>(gridDim.x), static_cast<_Tp>(gridDim.y), static_cast<_Tp>(gridDim.z)}; + } + + _CCCL_TEMPLATE(class _Tp) + _CCCL_REQUIRES(::cuda::std::__cccl_is_integer_v<_Tp>) + [[nodiscard]] _CCCL_DEVICE_API static _Tp count_as(const grid_level&) noexcept + { + return static_cast<_Tp>(gridDim.x) * static_cast<_Tp>(gridDim.y) * static_cast<_Tp>(gridDim.z); + } + + _CCCL_TEMPLATE(class _Tp) + _CCCL_REQUIRES(::cuda::std::__cccl_is_integer_v<_Tp>) + [[nodiscard]] _CCCL_DEVICE_API static hierarchy_query_result<_Tp> index_as(const grid_level&) noexcept + { + return {static_cast<_Tp>(blockIdx.x), static_cast<_Tp>(blockIdx.y), static_cast<_Tp>(blockIdx.z)}; + } + + _CCCL_TEMPLATE(class _Tp) + _CCCL_REQUIRES(::cuda::std::__cccl_is_integer_v<_Tp>) + [[nodiscard]] _CCCL_DEVICE_API static _Tp rank_as(const grid_level& __level) noexcept + { + const auto __dims = dims_as<_Tp>(__level); + const auto __idx = index_as<_Tp>(__level); + return static_cast<_Tp>((__idx.z * __dims.y + __idx.y) * __dims.x + __idx.x); + } + + // interactions with grid level in hierarchy + + _CCCL_TEMPLATE(class _Tp, class _Hierarchy) + _CCCL_REQUIRES(::cuda::std::__cccl_is_integer_v<_Tp> _CCCL_AND __is_hierarchy_v<_Hierarchy>) + [[nodiscard]] _CCCL_DEVICE_API static _Tp rank_as(const grid_level& __level, const _Hierarchy& __hier) noexcept + { + static_assert(has_unit_or_level_v, "_Hierarchy doesn't contain block level"); + static_assert(has_level_v, "_Hierarchy doesn't contain grid level"); + + const auto __dims = dims_as<_Tp>(__level, __hier); + const auto __idx = index_as<_Tp>(__level, __hier); + return static_cast<_Tp>((__idx.z * __dims.y + __idx.y) * __dims.x + __idx.x); + } +}; + +_CCCL_GLOBAL_CONSTANT block_level block; + +_CCCL_END_NAMESPACE_CUDA + +#include + +#endif // _CUDA___HIERARCHY_BLOCK_LEVEL_H diff --git a/libcudacxx/include/cuda/__hierarchy/cluster_level.h b/libcudacxx/include/cuda/__hierarchy/cluster_level.h new file mode 100644 index 00000000000..70ef8fdce89 --- /dev/null +++ b/libcudacxx/include/cuda/__hierarchy/cluster_level.h @@ -0,0 +1,74 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDA___HIERARCHY_CLUSTER_LEVEL_H +#define _CUDA___HIERARCHY_CLUSTER_LEVEL_H + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include +#include +#include +#include +#include + +#include + +_CCCL_BEGIN_NAMESPACE_CUDA + +struct cluster_level : __native_hierarchy_level_base +{ + using product_type = unsigned; + using allowed_above = allowed_levels; + using allowed_below = allowed_levels; + + using __next_native_level = grid_level; + + using __base_type = __native_hierarchy_level_base; + using __base_type::extents_as; + using __base_type::index_as; + + // interactions with grid level + + _CCCL_TEMPLATE(class _Tp) + _CCCL_REQUIRES(::cuda::std::__cccl_is_integer_v<_Tp>) + [[nodiscard]] _CCCL_DEVICE_API static ::cuda::std::dims<3, _Tp> extents_as(const grid_level&) noexcept + { + ::dim3 __dims{gridDim}; + NV_IF_TARGET(NV_PROVIDES_SM_90, (__dims = ::__clusterGridDimInClusters();)) + return ::cuda::std::dims<3, _Tp>{static_cast<_Tp>(__dims.x), static_cast<_Tp>(__dims.y), static_cast<_Tp>(__dims.z)}; + } + + _CCCL_TEMPLATE(class _Tp) + _CCCL_REQUIRES(::cuda::std::__cccl_is_integer_v<_Tp>) + [[nodiscard]] _CCCL_DEVICE_API static hierarchy_query_result<_Tp> index_as(const grid_level&) noexcept + { + ::dim3 __idx{blockIdx}; + NV_IF_TARGET(NV_PROVIDES_SM_90, (__idx = ::__clusterIdx();)) + return {static_cast<_Tp>(__idx.x), static_cast<_Tp>(__idx.y), static_cast<_Tp>(__idx.z)}; + } +}; + +_CCCL_GLOBAL_CONSTANT cluster_level cluster; + +_CCCL_END_NAMESPACE_CUDA + +#include + +#endif // _CUDA___HIERARCHY_CLUSTER_LEVEL_H diff --git a/libcudacxx/include/cuda/__hierarchy/dimensions.h b/libcudacxx/include/cuda/__hierarchy/dimensions.h index 2f52128b4fc..395c4c6e996 100644 --- a/libcudacxx/include/cuda/__hierarchy/dimensions.h +++ b/libcudacxx/include/cuda/__hierarchy/dimensions.h @@ -11,6 +11,16 @@ #ifndef _CUDA___HIERARCHY_DIMENSIONS_H #define _CUDA___HIERARCHY_DIMENSIONS_H +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + #include #include @@ -21,8 +31,7 @@ _CCCL_BEGIN_NAMESPACE_CUDA template using dimensions = ::cuda::std::extents<_Tp, _Extents...>; -// not unsigned because of a bug in ::cuda::std::extents -using dimensions_index_type = int; +using dimensions_index_type = unsigned; /** * @brief Type representing a result of a multi-dimensional hierarchy query. @@ -55,19 +64,19 @@ using dimensions_index_type = int; * Extents of the result */ template -struct hierarchy_query_result : public dimensions<_Tp, _Extents...> +struct hierarchy_query_result_org : public dimensions<_Tp, _Extents...> { using _Dims = dimensions<_Tp, _Extents...>; using _Dims::_Dims; - _CCCL_API constexpr hierarchy_query_result() + _CCCL_API constexpr hierarchy_query_result_org() : _Dims() , x(_Dims::extent(0)) , y(_Dims::rank() > 1 ? _Dims::extent(1) : 1) , z(_Dims::rank() > 2 ? _Dims::extent(2) : 1) {} - _CCCL_API explicit constexpr hierarchy_query_result(const _Dims& dims) + _CCCL_API explicit constexpr hierarchy_query_result_org(const _Dims& dims) : _Dims(dims) , x(_Dims::extent(0)) , y(_Dims::rank() > 1 ? _Dims::extent(1) : 1) @@ -135,7 +144,7 @@ __dims_sum(const dimensions<_T1, _E1...>& __h1, const dimensions<_T2, _E2...>& _ template [[nodiscard]] _CCCL_API constexpr auto __convert_to_query_result(const dimensions<_Tp, _Extents...>& __result) { - return hierarchy_query_result<_Tp, _Extents...>(__result); + return hierarchy_query_result_org<_Tp, _Extents...>(__result); } [[nodiscard]] _CCCL_API constexpr auto __dim3_to_dims(const ::dim3& dims) diff --git a/libcudacxx/include/cuda/__hierarchy/get_launch_dimensions.h b/libcudacxx/include/cuda/__hierarchy/get_launch_dimensions.h new file mode 100644 index 00000000000..ee3d07ee8ae --- /dev/null +++ b/libcudacxx/include/cuda/__hierarchy/get_launch_dimensions.h @@ -0,0 +1,90 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDA___HIERARCHY_GET_LAUNCH_DIMENSIONS_H +#define _CUDA___HIERARCHY_GET_LAUNCH_DIMENSIONS_H + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include +#include +#include +#include +#include + +#include + +_CCCL_BEGIN_NAMESPACE_CUDA + +/** + * @brief Returns a tuple of dim3 compatible objects that can be used to launch + * a kernel + * + * This function returns a tuple of hierarchy_query_result objects that contain + * dimensions from the supplied hierarchy, that can be used to launch that + * hierarchy. It is meant to allow for easy usage of hierarchy dimensions with + * the <<<>>> launch syntax or cudaLaunchKernelEx in case of a cluster launch. + * Contained hierarchy_query_result objects are results of extents() member + * function on the hierarchy passed in. The returned tuple has three elements if + * cluster_level is present in the hierarchy (extents(block, grid), + * extents(cluster, block), extents(thread, block)). Otherwise it contains only + * two elements, without the middle one related to the cluster. + * + * @par Snippet + * @code + * #include + * + * using namespace cuda; + * + * auto hierarchy = make_hierarchy(grid_dims(256), cluster_dims<4>(), + * block_dims<8, 8, 8>()); auto [grid_dimensions, cluster_dimensions, + * block_dimensions] = get_launch_dimensions(hierarchy); + * assert(grid_dimensions.x == 256); + * assert(cluster_dimensions.x == 4); + * assert(block_dimensions.x == 8); + * assert(block_dimensions.y == 8); + * assert(block_dimensions.z == 8); + * @endcode + * @par + * + * @param hierarchy + * Hierarchy that the launch dimensions are requested for + */ +template +constexpr auto _CCCL_HOST get_launch_dimensions(const hierarchy_dimensions<_Levels...>& __hierarchy) +{ + if constexpr (has_level_v>) + { + return ::cuda::std::make_tuple( + __hierarchy.extents(block_level{}, grid_level{}), + __hierarchy.extents(block_level{}, cluster_level{}), + __hierarchy.extents(thread_level{}, block_level{})); + } + else + { + return ::cuda::std::make_tuple( + __hierarchy.extents(block_level{}, grid_level{}), __hierarchy.extents(gpu_thread, block_level{})); + } +} + +_CCCL_END_NAMESPACE_CUDA + +#include + +#endif // _CUDA___HIERARCHY_GET_LAUNCH_DIMENSIONS_H diff --git a/libcudacxx/include/cuda/__hierarchy/grid_level.h b/libcudacxx/include/cuda/__hierarchy/grid_level.h new file mode 100644 index 00000000000..9cd6a111709 --- /dev/null +++ b/libcudacxx/include/cuda/__hierarchy/grid_level.h @@ -0,0 +1,44 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDA___HIERARCHY_GRID_LEVEL_H +#define _CUDA___HIERARCHY_GRID_LEVEL_H + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include + +#include + +_CCCL_BEGIN_NAMESPACE_CUDA + +struct grid_level : __native_hierarchy_level_base +{ + using product_type = unsigned long long; + using allowed_above = allowed_levels<>; + using allowed_below = allowed_levels; +}; + +_CCCL_GLOBAL_CONSTANT grid_level grid; + +_CCCL_END_NAMESPACE_CUDA + +#include + +#endif // _CUDA___HIERARCHY_GRID_LEVEL_H diff --git a/libcudacxx/include/cuda/__hierarchy/hierarchy_dimensions.h b/libcudacxx/include/cuda/__hierarchy/hierarchy_dimensions.h index 0525c567174..1d375843d83 100644 --- a/libcudacxx/include/cuda/__hierarchy/hierarchy_dimensions.h +++ b/libcudacxx/include/cuda/__hierarchy/hierarchy_dimensions.h @@ -11,7 +11,19 @@ #ifndef _CUDA___HIERARCHY_HIERARCHY_DIMENSIONS_H #define _CUDA___HIERARCHY_HIERARCHY_DIMENSIONS_H +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include #include +#include #include #include #include @@ -43,12 +55,6 @@ template } } // namespace __detail -template -using __level_type_of = typename _Level::level_type; - -template -struct hierarchy_dimensions; - namespace __detail { // Function to sometimes convince the compiler something is a constexpr and not @@ -67,22 +73,6 @@ template } } -template -struct __has_level_helper; - -template -struct __has_level_helper<_QueryLevel, hierarchy_dimensions<_Unit, _Levels...>> - : public ::cuda::std::__fold_or<::cuda::std::is_same_v<_QueryLevel, __level_type_of<_Levels>>...> -{}; - -template -struct __has_unit -{}; - -template -struct __has_unit<_QueryLevel, hierarchy_dimensions<_Unit, _Levels...>> : ::cuda::std::is_same<_QueryLevel, _Unit> -{}; - template struct __get_level_helper { @@ -101,15 +91,6 @@ struct __get_level_helper }; } // namespace __detail -template -inline constexpr bool has_level = - __detail::__has_level_helper<_QueryLevel, ::cuda::std::remove_cvref_t<_Hierarchy>>::value; - -template -inline constexpr bool has_level_or_unit = - __detail::__has_level_helper<_QueryLevel, ::cuda::std::remove_cvref_t<_Hierarchy>>::value - || __detail::__has_unit<_QueryLevel, ::cuda::std::remove_cvref_t<_Hierarchy>>::value; - namespace __detail { template @@ -377,7 +358,7 @@ struct __empty_hierarchy template struct hierarchy_dimensions { - static_assert(::cuda::std::is_base_of_v || ::cuda::std::is_same_v<_BottomUnit, void>); + static_assert(__is_hierarchy_level_v<_BottomUnit> || ::cuda::std::is_same_v<_BottomUnit, void>); ::cuda::std::tuple<_Levels...> levels; _CCCL_API constexpr hierarchy_dimensions(const _Levels&... __ls) noexcept @@ -419,8 +400,8 @@ struct hierarchy_dimensions [[nodiscard]] _CCCL_API static constexpr auto levels_range_static(const ::cuda::std::tuple<_Levels...>& __levels) noexcept { - static_assert(has_level<_Level, hierarchy_dimensions<_BottomUnit, _Levels...>>); - static_assert(has_level_or_unit<_Unit, hierarchy_dimensions<_BottomUnit, _Levels...>>); + static_assert(has_level_v<_Level, hierarchy_dimensions<_BottomUnit, _Levels...>>); + static_assert(has_unit_or_level_v<_Unit, hierarchy_dimensions<_BottomUnit, _Levels...>>); static_assert(__detail::__legal_unit_for_level<_Unit, _Level>); auto __fn = __detail::__get_levels_range<_Level, _Unit, _Levels...>; return ::cuda::std::apply(__fn, __levels); @@ -780,7 +761,7 @@ struct hierarchy_dimensions template _CCCL_API constexpr auto level(const _Level&) const noexcept { - static_assert(has_level<_Level, hierarchy_dimensions<_BottomUnit, _Levels...>>); + static_assert(has_level_v<_Level, hierarchy_dimensions<_BottomUnit, _Levels...>>); return ::cuda::std::apply(__detail::__get_level_helper<_Level>{}, levels); } @@ -811,8 +792,8 @@ struct hierarchy_dimensions // block) return ::cuda::std::apply(fragment_helper<_OtherUnit>(), ::cuda::std::tuple_cat(levels, __other.levels)); } - else if constexpr (has_level<__this_bottom_level, hierarchy_dimensions<_OtherUnit, _OtherLevels...>> - && (!has_level<__this_top_level, hierarchy_dimensions<_OtherUnit, _OtherLevels...>> + else if constexpr (has_level_v<__this_bottom_level, hierarchy_dimensions<_OtherUnit, _OtherLevels...>> + && (!has_level_v<__this_top_level, hierarchy_dimensions<_OtherUnit, _OtherLevels...>> || ::cuda::std::is_same_v<__this_top_level, __other_top_level>) ) { // Overlap with this on the top, e.g. this is (grid, cluster), other is @@ -837,8 +818,8 @@ struct hierarchy_dimensions { // Overlap with this on the bottom, e.g. this is (cluster, block), other // is (grid, cluster), can fully overlap - static_assert(has_level<__other_bottom_level, hierarchy_dimensions<_BottomUnit, _Levels...>> - && (!has_level<__this_bottom_level, hierarchy_dimensions<_OtherUnit, _OtherLevels...>> + static_assert(has_level_v<__other_bottom_level, hierarchy_dimensions<_BottomUnit, _Levels...>> + && (!has_level_v<__this_bottom_level, hierarchy_dimensions<_OtherUnit, _OtherLevels...>> || ::cuda::std::is_same_v<__this_bottom_level, __other_bottom_level>), "Can't combine the hierarchies"); @@ -856,54 +837,6 @@ struct hierarchy_dimensions #endif // _CCCL_DOXYGEN_INVOKED }; -/** - * @brief Returns a tuple of dim3 compatible objects that can be used to launch - * a kernel - * - * This function returns a tuple of hierarchy_query_result objects that contain - * dimensions from the supplied hierarchy, that can be used to launch that - * hierarchy. It is meant to allow for easy usage of hierarchy dimensions with - * the <<<>>> launch syntax or cudaLaunchKernelEx in case of a cluster launch. - * Contained hierarchy_query_result objects are results of extents() member - * function on the hierarchy passed in. The returned tuple has three elements if - * cluster_level is present in the hierarchy (extents(block, grid), - * extents(cluster, block), extents(thread, block)). Otherwise it contains only - * two elements, without the middle one related to the cluster. - * - * @par Snippet - * @code - * #include - * - * using namespace cuda; - * - * auto hierarchy = make_hierarchy(grid_dims(256), cluster_dims<4>(), - * block_dims<8, 8, 8>()); auto [grid_dimensions, cluster_dimensions, - * block_dimensions] = get_launch_dimensions(hierarchy); - * assert(grid_dimensions.x == 256); - * assert(cluster_dimensions.x == 4); - * assert(block_dimensions.x == 8); - * assert(block_dimensions.y == 8); - * assert(block_dimensions.z == 8); - * @endcode - * @par - * - * @param hierarchy - * Hierarchy that the launch dimensions are requested for - */ -template -constexpr auto _CCCL_HOST get_launch_dimensions(const hierarchy_dimensions<_Levels...>& __hierarchy) -{ - if constexpr (has_level>) - { - return ::cuda::std::make_tuple( - __hierarchy.extents(block, grid), __hierarchy.extents(block, cluster), __hierarchy.extents(thread, block)); - } - else - { - return ::cuda::std::make_tuple(__hierarchy.extents(block, grid), __hierarchy.extents(thread, block)); - } -} - // TODO consider having LUnit optional argument for template argument deduction /** * @brief Creates a hierarchy from passed in levels. diff --git a/libcudacxx/include/cuda/__hierarchy/hierarchy_level_base.h b/libcudacxx/include/cuda/__hierarchy/hierarchy_level_base.h new file mode 100644 index 00000000000..ccf8e319fb7 --- /dev/null +++ b/libcudacxx/include/cuda/__hierarchy/hierarchy_level_base.h @@ -0,0 +1,339 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDA___HIERARCHY_HIERARCHY_LEVEL_BASE_H +#define _CUDA___HIERARCHY_HIERARCHY_LEVEL_BASE_H + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +_CCCL_BEGIN_NAMESPACE_CUDA + +[[nodiscard]] _CCCL_API _CCCL_CONSTEVAL ::cuda::std::size_t +__hierarchy_static_extents_mul_helper(::cuda::std::size_t __lhs, ::cuda::std::size_t __rhs) noexcept +{ + if (__lhs == ::cuda::std::dynamic_extent || __rhs == ::cuda::std::dynamic_extent) + { + return ::cuda::std::dynamic_extent; + } + else + { + return __lhs * __rhs; + } +} + +template +[[nodiscard]] _CCCL_API constexpr auto __hierarchy_static_extents_mul(::cuda::std::index_sequence<_Is...>) noexcept +{ + return ::cuda::std::extents< + _ResultIndex, + ::cuda::__hierarchy_static_extents_mul_helper((_Is < _LhsExts::rank()) ? _LhsExts::static_extent(_Is) : 1, + (_Is < _RhsExts::rank()) ? _RhsExts::static_extent(_Is) : 1)...>{}; +} + +//! @brief Multiplies 2 extents in column major order together, returning a new extents type. If the ranks don't match, +//! the extent with lower rank is padded with 1s on the right to match the rank of the other. +//! +//! @param __lhs The left hand side extents to multiply. +//! @param __rhs The right hand side extents to multiply. +//! +//! @return The result of multiplying the extents together. +template +[[nodiscard]] _CCCL_API constexpr auto +__hierarchy_extents_mul(const ::cuda::std::extents<_Index, _LhsExts...>& __lhs, + const ::cuda::std::extents<_Index, _RhsExts...>& __rhs) noexcept +{ + using _Lhs = ::cuda::std::extents<_Index, _LhsExts...>; + using _Rhs = ::cuda::std::extents<_Index, _RhsExts...>; + + constexpr auto __rank = ::cuda::std::max(_Lhs::rank(), _Rhs::rank()); + using _Ret = + decltype(::cuda::__hierarchy_static_extents_mul<_Index, _Lhs, _Rhs>(::cuda::std::make_index_sequence<__rank>{})); + + ::cuda::std::array<_Index, __rank> __ret{}; + for (::cuda::std::size_t __i = 0; __i < __rank; ++__i) + { + if (_Ret::static_extent(__i) == ::cuda::std::dynamic_extent) + { + __ret[__i] = static_cast<_Index>((__i < _Lhs::rank()) ? __lhs.extent(__i) : 1) + * static_cast<_Index>((__i < _Rhs::rank()) ? __rhs.extent(__i) : 1); + } + else + { + __ret[__i] = _Ret::static_extent(__i); + } + } + return _Ret{__ret}; +} + +template +[[nodiscard]] _CCCL_API constexpr ::cuda::std::extents<_Index, _StaticExts...> +__hierarchy_extents_cast(::cuda::std::extents<_OrgIndex, _StaticExts...> __org_exts) noexcept +{ + using _OrgExts = ::cuda::std::extents<_OrgIndex, _StaticExts...>; + ::cuda::std::array<_Index, _OrgExts::rank()> __ret{}; + for (::cuda::std::size_t __i = 0; __i < _OrgExts::rank(); ++__i) + { + if (_OrgExts::static_extent(__i) == ::cuda::std::dynamic_extent) + { + __ret[__i] = static_cast<_Index>(__org_exts.extent(__i)); + } + else + { + __ret[__i] = _OrgExts::static_extent(__i); + } + } + return ::cuda::std::extents<_Index, _StaticExts...>{__ret}; +} + +template +struct hierarchy_level_base +{ + using level_type = _Level; + + template + using __default_md_query_type = unsigned; + template + using __default_1d_query_type = ::cuda::std::size_t; + + _CCCL_TEMPLATE(class _InLevel, class _Hierarchy) + _CCCL_REQUIRES(__is_hierarchy_level_v<_InLevel> _CCCL_AND __is_hierarchy_v<_Hierarchy>) + [[nodiscard]] _CCCL_API static constexpr auto dims(const _InLevel& __level, const _Hierarchy& __hier) noexcept + { + return _Level::template dims_as<__default_md_query_type<_InLevel>>(__level, __hier); + } + + _CCCL_TEMPLATE(class _InLevel, class _Hierarchy) + _CCCL_REQUIRES(__is_hierarchy_level_v<_InLevel> _CCCL_AND __is_hierarchy_v<_Hierarchy>) + [[nodiscard]] _CCCL_API static constexpr auto static_dims(const _InLevel& __level, const _Hierarchy& __hier) noexcept + { + return __static_dims_impl(__level, __hier); + } + + _CCCL_TEMPLATE(class _InLevel, class _Hierarchy) + _CCCL_REQUIRES(__is_hierarchy_level_v<_InLevel> _CCCL_AND __is_hierarchy_v<_Hierarchy>) + [[nodiscard]] _CCCL_API static constexpr auto extents(const _InLevel& __level, const _Hierarchy& __hier) noexcept + { + return _Level::template extents_as<__default_md_query_type<_InLevel>>(__level, __hier); + } + + _CCCL_TEMPLATE(class _InLevel, class _Hierarchy) + _CCCL_REQUIRES(__is_hierarchy_level_v<_InLevel> _CCCL_AND __is_hierarchy_v<_Hierarchy>) + [[nodiscard]] _CCCL_API static constexpr ::cuda::std::size_t + count(const _InLevel& __level, const _Hierarchy& __hier) noexcept + { + return _Level::template count_as<__default_1d_query_type<_InLevel>>(__level, __hier); + } + +#if _CCCL_CUDA_COMPILATION() + _CCCL_TEMPLATE(class _InLevel, class _Hierarchy) + _CCCL_REQUIRES(__is_hierarchy_level_v<_InLevel> _CCCL_AND __is_hierarchy_v<_Hierarchy>) + [[nodiscard]] _CCCL_DEVICE_API static constexpr auto index(const _InLevel& __level, const _Hierarchy& __hier) noexcept + { + return _Level::template index_as<__default_md_query_type<_InLevel>>(__level, __hier); + } + + _CCCL_TEMPLATE(class _InLevel, class _Hierarchy) + _CCCL_REQUIRES(__is_hierarchy_level_v<_InLevel> _CCCL_AND __is_hierarchy_v<_Hierarchy>) + [[nodiscard]] _CCCL_DEVICE_API static constexpr ::cuda::std::size_t + rank(const _InLevel& __level, const _Hierarchy& __hier) noexcept + { + return _Level::template rank_as<__default_1d_query_type<_InLevel>>(__level, __hier); + } +#endif // _CCCL_CUDA_COMPILATION() + + _CCCL_TEMPLATE(class _Tp, class _InLevel, class _Hierarchy) + _CCCL_REQUIRES(::cuda::std::__cccl_is_integer_v<_Tp> _CCCL_AND __is_hierarchy_level_v<_InLevel> _CCCL_AND + __is_hierarchy_v<_Hierarchy>) + [[nodiscard]] _CCCL_API static constexpr auto dims_as(const _InLevel& __level, const _Hierarchy& __hier) noexcept + { + return __dims_as_impl<_Tp>(__level, __hier); + } + + _CCCL_TEMPLATE(class _Tp, class _InLevel, class _Hierarchy) + _CCCL_REQUIRES(::cuda::std::__cccl_is_integer_v<_Tp> _CCCL_AND __is_hierarchy_level_v<_InLevel> _CCCL_AND + __is_hierarchy_v<_Hierarchy>) + [[nodiscard]] _CCCL_API static constexpr auto extents_as(const _InLevel& __in_level, const _Hierarchy& __hier) noexcept + { + static_assert(has_unit_or_level_v<_Level, _Hierarchy>, "_Hierarchy doesn't contain _Level"); + static_assert(has_level_v<_InLevel, _Hierarchy>, "_Hierarchy doesn't contain _InLevel"); + + using _NextLevel = __next_hierarchy_level_t<_Level, _Hierarchy>; + using _CurrExts = decltype(::cuda::__hierarchy_extents_cast<_Tp>(__hier.level(_NextLevel{}).dims)); + + // Remove dependency on runtime storage. This makes the queries work for hierarchy levels with all static extents + // in constant evaluated context. + _CurrExts __curr_exts{}; + if constexpr (_CurrExts::rank_dynamic() > 0) + { + __curr_exts = ::cuda::__hierarchy_extents_cast<_Tp>(__hier.level(_NextLevel{}).dims); + } + + if constexpr (!::cuda::std::is_same_v<_NextLevel, _InLevel>) + { + const auto __next_exts = _NextLevel::template extents_as<_Tp>(__in_level, __hier); + return ::cuda::__hierarchy_extents_mul(__curr_exts, __next_exts); + } + else + { + return __curr_exts; + } + } + + _CCCL_TEMPLATE(class _Tp, class _InLevel, class _Hierarchy) + _CCCL_REQUIRES(::cuda::std::__cccl_is_integer_v<_Tp> _CCCL_AND __is_hierarchy_level_v<_InLevel> _CCCL_AND + __is_hierarchy_v<_Hierarchy>) + [[nodiscard]] _CCCL_API static constexpr auto count_as(const _InLevel& __level, const _Hierarchy& __hier) noexcept + { + return __count_as_impl<_Tp>(__level, __hier); + } + +#if _CCCL_CUDA_COMPILATION() + _CCCL_TEMPLATE(class _Tp, class _InLevel, class _Hierarchy) + _CCCL_REQUIRES(::cuda::std::__cccl_is_integer_v<_Tp> _CCCL_AND __is_hierarchy_level_v<_InLevel> _CCCL_AND + __is_hierarchy_v<_Hierarchy>) + [[nodiscard]] _CCCL_DEVICE_API static constexpr auto + index_as(const _InLevel& __level, const _Hierarchy& __hier) noexcept + { + static_assert(has_unit_or_level_v<_Level, _Hierarchy>, "_Hierarchy doesn't contain _Level"); + static_assert(has_level_v<_InLevel, _Hierarchy>, "_Hierarchy doesn't contain _InLevel"); + + using _NextLevel = __next_hierarchy_level_t<_Level, _Hierarchy>; + if constexpr (::cuda::std::is_same_v<_InLevel, _NextLevel>) + { + using _CurrExts = decltype(_Level::template extents_as<_Tp>(_NextLevel{}, __hier)); + auto __curr_idx = _Level::template index_as<_Tp>(_NextLevel{}); + for (::cuda::std::size_t __i = _CurrExts::rank(); __i < 3; ++__i) + { + __curr_idx[__i] = 0; + } + return __curr_idx; + } + else + { + const auto __curr_exts = _Level::template extents_as<_Tp>(_NextLevel{}, __hier); + const auto __next_idx = _NextLevel::template index_as<_Tp>(__level, __hier); + const auto __curr_idx = _Level::template index_as<_Tp>(_NextLevel{}, __hier); + + hierarchy_query_result<_Tp> __ret{}; + for (::cuda::std::size_t __i = 0; __i < 3; ++__i) + { + __ret[__i] = __curr_idx[__i] + ((__i < __curr_exts.rank()) ? __curr_exts.extent(__i) : 1) * __next_idx[__i]; + } + return __ret; + } + } + + _CCCL_TEMPLATE(class _Tp, class _InLevel, class _Hierarchy) + _CCCL_REQUIRES(::cuda::std::__cccl_is_integer_v<_Tp> _CCCL_AND __is_hierarchy_level_v<_InLevel> _CCCL_AND + __is_hierarchy_v<_Hierarchy>) + [[nodiscard]] _CCCL_DEVICE_API static constexpr auto + rank_as(const _InLevel& __level, const _Hierarchy& __hier) noexcept + { + static_assert(has_unit_or_level_v<_Level, _Hierarchy>, "_Hierarchy doesn't contain _Level"); + static_assert(has_level_v<_InLevel, _Hierarchy>, "_Hierarchy doesn't contain _InLevel"); + + using _NextLevel = __next_hierarchy_level_t<_Level, _Hierarchy>; + + const auto __curr_exts = _Level::template extents_as<_Tp>(_NextLevel{}, __hier); + const auto __curr_idx = _Level::template index_as<_Tp>(_NextLevel{}, __hier); + + _Tp __ret = 0; + if constexpr (!::cuda::std::is_same_v<_InLevel, _NextLevel>) + { + __ret = _NextLevel::template rank_as<_Tp>(__level, __hier) * _Level::template count_as<_Tp>(_NextLevel{}, __hier); + } + + for (::cuda::std::size_t __i = __curr_exts.rank(); __i > 0; --__i) + { + _Tp __inc = __curr_idx[__i - 1]; + for (::cuda::std::size_t __j = __i - 1; __j > 0; --__j) + { + __inc *= __curr_exts.extent(__j - 1); + } + __ret += __inc; + } + return __ret; + } +#endif // _CCCL_CUDA_COMPILATION() + +private: + template + friend struct __native_hierarchy_level_base; + + _CCCL_EXEC_CHECK_DISABLE + template + [[nodiscard]] _CCCL_API static constexpr auto __dims_as_impl(const _Args&... __args) noexcept + { + auto __exts = _Level::template extents_as<_Tp>(__args...); + using _Exts = decltype(__exts); + + hierarchy_query_result<_Tp> __ret{1, 1, 1}; + for (::cuda::std::size_t __i = 0; __i < _Exts::rank(); ++__i) + { + __ret[__i] = __exts.extent(__i); + } + return __ret; + } + + template + [[nodiscard]] _CCCL_API static constexpr auto __static_dims_impl(const _Args&... __args) noexcept + { + using _Exts = decltype(_Level::extents(__args...)); + + hierarchy_query_result<::cuda::std::size_t> __ret{1, 1, 1}; + for (::cuda::std::size_t __i = 0; __i < _Exts::rank(); ++__i) + { + __ret[__i] = _Exts::static_extent(__i); + } + return __ret; + } + + _CCCL_EXEC_CHECK_DISABLE + template + [[nodiscard]] _CCCL_API static constexpr _Tp __count_as_impl(const _Args&... __args) noexcept + { + const auto __exts = _Level::template extents_as<_Tp>(__args...); + + _Tp __ret = 1; + for (::cuda::std::size_t __i = 0; __i < __exts.rank(); ++__i) + { + __ret *= __exts.extent(__i); + } + return __ret; + } +}; + +_CCCL_END_NAMESPACE_CUDA + +#include + +#endif // _CUDA___HIERARCHY_HIERARCHY_LEVEL_BASE_H diff --git a/libcudacxx/include/cuda/__hierarchy/hierarchy_levels.h b/libcudacxx/include/cuda/__hierarchy/hierarchy_levels.h index 6019b4a1ba5..a629cf938a9 100644 --- a/libcudacxx/include/cuda/__hierarchy/hierarchy_levels.h +++ b/libcudacxx/include/cuda/__hierarchy/hierarchy_levels.h @@ -11,6 +11,17 @@ #ifndef _CUDA___HIERARCHY_HIERARCHY_LEVELS_H #define _CUDA___HIERARCHY_HIERARCHY_LEVELS_H +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include #include #include @@ -98,94 +109,6 @@ template inline constexpr bool __legal_unit_for_level<_Unit, void> = false; } // namespace __detail -// Base type for all hierarchy levels -struct hierarchy_level -{}; - -struct grid_level; -struct cluster_level; -struct block_level; -struct thread_level; - -/* - Types to represent CUDA threads hierarchy levels - All metadata about the hierarchy level goes here including certain forward - progress information or what adjacent levels are valid in the hierarchy for - validation. -*/ - -/** - * @brief Type representing the grid level in CUDA thread hierarchy - * - * This type can be used in hierarchy queries to refer to the - * grid level or to get that level from the hierarchy. - * There is a constexpr variable of this type available for convenience - * named grid. - */ -struct grid_level - : public hierarchy_level - , public __detail::__dimensions_query -{ - using product_type = unsigned long long; - using allowed_above = allowed_levels<>; - using allowed_below = allowed_levels; -}; -_CCCL_GLOBAL_CONSTANT grid_level grid; - -/** - * @brief Type representing the cluster level in CUDA thread hierarchy - * - * This type can be used in hierarchy queries to refer to the - * cluster level or to get that level from the hierarchy. - * There is a constexpr variable of this type available for convenience - * named cluster. - */ -struct cluster_level - : public hierarchy_level - , public __detail::__dimensions_query -{ - using product_type = unsigned int; - using allowed_above = allowed_levels; - using allowed_below = allowed_levels; -}; -_CCCL_GLOBAL_CONSTANT cluster_level cluster; - -/** - * @brief Type representing the block level in CUDA thread hierarchy - * - * This type can be used in hierarchy queries to refer to the - * block level or to get that level from the hierarchy. - * There is a constexpr variable of this type available for convenience - * named block. - */ -struct block_level - : public hierarchy_level - , public __detail::__dimensions_query -{ - using product_type = unsigned int; - using allowed_above = allowed_levels; - using allowed_below = allowed_levels; -}; -_CCCL_GLOBAL_CONSTANT block_level block; - -/** - * @brief Type representing the thread level in CUDA thread hierarchy - * - * This type can be used in hierarchy queries to specify threads as a - * unit of the query. - * There is a constexpr variable of this type available for convenience - * named thread. - */ -struct thread_level - : public hierarchy_level - , public __detail::__dimensions_query -{ - using product_type = unsigned int; - using allowed_above = allowed_levels; - using allowed_below = allowed_levels<>; -}; -_CCCL_GLOBAL_CONSTANT thread_level thread; - template constexpr bool is_core_cuda_hierarchy_level = ::cuda::std::is_same_v<_Level, grid_level> || ::cuda::std::is_same_v<_Level, cluster_level> diff --git a/libcudacxx/include/cuda/__hierarchy/hierarchy_query_result.h b/libcudacxx/include/cuda/__hierarchy/hierarchy_query_result.h new file mode 100644 index 00000000000..0192198ba03 --- /dev/null +++ b/libcudacxx/include/cuda/__hierarchy/hierarchy_query_result.h @@ -0,0 +1,149 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDA___HIERARCHY_HIERARCHY_QUERY_RESULT_H +#define _CUDA___HIERARCHY_HIERARCHY_QUERY_RESULT_H + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include +#include +#include +#include + +#include + +_CCCL_BEGIN_NAMESPACE_CUDA + +template +struct hierarchy_query_result +{ + using value_type = _Tp; + + _Tp x; + _Tp y; + _Tp z; + + [[nodiscard]] _CCCL_API constexpr _Tp& operator[](::cuda::std::size_t __i) noexcept + { + if (__i == 0) + { + return x; + } + else if (__i == 1) + { + return y; + } + else + { + return z; + } + } + [[nodiscard]] _CCCL_API constexpr const _Tp& operator[](::cuda::std::size_t __i) const noexcept + { + if (__i == 0) + { + return x; + } + else if (__i == 1) + { + return y; + } + else + { + return z; + } + } + + _CCCL_TEMPLATE(class _Tp2 = _Tp) + _CCCL_REQUIRES(::cuda::std::is_same_v<_Tp2, signed char>) + _CCCL_API constexpr operator char3() const noexcept + { + return {static_cast(x), static_cast(y), static_cast(z)}; + } + + _CCCL_TEMPLATE(class _Tp2 = _Tp) + _CCCL_REQUIRES(::cuda::std::is_same_v<_Tp2, short>) + _CCCL_API constexpr operator short3() const noexcept + { + return {static_cast(x), static_cast(y), static_cast(z)}; + } + + _CCCL_TEMPLATE(class _Tp2 = _Tp) + _CCCL_REQUIRES(::cuda::std::is_same_v<_Tp2, int>) + _CCCL_API constexpr operator int3() const noexcept + { + return {static_cast(x), static_cast(y), static_cast(z)}; + } + + _CCCL_TEMPLATE(class _Tp2 = _Tp) + _CCCL_REQUIRES(::cuda::std::is_same_v<_Tp2, long>) + _CCCL_API constexpr operator long3() const noexcept + { + return {static_cast(x), static_cast(y), static_cast(z)}; + } + + _CCCL_TEMPLATE(class _Tp2 = _Tp) + _CCCL_REQUIRES(::cuda::std::is_same_v<_Tp2, long long>) + _CCCL_API constexpr operator longlong3() const noexcept + { + return {static_cast(x), static_cast(y), static_cast(z)}; + } + + _CCCL_TEMPLATE(class _Tp2 = _Tp) + _CCCL_REQUIRES(::cuda::std::is_same_v<_Tp2, unsigned char>) + _CCCL_API constexpr operator uchar3() const noexcept + { + return {static_cast(x), static_cast(y), static_cast(z)}; + } + + _CCCL_TEMPLATE(class _Tp2 = _Tp) + _CCCL_REQUIRES(::cuda::std::is_same_v<_Tp2, unsigned short>) + _CCCL_API constexpr operator ushort3() const noexcept + { + return {static_cast(x), static_cast(y), static_cast(z)}; + } + + _CCCL_TEMPLATE(class _Tp2 = _Tp) + _CCCL_REQUIRES(::cuda::std::is_same_v<_Tp2, unsigned>) + _CCCL_API constexpr operator uint3() const noexcept + { + return {static_cast(x), static_cast(y), static_cast(z)}; + } + + _CCCL_TEMPLATE(class _Tp2 = _Tp) + _CCCL_REQUIRES(::cuda::std::is_same_v<_Tp2, unsigned long>) + _CCCL_API constexpr operator ulong3() const noexcept + { + return {static_cast(x), static_cast(y), static_cast(z)}; + } + + _CCCL_TEMPLATE(class _Tp2 = _Tp) + _CCCL_REQUIRES(::cuda::std::is_same_v<_Tp2, unsigned long long>) + _CCCL_API constexpr operator ulonglong3() const noexcept + { + return {static_cast(x), static_cast(y), static_cast(z)}; + } +}; + +_CCCL_END_NAMESPACE_CUDA + +#include + +#endif // _CUDA___HIERARCHY_HIERARCHY_QUERY_RESULT_H diff --git a/libcudacxx/include/cuda/__hierarchy/level_dimensions.h b/libcudacxx/include/cuda/__hierarchy/level_dimensions.h index 8dea5ca85d4..ec4fcd38d9f 100644 --- a/libcudacxx/include/cuda/__hierarchy/level_dimensions.h +++ b/libcudacxx/include/cuda/__hierarchy/level_dimensions.h @@ -11,6 +11,20 @@ #ifndef _CUDA___HIERARCHY_LEVEL_DIMENSIONS_H #define _CUDA___HIERARCHY_LEVEL_DIMENSIONS_H +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include +#include +#include #include #include #include @@ -111,7 +125,7 @@ struct __dimensions_handler<::cuda::std::integral_constant<_Dims, _Val>> template struct level_dimensions { - static_assert(::cuda::std::is_base_of_v); + static_assert(__is_hierarchy_level_v<_Level>); using level_type = _Level; // Needs alignas to work around an issue with tuple diff --git a/libcudacxx/include/cuda/__hierarchy/native_hierarchy_level_base.h b/libcudacxx/include/cuda/__hierarchy/native_hierarchy_level_base.h new file mode 100644 index 00000000000..2d97796a7a4 --- /dev/null +++ b/libcudacxx/include/cuda/__hierarchy/native_hierarchy_level_base.h @@ -0,0 +1,203 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDA___HIERARCHY_NATIVE_HIERARCHY_LEVEL_BASE_H +#define _CUDA___HIERARCHY_NATIVE_HIERARCHY_LEVEL_BASE_H + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +_CCCL_BEGIN_NAMESPACE_CUDA + +// cudafe++ makes the queries (that are device only) return void when compiling for host, which causes host compilers +// to warn about applying [[nodiscard]] to a function that returns void. +_CCCL_DIAG_PUSH +#if _CCCL_CUDA_COMPILER(NVCC) +_CCCL_DIAG_SUPPRESS_GCC("-Wattributes") +_CCCL_DIAG_SUPPRESS_CLANG("-Wignored-attributes") +_CCCL_DIAG_SUPPRESS_NVHPC(nodiscard_doesnt_apply) +#endif // _CCCL_CUDA_COMPILER(NVCC) + +template +struct __native_hierarchy_level_base : hierarchy_level_base<_Level> +{ + using __base_type = hierarchy_level_base<_Level>; + using __base_type::count; + using __base_type::count_as; + using __base_type::dims; + using __base_type::dims_as; + using __base_type::extents; + using __base_type::extents_as; + using __base_type::index; + using __base_type::index_as; + using __base_type::rank; + using __base_type::rank_as; + using __base_type::static_dims; + + template + using __default_md_query_type = unsigned; + template + using __default_1d_query_type = ::cuda::std::size_t; + +#if _CCCL_CUDA_COMPILATION() + + _CCCL_TEMPLATE(class _InLevel) + _CCCL_REQUIRES(__is_native_hierarchy_level_v<_InLevel>) + [[nodiscard]] _CCCL_DEVICE_API static auto dims(const _InLevel& __level) noexcept + { + return _Level::template dims_as<__default_md_query_type<_InLevel>>(__level); + } + + _CCCL_TEMPLATE(class _InLevel) + _CCCL_REQUIRES(__is_native_hierarchy_level_v<_InLevel>) + [[nodiscard]] _CCCL_DEVICE_API static constexpr auto static_dims(const _InLevel& __level) noexcept + { + return __base_type::__static_dims_impl(__level); + } + + _CCCL_TEMPLATE(class _InLevel) + _CCCL_REQUIRES(__is_native_hierarchy_level_v<_InLevel>) + [[nodiscard]] _CCCL_DEVICE_API static auto extents(const _InLevel& __level) noexcept + { + return _Level::template extents_as<__default_md_query_type<_InLevel>>(__level); + } + + _CCCL_TEMPLATE(class _InLevel) + _CCCL_REQUIRES(__is_native_hierarchy_level_v<_InLevel>) + [[nodiscard]] _CCCL_DEVICE_API static auto count(const _InLevel& __level) noexcept + { + return _Level::template count_as<__default_1d_query_type<_InLevel>>(__level); + } + + _CCCL_TEMPLATE(class _InLevel) + _CCCL_REQUIRES(__is_native_hierarchy_level_v<_InLevel>) + [[nodiscard]] _CCCL_DEVICE_API static auto index(const _InLevel& __level) noexcept + { + return _Level::template index_as<__default_md_query_type<_InLevel>>(__level); + } + + _CCCL_TEMPLATE(class _InLevel) + _CCCL_REQUIRES(__is_native_hierarchy_level_v<_InLevel>) + [[nodiscard]] _CCCL_DEVICE_API static auto rank(const _InLevel& __level) noexcept + { + return _Level::template rank_as<__default_1d_query_type<_InLevel>>(__level); + } + + _CCCL_TEMPLATE(class _Tp, class _InLevel) + _CCCL_REQUIRES(__is_native_hierarchy_level_v<_InLevel>) + [[nodiscard]] _CCCL_DEVICE_API static auto dims_as(const _InLevel& __level) noexcept + { + return __base_type::template __dims_as_impl<_Tp>(__level); + } + + _CCCL_TEMPLATE(class _Tp, class _InLevel) + _CCCL_REQUIRES(__is_native_hierarchy_level_v<_InLevel>) + [[nodiscard]] _CCCL_DEVICE_API static auto extents_as(const _InLevel& __level) noexcept + { + static_assert(__is_natively_reachable_hierarchy_level_v<_Level, _InLevel>, + "_InLevel must be reachable from _Level"); + + using _NextLevel = typename _Level::__next_native_level; + auto __next_exts = _NextLevel::template extents_as<_Tp>(__level); + auto __curr_exts = _Level::template extents_as<_Tp>(_NextLevel{}); + return ::cuda::__hierarchy_extents_mul(__curr_exts, __next_exts); + } + + _CCCL_TEMPLATE(class _Tp, class _InLevel) + _CCCL_REQUIRES(__is_native_hierarchy_level_v<_InLevel>) + [[nodiscard]] _CCCL_DEVICE_API static auto count_as(const _InLevel& __level) noexcept + { + return __base_type::template __count_as_impl<_Tp>(__level); + } + + _CCCL_TEMPLATE(class _Tp, class _InLevel) + _CCCL_REQUIRES(__is_native_hierarchy_level_v<_InLevel>) + [[nodiscard]] _CCCL_DEVICE_API static auto index_as(const _InLevel& __level) noexcept + { + static_assert(__is_natively_reachable_hierarchy_level_v<_Level, _InLevel>, + "_InLevel must be reachable from _Level"); + + using _NextLevel = typename _Level::__next_native_level; + const auto __curr_exts = _Level::template extents_as<_Tp>(_NextLevel{}); + const auto __next_idx = _NextLevel::template index_as<_Tp>(__level); + const auto __curr_idx = _Level::template index_as<_Tp>(_NextLevel{}); + + hierarchy_query_result<_Tp> __ret{}; + for (::cuda::std::size_t __i = 0; __i < 3; ++__i) + { + __ret[__i] = __curr_idx[__i] + ((__i < __curr_exts.rank()) ? __curr_exts.extent(__i) : 1) * __next_idx[__i]; + } + return __ret; + } + + _CCCL_TEMPLATE(class _Tp, class _InLevel) + _CCCL_REQUIRES(__is_native_hierarchy_level_v<_InLevel>) + [[nodiscard]] _CCCL_DEVICE_API static _Tp rank_as(const _InLevel& __level) noexcept + { + static_assert(__is_natively_reachable_hierarchy_level_v<_Level, _InLevel>, + "_InLevel must be reachable from _Level"); + + using _NextLevel = typename _Level::__next_native_level; + + const auto __curr_exts = _Level::template extents_as<_Tp>(_NextLevel{}); + const auto __curr_idx = _Level::template index_as<_Tp>(_NextLevel{}); + + _Tp __ret = 0; + if constexpr (!::cuda::std::is_same_v<_InLevel, _NextLevel>) + { + __ret = _NextLevel::template rank_as<_Tp>(__level) * _Level::template count_as<_Tp>(_NextLevel{}); + } + + for (::cuda::std::size_t __i = __curr_exts.rank(); __i > 0; --__i) + { + _Tp __inc = __curr_idx[__i - 1]; + for (::cuda::std::size_t __j = __i - 1; __j > 0; --__j) + { + __inc *= __curr_exts.extent(__j - 1); + } + __ret += __inc; + } + return __ret; + } + +#endif // _CCCL_CUDA_COMPILATION() +}; + +_CCCL_DIAG_POP + +template <> +struct __native_hierarchy_level_base : hierarchy_level_base +{}; + +_CCCL_END_NAMESPACE_CUDA + +#include + +#endif // _CUDA___HIERARCHY_NATIVE_HIERARCHY_LEVEL_BASE_H diff --git a/libcudacxx/include/cuda/__hierarchy/thread_level.h b/libcudacxx/include/cuda/__hierarchy/thread_level.h new file mode 100644 index 00000000000..f8beda576d1 --- /dev/null +++ b/libcudacxx/include/cuda/__hierarchy/thread_level.h @@ -0,0 +1,96 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDA___HIERARCHY_THREAD_LEVEL_H +#define _CUDA___HIERARCHY_THREAD_LEVEL_H + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include +#include +#include +#include +#include + +#include + +_CCCL_BEGIN_NAMESPACE_CUDA + +struct thread_level : __native_hierarchy_level_base +{ + using product_type = unsigned; + using allowed_above = allowed_levels; + using allowed_below = allowed_levels<>; + + using __next_native_level = block_level; + + using __base_type = __native_hierarchy_level_base; + using __base_type::extents_as; + using __base_type::index_as; + using __base_type::rank_as; + + // interactions with block level + + _CCCL_TEMPLATE(class _Tp) + _CCCL_REQUIRES(::cuda::std::__cccl_is_integer_v<_Tp>) + [[nodiscard]] _CCCL_DEVICE_API static ::cuda::std::dims<3, _Tp> extents_as(const block_level&) noexcept + { + return ::cuda::std::dims<3, _Tp>{ + static_cast<_Tp>(blockDim.x), static_cast<_Tp>(blockDim.y), static_cast<_Tp>(blockDim.z)}; + } + + _CCCL_TEMPLATE(class _Tp) + _CCCL_REQUIRES(::cuda::std::__cccl_is_integer_v<_Tp>) + [[nodiscard]] _CCCL_DEVICE_API static hierarchy_query_result<_Tp> index_as(const block_level&) noexcept + { + return {static_cast<_Tp>(threadIdx.x), static_cast<_Tp>(threadIdx.y), static_cast<_Tp>(threadIdx.z)}; + } + + // interactions with warp level + + _CCCL_TEMPLATE(class _Tp) + _CCCL_REQUIRES(::cuda::std::__cccl_is_integer_v<_Tp>) + [[nodiscard]] + _CCCL_DEVICE_API static constexpr ::cuda::std::extents<_Tp, 32> extents_as(const warp_level&) noexcept + { + return {}; + } + + _CCCL_TEMPLATE(class _Tp) + _CCCL_REQUIRES(::cuda::std::__cccl_is_integer_v<_Tp>) + [[nodiscard]] _CCCL_DEVICE_API static hierarchy_query_result<_Tp> index_as(const warp_level&) noexcept + { + return {static_cast<_Tp>(::cuda::ptx::get_sreg_laneid()), 0, 0}; + } + + _CCCL_TEMPLATE(class _Tp) + _CCCL_REQUIRES(::cuda::std::__cccl_is_integer_v<_Tp>) + [[nodiscard]] _CCCL_DEVICE_API static _Tp rank_as(const warp_level&) noexcept + { + return static_cast<_Tp>(::cuda::ptx::get_sreg_laneid()); + } +}; + +_CCCL_GLOBAL_CONSTANT thread_level gpu_thread; + +_CCCL_END_NAMESPACE_CUDA + +#include + +#endif // _CUDA___HIERARCHY_THREAD_LEVEL_H diff --git a/libcudacxx/include/cuda/__hierarchy/traits.h b/libcudacxx/include/cuda/__hierarchy/traits.h new file mode 100644 index 00000000000..1e64e0aaa77 --- /dev/null +++ b/libcudacxx/include/cuda/__hierarchy/traits.h @@ -0,0 +1,119 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDA___HIERARCHY_TRAITS_H +#define _CUDA___HIERARCHY_TRAITS_H + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include +#include +#include +#include +#include +#include + +#include + +_CCCL_BEGIN_NAMESPACE_CUDA + +// __is_natively_reachable_hierarchy_level_v + +template +inline constexpr bool __is_natively_reachable_hierarchy_level_helper_v = false; +template +inline constexpr bool __is_natively_reachable_hierarchy_level_helper_v< + _FromLevel, + _CurrLevel, + _ToLevel, + ::cuda::std::void_t> = + __is_natively_reachable_hierarchy_level_helper_v<_FromLevel, typename _CurrLevel::__next_native_level, _ToLevel>; +template +inline constexpr bool __is_natively_reachable_hierarchy_level_helper_v<_Level, _Level, _ToLevel> = false; +template +inline constexpr bool __is_natively_reachable_hierarchy_level_helper_v<_FromLevel, _Level, _Level> = true; + +template +inline constexpr bool __is_natively_reachable_hierarchy_level_v = false; +template +inline constexpr bool __is_natively_reachable_hierarchy_level_v< + _FromLevel, + _ToLevel, + ::cuda::std::void_t> = + __is_native_hierarchy_level_v<_ToLevel> + && __is_natively_reachable_hierarchy_level_helper_v<_FromLevel, typename _FromLevel::__next_native_level, _ToLevel>; + +// __level_type_of + +template +using __level_type_of = typename _Level::level_type; + +// has_unit_v + +template +inline constexpr bool __has_unit_helper_v = false; +template +inline constexpr bool __has_unit_helper_v<_QueryLevel, hierarchy_dimensions<_QueryLevel, _Levels...>> = true; + +// has_level_v + +template +inline constexpr bool __has_level_helper_v = false; +template +inline constexpr bool __has_level_helper_v<_QueryLevel, hierarchy_dimensions<_Unit, _Levels...>> = + (::cuda::std::is_same_v<_QueryLevel, typename _Levels::level_type> || ...); + +template +inline constexpr bool has_level_v = __has_level_helper_v<_QueryLevel, ::cuda::std::remove_cvref_t<_Hierarchy>>; + +template +inline constexpr bool has_unit_v = __has_unit_helper_v<_QueryLevel, ::cuda::std::remove_cvref_t<_Hierarchy>>; + +// has_unit_or_level_v + +template +inline constexpr bool has_unit_or_level_v = has_unit_v<_QueryLevel, _Hierarchy> || has_level_v<_QueryLevel, _Hierarchy>; + +// __next_hierarchy_level + +template +struct __next_hierarchy_level; + +template +struct __next_hierarchy_level<_Level, hierarchy_dimensions<_BottomUnit, _Levels...>> +{ + static constexpr ::cuda::std::size_t __level_idx = + ::cuda::std::__find_exactly_one_t<_Level, typename _Levels::level_type...>::value; + using __type = ::cuda::std::__type_index_c<__level_idx - 1, typename _Levels::level_type...>; +}; + +template +struct __next_hierarchy_level<_Level, hierarchy_dimensions<_Level, _Levels...>> +{ + using __type = ::cuda::std::__type_index_c<(sizeof...(_Levels) - 1), typename _Levels::level_type...>; +}; + +template +using __next_hierarchy_level_t = typename __next_hierarchy_level<_Level, _Hierarchy>::__type; + +_CCCL_END_NAMESPACE_CUDA + +#include + +#endif // _CUDA___HIERARCHY_TRAITS_H diff --git a/libcudacxx/include/cuda/__hierarchy/warp_level.h b/libcudacxx/include/cuda/__hierarchy/warp_level.h new file mode 100644 index 00000000000..6e1d9170391 --- /dev/null +++ b/libcudacxx/include/cuda/__hierarchy/warp_level.h @@ -0,0 +1,67 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDA___HIERARCHY_WARP_LEVEL_H +#define _CUDA___HIERARCHY_WARP_LEVEL_H + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +_CCCL_BEGIN_NAMESPACE_CUDA + +struct warp_level : __native_hierarchy_level_base +{ + using __next_native_level = block_level; + + using __base_type = __native_hierarchy_level_base; + using __base_type::extents_as; + using __base_type::index_as; + + _CCCL_TEMPLATE(class _Tp) + _CCCL_REQUIRES(::cuda::std::__cccl_is_integer_v<_Tp>) + [[nodiscard]] _CCCL_DEVICE_API static ::cuda::std::dims<1, _Tp> extents_as(const block_level&) noexcept + { + return ::cuda::std::dims<1, _Tp>{static_cast<_Tp>((gpu_thread.count(block) + 31) / 32)}; + } + + _CCCL_TEMPLATE(class _Tp) + _CCCL_REQUIRES(::cuda::std::__cccl_is_integer_v<_Tp>) + [[nodiscard]] _CCCL_DEVICE_API static hierarchy_query_result<_Tp> index_as(const block_level&) noexcept + { + return {static_cast<_Tp>(gpu_thread.rank(block) / 32), 0, 0}; + } +}; + +_CCCL_GLOBAL_CONSTANT warp_level warp; + +_CCCL_END_NAMESPACE_CUDA + +#include + +#endif // _CUDA___HIERARCHY_WARP_LEVEL_H diff --git a/libcudacxx/include/cuda/__launch/launch.h b/libcudacxx/include/cuda/__launch/launch.h index e2df77229b6..ed30b9163a5 100644 --- a/libcudacxx/include/cuda/__launch/launch.h +++ b/libcudacxx/include/cuda/__launch/launch.h @@ -24,6 +24,11 @@ #if _CCCL_HAS_CTK() && !_CCCL_COMPILER(NVRTC) # include +# include +# include +# include +# include +# include # include # include # include @@ -82,7 +87,7 @@ _CCCL_HOST_API auto __launch_impl(_Dst&& __dst, _Config __conf, ::CUfunction __k static_assert(!::cuda::std::is_same_v, "Can't launch a configuration without hierarchy dimensions"); ::CUlaunchConfig __config{}; - constexpr bool __has_cluster_level = has_level; + constexpr bool __has_cluster_level = has_level_v; constexpr unsigned int __num_attrs_needed = __detail::kernel_config_count_attr_space(__conf) + __has_cluster_level; ::CUlaunchAttribute __attrs[__num_attrs_needed == 0 ? 1 : __num_attrs_needed]; __config.attrs = &__attrs[0]; @@ -97,9 +102,9 @@ _CCCL_HOST_API auto __launch_impl(_Dst&& __dst, _Config __conf, ::CUfunction __k __config.gridDimX = static_cast(__conf.dims.extents(block, grid).x); __config.gridDimY = static_cast(__conf.dims.extents(block, grid).y); __config.gridDimZ = static_cast(__conf.dims.extents(block, grid).z); - __config.blockDimX = static_cast(__conf.dims.extents(thread, block).x); - __config.blockDimY = static_cast(__conf.dims.extents(thread, block).y); - __config.blockDimZ = static_cast(__conf.dims.extents(thread, block).z); + __config.blockDimX = static_cast(__conf.dims.extents(gpu_thread, block).x); + __config.blockDimY = static_cast(__conf.dims.extents(gpu_thread, block).y); + __config.blockDimZ = static_cast(__conf.dims.extents(gpu_thread, block).z); if constexpr (__has_cluster_level) { diff --git a/libcudacxx/include/cuda/hierarchy b/libcudacxx/include/cuda/hierarchy index acabf9928d4..dcb3ff104da 100644 --- a/libcudacxx/include/cuda/hierarchy +++ b/libcudacxx/include/cuda/hierarchy @@ -8,8 +8,8 @@ // //===----------------------------------------------------------------------===// -#ifndef _CUDA_HIERARCHY_ -#define _CUDA_HIERARCHY_ +#ifndef _CUDA_HIERARCHY +#define _CUDA_HIERARCHY #include @@ -21,8 +21,18 @@ # pragma system_header #endif // no system header +#include +#include +#include +#include #include +#include #include +#include #include +#include +#include +#include +#include -#endif // _CUDA_HIERARCHY_ +#endif // _CUDA_HIERARCHY diff --git a/libcudacxx/test/libcudacxx/cuda/ccclrt/common/host_device.cuh b/libcudacxx/test/libcudacxx/cuda/ccclrt/common/host_device.cuh index 07202970ab8..424cd7fe024 100644 --- a/libcudacxx/test/libcudacxx/cuda/ccclrt/common/host_device.cuh +++ b/libcudacxx/test/libcudacxx/cuda/ccclrt/common/host_device.cuh @@ -69,10 +69,10 @@ void test_host_dev(const Dims& dims, const Lambda& lambda, const Filters&... fil cudaLaunchAttribute attrs[1]; config.attrs = &attrs[0]; - config.blockDim = dims.extents(cuda::thread, cuda::block); + config.blockDim = dims.extents(cuda::gpu_thread, cuda::block); config.gridDim = dims.extents(cuda::block, cuda::grid); - if constexpr (cuda::has_level) + if constexpr (cuda::has_level_v) { dim3 cluster_dims = dims.extents(cuda::block, cuda::cluster); config.attrs[config.numAttrs].id = cudaLaunchAttributeClusterDimension; diff --git a/libcudacxx/test/libcudacxx/cuda/ccclrt/hierarchy/hierarchy_custom_types.cu b/libcudacxx/test/libcudacxx/cuda/ccclrt/hierarchy/hierarchy_custom_types.cu index 3235ac21028..d592700eefb 100644 --- a/libcudacxx/test/libcudacxx/cuda/ccclrt/hierarchy/hierarchy_custom_types.cu +++ b/libcudacxx/test/libcudacxx/cuda/ccclrt/hierarchy/hierarchy_custom_types.cu @@ -13,7 +13,7 @@ #include #include -struct custom_level : public cuda::hierarchy_level +struct custom_level : public cuda::hierarchy_level_base { using product_type = unsigned int; using allowed_above = cuda::allowed_levels; @@ -51,7 +51,7 @@ struct custom_level_test auto custom_block_back = custom_dims.level(cuda::block); CCCLRT_REQUIRE(custom_block_back.dummy == 2); - auto custom_dims_fragment = custom_dims.fragment(cuda::thread, cuda::block); + auto custom_dims_fragment = custom_dims.fragment(cuda::gpu_thread, cuda::block); auto custom_block_back2 = custom_dims_fragment.level(cuda::block); CCCLRT_REQUIRE(custom_block_back2.dummy == 2); @@ -62,8 +62,8 @@ struct custom_level_test cuda::level_dimensions(custom_level_dims), cuda::block_dims<256>()); - static_assert(custom_hierarchy.extents(cuda::thread, custom_level()) == dim3(512, 2, 2)); - static_assert(custom_hierarchy.count(cuda::thread, custom_level()) == 2048); + static_assert(custom_hierarchy.extents(cuda::gpu_thread, custom_level()) == dim3(512, 2, 2)); + static_assert(custom_hierarchy.count(cuda::gpu_thread, custom_level()) == 2048); test_host_dev(custom_hierarchy, *this); } diff --git a/libcudacxx/test/libcudacxx/cuda/ccclrt/hierarchy/hierarchy_smoke.cu b/libcudacxx/test/libcudacxx/cuda/ccclrt/hierarchy/hierarchy_smoke.cu index e63e80a93fc..2aa81ce5732 100644 --- a/libcudacxx/test/libcudacxx/cuda/ccclrt/hierarchy/hierarchy_smoke.cu +++ b/libcudacxx/test/libcudacxx/cuda/ccclrt/hierarchy/hierarchy_smoke.cu @@ -30,15 +30,15 @@ struct basic_test_single_dim // device-side require doesn't work with clang-cuda for now #if !_CCCL_CUDA_COMPILER(CLANG) CCCLRT_REQUIRE(dims.extents().x == grid_size * block_size); - CCCLRT_REQUIRE(dims.extents(cuda::thread).x == grid_size * block_size); - CCCLRT_REQUIRE(dims.extents(cuda::thread, cuda::grid).x == grid_size * block_size); + CCCLRT_REQUIRE(dims.extents(cuda::gpu_thread).x == grid_size * block_size); + CCCLRT_REQUIRE(dims.extents(cuda::gpu_thread, cuda::grid).x == grid_size * block_size); CCCLRT_REQUIRE(dims.count() == grid_size * block_size); - CCCLRT_REQUIRE(dims.count(cuda::thread) == grid_size * block_size); - CCCLRT_REQUIRE(dims.count(cuda::thread, cuda::grid) == grid_size * block_size); + CCCLRT_REQUIRE(dims.count(cuda::gpu_thread) == grid_size * block_size); + CCCLRT_REQUIRE(dims.count(cuda::gpu_thread, cuda::grid) == grid_size * block_size); - CCCLRT_REQUIRE(dims.extents(cuda::thread, cuda::block).x == block_size); + CCCLRT_REQUIRE(dims.extents(cuda::gpu_thread, cuda::block).x == block_size); CCCLRT_REQUIRE(dims.extents(cuda::block, cuda::grid).x == grid_size); - CCCLRT_REQUIRE(dims.count(cuda::thread, cuda::block) == block_size); + CCCLRT_REQUIRE(dims.count(cuda::gpu_thread, cuda::block) == block_size); CCCLRT_REQUIRE(dims.count(cuda::block, cuda::grid) == grid_size); #endif } @@ -47,34 +47,34 @@ struct basic_test_single_dim { auto dims = cuda::make_hierarchy(cuda::block_dims(), cuda::grid_dims()); static_assert(dims.extents().x == grid_size * block_size); - static_assert(dims.extents(cuda::thread).x == grid_size * block_size); - static_assert(dims.extents(cuda::thread, cuda::grid).x == grid_size * block_size); + static_assert(dims.extents(cuda::gpu_thread).x == grid_size * block_size); + static_assert(dims.extents(cuda::gpu_thread, cuda::grid).x == grid_size * block_size); static_assert(dims.count() == grid_size * block_size); - static_assert(dims.count(cuda::thread) == grid_size * block_size); - static_assert(dims.count(cuda::thread, cuda::grid) == grid_size * block_size); + static_assert(dims.count(cuda::gpu_thread) == grid_size * block_size); + static_assert(dims.count(cuda::gpu_thread, cuda::grid) == grid_size * block_size); static_assert(dims.static_count() == grid_size * block_size); - static_assert(dims.static_count(cuda::thread) == grid_size * block_size); - static_assert(dims.static_count(cuda::thread, cuda::grid) == grid_size * block_size); + static_assert(dims.static_count(cuda::gpu_thread) == grid_size * block_size); + static_assert(dims.static_count(cuda::gpu_thread, cuda::grid) == grid_size * block_size); static_assert(dims.static_extents()[0] == grid_size * block_size); - static_assert(dims.static_extents(cuda::thread)[0] == grid_size * block_size); - static_assert(dims.static_extents(cuda::thread, cuda::grid)[0] == grid_size * block_size); + static_assert(dims.static_extents(cuda::gpu_thread)[0] == grid_size * block_size); + static_assert(dims.static_extents(cuda::gpu_thread, cuda::grid)[0] == grid_size * block_size); - static_assert(dims.extents(cuda::thread, cuda::block).x == block_size); + static_assert(dims.extents(cuda::gpu_thread, cuda::block).x == block_size); static_assert(dims.extents(cuda::block, cuda::grid).x == grid_size); - static_assert(dims.count(cuda::thread, cuda::block) == block_size); + static_assert(dims.count(cuda::gpu_thread, cuda::block) == block_size); static_assert(dims.count(cuda::block, cuda::grid) == grid_size); - static_assert(dims.static_count(cuda::thread, cuda::block) == block_size); + static_assert(dims.static_count(cuda::gpu_thread, cuda::block) == block_size); static_assert(dims.static_count(cuda::block, cuda::grid) == grid_size); - static_assert(dims.static_extents(cuda::thread, cuda::block)[0] == block_size); + static_assert(dims.static_extents(cuda::gpu_thread, cuda::block)[0] == block_size); auto dims_dyn = cuda::make_hierarchy(cuda::block_dims(block_size), cuda::grid_dims(grid_size)); test_host_dev(dims_dyn, *this); - static_assert(dims_dyn.static_count(cuda::thread, cuda::block) == cuda::std::dynamic_extent); - static_assert(dims_dyn.static_count(cuda::thread, cuda::grid) == cuda::std::dynamic_extent); - static_assert(dims_dyn.static_extents(cuda::thread, cuda::block)[0] == cuda::std::dynamic_extent); - static_assert(dims_dyn.static_extents(cuda::thread, cuda::grid)[0] == cuda::std::dynamic_extent); + static_assert(dims_dyn.static_count(cuda::gpu_thread, cuda::block) == cuda::std::dynamic_extent); + static_assert(dims_dyn.static_count(cuda::gpu_thread, cuda::grid) == cuda::std::dynamic_extent); + static_assert(dims_dyn.static_extents(cuda::gpu_thread, cuda::block)[0] == cuda::std::dynamic_extent); + static_assert(dims_dyn.static_extents(cuda::gpu_thread, cuda::grid)[0] == cuda::std::dynamic_extent); // Test that we can also drop the empty parens in the level constructors: auto config = cuda::make_hierarchy(cuda::block_dims, cuda::grid_dims); @@ -92,18 +92,18 @@ struct basic_test_multi_dim // device-side require doesn't work with clang-cuda for now #if !_CCCL_CUDA_COMPILER(CLANG) CCCLRT_REQUIRE(dims.extents() == dim3(32, 12, 4)); - CCCLRT_REQUIRE(dims.extents(cuda::thread) == dim3(32, 12, 4)); - CCCLRT_REQUIRE(dims.extents(cuda::thread, cuda::grid) == dim3(32, 12, 4)); + CCCLRT_REQUIRE(dims.extents(cuda::gpu_thread) == dim3(32, 12, 4)); + CCCLRT_REQUIRE(dims.extents(cuda::gpu_thread, cuda::grid) == dim3(32, 12, 4)); CCCLRT_REQUIRE(dims.extents().extent(0) == 32); CCCLRT_REQUIRE(dims.extents().extent(1) == 12); CCCLRT_REQUIRE(dims.extents().extent(2) == 4); CCCLRT_REQUIRE(dims.count() == 512 * 3); - CCCLRT_REQUIRE(dims.count(cuda::thread) == 512 * 3); - CCCLRT_REQUIRE(dims.count(cuda::thread, cuda::grid) == 512 * 3); + CCCLRT_REQUIRE(dims.count(cuda::gpu_thread) == 512 * 3); + CCCLRT_REQUIRE(dims.count(cuda::gpu_thread, cuda::grid) == 512 * 3); - CCCLRT_REQUIRE(dims.extents(cuda::thread, cuda::block) == dim3(2, 3, 4)); + CCCLRT_REQUIRE(dims.extents(cuda::gpu_thread, cuda::block) == dim3(2, 3, 4)); CCCLRT_REQUIRE(dims.extents(cuda::block, cuda::grid) == dim3(16, 4, 1)); - CCCLRT_REQUIRE(dims.count(cuda::thread, cuda::block) == 24); + CCCLRT_REQUIRE(dims.count(cuda::gpu_thread, cuda::block) == 24); CCCLRT_REQUIRE(dims.count(cuda::block, cuda::grid) == 64); #endif } @@ -113,30 +113,30 @@ struct basic_test_multi_dim auto dims_multidim = cuda::make_hierarchy(cuda::block_dims<2, 3, 4>(), cuda::grid_dims<16, 4, 1>()); static_assert(dims_multidim.extents() == dim3(32, 12, 4)); - static_assert(dims_multidim.extents(cuda::thread) == dim3(32, 12, 4)); - static_assert(dims_multidim.extents(cuda::thread, cuda::grid) == dim3(32, 12, 4)); + static_assert(dims_multidim.extents(cuda::gpu_thread) == dim3(32, 12, 4)); + static_assert(dims_multidim.extents(cuda::gpu_thread, cuda::grid) == dim3(32, 12, 4)); static_assert(dims_multidim.extents().extent(0) == 32); static_assert(dims_multidim.extents().extent(1) == 12); static_assert(dims_multidim.extents().extent(2) == 4); static_assert(dims_multidim.count() == 512 * 3); - static_assert(dims_multidim.count(cuda::thread) == 512 * 3); - static_assert(dims_multidim.count(cuda::thread, cuda::grid) == 512 * 3); + static_assert(dims_multidim.count(cuda::gpu_thread) == 512 * 3); + static_assert(dims_multidim.count(cuda::gpu_thread, cuda::grid) == 512 * 3); static_assert(dims_multidim.static_count() == 512 * 3); - static_assert(dims_multidim.static_count(cuda::thread) == 512 * 3); - static_assert(dims_multidim.static_count(cuda::thread, cuda::grid) == 512 * 3); + static_assert(dims_multidim.static_count(cuda::gpu_thread) == 512 * 3); + static_assert(dims_multidim.static_count(cuda::gpu_thread, cuda::grid) == 512 * 3); static_assert(dims_multidim.static_extents() == cuda::std::array{32, 12, 4}); - static_assert(dims_multidim.static_extents(cuda::thread) == cuda::std::array{32, 12, 4}); + static_assert(dims_multidim.static_extents(cuda::gpu_thread) == cuda::std::array{32, 12, 4}); static_assert( - dims_multidim.static_extents(cuda::thread, cuda::grid) == cuda::std::array{32, 12, 4}); + dims_multidim.static_extents(cuda::gpu_thread, cuda::grid) == cuda::std::array{32, 12, 4}); - static_assert(dims_multidim.extents(cuda::thread, cuda::block) == dim3(2, 3, 4)); + static_assert(dims_multidim.extents(cuda::gpu_thread, cuda::block) == dim3(2, 3, 4)); static_assert(dims_multidim.extents(cuda::block, cuda::grid) == dim3(16, 4, 1)); - static_assert(dims_multidim.count(cuda::thread, cuda::block) == 24); + static_assert(dims_multidim.count(cuda::gpu_thread, cuda::block) == 24); static_assert(dims_multidim.count(cuda::block, cuda::grid) == 64); - static_assert(dims_multidim.static_count(cuda::thread, cuda::block) == 24); + static_assert(dims_multidim.static_count(cuda::gpu_thread, cuda::block) == 24); static_assert(dims_multidim.static_count(cuda::block, cuda::grid) == 64); static_assert( - dims_multidim.static_extents(cuda::thread, cuda::block) == cuda::std::array{2, 3, 4}); + dims_multidim.static_extents(cuda::gpu_thread, cuda::block) == cuda::std::array{2, 3, 4}); static_assert( dims_multidim.static_extents(cuda::block, cuda::grid) == cuda::std::array{16, 4, 1}); @@ -144,8 +144,8 @@ struct basic_test_multi_dim test_host_dev(dims_multidim_dyn, *this); - static_assert(dims_multidim_dyn.static_count(cuda::thread, cuda::block) == cuda::std::dynamic_extent); - static_assert(dims_multidim_dyn.static_count(cuda::thread, cuda::grid) == cuda::std::dynamic_extent); + static_assert(dims_multidim_dyn.static_count(cuda::gpu_thread, cuda::block) == cuda::std::dynamic_extent); + static_assert(dims_multidim_dyn.static_count(cuda::gpu_thread, cuda::grid) == cuda::std::dynamic_extent); } }; @@ -159,14 +159,14 @@ struct basic_test_mixed // device-side require doesn't work with clang-cuda for now #if !_CCCL_CUDA_COMPILER(CLANG) CCCLRT_REQUIRE(dims.extents() == dim3(2048, 4, 2)); - CCCLRT_REQUIRE(dims.extents(cuda::thread) == dim3(2048, 4, 2)); - CCCLRT_REQUIRE(dims.extents(cuda::thread, cuda::grid) == dim3(2048, 4, 2)); + CCCLRT_REQUIRE(dims.extents(cuda::gpu_thread) == dim3(2048, 4, 2)); + CCCLRT_REQUIRE(dims.extents(cuda::gpu_thread, cuda::grid) == dim3(2048, 4, 2)); CCCLRT_REQUIRE(dims.extents().extent(0) == 2048); CCCLRT_REQUIRE(dims.extents().extent(1) == 4); CCCLRT_REQUIRE(dims.extents().extent(2) == 2); CCCLRT_REQUIRE(dims.count() == 16 * 1024); - CCCLRT_REQUIRE(dims.count(cuda::thread) == 16 * 1024); - CCCLRT_REQUIRE(dims.count(cuda::thread, cuda::grid) == 16 * 1024); + CCCLRT_REQUIRE(dims.count(cuda::gpu_thread) == 16 * 1024); + CCCLRT_REQUIRE(dims.count(cuda::gpu_thread, cuda::grid) == 16 * 1024); CCCLRT_REQUIRE(dims.extents(cuda::block, cuda::grid) == dim3(8, 4, 2)); CCCLRT_REQUIRE(dims.count(cuda::block, cuda::grid) == 64); @@ -178,11 +178,11 @@ struct basic_test_mixed auto dims_mixed = cuda::make_hierarchy(cuda::block_dims(), cuda::grid_dims(dim3(8, 4, 2))); test_host_dev(dims_mixed, *this); - static_assert(dims_mixed.extents(cuda::thread, cuda::block) == block_size); - static_assert(dims_mixed.count(cuda::thread, cuda::block) == block_size); - static_assert(dims_mixed.static_count(cuda::thread, cuda::block) == block_size); + static_assert(dims_mixed.extents(cuda::gpu_thread, cuda::block) == block_size); + static_assert(dims_mixed.count(cuda::gpu_thread, cuda::block) == block_size); + static_assert(dims_mixed.static_count(cuda::gpu_thread, cuda::block) == block_size); static_assert(dims_mixed.static_count(cuda::block, cuda::grid) == cuda::std::dynamic_extent); - static_assert(dims_mixed.static_extents(cuda::thread, cuda::block)[0] == block_size); + static_assert(dims_mixed.static_extents(cuda::gpu_thread, cuda::block)[0] == block_size); // TODO include mixed static and dynamic info on a single level // Currently bugged in std::extents @@ -209,7 +209,7 @@ struct basic_test_cluster CCCLRT_REQUIRE(dims.extents(cuda::block, cuda::grid) == dim3(2, 6, 9)); CCCLRT_REQUIRE(dims.count(cuda::block, cuda::grid) == 108); CCCLRT_REQUIRE(dims.extents(cuda::cluster, cuda::grid) == dim3(1, 3, 9)); - CCCLRT_REQUIRE(dims.extents(cuda::thread, cuda::cluster) == dim3(512, 2, 1)); + CCCLRT_REQUIRE(dims.extents(cuda::gpu_thread, cuda::cluster) == dim3(512, 2, 1)); #endif } @@ -224,13 +224,13 @@ struct basic_test_cluster static_assert(dims.static_count() == 1024 * 1024); static_assert(dims.static_extents()[0] == 1024 * 1024); - static_assert(dims.extents(cuda::thread, cuda::block).x == 256); + static_assert(dims.extents(cuda::gpu_thread, cuda::block).x == 256); static_assert(dims.extents(cuda::block, cuda::grid).x == 4 * 1024); - static_assert(dims.count(cuda::thread, cuda::cluster) == 2 * 1024); + static_assert(dims.count(cuda::gpu_thread, cuda::cluster) == 2 * 1024); static_assert(dims.count(cuda::cluster) == 512); static_assert(dims.static_count(cuda::cluster) == 512); static_assert(dims.static_count(cuda::block, cuda::cluster) == 8); - static_assert(dims.static_extents(cuda::thread, cuda::block)[0] == 256); + static_assert(dims.static_extents(cuda::gpu_thread, cuda::block)[0] == 256); static_assert(dims.static_extents(cuda::block, cuda::grid)[0] == 4 * 1024); } SECTION("Mixed cluster dims") @@ -238,12 +238,12 @@ struct basic_test_cluster auto dims_mixed = cuda::make_hierarchy( cuda::block_dims<256>(), cuda::cluster_dims(dim3(2, 2, 1)), cuda::grid_dims(dim3(1, 3, 9))); test_host_dev(dims_mixed, *this, arch_filter, 90>); - static_assert(dims_mixed.extents(cuda::thread, cuda::block) == 256); - static_assert(dims_mixed.count(cuda::thread, cuda::block) == 256); - static_assert(dims_mixed.static_count(cuda::thread, cuda::block) == 256); + static_assert(dims_mixed.extents(cuda::gpu_thread, cuda::block) == 256); + static_assert(dims_mixed.count(cuda::gpu_thread, cuda::block) == 256); + static_assert(dims_mixed.static_count(cuda::gpu_thread, cuda::block) == 256); static_assert(dims_mixed.static_count(cuda::block, cuda::cluster) == cuda::std::dynamic_extent); static_assert(dims_mixed.static_count(cuda::block) == cuda::std::dynamic_extent); - static_assert(dims_mixed.static_extents(cuda::thread, cuda::block)[0] == 256); + static_assert(dims_mixed.static_extents(cuda::gpu_thread, cuda::block)[0] == 256); static_assert(dims_mixed.static_extents(cuda::block, cuda::cluster)[0] == cuda::std::dynamic_extent); static_assert(dims_mixed.static_extents(cuda::block)[0] == cuda::std::dynamic_extent); } @@ -294,16 +294,16 @@ C2H_TEST("Different constructions", "[hierarchy]") cuda::block_dims()); static_assert(std::is_same_v); - static_assert(config.dims.count(cuda::thread, cuda::block) == block_size); - static_assert(config.dims.count(cuda::thread, cuda::cluster) == cluster_cnt * + static_assert(config.dims.count(cuda::gpu_thread, cuda::block) == block_size); + static_assert(config.dims.count(cuda::gpu_thread, cuda::cluster) == cluster_cnt * block_size); static_assert(config.dims.count(cuda::block, cuda::cluster) == cluster_cnt); CCCLRT_REQUIRE(config.dims.count() == grid_size * cluster_cnt * block_size); - static_assert(cuda::has_level); - static_assert(cuda::has_level); - static_assert(cuda::has_level); - static_assert(!cuda::has_level); + static_assert(cuda::has_level_v); + static_assert(cuda::has_level_v); + static_assert(cuda::has_level_v); + static_assert(!cuda::has_level_v); */ } @@ -311,16 +311,16 @@ C2H_TEST("Replace level", "[hierarchy]") { const auto dimensions = cuda::make_hierarchy(cuda::block_dims<512>(), cuda::cluster_dims<8>(), cuda::grid_dims(256)); const auto fragment = dimensions.fragment(cuda::block, cuda::grid); - static_assert(!cuda::has_level); - static_assert(!cuda::has_level_or_unit); - static_assert(cuda::has_level); - static_assert(cuda::has_level); - static_assert(cuda::has_level_or_unit); + static_assert(!cuda::has_level_v); + static_assert(!cuda::has_unit_or_level_v); + static_assert(cuda::has_level_v); + static_assert(cuda::has_level_v); + static_assert(cuda::has_unit_or_level_v); const auto replaced = cuda::hierarchy_add_level(fragment, cuda::block_dims(256)); - static_assert(cuda::has_level); - static_assert(cuda::has_level_or_unit); - CCCLRT_REQUIRE(replaced.count(cuda::thread, cuda::block) == 256); + static_assert(cuda::has_level_v); + static_assert(cuda::has_unit_or_level_v); + CCCLRT_REQUIRE(replaced.count(cuda::gpu_thread, cuda::block) == 256); } template @@ -329,34 +329,34 @@ __global__ void kernel(Hierarchy hierarchy) auto grid = cg::this_grid(); auto block = cg::this_thread_block(); - CCCLRT_REQUIRE_DEVICE(grid.thread_rank() == (cuda::hierarchy::rank(cuda::thread, cuda::grid))); + CCCLRT_REQUIRE_DEVICE(grid.thread_rank() == (cuda::hierarchy::rank(cuda::gpu_thread, cuda::grid))); CCCLRT_REQUIRE_DEVICE(grid.block_rank() == (cuda::hierarchy::rank(cuda::block, cuda::grid))); - CCCLRT_REQUIRE_DEVICE(grid.thread_rank() == cuda::grid.rank(cuda::thread)); - CCCLRT_REQUIRE_DEVICE(grid.block_rank() == cuda::grid.rank(cuda::block)); + CCCLRT_REQUIRE_DEVICE(grid.thread_rank() == cuda::gpu_thread.rank(cuda::grid)); + CCCLRT_REQUIRE_DEVICE(grid.block_rank() == cuda::block.rank(cuda::grid)); CCCLRT_REQUIRE_DEVICE(grid.block_index() == (cuda::hierarchy::index(cuda::block, cuda::grid))); - CCCLRT_REQUIRE_DEVICE(grid.block_index() == cuda::grid.index(cuda::block)); + CCCLRT_REQUIRE_DEVICE(grid.block_index() == cuda::block.index(cuda::grid)); - CCCLRT_REQUIRE_DEVICE(grid.num_threads() == (cuda::hierarchy::count(cuda::thread, cuda::grid))); + CCCLRT_REQUIRE_DEVICE(grid.num_threads() == (cuda::hierarchy::count(cuda::gpu_thread, cuda::grid))); CCCLRT_REQUIRE_DEVICE(grid.num_blocks() == (cuda::hierarchy::count(cuda::block, cuda::grid))); - CCCLRT_REQUIRE_DEVICE(grid.num_threads() == (cuda::grid.count(cuda::thread))); - CCCLRT_REQUIRE_DEVICE(grid.num_blocks() == cuda::grid.count(cuda::block)); + CCCLRT_REQUIRE_DEVICE(grid.num_threads() == (cuda::gpu_thread.count(cuda::grid))); + CCCLRT_REQUIRE_DEVICE(grid.num_blocks() == cuda::block.count(cuda::grid)); CCCLRT_REQUIRE_DEVICE(grid.dim_blocks() == (cuda::hierarchy::extents())); - CCCLRT_REQUIRE_DEVICE(grid.dim_blocks() == cuda::grid.extents(cuda::block)); + CCCLRT_REQUIRE_DEVICE(grid.dim_blocks() == dim3{cuda::block.dims(cuda::grid)}); CCCLRT_REQUIRE_DEVICE(block.thread_rank() == (cuda::hierarchy::rank())); CCCLRT_REQUIRE_DEVICE(block.thread_index() == (cuda::hierarchy::index())); CCCLRT_REQUIRE_DEVICE(block.num_threads() == (cuda::hierarchy::count())); CCCLRT_REQUIRE_DEVICE(block.dim_threads() == (cuda::hierarchy::extents())); - CCCLRT_REQUIRE_DEVICE(block.thread_rank() == cuda::block.rank(cuda::thread)); - CCCLRT_REQUIRE_DEVICE(block.thread_index() == cuda::block.index(cuda::thread)); - CCCLRT_REQUIRE_DEVICE(block.num_threads() == cuda::block.count(cuda::thread)); - CCCLRT_REQUIRE_DEVICE(block.dim_threads() == cuda::block.extents(cuda::thread)); + CCCLRT_REQUIRE_DEVICE(block.thread_rank() == cuda::gpu_thread.rank(cuda::block)); + CCCLRT_REQUIRE_DEVICE(block.thread_index() == cuda::gpu_thread.index(cuda::block)); + CCCLRT_REQUIRE_DEVICE(block.num_threads() == cuda::gpu_thread.count(cuda::block)); + CCCLRT_REQUIRE_DEVICE(block.dim_threads() == dim3{cuda::gpu_thread.dims(cuda::block)}); - auto block_index = hierarchy.index(cuda::thread, cuda::block); + auto block_index = hierarchy.index(cuda::gpu_thread, cuda::block); CCCLRT_REQUIRE_DEVICE(block_index == block.thread_index()); auto grid_index = hierarchy.index(); CCCLRT_REQUIRE_DEVICE( @@ -370,7 +370,7 @@ __global__ void kernel(Hierarchy hierarchy) == static_cast(grid.block_index().z) * block.dim_threads().z + block.thread_index().z); CCCLRT_REQUIRE_DEVICE(hierarchy.rank(cuda::block) == grid.block_rank()); - CCCLRT_REQUIRE_DEVICE(hierarchy.rank(cuda::thread, cuda::block) == block.thread_rank()); + CCCLRT_REQUIRE_DEVICE(hierarchy.rank(cuda::gpu_thread, cuda::block) == block.thread_rank()); CCCLRT_REQUIRE_DEVICE(hierarchy.rank() == grid.thread_rank()); } @@ -400,14 +400,14 @@ C2H_TEST("Dims queries indexing and ambient hierarchy", "[hierarchy]") template __global__ void rank_kernel_optimized(Hierarchy hierarchy, unsigned int* out) { - auto thread_id = hierarchy.rank(cuda::thread, cuda::block); + auto thread_id = hierarchy.rank(cuda::gpu_thread, cuda::block); out[thread_id] = thread_id; } template __global__ void rank_kernel(Hierarchy hierarchy, unsigned int* out) { - auto thread_id = cuda::hierarchy::rank(cuda::thread, cuda::block); + auto thread_id = cuda::hierarchy::rank(cuda::gpu_thread, cuda::block); out[thread_id] = thread_id; } @@ -438,36 +438,36 @@ template __global__ void examples_kernel(Hierarchy hierarchy) { { - auto thread_index_in_block = hierarchy.index(cuda::thread, cuda::block); + auto thread_index_in_block = hierarchy.index(cuda::gpu_thread, cuda::block); CCCLRT_REQUIRE_DEVICE(thread_index_in_block == threadIdx); auto block_index_in_grid = hierarchy.index(cuda::block); CCCLRT_REQUIRE_DEVICE(block_index_in_grid == blockIdx); } { - int thread_rank_in_block = hierarchy.rank(cuda::thread, cuda::block); + int thread_rank_in_block = hierarchy.rank(cuda::gpu_thread, cuda::block); int block_rank_in_grid = hierarchy.rank(cuda::block); } { // Can be called with the instances of level types - int num_threads_in_block = cuda::hierarchy::count(cuda::thread, cuda::block); - int num_blocks_in_grid = cuda::grid.count(cuda::block); + int num_threads_in_block = cuda::hierarchy::count(cuda::gpu_thread, cuda::block); + int num_blocks_in_grid = cuda::block.count(cuda::grid); // Or using the level types as template arguments int num_threads_in_grid = cuda::hierarchy::count(); } { // Can be called with the instances of level types - int thread_rank_in_block = cuda::hierarchy::rank(cuda::thread, cuda::block); - int block_rank_in_grid = cuda::grid.rank(cuda::block); + int thread_rank_in_block = cuda::hierarchy::rank(cuda::gpu_thread, cuda::block); + int block_rank_in_grid = cuda::block.rank(cuda::grid); // Or using the level types as template arguments int thread_rank_in_grid = cuda::hierarchy::rank(); } { // Can be called with the instances of level types - auto block_dims = cuda::hierarchy::extents(cuda::thread, cuda::block); + auto block_dims = cuda::hierarchy::extents(cuda::gpu_thread, cuda::block); CCCLRT_REQUIRE_DEVICE(block_dims == blockDim); - auto grid_dims = cuda::grid.extents(cuda::block); + dim3 grid_dims{cuda::block.dims(cuda::grid)}; CCCLRT_REQUIRE_DEVICE(grid_dims == gridDim); // Or using the level types as template arguments @@ -475,10 +475,10 @@ __global__ void examples_kernel(Hierarchy hierarchy) } { // Can be called with the instances of level types - auto thread_index_in_block = cuda::hierarchy::index(cuda::thread, cuda::block); - CCCLRT_REQUIRE_DEVICE(thread_index_in_block == threadIdx); - auto block_index_in_grid = cuda::grid.index(cuda::block); - CCCLRT_REQUIRE_DEVICE(block_index_in_grid == blockIdx); + auto thread_index_in_block = cuda::gpu_thread.index(cuda::block); + CCCLRT_REQUIRE_DEVICE(static_cast(thread_index_in_block) == threadIdx); + auto block_index_in_grid = cuda::block.index(cuda::grid); + CCCLRT_REQUIRE_DEVICE(static_cast(block_index_in_grid) == blockIdx); // Or using the level types as template arguments auto thread_index_in_grid = cuda::hierarchy::index(); @@ -494,25 +494,25 @@ C2H_TEST("Examples", "[hierarchy]") auto hierarchy = cuda::make_hierarchy(cuda::grid_dims(256), cuda::cluster_dims<4>(), cuda::block_dims<8, 8, 8>()); auto fragment = hierarchy.fragment(cuda::block, cuda::grid); auto new_hierarchy = cuda::hierarchy_add_level(fragment, cuda::block_dims<128>()); - static_assert(new_hierarchy.count(cuda::thread, cuda::block) == 128); + static_assert(new_hierarchy.count(cuda::gpu_thread, cuda::block) == 128); } { auto hierarchy = cuda::make_hierarchy(cuda::grid_dims(256), cuda::cluster_dims<4>(), cuda::block_dims<8, 8, 8>()); - static_assert(hierarchy.count(cuda::thread, cuda::cluster) == 4 * 8 * 8 * 8); + static_assert(hierarchy.count(cuda::gpu_thread, cuda::cluster) == 4 * 8 * 8 * 8); CCCLRT_REQUIRE(hierarchy.count() == 256 * 4 * 8 * 8 * 8); CCCLRT_REQUIRE(hierarchy.count(cuda::cluster) == 256); } { [[maybe_unused]] auto hierarchy = cuda::make_hierarchy(cuda::grid_dims(256), cuda::cluster_dims<4>(), cuda::block_dims<8, 8, 8>()); - static_assert(hierarchy.static_count(cuda::thread, cuda::cluster) == 4 * 8 * 8 * 8); + static_assert(hierarchy.static_count(cuda::gpu_thread, cuda::cluster) == 4 * 8 * 8 * 8); CCCLRT_REQUIRE(hierarchy.static_count() == cuda::std::dynamic_extent); } { auto hierarchy = cuda::make_hierarchy(cuda::grid_dims(256), cuda::cluster_dims<4>(), cuda::block_dims<8, 8, 8>()); - static_assert(hierarchy.extents(cuda::thread, cuda::cluster).extent(0) == 4 * 8); - static_assert(hierarchy.extents(cuda::thread, cuda::cluster).extent(1) == 8); - static_assert(hierarchy.extents(cuda::thread, cuda::cluster).extent(2) == 8); + static_assert(hierarchy.extents(cuda::gpu_thread, cuda::cluster).extent(0) == 4 * 8); + static_assert(hierarchy.extents(cuda::gpu_thread, cuda::cluster).extent(1) == 8); + static_assert(hierarchy.extents(cuda::gpu_thread, cuda::cluster).extent(2) == 8); CCCLRT_REQUIRE(hierarchy.extents().extent(0) == 256 * 4 * 8); CCCLRT_REQUIRE(hierarchy.extents(cuda::cluster).extent(0) == 256); } @@ -568,11 +568,11 @@ C2H_TEST("Trivially constructable", "[hierarchy]") C2H_TEST("cuda::distribute", "[hierarchy]") { - int numElements = 50000; + unsigned numElements = 50000; constexpr int threadsPerBlock = 256; auto config = cuda::distribute(numElements); - CCCLRT_REQUIRE(config.dims.count(cuda::thread, cuda::block) == 256); + CCCLRT_REQUIRE(config.dims.count(cuda::gpu_thread, cuda::block) == 256); CCCLRT_REQUIRE(config.dims.count(cuda::block, cuda::grid) == (numElements + threadsPerBlock - 1) / threadsPerBlock); } @@ -583,45 +583,45 @@ C2H_TEST("hierarchy merge", "[hierarchy]") auto h1 = cuda::make_hierarchy(cuda::grid_dims<2>()); auto h2 = cuda::make_hierarchy(cuda::block_dims<3>()); auto combined = h1.combine(h2); - static_assert(combined.count(cuda::thread) == 6); - static_assert(combined.count(cuda::thread, cuda::block) == 3); + static_assert(combined.count(cuda::gpu_thread) == 6); + static_assert(combined.count(cuda::gpu_thread, cuda::block) == 3); static_assert(combined.count(cuda::block) == 2); auto combined_the_other_way = h2.combine(h1); static_assert(cuda::std::is_same_v); - static_assert(combined_the_other_way.count(cuda::thread) == 6); + static_assert(combined_the_other_way.count(cuda::gpu_thread) == 6); auto dynamic_values = cuda::make_hierarchy(cuda::cluster_dims(4), cuda::block_dims(5)); auto combined_dynamic = dynamic_values.combine(h1); - CCCLRT_REQUIRE(combined_dynamic.count(cuda::thread) == 40); + CCCLRT_REQUIRE(combined_dynamic.count(cuda::gpu_thread) == 40); } SECTION("Overlapping") { auto h1 = cuda::make_hierarchy(cuda::grid_dims<2>(), cuda::cluster_dims<3>()); auto h2 = cuda::make_hierarchy(cuda::block_dims<4>(), cuda::cluster_dims<5>()); auto combined = h1.combine(h2); - static_assert(combined.count(cuda::thread) == 24); - static_assert(combined.count(cuda::thread, cuda::block) == 4); + static_assert(combined.count(cuda::gpu_thread) == 24); + static_assert(combined.count(cuda::gpu_thread, cuda::block) == 4); static_assert(combined.count(cuda::block) == 6); auto combined_the_other_way = h2.combine(h1); static_assert(!cuda::std::is_same_v); - static_assert(combined_the_other_way.count(cuda::thread) == 40); - static_assert(combined_the_other_way.count(cuda::thread, cuda::block) == 4); + static_assert(combined_the_other_way.count(cuda::gpu_thread) == 40); + static_assert(combined_the_other_way.count(cuda::gpu_thread, cuda::block) == 4); static_assert(combined_the_other_way.count(cuda::block) == 10); auto ultimate_combination = combined.combine(combined_the_other_way); static_assert(cuda::std::is_same_v); - static_assert(ultimate_combination.count(cuda::thread) == 24); + static_assert(ultimate_combination.count(cuda::gpu_thread) == 24); auto block_level_replacement = cuda::make_hierarchy(cuda::block_dims<6>()); auto with_block_replaced = block_level_replacement.combine(combined); - static_assert(with_block_replaced.count(cuda::thread) == 36); - static_assert(with_block_replaced.count(cuda::thread, cuda::block) == 6); + static_assert(with_block_replaced.count(cuda::gpu_thread) == 36); + static_assert(with_block_replaced.count(cuda::gpu_thread, cuda::block) == 6); auto grid_cluster_level_replacement = cuda::make_hierarchy(cuda::grid_dims<7>(), cuda::cluster_dims<8>()); auto with_grid_cluster_replaced = grid_cluster_level_replacement.combine(combined); - static_assert(with_grid_cluster_replaced.count(cuda::thread) == 7 * 8 * 4); + static_assert(with_grid_cluster_replaced.count(cuda::gpu_thread) == 7 * 8 * 4); static_assert(with_grid_cluster_replaced.count(cuda::block, cuda::cluster) == 8); static_assert(with_grid_cluster_replaced.count(cuda::cluster) == 7); } diff --git a/libcudacxx/test/libcudacxx/cuda/ccclrt/launch/configuration.cu b/libcudacxx/test/libcudacxx/cuda/ccclrt/launch/configuration.cu index cf7e15183cd..2508df8b7c2 100644 --- a/libcudacxx/test/libcudacxx/cuda/ccclrt/launch/configuration.cu +++ b/libcudacxx/test/libcudacxx/cuda/ccclrt/launch/configuration.cu @@ -204,10 +204,10 @@ C2H_TEST("Hierarchy construction in config", "[launch]") static_assert(config.dims.count(cuda::block) == 2); auto config_larger = cuda::make_config(cuda::grid_dims<2>(), cuda::block_dims(256), cuda::cooperative_launch()); - CCCLRT_REQUIRE(config_larger.dims.count(cuda::thread) == 512); + CCCLRT_REQUIRE(config_larger.dims.count(cuda::gpu_thread) == 512); auto config_no_options = cuda::make_config(cuda::grid_dims(2), cuda::block_dims<128>()); - CCCLRT_REQUIRE(config_no_options.dims.count(cuda::thread) == 256); + CCCLRT_REQUIRE(config_no_options.dims.count(cuda::gpu_thread) == 256); [[maybe_unused]] auto config_no_dims = cuda::make_config(cuda::cooperative_launch()); static_assert(cuda::std::is_same_v); @@ -231,18 +231,18 @@ C2H_TEST("Configuration combine", "[launch]") static_assert(cuda::std::is_same_v); static_assert(cuda::std::is_same_v); static_assert(cuda::std::is_same_v); - CCCLRT_REQUIRE(combined.dims.count(cuda::thread) == 512); + CCCLRT_REQUIRE(combined.dims.count(cuda::gpu_thread) == 512); } SECTION("Combine with overlap") { auto config_part1 = make_config(grid, cluster, cuda::launch_priority(2)); auto config_part2 = make_config(cuda::cluster_dims<256>(), block, cuda::launch_priority(42)); auto combined = config_part1.combine(config_part2); - CCCLRT_REQUIRE(combined.dims.count(cuda::thread) == 2048); + CCCLRT_REQUIRE(combined.dims.count(cuda::gpu_thread) == 2048); CCCLRT_REQUIRE(cuda::std::get<0>(combined.options).priority == 2); auto replaced_one_option = cuda::make_config(cuda::launch_priority(3)).combine(combined); - CCCLRT_REQUIRE(replaced_one_option.dims.count(cuda::thread) == 2048); + CCCLRT_REQUIRE(replaced_one_option.dims.count(cuda::gpu_thread) == 2048); CCCLRT_REQUIRE(cuda::std::get<0>(replaced_one_option.options).priority == 3); [[maybe_unused]] auto combined_with_extra_option = combined.combine(cuda::make_config(cuda::cooperative_launch())); diff --git a/libcudacxx/test/libcudacxx/cuda/ccclrt/launch/launch_smoke.cu b/libcudacxx/test/libcudacxx/cuda/ccclrt/launch/launch_smoke.cu index 584aa4c2d1e..232df2f0a96 100644 --- a/libcudacxx/test/libcudacxx/cuda/ccclrt/launch/launch_smoke.cu +++ b/libcudacxx/test/libcudacxx/cuda/ccclrt/launch/launch_smoke.cu @@ -49,7 +49,7 @@ struct functor_taking_config template __device__ void operator()(Config config, int grid_size) { - static_assert(config.dims.static_count(cuda::thread, cuda::block) == BlockSize); + static_assert(config.dims.static_count(cuda::gpu_thread, cuda::block) == BlockSize); CCCLRT_REQUIRE_DEVICE(config.dims.count(cuda::block, cuda::grid) == grid_size); kernel_run_proof = true; } @@ -220,7 +220,7 @@ void launch_smoke_test(cudaStream_t dst) { cuda::launch(dst, cuda::block_dims<256>() & cuda::grid_dims(1), [] __device__(auto config) { - if (config.dims.rank(cuda::thread, cuda::block) == 0) { + if (config.dims.rank(cuda::gpu_thread, cuda::block) == 0) { printf("Hello from the GPU\n"); kernel_run_proof = true; } @@ -307,7 +307,7 @@ void test_default_config() { auto block = cuda::block_dims<256>; auto verify_lambda = [] __device__(auto config) { - static_assert(config.dims.count(cuda::thread, cuda::block) == 256); + static_assert(config.dims.count(cuda::gpu_thread, cuda::block) == 256); CCCLRT_REQUIRE(config.dims.count(cuda::block) == 4); cooperative_groups::this_grid().sync(); }; diff --git a/libcudacxx/test/libcudacxx/cuda/containers/buffer/transform.cu b/libcudacxx/test/libcudacxx/cuda/containers/buffer/transform.cu index 6fcdf4ea4d0..d00c5f49c31 100644 --- a/libcudacxx/test/libcudacxx/cuda/containers/buffer/transform.cu +++ b/libcudacxx/test/libcudacxx/cuda/containers/buffer/transform.cu @@ -15,6 +15,7 @@ #include #include +#include #include #include #include @@ -68,8 +69,8 @@ struct add_kernel template __device__ void operator()(cuda::std::span a, cuda::std::span b) { - for (int i = cuda::hierarchy::rank(cuda::thread, cuda::grid); i < a.size(); - i += cuda::hierarchy::count(cuda::thread, cuda::grid)) + for (int i = cuda::hierarchy::rank(cuda::gpu_thread, cuda::grid); i < a.size(); + i += cuda::hierarchy::count(cuda::gpu_thread, cuda::grid)) { a[i] += b[i]; } diff --git a/libcudacxx/test/libcudacxx/cuda/hierarchy/block_level/hierarchy_queries.pass.cpp b/libcudacxx/test/libcudacxx/cuda/hierarchy/block_level/hierarchy_queries.pass.cpp new file mode 100644 index 00000000000..b0712e18cd8 --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/hierarchy/block_level/hierarchy_queries.pass.cpp @@ -0,0 +1,240 @@ +//===----------------------------------------------------------------------===// +// +// Part of the libcu++ Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +// todo: enable with nvrtc +// UNSUPPORTED: nvrtc + +#include "hierarchy_queries.h" + +#include +#include +#include +#include +#include + +template +__device__ void test_block( + const Hierarchy& hier, const GridExts& grid_exts, const ClusterExts& cluster_exts, const BlockExts& block_exts) +{ + // 1. Test cuda::block.dims(x, hier) + if constexpr (cuda::has_level_v) + { + uint3 exp{1, 1, 1}; + NV_IF_TARGET(NV_PROVIDES_SM_90, (exp = __clusterDim();)) + test_dims(exp, cuda::block, cuda::cluster, hier); + } + test_dims(gridDim, cuda::block, cuda::grid, hier); + + // 2. Test cuda::block.static_dims(x, hier) + if constexpr (cuda::has_level_v) + { + const ulonglong3 exp{ + ClusterExts::static_extent(0), + ClusterExts::static_extent(1), + ClusterExts::static_extent(2), + }; + test_static_dims(exp, cuda::block, cuda::cluster, hier); + } + { + const ulonglong3 exp{ + mul_static_extents(GridExts::static_extent(0), ClusterExts::static_extent(0)), + mul_static_extents(GridExts::static_extent(1), ClusterExts::static_extent(1)), + mul_static_extents(GridExts::static_extent(2), ClusterExts::static_extent(2)), + }; + test_static_dims(exp, cuda::block, cuda::grid, hier); + } + + // 3. Test cuda::block.extents(x) + if constexpr (cuda::has_level_v) + { + uint3 dims{1, 1, 1}; + NV_IF_TARGET(NV_PROVIDES_SM_90, (dims = __clusterDim();)) + const cuda::std:: + extents + exp{dims.x, dims.y, dims.z}; + + test_extents(exp, cuda::block, cuda::cluster, hier); + } + { + const cuda::std::extents + exp{gridDim.x, gridDim.y, gridDim.z}; + test_extents(exp, cuda::block, cuda::grid, hier); + } + + // 4. Test cuda::block.count(x, hier) + if constexpr (cuda::has_level_v) + { + cuda::std::size_t exp = 1; + NV_IF_TARGET(NV_PROVIDES_SM_90, ({ + exp *= __clusterDim().x; + exp *= __clusterDim().y; + exp *= __clusterDim().z; + })) + test_count(exp, cuda::block, cuda::cluster, hier); + } + test_count(cuda::std::size_t{gridDim.z} * gridDim.y * gridDim.x, cuda::block, cuda::grid, hier); + + // 5. test cuda::block.index(x, hier) + if constexpr (cuda::has_level_v) + { + uint3 exp{0, 0, 0}; + NV_IF_TARGET(NV_PROVIDES_SM_90, (exp = __clusterRelativeBlockIdx();)) + test_index(exp, cuda::block, cuda::cluster, hier); + } + test_index(blockIdx, cuda::block, cuda::grid, hier); + + // 6. Test cuda::block.rank(x, hier) + if constexpr (cuda::has_level_v) + { + cuda::std::size_t exp = 0; + NV_IF_TARGET(NV_PROVIDES_SM_90, ({ + exp = ((__clusterRelativeBlockIdx().z * __clusterDim().y) + __clusterRelativeBlockIdx().y) + * __clusterDim().x + + __clusterRelativeBlockIdx().x; + })) + test_rank(exp, cuda::block, cuda::cluster, hier); + } + { + const cuda::std::size_t exp = (blockIdx.z * gridDim.y + blockIdx.y) * gridDim.x + blockIdx.x; + test_rank(exp, cuda::block, cuda::grid, hier); + } +} + +__device__ void test_device() +{ + // todo: make hierarchy constructible on device + // test_thread(cuda::make_hierarchy(cuda::grid_dims(gridDim), cuda::block_dims(blockDim))); +} + +#if !_CCCL_COMPILER(NVRTC) +template +__global__ void test_kernel(Hierarchy hier, GridExts grid_exts, BlockExts block_exts) +{ + test_block(hier, grid_exts, cuda::std::extents{}, block_exts); +} + +template +__global__ void test_kernel(Hierarchy hier, GridExts grid_exts, ClusterExts cluster_exts, BlockExts block_exts) +{ + test_block(hier, grid_exts, cluster_exts, block_exts); +} + +template +void test_launch(GridExts grid_exts, BlockExts block_exts) +{ + const dim3 grid_dims{grid_exts.extent(0), grid_exts.extent(1), grid_exts.extent(2)}; + const dim3 block_dims{block_exts.extent(0), block_exts.extent(1), block_exts.extent(2)}; + + const cuda::std::dims<3, unsigned> grid_exts_dyn{grid_exts.extent(0), grid_exts.extent(1), grid_exts.extent(2)}; + const cuda::std::dims<3, unsigned> block_exts_dyn{block_exts.extent(0), block_exts.extent(1), block_exts.extent(2)}; + + // 1. Launch hierarchy with all static extents. + test_kernel<<>>( + cuda::make_hierarchy( + cuda::grid_dims(), + cuda::block_dims()), + grid_exts, + block_exts); + + // 2. Launch hierarchy with static grid extents and dynamic block extents. + test_kernel<<>>( + cuda::make_hierarchy( + cuda::grid_dims(), + cuda::block_dims(block_dims)), + grid_exts, + block_exts_dyn); + + // 3. Launch hierarchy with dynamic grid extents and static block extents. + test_kernel<<>>( + cuda::make_hierarchy( + cuda::grid_dims(grid_dims), + cuda::block_dims()), + grid_exts_dyn, + block_exts); + + // 4. Launch hierarchy with dynamic grid extents and dynamic block extents. + test_kernel<<>>( + cuda::make_hierarchy(cuda::grid_dims(grid_dims), cuda::block_dims(block_dims)), grid_exts_dyn, block_exts_dyn); +} + +template +void test_launch(GridExts grid_exts, ClusterExts cluster_exts, BlockExts block_exts) +{ + const dim3 grid_dims{grid_exts.extent(0), grid_exts.extent(1), grid_exts.extent(2)}; + const dim3 cluster_dims{cluster_exts.extent(0), cluster_exts.extent(1), cluster_exts.extent(2)}; + const dim3 block_dims{block_exts.extent(0), block_exts.extent(1), block_exts.extent(2)}; + + cuda::std::dims<3, unsigned> grid_exts_dyn{grid_exts.extent(0), grid_exts.extent(1), grid_exts.extent(2)}; + cuda::std::dims<3, unsigned> cluster_exts_dyn{cluster_exts.extent(0), cluster_exts.extent(1), cluster_exts.extent(2)}; + cuda::std::dims<3, unsigned> block_exts_dyn{block_exts.extent(0), block_exts.extent(1), block_exts.extent(2)}; + + cudaLaunchAttribute attribute[1]{}; + attribute[0].id = cudaLaunchAttributeClusterDimension; + attribute[0].val.clusterDim.x = cluster_dims.x; + attribute[0].val.clusterDim.y = cluster_dims.y; + attribute[0].val.clusterDim.z = cluster_dims.z; + + cudaLaunchConfig_t config{}; + config.gridDim = dim3{grid_dims.x * cluster_dims.x, grid_dims.y * cluster_dims.y, grid_dims.z * cluster_dims.z}; + config.blockDim = block_dims; + config.attrs = attribute; + config.numAttrs = 1; + + // 1. Launch hierarchy with all static extents. + { + auto hier = cuda::make_hierarchy( + cuda::grid_dims(), + cuda::cluster_dims(), + cuda::block_dims()); + auto kernel = test_kernel; + assert(cudaLaunchKernelEx(&config, kernel, hier, grid_exts, cluster_exts, block_exts) == cudaSuccess); + } + + // 2. Launch hierarchy with all dynamic extents. + { + auto hier = + cuda::make_hierarchy(cuda::grid_dims(grid_dims), cuda::cluster_dims(cluster_dims), cuda::block_dims(block_dims)); + auto kernel = + test_kernel; + assert(cudaLaunchKernelEx(&config, kernel, hier, grid_exts_dyn, cluster_exts_dyn, block_exts_dyn) == cudaSuccess); + } +} + +void test() +{ + int cc_major{}; + assert(cudaDeviceGetAttribute(&cc_major, cudaDevAttrComputeCapabilityMajor, 0) == cudaSuccess); + + // thread block clusters require compute capability at least 9.0 + const bool enable_clusters = cc_major >= 9; + + test_launch(cuda::std::extents{}, cuda::std::extents{}); + test_launch(cuda::std::extents{}, cuda::std::extents{}); + test_launch(cuda::std::extents{}, cuda::std::extents{}); + test_launch(cuda::std::extents{}, cuda::std::extents{}); + + if (enable_clusters) + { + test_launch(cuda::std::extents{}, + cuda::std::extents{}, + cuda::std::extents{}); + } + + assert(cudaDeviceSynchronize() == cudaSuccess); +} +#endif // !_CCCL_COMPILER(NVRTC) + +int main(int, char**) +{ + NV_IF_ELSE_TARGET(NV_IS_HOST, (test();), (test_device();)) + return 0; +} diff --git a/libcudacxx/test/libcudacxx/cuda/hierarchy/block_level/hierarchy_query_signatures.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/hierarchy/block_level/hierarchy_query_signatures.compile.pass.cpp new file mode 100644 index 00000000000..55f3c8b803e --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/hierarchy/block_level/hierarchy_query_signatures.compile.pass.cpp @@ -0,0 +1,128 @@ +//===----------------------------------------------------------------------===// +// +// Part of the libcu++ Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +// todo: enable with nvrtc +// UNSUPPORTED: nvrtc + +#include +#include +#include +#include + +template +__device__ void test_query_signatures(const Level& level, const Hierarchy& hier) +{ + // 1. Test cuda::block_level::dims(x, hier) signature. + static_assert( + cuda::std::is_same_v, decltype(cuda::block_level::dims(level, hier))>); + static_assert(noexcept(cuda::block_level::dims(level, hier))); + + // 2. Test cuda::block_level::static_dims(x, hier) signature. + static_assert(cuda::std::is_same_v, + decltype(cuda::block_level::static_dims(level, hier))>); + static_assert(noexcept(cuda::block_level::static_dims(level, hier))); + + // 3. Test cuda::block_level::extents(x, hier) signature. + using ExtentsResult = decltype(cuda::block_level::extents(level, hier)); + static_assert(cuda::std::__is_cuda_std_extents_v); + static_assert(cuda::std::is_same_v); + static_assert(noexcept(cuda::block_level::extents(level, hier))); + + // 4. Test cuda::block_level::count(x, hier) signature. + static_assert(cuda::std::is_same_v); + static_assert(noexcept(cuda::block_level::count(level, hier))); + + // 5. Test cuda::block_level::index(x, hier) signature. + static_assert( + cuda::std::is_same_v, decltype(cuda::block_level::index(level, hier))>); + static_assert(noexcept(cuda::block_level::index(level, hier))); + + // 6. Test cuda::block_level::rank(x, hier) signature. + static_assert(cuda::std::is_same_v); + static_assert(noexcept(cuda::block_level::rank(level, hier))); +} + +template +__device__ void test_query_as_signatures(const Level& level, const Hierarchy& hier) +{ + // 1. Test cuda::block_level::dims_as(x, hier) signature. + static_assert( + cuda::std::is_same_v, decltype(cuda::block_level::dims_as(level, hier))>); + static_assert(noexcept(cuda::block_level::dims_as(level, hier))); + + // 2. Test cuda::block_level::extents_as(x, hier) signature. + using ExtentsResult = decltype(cuda::block_level::extents_as(level, hier)); + static_assert(cuda::std::__is_cuda_std_extents_v); + static_assert(cuda::std::is_same_v); + static_assert(noexcept(cuda::block_level::extents_as(level, hier))); + + // 3. Test cuda::block_level::count_as(x, hier) signature. + static_assert(cuda::std::is_same_v(level, hier))>); + static_assert(noexcept(cuda::block_level::count_as(level, hier))); + + // 4. Test cuda::block_level::index_as(x, hier) signature. + static_assert( + cuda::std::is_same_v, decltype(cuda::block_level::index_as(level, hier))>); + static_assert(noexcept(cuda::block_level::index_as(level, hier))); + + // 5. Test cuda::block_level::rank_as(x, hier) signature. + static_assert(cuda::std::is_same_v(level, hier))>); + static_assert(noexcept(cuda::block_level::rank_as(level, hier))); +} + +template +__device__ void test(const InLevel& in_level, const Hierarchy& hier) +{ + test_query_signatures(in_level, hier); + test_query_as_signatures(in_level, hier); + test_query_as_signatures(in_level, hier); + test_query_as_signatures(in_level, hier); + test_query_as_signatures(in_level, hier); + test_query_as_signatures(in_level, hier); + test_query_as_signatures(in_level, hier); +} + +template +__device__ void test(const Hierarchy& hier) +{ + if constexpr (cuda::has_level_v) + { + test(cuda::cluster, hier); + } + test(cuda::grid, hier); +} + +template +__global__ void test_kernel(Hierarchy hier) +{ + test(hier); +} + +#define TEST_KERNEL_INSTANTIATE(...) \ + template __global__ void test_kernel( \ + decltype(cuda::make_hierarchy(__VA_ARGS__))) + +TEST_KERNEL_INSTANTIATE(cuda::grid_dims<1>(), cuda::block_dims<1>()); +TEST_KERNEL_INSTANTIATE(cuda::grid_dims<1>(), cuda::block_dims(dim3{})); +TEST_KERNEL_INSTANTIATE(cuda::grid_dims(dim3{}), cuda::block_dims<1>()); +TEST_KERNEL_INSTANTIATE(cuda::grid_dims(dim3{}), cuda::block_dims(dim3{})); + +TEST_KERNEL_INSTANTIATE(cuda::grid_dims<1>(), cuda::cluster_dims<1>(), cuda::block_dims<1>()); +TEST_KERNEL_INSTANTIATE(cuda::grid_dims<1>(), cuda::cluster_dims<1>(), cuda::block_dims(dim3{})); +TEST_KERNEL_INSTANTIATE(cuda::grid_dims<1>(), cuda::cluster_dims(dim3{}), cuda::block_dims<1>()); +TEST_KERNEL_INSTANTIATE(cuda::grid_dims<1>(), cuda::cluster_dims(dim3{}), cuda::block_dims(dim3{})); +TEST_KERNEL_INSTANTIATE(cuda::grid_dims(dim3{}), cuda::cluster_dims<1>(), cuda::block_dims<1>()); +TEST_KERNEL_INSTANTIATE(cuda::grid_dims(dim3{}), cuda::cluster_dims<1>(), cuda::block_dims(dim3{})); +TEST_KERNEL_INSTANTIATE(cuda::grid_dims(dim3{}), cuda::cluster_dims(dim3{}), cuda::block_dims<1>()); +TEST_KERNEL_INSTANTIATE(cuda::grid_dims(dim3{}), cuda::cluster_dims(dim3{}), cuda::block_dims(dim3{})); + +int main(int, char**) +{ + return 0; +} diff --git a/libcudacxx/test/libcudacxx/cuda/hierarchy/block_level/native_hierarchy_queries.pass.cpp b/libcudacxx/test/libcudacxx/cuda/hierarchy/block_level/native_hierarchy_queries.pass.cpp new file mode 100644 index 00000000000..270af7d4aba --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/hierarchy/block_level/native_hierarchy_queries.pass.cpp @@ -0,0 +1,125 @@ +//===----------------------------------------------------------------------===// +// +// Part of the libcu++ Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +// todo: enable with nvrtc +// UNSUPPORTED: nvrtc + +#include +#include +#include +#include +#include + +#include "hierarchy_queries.h" + +__device__ void test_block() +{ + constexpr cuda::std::size_t dext = cuda::std::dynamic_extent; + + // 1. Test cuda::block.dims(x) + { + uint3 exp{1, 1, 1}; + NV_IF_TARGET(NV_PROVIDES_SM_90, (exp = __clusterDim();)) + test_dims(exp, cuda::block, cuda::cluster); + } + test_dims(gridDim, cuda::block, cuda::grid); + + // 2. Test cuda::block.static_dims(x) + test_static_dims(ulonglong3{dext, dext, dext}, cuda::block, cuda::cluster); + test_static_dims(ulonglong3{dext, dext, dext}, cuda::block, cuda::grid); + + // 3. Test cuda::block.extents(x) + { + uint3 exp{1, 1, 1}; + NV_IF_TARGET(NV_PROVIDES_SM_90, (exp = __clusterDim();)) + test_extents(cuda::std::dims<3, unsigned>{exp.x, exp.y, exp.z}, cuda::block, cuda::cluster); + } + test_extents(cuda::std::dims<3, unsigned>{gridDim.x, gridDim.y, gridDim.z}, cuda::block, cuda::grid); + + // 4. Test cuda::block.count(x) + { + cuda::std::size_t exp = 1; + NV_IF_TARGET(NV_PROVIDES_SM_90, ({ + exp *= __clusterDim().x; + exp *= __clusterDim().y; + exp *= __clusterDim().z; + })) + test_count(exp, cuda::block, cuda::cluster); + } + test_count(cuda::std::size_t{gridDim.z} * gridDim.y * gridDim.x, cuda::block, cuda::grid); + + // 5. test cuda::block.index(x) + { + uint3 exp{0, 0, 0}; + NV_IF_TARGET(NV_PROVIDES_SM_90, (exp = __clusterRelativeBlockIdx();)) + test_index(exp, cuda::block, cuda::cluster); + } + test_index(blockIdx, cuda::block, cuda::grid); + + // 6. Test cuda::block.rank(x) + { + cuda::std::size_t exp = 0; + NV_IF_TARGET(NV_PROVIDES_SM_90, ({ + exp = ((__clusterRelativeBlockIdx().z * __clusterDim().y) + __clusterRelativeBlockIdx().y) + * __clusterDim().x + + __clusterRelativeBlockIdx().x; + })) + test_rank(exp, cuda::block, cuda::cluster); + } + { + const cuda::std::size_t exp = (blockIdx.z * gridDim.y + blockIdx.y) * gridDim.x + blockIdx.x; + test_rank(exp, cuda::block, cuda::grid); + } +} + +#if !_CCCL_COMPILER(NVRTC) +__global__ void test_kernel() +{ + test_block(); +} + +void test() +{ + int cc_major{}; + assert(cudaDeviceGetAttribute(&cc_major, cudaDevAttrComputeCapabilityMajor, 0) == cudaSuccess); + + // thread block clusters require compute capability at least 9.0 + const bool enable_clusters = cc_major >= 9; + + test_kernel<<<1, 128>>>(); + test_kernel<<<128, 1>>>(); + test_kernel<<>>(); + test_kernel<<>>(); + if (enable_clusters) + { + cudaLaunchAttribute attribute[1]{}; + attribute[0].id = cudaLaunchAttributeClusterDimension; + attribute[0].val.clusterDim.x = 4; + attribute[0].val.clusterDim.y = 2; + attribute[0].val.clusterDim.z = 1; + + cudaLaunchConfig_t config{}; + config.gridDim = {12, 10, 3}; + config.blockDim = {2, 8, 4}; + config.attrs = attribute; + config.numAttrs = 1; + + void* pargs[1]{}; + assert(cudaLaunchKernelExC(&config, (const void*) test_kernel, pargs) == cudaSuccess); + } + + assert(cudaDeviceSynchronize() == cudaSuccess); +} +#endif // !_CCCL_COMPILER(NVRTC) + +int main(int, char**) +{ + NV_IF_ELSE_TARGET(NV_IS_HOST, (test();), (test_block();)) + return 0; +} diff --git a/libcudacxx/test/libcudacxx/cuda/hierarchy/block_level/native_hierarchy_query_signatures.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/hierarchy/block_level/native_hierarchy_query_signatures.compile.pass.cpp new file mode 100644 index 00000000000..afd683f50d2 --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/hierarchy/block_level/native_hierarchy_query_signatures.compile.pass.cpp @@ -0,0 +1,93 @@ +//===----------------------------------------------------------------------===// +// +// Part of the libcu++ Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +// todo: enable with nvrtc +// UNSUPPORTED: nvrtc + +#include +#include +#include +#include + +template +__device__ void test_query_signatures(const Level& level) +{ + // 1. Test cuda::block_level::dims(x) signature. + static_assert(cuda::std::is_same_v, decltype(cuda::block_level::dims(level))>); + static_assert(noexcept(cuda::block_level::dims(level))); + + // 2. Test cuda::block_level::static_dims(x) signature. + static_assert(cuda::std::is_same_v, + decltype(cuda::block_level::static_dims(level))>); + static_assert(noexcept(cuda::block_level::static_dims(level))); + + // 3. Test cuda::block_level::extents(x) signature. + static_assert(cuda::std::is_same_v, decltype(cuda::block_level::extents(level))>); + static_assert(noexcept(cuda::block_level::extents(level))); + + // 4. Test cuda::block_level::count(x) signature. + static_assert(cuda::std::is_same_v); + static_assert(noexcept(cuda::block_level::count(level))); + + // 5. Test cuda::block_level::index(x) signature. + static_assert( + cuda::std::is_same_v, decltype(cuda::block_level::index(level))>); + static_assert(noexcept(cuda::block_level::index(level))); + + // 6. Test cuda::block_level::rank(x) signature. + static_assert(cuda::std::is_same_v); + static_assert(noexcept(cuda::block_level::rank(level))); +} + +template +__device__ void test_query_as_signatures(const Level& level) +{ + // 1. Test cuda::block_level::dims(x) signature. + static_assert(cuda::std::is_same_v, decltype(cuda::block_level::dims_as(level))>); + static_assert(noexcept(cuda::block_level::dims_as(level))); + + // 2. Test cuda::block_level::extents(x) signature. + static_assert(cuda::std::is_same_v, decltype(cuda::block_level::extents_as(level))>); + static_assert(noexcept(cuda::block_level::extents_as(level))); + + // 3. Test cuda::block_level::count(x) signature. + static_assert(cuda::std::is_same_v(level))>); + static_assert(noexcept(cuda::block_level::count_as(level))); + + // 4. Test cuda::block_level::index(x) signature. + static_assert(cuda::std::is_same_v, decltype(cuda::block_level::index_as(level))>); + static_assert(noexcept(cuda::block_level::index_as(level))); + + // 5. Test cuda::block_level::rank(x) signature. + static_assert(cuda::std::is_same_v(level))>); + static_assert(noexcept(cuda::block_level::rank_as(level))); +} + +template +__device__ void test(const InLevel& in_level) +{ + test_query_signatures(in_level); + test_query_as_signatures(in_level); + test_query_as_signatures(in_level); + test_query_as_signatures(in_level); + test_query_as_signatures(in_level); + test_query_as_signatures(in_level); + test_query_as_signatures(in_level); +} + +__device__ void test() +{ + test(cuda::cluster); + test(cuda::grid); +} + +int main(int, char**) +{ + return 0; +} diff --git a/libcudacxx/test/libcudacxx/cuda/hierarchy/cluster_level/hierarchy_queries.pass.cpp b/libcudacxx/test/libcudacxx/cuda/hierarchy/cluster_level/hierarchy_queries.pass.cpp new file mode 100644 index 00000000000..ddb0d369589 --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/hierarchy/cluster_level/hierarchy_queries.pass.cpp @@ -0,0 +1,185 @@ +//===----------------------------------------------------------------------===// +// +// Part of the libcu++ Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +// todo: enable with nvrtc +// UNSUPPORTED: nvrtc + +#include "hierarchy_queries.h" + +#include +#include +#include +#include +#include + +template +__device__ void test_cluster(const Hierarchy& hier, const GridExts& grid_exts, const ClusterExts&, const BlockExts&) +{ + uint3 dims = gridDim; + NV_IF_TARGET(NV_PROVIDES_SM_90, (dims = __clusterGridDimInClusters();)) + + uint3 index = blockIdx; + NV_IF_TARGET(NV_PROVIDES_SM_90, (index = __clusterIdx();)) + + // 1. Test cuda::cluster.dims(x, hier) + test_dims(dims, cuda::cluster, cuda::grid, hier); + + // 2. Test cuda::cluster.static_dims(x, hier) + { + const ulonglong3 exp{ + GridExts::static_extent(0), + GridExts::static_extent(1), + GridExts::static_extent(2), + }; + test_static_dims(exp, cuda::cluster, cuda::grid, hier); + } + + // 3. Test cuda::cluster.extents(x) + { + const cuda::std::extents + exp{dims.x, dims.y, dims.z}; + test_extents(exp, cuda::cluster, cuda::grid, hier); + } + + // 4. Test cuda::cluster.count(x, hier) + test_count(cuda::std::size_t{dims.z} * dims.y * dims.x, cuda::cluster, cuda::grid, hier); + + // 5. test cuda::cluster.index(x, hier) + test_index(index, cuda::cluster, cuda::grid, hier); + + // 6. Test cuda::cluster.rank(x, hier) + { + const cuda::std::size_t exp = (index.z * dims.y + index.y) * dims.x + index.x; + test_rank(exp, cuda::cluster, cuda::grid, hier); + } +} + +__device__ void test_device() +{ + // todo: make hierarchy constructible on device + // test_thread(cuda::make_hierarchy(cuda::grid_dims(gridDim), cuda::cluster_dims(clusterDim))); +} + +#if !_CCCL_COMPILER(NVRTC) +template +__global__ void test_kernel(Hierarchy hier, GridExts grid_exts, BlockExts block_exts) +{ + test_cluster(hier, grid_exts, cuda::std::extents{}, block_exts); +} + +template +__global__ void test_kernel(Hierarchy hier, GridExts grid_exts, ClusterExts cluster_exts, BlockExts block_exts) +{ + test_cluster(hier, grid_exts, cluster_exts, block_exts); +} +template +void test_launch(GridExts grid_exts, BlockExts block_exts) +{ + const dim3 grid_dims{grid_exts.extent(0), grid_exts.extent(1), grid_exts.extent(2)}; + const dim3 block_dims{block_exts.extent(0), block_exts.extent(1), block_exts.extent(2)}; + + const cuda::std::dims<3, unsigned> grid_exts_dyn{grid_exts.extent(0), grid_exts.extent(1), grid_exts.extent(2)}; + const cuda::std::dims<3, unsigned> block_exts_dyn{block_exts.extent(0), block_exts.extent(1), block_exts.extent(2)}; + + // 1. Launch hierarchy with all static extents. + test_kernel<<>>( + cuda::make_hierarchy( + cuda::grid_dims(), + cuda::block_dims()), + grid_exts, + block_exts); + + // 2. Launch hierarchy with static grid extents and dynamic block extents. + test_kernel<<>>( + cuda::make_hierarchy( + cuda::grid_dims(), + cuda::block_dims(block_dims)), + grid_exts, + block_exts_dyn); + + // 3. Launch hierarchy with dynamic grid extents and static block extents. + test_kernel<<>>( + cuda::make_hierarchy( + cuda::grid_dims(grid_dims), + cuda::block_dims()), + grid_exts_dyn, + block_exts); + + // 4. Launch hierarchy with dynamic grid extents and dynamic block extents. + test_kernel<<>>( + cuda::make_hierarchy(cuda::grid_dims(grid_dims), cuda::block_dims(block_dims)), grid_exts_dyn, block_exts_dyn); +} + +template +void test_launch(GridExts grid_exts, ClusterExts cluster_exts, BlockExts block_exts) +{ + const dim3 grid_dims{grid_exts.extent(0), grid_exts.extent(1), grid_exts.extent(2)}; + const dim3 cluster_dims{cluster_exts.extent(0), cluster_exts.extent(1), cluster_exts.extent(2)}; + const dim3 block_dims{block_exts.extent(0), block_exts.extent(1), block_exts.extent(2)}; + + cuda::std::dims<3, unsigned> grid_exts_dyn{grid_exts.extent(0), grid_exts.extent(1), grid_exts.extent(2)}; + cuda::std::dims<3, unsigned> cluster_exts_dyn{cluster_exts.extent(0), cluster_exts.extent(1), cluster_exts.extent(2)}; + cuda::std::dims<3, unsigned> block_exts_dyn{block_exts.extent(0), block_exts.extent(1), block_exts.extent(2)}; + + cudaLaunchAttribute attribute[1]{}; + attribute[0].id = cudaLaunchAttributeClusterDimension; + attribute[0].val.clusterDim.x = cluster_dims.x; + attribute[0].val.clusterDim.y = cluster_dims.y; + attribute[0].val.clusterDim.z = cluster_dims.z; + + cudaLaunchConfig_t config{}; + config.gridDim = dim3{grid_dims.x * cluster_dims.x, grid_dims.y * cluster_dims.y, grid_dims.z * cluster_dims.z}; + config.blockDim = block_dims; + config.attrs = attribute; + config.numAttrs = 1; + + // 1. Launch hierarchy with all static extents. + { + auto hier = cuda::make_hierarchy( + cuda::grid_dims(), + cuda::cluster_dims(), + cuda::block_dims()); + auto kernel = test_kernel; + assert(cudaLaunchKernelEx(&config, kernel, hier, grid_exts, cluster_exts, block_exts) == cudaSuccess); + } + + // 2. Launch hierarchy with all dynamic extents. + { + auto hier = + cuda::make_hierarchy(cuda::grid_dims(grid_dims), cuda::cluster_dims(cluster_dims), cuda::block_dims(block_dims)); + auto kernel = + test_kernel; + assert(cudaLaunchKernelEx(&config, kernel, hier, grid_exts_dyn, cluster_exts_dyn, block_exts_dyn) == cudaSuccess); + } +} + +void test() +{ + int cc_major{}; + assert(cudaDeviceGetAttribute(&cc_major, cudaDevAttrComputeCapabilityMajor, 0) == cudaSuccess); + + // thread block clusters require compute capability at least 9.0 + const bool enable_clusters = cc_major >= 9; + + if (enable_clusters) + { + test_launch(cuda::std::extents{}, + cuda::std::extents{}, + cuda::std::extents{}); + } + + assert(cudaDeviceSynchronize() == cudaSuccess); +} +#endif // !_CCCL_COMPILER(NVRTC) + +int main(int, char**) +{ + NV_IF_ELSE_TARGET(NV_IS_HOST, (test();), (test_device();)) + return 0; +} diff --git a/libcudacxx/test/libcudacxx/cuda/hierarchy/cluster_level/hierarchy_query_signatures.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/hierarchy/cluster_level/hierarchy_query_signatures.compile.pass.cpp new file mode 100644 index 00000000000..4559783539f --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/hierarchy/cluster_level/hierarchy_query_signatures.compile.pass.cpp @@ -0,0 +1,119 @@ +//===----------------------------------------------------------------------===// +// +// Part of the libcu++ Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +// todo: enable with nvrtc +// UNSUPPORTED: nvrtc + +#include +#include +#include +#include + +template +__device__ void test_query_signatures(const Level& level, const Hierarchy& hier) +{ + // 1. Test cuda::cluster_level::dims(x, hier) signature. + static_assert( + cuda::std::is_same_v, decltype(cuda::cluster_level::dims(level, hier))>); + static_assert(noexcept(cuda::cluster_level::dims(level, hier))); + + // 2. Test cuda::cluster_level::static_dims(x, hier) signature. + static_assert(cuda::std::is_same_v, + decltype(cuda::cluster_level::static_dims(level, hier))>); + static_assert(noexcept(cuda::cluster_level::static_dims(level, hier))); + + // 3. Test cuda::cluster_level::extents(x, hier) signature. + using ExtentsResult = decltype(cuda::cluster_level::extents(level, hier)); + static_assert(cuda::std::__is_cuda_std_extents_v); + static_assert(cuda::std::is_same_v); + static_assert(noexcept(cuda::cluster_level::extents(level, hier))); + + // 4. Test cuda::cluster_level::count(x, hier) signature. + static_assert(cuda::std::is_same_v); + static_assert(noexcept(cuda::cluster_level::count(level, hier))); + + // 5. Test cuda::cluster_level::index(x, hier) signature. + static_assert( + cuda::std::is_same_v, decltype(cuda::cluster_level::index(level, hier))>); + static_assert(noexcept(cuda::cluster_level::index(level, hier))); + + // 6. Test cuda::cluster_level::rank(x, hier) signature. + static_assert(cuda::std::is_same_v); + static_assert(noexcept(cuda::cluster_level::rank(level, hier))); +} + +template +__device__ void test_query_as_signatures(const Level& level, const Hierarchy& hier) +{ + // 1. Test cuda::cluster_level::dims_as(x, hier) signature. + static_assert( + cuda::std::is_same_v, decltype(cuda::cluster_level::dims_as(level, hier))>); + static_assert(noexcept(cuda::cluster_level::dims_as(level, hier))); + + // 2. Test cuda::cluster_level::extents_as(x, hier) signature. + using ExtentsResult = decltype(cuda::cluster_level::extents_as(level, hier)); + static_assert(cuda::std::__is_cuda_std_extents_v); + static_assert(cuda::std::is_same_v); + static_assert(noexcept(cuda::cluster_level::extents_as(level, hier))); + + // 3. Test cuda::cluster_level::count_as(x, hier) signature. + static_assert(cuda::std::is_same_v(level, hier))>); + static_assert(noexcept(cuda::cluster_level::count_as(level, hier))); + + // 4. Test cuda::cluster_level::index_as(x, hier) signature. + static_assert( + cuda::std::is_same_v, decltype(cuda::cluster_level::index_as(level, hier))>); + static_assert(noexcept(cuda::cluster_level::index_as(level, hier))); + + // 5. Test cuda::cluster_level::rank_as(x, hier) signature. + static_assert(cuda::std::is_same_v(level, hier))>); + static_assert(noexcept(cuda::cluster_level::rank_as(level, hier))); +} + +template +__device__ void test(const InLevel& in_level, const Hierarchy& hier) +{ + test_query_signatures(in_level, hier); + test_query_as_signatures(in_level, hier); + test_query_as_signatures(in_level, hier); + test_query_as_signatures(in_level, hier); + test_query_as_signatures(in_level, hier); + test_query_as_signatures(in_level, hier); + test_query_as_signatures(in_level, hier); +} + +template +__device__ void test(const Hierarchy& hier) +{ + test(cuda::grid, hier); +} + +template +__global__ void test_kernel(Hierarchy hier) +{ + test(hier); +} + +#define TEST_KERNEL_INSTANTIATE(...) \ + template __global__ void test_kernel( \ + decltype(cuda::make_hierarchy(__VA_ARGS__))) + +TEST_KERNEL_INSTANTIATE(cuda::grid_dims<1>(), cuda::cluster_dims<1>(), cuda::block_dims<1>()); +TEST_KERNEL_INSTANTIATE(cuda::grid_dims<1>(), cuda::cluster_dims<1>(), cuda::block_dims(dim3{})); +TEST_KERNEL_INSTANTIATE(cuda::grid_dims<1>(), cuda::cluster_dims(dim3{}), cuda::block_dims<1>()); +TEST_KERNEL_INSTANTIATE(cuda::grid_dims<1>(), cuda::cluster_dims(dim3{}), cuda::block_dims(dim3{})); +TEST_KERNEL_INSTANTIATE(cuda::grid_dims(dim3{}), cuda::cluster_dims<1>(), cuda::block_dims<1>()); +TEST_KERNEL_INSTANTIATE(cuda::grid_dims(dim3{}), cuda::cluster_dims<1>(), cuda::block_dims(dim3{})); +TEST_KERNEL_INSTANTIATE(cuda::grid_dims(dim3{}), cuda::cluster_dims(dim3{}), cuda::block_dims<1>()); +TEST_KERNEL_INSTANTIATE(cuda::grid_dims(dim3{}), cuda::cluster_dims(dim3{}), cuda::block_dims(dim3{})); + +int main(int, char**) +{ + return 0; +} diff --git a/libcudacxx/test/libcudacxx/cuda/hierarchy/cluster_level/native_hierarchy_queries.pass.cpp b/libcudacxx/test/libcudacxx/cuda/hierarchy/cluster_level/native_hierarchy_queries.pass.cpp new file mode 100644 index 00000000000..4e58b131fa2 --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/hierarchy/cluster_level/native_hierarchy_queries.pass.cpp @@ -0,0 +1,97 @@ +//===----------------------------------------------------------------------===// +// +// Part of the libcu++ Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +// todo: enable with nvrtc +// UNSUPPORTED: nvrtc + +#include +#include +#include +#include +#include + +#include "hierarchy_queries.h" + +__device__ void test_cluster() +{ + constexpr cuda::std::size_t dext = cuda::std::dynamic_extent; + + uint3 dims = gridDim; + NV_IF_TARGET(NV_PROVIDES_SM_90, (dims = __clusterGridDimInClusters();)) + + uint3 index = blockIdx; + NV_IF_TARGET(NV_PROVIDES_SM_90, (index = __clusterIdx();)) + + // 1. Test cuda::cluster.dims(x) + test_dims(dims, cuda::cluster, cuda::grid); + + // 2. Test cuda::cluster.static_dims(x) + test_static_dims(ulonglong3{dext, dext, dext}, cuda::cluster, cuda::grid); + + // 3. Test cuda::cluster.extents(x) + test_extents(cuda::std::dims<3, unsigned>{dims.x, dims.y, dims.z}, cuda::cluster, cuda::grid); + + // 4. Test cuda::cluster.count(x) + test_count(cuda::std::size_t{dims.z} * dims.y * dims.x, cuda::cluster, cuda::grid); + + // 5. test cuda::cluster.index(x) + test_index(index, cuda::cluster, cuda::grid); + + // 6. Test cuda::cluster.rank(x) + { + const cuda::std::size_t exp = (index.z * dims.y + index.y) * dims.x + index.x; + test_rank(exp, cuda::cluster, cuda::grid); + } +} + +#if !_CCCL_COMPILER(NVRTC) +__global__ void test_kernel() +{ + test_cluster(); +} + +void test() +{ + int cc_major{}; + assert(cudaDeviceGetAttribute(&cc_major, cudaDevAttrComputeCapabilityMajor, 0) == cudaSuccess); + + // thread block clusters require compute capability at least 9.0 + const bool enable_clusters = cc_major >= 9; + + test_kernel<<<1, 128>>>(); + test_kernel<<<128, 1>>>(); + test_kernel<<>>(); + test_kernel<<>>(); + if (enable_clusters) + { + cudaLaunchAttribute attribute[1]{}; + attribute[0].id = cudaLaunchAttributeClusterDimension; + attribute[0].val.clusterDim.x = 4; + attribute[0].val.clusterDim.y = 2; + attribute[0].val.clusterDim.z = 1; + + cudaLaunchConfig_t config{}; + config.gridDim = {12, 10, 3}; + config.blockDim = {2, 8, 4}; + config.attrs = attribute; + config.numAttrs = 1; + + void* pargs[1]{}; + assert(cudaLaunchKernelExC(&config, (const void*) test_kernel, pargs) == cudaSuccess); + } + + assert(cudaDeviceSynchronize() == cudaSuccess); +} +#endif // !_CCCL_COMPILER(NVRTC) + +int main(int, char**) +{ + NV_IF_ELSE_TARGET(NV_IS_HOST, (test();), (test_cluster();)) + return 0; +} diff --git a/libcudacxx/test/libcudacxx/cuda/hierarchy/cluster_level/native_hierarchy_query_signatures.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/hierarchy/cluster_level/native_hierarchy_query_signatures.compile.pass.cpp new file mode 100644 index 00000000000..64c0905c8d8 --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/hierarchy/cluster_level/native_hierarchy_query_signatures.compile.pass.cpp @@ -0,0 +1,94 @@ +//===----------------------------------------------------------------------===// +// +// Part of the libcu++ Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +// todo: enable with nvrtc +// UNSUPPORTED: nvrtc + +#include +#include +#include + +template +__device__ void test_query_signatures(const Level& level) +{ + // 1. Test cuda::cluster_level::dims(x) signature. + static_assert( + cuda::std::is_same_v, decltype(cuda::cluster_level::dims(level))>); + static_assert(noexcept(cuda::cluster_level::dims(level))); + + // 2. Test cuda::cluster_level::static_dims(x) signature. + static_assert(cuda::std::is_same_v, + decltype(cuda::cluster_level::static_dims(level))>); + static_assert(noexcept(cuda::cluster_level::static_dims(level))); + + // 3. Test cuda::cluster_level::extents(x) signature. + static_assert(cuda::std::is_same_v, decltype(cuda::cluster_level::extents(level))>); + static_assert(noexcept(cuda::cluster_level::extents(level))); + + // 4. Test cuda::cluster_level::count(x) signature. + static_assert(cuda::std::is_same_v); + static_assert(noexcept(cuda::cluster_level::count(level))); + + // 5. Test cuda::cluster_level::index(x) signature. + static_assert( + cuda::std::is_same_v, decltype(cuda::cluster_level::index(level))>); + static_assert(noexcept(cuda::cluster_level::index(level))); + + // 6. Test cuda::cluster_level::rank(x) signature. + static_assert(cuda::std::is_same_v); + static_assert(noexcept(cuda::cluster_level::rank(level))); +} + +template +__device__ void test_query_as_signatures(const Level& level) +{ + // 1. Test cuda::cluster_level::dims(x) signature. + static_assert( + cuda::std::is_same_v, decltype(cuda::cluster_level::dims_as(level))>); + static_assert(noexcept(cuda::cluster_level::dims_as(level))); + + // 2. Test cuda::cluster_level::extents(x) signature. + static_assert(cuda::std::is_same_v, decltype(cuda::cluster_level::extents_as(level))>); + static_assert(noexcept(cuda::cluster_level::extents_as(level))); + + // 3. Test cuda::cluster_level::count(x) signature. + static_assert(cuda::std::is_same_v(level))>); + static_assert(noexcept(cuda::cluster_level::count_as(level))); + + // 4. Test cuda::cluster_level::index(x) signature. + static_assert( + cuda::std::is_same_v, decltype(cuda::cluster_level::index_as(level))>); + static_assert(noexcept(cuda::cluster_level::index_as(level))); + + // 5. Test cuda::cluster_level::rank(x) signature. + static_assert(cuda::std::is_same_v(level))>); + static_assert(noexcept(cuda::cluster_level::rank_as(level))); +} + +template +__device__ void test(const InLevel& in_level) +{ + test_query_signatures(in_level); + test_query_as_signatures(in_level); + test_query_as_signatures(in_level); + test_query_as_signatures(in_level); + test_query_as_signatures(in_level); + test_query_as_signatures(in_level); + test_query_as_signatures(in_level); +} + +__device__ void test() +{ + test(cuda::grid); +} + +int main(int, char**) +{ + return 0; +} diff --git a/libcudacxx/test/libcudacxx/cuda/hierarchy/hierarchy_objects.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/hierarchy/hierarchy_objects.compile.pass.cpp new file mode 100644 index 00000000000..c3d7fc4f917 --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/hierarchy/hierarchy_objects.compile.pass.cpp @@ -0,0 +1,24 @@ +//===----------------------------------------------------------------------===// +// +// Part of the libcu++ Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +// todo: enable with nvrtc +// UNSUPPORTED: nvrtc + +#include +#include + +static_assert(cuda::std::is_same_v>); +static_assert(cuda::std::is_same_v>); +static_assert(cuda::std::is_same_v>); +static_assert(cuda::std::is_same_v>); + +int main(int, char**) +{ + return 0; +} diff --git a/libcudacxx/test/libcudacxx/cuda/hierarchy/hierarchy_query_result.pass.cpp b/libcudacxx/test/libcudacxx/cuda/hierarchy/hierarchy_query_result.pass.cpp new file mode 100644 index 00000000000..e9cc316462e --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/hierarchy/hierarchy_query_result.pass.cpp @@ -0,0 +1,113 @@ +//===----------------------------------------------------------------------===// +// +// Part of the libcu++ Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +// todo: enable with nvrtc +// UNSUPPORTED: nvrtc + +#include +#include +#include +#include + +template +__host__ __device__ constexpr void test() +{ + using HQR = cuda::hierarchy_query_result; + using Vec = cuda::__vector_type_t; + constexpr auto has_vec = !cuda::std::is_same_v; + + // 1. Test value_type + static_assert(cuda::std::is_same_v); + + // 2. Test constructors + static_assert(cuda::std::is_trivially_default_constructible_v); + static_assert(cuda::std::is_trivially_copyable_v); + + // 3. Test public members + { + HQR v{T{0}, T{1}, T{2}}; + assert(v.x == static_cast(0)); + assert(v.y == static_cast(1)); + assert(v.z == static_cast(2)); + } + + // 4. Test operator[] const + static_assert(cuda::std::is_same_v()[cuda::std::size_t{}])>); + static_assert(noexcept(cuda::std::declval()[cuda::std::size_t{}])); + { + const HQR v{T{0}, T{1}, T{2}}; + for (cuda::std::size_t i = 0; i < 3; ++i) + { + assert(v[i] == static_cast(i)); + } + } + + // 5. Test operator[] + static_assert(cuda::std::is_same_v()[cuda::std::size_t{}])>); + static_assert(noexcept(cuda::std::declval()[cuda::std::size_t{}])); + { + HQR v{T{0}, T{1}, T{2}}; + for (cuda::std::size_t i = 0; i < 3; ++i) + { + assert(v[i] == static_cast(i)); + } + } + + // 6. Test operator vector-type + static_assert(!has_vec || cuda::std::is_nothrow_convertible_v); + if constexpr (has_vec) + { + const HQR v{T{0}, T{1}, T{2}}; + Vec vec = v; + assert(vec.x == v.x); + assert(vec.y == v.y); + assert(vec.z == v.z); + } + + // 7. Test dim3 can be constructed from the query result + static_assert(!cuda::std::is_same_v || cuda::std::is_constructible_v); + if constexpr (cuda::std::is_same_v) + { + const HQR v{T{0}, T{1}, T{2}}; + dim3 vec{v}; + assert(vec.x == v.x); + assert(vec.y == v.y); + assert(vec.z == v.z); + } +} + +__host__ __device__ constexpr bool test() +{ + test(); + test(); + test(); + test(); + test(); +#if _CCCL_HAS_INT128() + test<__int128_t>(); +#endif // _CCCL_HAS_INT128(); + + test(); + test(); + test(); + test(); + test(); +#if _CCCL_HAS_INT128() + test<__uint128_t>(); +#endif // _CCCL_HAS_INT128(); + + return true; +} + +int main(int, char**) +{ + test(); + static_assert(test()); + return 0; +} diff --git a/libcudacxx/test/libcudacxx/cuda/hierarchy/thread_level/hierarchy_queries.pass.cpp b/libcudacxx/test/libcudacxx/cuda/hierarchy/thread_level/hierarchy_queries.pass.cpp new file mode 100644 index 00000000000..220d7107274 --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/hierarchy/thread_level/hierarchy_queries.pass.cpp @@ -0,0 +1,285 @@ +//===----------------------------------------------------------------------===// +// +// Part of the libcu++ Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +// todo: enable with nvrtc +// UNSUPPORTED: nvrtc + +#include "hierarchy_queries.h" + +#include +#include +#include +#include +#include + +template +__device__ void test_thread( + const Hierarchy& hier, const GridExts& grid_exts, const ClusterExts& cluster_exts, const BlockExts& block_exts) +{ + // 1. Test cuda::gpu_thread.dims(x, hier) + test_dims(blockDim, cuda::gpu_thread, cuda::block, hier); + if constexpr (cuda::has_level_v) + { + uint3 exp = blockDim; + NV_IF_TARGET(NV_PROVIDES_SM_90, ({ + exp.x *= __clusterDim().x; + exp.y *= __clusterDim().y; + exp.z *= __clusterDim().z; + })) + test_dims(exp, cuda::gpu_thread, cuda::cluster, hier); + } + { + const uint3 exp{blockDim.x * gridDim.x, blockDim.y * gridDim.y, blockDim.z * gridDim.z}; + test_dims(exp, cuda::gpu_thread, cuda::grid, hier); + } + + // 2. Test cuda::gpu_thread.static_dims(x, hier) + test_static_dims(ulonglong3{BlockExts::static_extent(0), BlockExts::static_extent(1), BlockExts::static_extent(2)}, + cuda::gpu_thread, + cuda::block, + hier); + if constexpr (cuda::has_level_v) + { + const ulonglong3 exp{ + mul_static_extents(ClusterExts::static_extent(0), BlockExts::static_extent(0)), + mul_static_extents(ClusterExts::static_extent(1), BlockExts::static_extent(1)), + mul_static_extents(ClusterExts::static_extent(2), BlockExts::static_extent(2)), + }; + test_static_dims(exp, cuda::gpu_thread, cuda::cluster, hier); + } + { + const ulonglong3 exp{ + mul_static_extents(GridExts::static_extent(0), ClusterExts::static_extent(0), BlockExts::static_extent(0)), + mul_static_extents(GridExts::static_extent(1), ClusterExts::static_extent(1), BlockExts::static_extent(1)), + mul_static_extents(GridExts::static_extent(2), ClusterExts::static_extent(2), BlockExts::static_extent(2)), + }; + test_static_dims(exp, cuda::gpu_thread, cuda::grid, hier); + } + + // 3. Test cuda::gpu_thread.extents(x) + test_extents(block_exts, cuda::gpu_thread, cuda::block, hier); + if constexpr (cuda::has_level_v) + { + uint3 dims = blockDim; + NV_IF_TARGET(NV_PROVIDES_SM_90, ({ + dims.x *= __clusterDim().x; + dims.y *= __clusterDim().y; + dims.z *= __clusterDim().z; + })) + + const cuda::std::extents + exp{dims.x, dims.y, dims.z}; + + test_extents(exp, cuda::gpu_thread, cuda::cluster, hier); + } + { + const cuda::std::extents< + unsigned, + mul_static_extents(GridExts::static_extent(0), ClusterExts::static_extent(0), BlockExts::static_extent(0)), + mul_static_extents(GridExts::static_extent(1), ClusterExts::static_extent(1), BlockExts::static_extent(1)), + mul_static_extents(GridExts::static_extent(2), ClusterExts::static_extent(2), BlockExts::static_extent(2))> + exp{blockDim.x * gridDim.x, blockDim.y * gridDim.y, blockDim.z * gridDim.z}; + test_extents(exp, cuda::gpu_thread, cuda::grid, hier); + } + + // 4. Test cuda::gpu_thread.count(x, hier) + test_count(cuda::std::size_t{blockDim.z} * blockDim.y * blockDim.x, cuda::gpu_thread, cuda::block, hier); + if constexpr (cuda::has_level_v) + { + uint3 exp = blockDim; + NV_IF_TARGET(NV_PROVIDES_SM_90, ({ + exp.x *= __clusterDim().x; + exp.y *= __clusterDim().y; + exp.z *= __clusterDim().z; + })) + test_count(cuda::std::size_t{exp.z} * exp.y * exp.x, cuda::gpu_thread, cuda::cluster, hier); + } + { + const uint3 exp{blockDim.x * gridDim.x, blockDim.y * gridDim.y, blockDim.z * gridDim.z}; + test_count(cuda::std::size_t{exp.z} * exp.y * exp.x, cuda::gpu_thread, cuda::grid, hier); + } + + // 5. test cuda::gpu_thread.index(x, hier) + test_index(threadIdx, cuda::gpu_thread, cuda::block, hier); + if constexpr (cuda::has_level_v) + { + uint3 exp = threadIdx; + NV_IF_TARGET(NV_PROVIDES_SM_90, ({ + exp.x += blockDim.x * __clusterRelativeBlockIdx().x; + exp.y += blockDim.y * __clusterRelativeBlockIdx().y; + exp.z += blockDim.z * __clusterRelativeBlockIdx().z; + })) + test_index(exp, cuda::gpu_thread, cuda::cluster, hier); + } + { + const uint3 exp{ + threadIdx.x + blockDim.x * blockIdx.x, + threadIdx.y + blockDim.y * blockIdx.y, + threadIdx.z + blockDim.z * blockIdx.z, + }; + test_index(exp, cuda::gpu_thread, cuda::grid, hier); + } + + // 6. Test cuda::gpu_thread.rank(x, hier) + test_rank((threadIdx.z * blockDim.y + threadIdx.y) * blockDim.x + threadIdx.x, cuda::gpu_thread, cuda::block, hier); + if constexpr (cuda::has_level_v) + { + cuda::std::size_t exp = 0; + NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, + ({ + exp = (((__clusterRelativeBlockIdx().z * __clusterDim().y * __clusterDim().x) + + __clusterRelativeBlockIdx().y * __clusterDim().x) + + __clusterRelativeBlockIdx().x) + * (blockDim.x * blockDim.y * blockDim.z) + + ((threadIdx.z * blockDim.y * blockDim.x) + threadIdx.y * blockDim.x) + threadIdx.x; + }), + ({ exp = ((threadIdx.z * blockDim.y + threadIdx.y) * blockDim.x) + threadIdx.x; })) + test_rank(exp, cuda::gpu_thread, cuda::cluster, hier); + } + { + const cuda::std::size_t exp = + (blockIdx.z * gridDim.y * gridDim.x + blockIdx.y * gridDim.x + blockIdx.x) + * (blockDim.x * blockDim.y * blockDim.z) + + threadIdx.z * blockDim.y * blockDim.x + threadIdx.y * blockDim.x + threadIdx.x; + test_rank(exp, cuda::gpu_thread, cuda::grid, hier); + } +} + +__device__ void test_device() +{ + // todo: make hierarchy constructible on device + // test_thread(cuda::make_hierarchy(cuda::grid_dims(gridDim), cuda::block_dims(blockDim))); +} + +#if !_CCCL_COMPILER(NVRTC) +template +__global__ void test_kernel(Hierarchy hier, GridExts grid_exts, BlockExts block_exts) +{ + test_thread(hier, grid_exts, cuda::std::extents{}, block_exts); +} + +template +__global__ void test_kernel(Hierarchy hier, GridExts grid_exts, ClusterExts cluster_exts, BlockExts block_exts) +{ + test_thread(hier, grid_exts, cluster_exts, block_exts); +} + +template +void test_launch(GridExts grid_exts, BlockExts block_exts) +{ + const dim3 grid_dims{grid_exts.extent(0), grid_exts.extent(1), grid_exts.extent(2)}; + const dim3 block_dims{block_exts.extent(0), block_exts.extent(1), block_exts.extent(2)}; + + const cuda::std::dims<3, unsigned> grid_exts_dyn{grid_exts.extent(0), grid_exts.extent(1), grid_exts.extent(2)}; + const cuda::std::dims<3, unsigned> block_exts_dyn{block_exts.extent(0), block_exts.extent(1), block_exts.extent(2)}; + + // 1. Launch hierarchy with all static extents. + test_kernel<<>>( + cuda::make_hierarchy( + cuda::grid_dims(), + cuda::block_dims()), + grid_exts, + block_exts); + + // 2. Launch hierarchy with static grid extents and dynamic block extents. + test_kernel<<>>( + cuda::make_hierarchy( + cuda::grid_dims(), + cuda::block_dims(block_dims)), + grid_exts, + block_exts_dyn); + + // 3. Launch hierarchy with dynamic grid extents and static block extents. + test_kernel<<>>( + cuda::make_hierarchy( + cuda::grid_dims(grid_dims), + cuda::block_dims()), + grid_exts_dyn, + block_exts); + + // 4. Launch hierarchy with dynamic grid extents and dynamic block extents. + test_kernel<<>>( + cuda::make_hierarchy(cuda::grid_dims(grid_dims), cuda::block_dims(block_dims)), grid_exts_dyn, block_exts_dyn); +} + +template +void test_launch(GridExts grid_exts, ClusterExts cluster_exts, BlockExts block_exts) +{ + const dim3 grid_dims{grid_exts.extent(0), grid_exts.extent(1), grid_exts.extent(2)}; + const dim3 cluster_dims{cluster_exts.extent(0), cluster_exts.extent(1), cluster_exts.extent(2)}; + const dim3 block_dims{block_exts.extent(0), block_exts.extent(1), block_exts.extent(2)}; + + cuda::std::dims<3, unsigned> grid_exts_dyn{grid_exts.extent(0), grid_exts.extent(1), grid_exts.extent(2)}; + cuda::std::dims<3, unsigned> cluster_exts_dyn{cluster_exts.extent(0), cluster_exts.extent(1), cluster_exts.extent(2)}; + cuda::std::dims<3, unsigned> block_exts_dyn{block_exts.extent(0), block_exts.extent(1), block_exts.extent(2)}; + + cudaLaunchAttribute attribute[1]{}; + attribute[0].id = cudaLaunchAttributeClusterDimension; + attribute[0].val.clusterDim.x = cluster_dims.x; + attribute[0].val.clusterDim.y = cluster_dims.y; + attribute[0].val.clusterDim.z = cluster_dims.z; + + cudaLaunchConfig_t config{}; + config.gridDim = dim3{grid_dims.x * cluster_dims.x, grid_dims.y * cluster_dims.y, grid_dims.z * cluster_dims.z}; + config.blockDim = block_dims; + config.attrs = attribute; + config.numAttrs = 1; + + // 1. Launch hierarchy with all static extents. + { + auto hier = cuda::make_hierarchy( + cuda::grid_dims(), + cuda::cluster_dims(), + cuda::block_dims()); + auto kernel = test_kernel; + assert(cudaLaunchKernelEx(&config, kernel, hier, grid_exts, cluster_exts, block_exts) == cudaSuccess); + } + + // 2. Launch hierarchy with all dynamic extents. + { + auto hier = + cuda::make_hierarchy(cuda::grid_dims(grid_dims), cuda::cluster_dims(cluster_dims), cuda::block_dims(block_dims)); + auto kernel = + test_kernel; + assert(cudaLaunchKernelEx(&config, kernel, hier, grid_exts_dyn, cluster_exts_dyn, block_exts_dyn) == cudaSuccess); + } +} + +void test() +{ + int cc_major{}; + assert(cudaDeviceGetAttribute(&cc_major, cudaDevAttrComputeCapabilityMajor, 0) == cudaSuccess); + + // thread block clusters require compute capability at least 9.0 + const bool enable_clusters = cc_major >= 9; + + test_launch(cuda::std::extents{}, cuda::std::extents{}); + test_launch(cuda::std::extents{}, cuda::std::extents{}); + test_launch(cuda::std::extents{}, cuda::std::extents{}); + test_launch(cuda::std::extents{}, cuda::std::extents{}); + + if (enable_clusters) + { + test_launch(cuda::std::extents{}, + cuda::std::extents{}, + cuda::std::extents{}); + } + + assert(cudaDeviceSynchronize() == cudaSuccess); +} +#endif // !_CCCL_COMPILER(NVRTC) + +int main(int, char**) +{ + NV_IF_ELSE_TARGET(NV_IS_HOST, (test();), (test_device();)) + return 0; +} diff --git a/libcudacxx/test/libcudacxx/cuda/hierarchy/thread_level/hierarchy_query_signatures.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/hierarchy/thread_level/hierarchy_query_signatures.compile.pass.cpp new file mode 100644 index 00000000000..8c693158df3 --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/hierarchy/thread_level/hierarchy_query_signatures.compile.pass.cpp @@ -0,0 +1,129 @@ +//===----------------------------------------------------------------------===// +// +// Part of the libcu++ Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +// todo: enable with nvrtc +// UNSUPPORTED: nvrtc + +#include +#include +#include +#include + +template +__device__ void test_query_signatures(const Level& level, const Hierarchy& hier) +{ + // 1. Test cuda::thread_level::dims(x, hier) signature. + static_assert( + cuda::std::is_same_v, decltype(cuda::thread_level::dims(level, hier))>); + static_assert(noexcept(cuda::thread_level::dims(level, hier))); + + // 2. Test cuda::thread_level::static_dims(x, hier) signature. + static_assert(cuda::std::is_same_v, + decltype(cuda::thread_level::static_dims(level, hier))>); + static_assert(noexcept(cuda::thread_level::static_dims(level, hier))); + + // 3. Test cuda::thread_level::extents(x, hier) signature. + using ExtentsResult = decltype(cuda::thread_level::extents(level, hier)); + static_assert(cuda::std::__is_cuda_std_extents_v); + static_assert(cuda::std::is_same_v); + static_assert(noexcept(cuda::thread_level::extents(level, hier))); + + // 4. Test cuda::thread_level::count(x, hier) signature. + static_assert(cuda::std::is_same_v); + static_assert(noexcept(cuda::thread_level::count(level, hier))); + + // 5. Test cuda::thread_level::index(x, hier) signature. + static_assert( + cuda::std::is_same_v, decltype(cuda::thread_level::index(level, hier))>); + static_assert(noexcept(cuda::thread_level::index(level, hier))); + + // 6. Test cuda::thread_level::rank(x, hier) signature. + static_assert(cuda::std::is_same_v); + static_assert(noexcept(cuda::thread_level::rank(level, hier))); +} + +template +__device__ void test_query_as_signatures(const Level& level, const Hierarchy& hier) +{ + // 1. Test cuda::thread_level::dims_as(x, hier) signature. + static_assert( + cuda::std::is_same_v, decltype(cuda::thread_level::dims_as(level, hier))>); + static_assert(noexcept(cuda::thread_level::dims_as(level, hier))); + + // 2. Test cuda::thread_level::extents_as(x, hier) signature. + using ExtentsResult = decltype(cuda::thread_level::extents_as(level, hier)); + static_assert(cuda::std::__is_cuda_std_extents_v); + static_assert(cuda::std::is_same_v); + static_assert(noexcept(cuda::thread_level::extents_as(level, hier))); + + // 3. Test cuda::thread_level::count_as(x, hier) signature. + static_assert(cuda::std::is_same_v(level, hier))>); + static_assert(noexcept(cuda::thread_level::count_as(level, hier))); + + // 4. Test cuda::thread_level::index_as(x, hier) signature. + static_assert( + cuda::std::is_same_v, decltype(cuda::thread_level::index_as(level, hier))>); + static_assert(noexcept(cuda::thread_level::index_as(level, hier))); + + // 5. Test cuda::thread_level::rank_as(x, hier) signature. + static_assert(cuda::std::is_same_v(level, hier))>); + static_assert(noexcept(cuda::thread_level::rank_as(level, hier))); +} + +template +__device__ void test(const InLevel& in_level, const Hierarchy& hier) +{ + test_query_signatures(in_level, hier); + test_query_as_signatures(in_level, hier); + test_query_as_signatures(in_level, hier); + test_query_as_signatures(in_level, hier); + test_query_as_signatures(in_level, hier); + test_query_as_signatures(in_level, hier); + test_query_as_signatures(in_level, hier); +} + +template +__device__ void test(const Hierarchy& hier) +{ + test(cuda::block, hier); + if constexpr (cuda::has_level_v) + { + test(cuda::cluster, hier); + } + test(cuda::grid, hier); +} + +template +__global__ void test_kernel(Hierarchy hier) +{ + test(hier); +} + +#define TEST_KERNEL_INSTANTIATE(...) \ + template __global__ void test_kernel( \ + decltype(cuda::make_hierarchy(__VA_ARGS__))) + +TEST_KERNEL_INSTANTIATE(cuda::grid_dims<1>(), cuda::block_dims<1>()); +TEST_KERNEL_INSTANTIATE(cuda::grid_dims<1>(), cuda::block_dims(dim3{})); +TEST_KERNEL_INSTANTIATE(cuda::grid_dims(dim3{}), cuda::block_dims<1>()); +TEST_KERNEL_INSTANTIATE(cuda::grid_dims(dim3{}), cuda::block_dims(dim3{})); + +TEST_KERNEL_INSTANTIATE(cuda::grid_dims<1>(), cuda::cluster_dims<1>(), cuda::block_dims<1>()); +TEST_KERNEL_INSTANTIATE(cuda::grid_dims<1>(), cuda::cluster_dims<1>(), cuda::block_dims(dim3{})); +TEST_KERNEL_INSTANTIATE(cuda::grid_dims<1>(), cuda::cluster_dims(dim3{}), cuda::block_dims<1>()); +TEST_KERNEL_INSTANTIATE(cuda::grid_dims<1>(), cuda::cluster_dims(dim3{}), cuda::block_dims(dim3{})); +TEST_KERNEL_INSTANTIATE(cuda::grid_dims(dim3{}), cuda::cluster_dims<1>(), cuda::block_dims<1>()); +TEST_KERNEL_INSTANTIATE(cuda::grid_dims(dim3{}), cuda::cluster_dims<1>(), cuda::block_dims(dim3{})); +TEST_KERNEL_INSTANTIATE(cuda::grid_dims(dim3{}), cuda::cluster_dims(dim3{}), cuda::block_dims<1>()); +TEST_KERNEL_INSTANTIATE(cuda::grid_dims(dim3{}), cuda::cluster_dims(dim3{}), cuda::block_dims(dim3{})); + +int main(int, char**) +{ + return 0; +} diff --git a/libcudacxx/test/libcudacxx/cuda/hierarchy/thread_level/native_hierarchy_queries.pass.cpp b/libcudacxx/test/libcudacxx/cuda/hierarchy/thread_level/native_hierarchy_queries.pass.cpp new file mode 100644 index 00000000000..7ec30378b03 --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/hierarchy/thread_level/native_hierarchy_queries.pass.cpp @@ -0,0 +1,169 @@ +//===----------------------------------------------------------------------===// +// +// Part of the libcu++ Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +// todo: enable with nvrtc +// UNSUPPORTED: nvrtc + +#include +#include +#include +#include +#include + +#include "hierarchy_queries.h" + +__device__ void test_thread() +{ + constexpr cuda::std::size_t dext = cuda::std::dynamic_extent; + + // 1. Test cuda::gpu_thread.dims(x) + test_dims(uint3{static_cast(warpSize), 1u, 1u}, cuda::gpu_thread, cuda::warp); + test_dims(blockDim, cuda::gpu_thread, cuda::block); + { + uint3 exp = blockDim; + NV_IF_TARGET(NV_PROVIDES_SM_90, ({ + exp.x *= __clusterDim().x; + exp.y *= __clusterDim().y; + exp.z *= __clusterDim().z; + })) + test_dims(exp, cuda::gpu_thread, cuda::cluster); + } + test_dims({blockDim.x * gridDim.x, blockDim.y * gridDim.y, blockDim.z * gridDim.z}, cuda::gpu_thread, cuda::grid); + + // 2. Test cuda::gpu_thread.static_dims(x) + test_static_dims(ulonglong3{cuda::std::size_t{32}, 1, 1}, cuda::gpu_thread, cuda::warp); + test_static_dims(ulonglong3{dext, dext, dext}, cuda::gpu_thread, cuda::block); + test_static_dims(ulonglong3{dext, dext, dext}, cuda::gpu_thread, cuda::cluster); + test_static_dims(ulonglong3{dext, dext, dext}, cuda::gpu_thread, cuda::grid); + + // 3. Test cuda::gpu_thread.extents(x) + test_extents(cuda::std::extents{}, cuda::gpu_thread, cuda::warp); + test_extents(cuda::std::dims<3, unsigned>{blockDim.x, blockDim.y, blockDim.z}, cuda::gpu_thread, cuda::block); + { + uint3 exp = blockDim; + NV_IF_TARGET(NV_PROVIDES_SM_90, ({ + exp.x *= __clusterDim().x; + exp.y *= __clusterDim().y; + exp.z *= __clusterDim().z; + })) + test_extents(cuda::std::dims<3, unsigned>{exp.x, exp.y, exp.z}, cuda::gpu_thread, cuda::cluster); + } + { + const uint3 exp{blockDim.x * gridDim.x, blockDim.y * gridDim.y, blockDim.z * gridDim.z}; + test_extents(cuda::std::dims<3, unsigned>{exp.x, exp.y, exp.z}, cuda::gpu_thread, cuda::grid); + } + + // 4. Test cuda::gpu_thread.count(x) + test_count(32, cuda::gpu_thread, cuda::warp); + test_count(cuda::std::size_t{blockDim.z} * blockDim.y * blockDim.x, cuda::gpu_thread, cuda::block); + { + uint3 exp = blockDim; + NV_IF_TARGET(NV_PROVIDES_SM_90, ({ + exp.x *= __clusterDim().x; + exp.y *= __clusterDim().y; + exp.z *= __clusterDim().z; + })) + test_count(cuda::std::size_t{exp.z} * exp.y * exp.x, cuda::gpu_thread, cuda::cluster); + } + { + const uint3 exp{blockDim.x * gridDim.x, blockDim.y * gridDim.y, blockDim.z * gridDim.z}; + test_count(cuda::std::size_t{exp.z} * exp.y * exp.x, cuda::gpu_thread, cuda::grid); + } + + // 5. test cuda::gpu_thread.index(x) + test_index(uint3{cuda::ptx::get_sreg_laneid(), 0, 0}, cuda::gpu_thread, cuda::warp); + test_index(threadIdx, cuda::gpu_thread, cuda::block); + { + uint3 exp = threadIdx; + NV_IF_TARGET(NV_PROVIDES_SM_90, ({ + exp.x += blockDim.x * __clusterRelativeBlockIdx().x; + exp.y += blockDim.y * __clusterRelativeBlockIdx().y; + exp.z += blockDim.z * __clusterRelativeBlockIdx().z; + })) + test_index(exp, cuda::gpu_thread, cuda::cluster); + } + { + const uint3 exp{ + threadIdx.x + blockDim.x * blockIdx.x, + threadIdx.y + blockDim.y * blockIdx.y, + threadIdx.z + blockDim.z * blockIdx.z, + }; + test_index(exp, cuda::gpu_thread, cuda::grid); + } + + // 6. Test cuda::gpu_thread.rank(x) + test_rank(cuda::ptx::get_sreg_laneid(), cuda::gpu_thread, cuda::warp); + test_rank((threadIdx.z * blockDim.y + threadIdx.y) * blockDim.x + threadIdx.x, cuda::gpu_thread, cuda::block); + { + cuda::std::size_t exp = 0; + NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, + ({ + exp = (((__clusterRelativeBlockIdx().z * __clusterDim().y * __clusterDim().x) + + __clusterRelativeBlockIdx().y * __clusterDim().x) + + __clusterRelativeBlockIdx().x) + * (blockDim.x * blockDim.y * blockDim.z) + + ((threadIdx.z * blockDim.y * blockDim.x) + threadIdx.y * blockDim.x) + threadIdx.x; + }), + ({ exp = ((threadIdx.z * blockDim.y + threadIdx.y) * blockDim.x) + threadIdx.x; })) + test_rank(exp, cuda::gpu_thread, cuda::cluster); + } + { + const cuda::std::size_t exp = + (blockIdx.z * gridDim.y * gridDim.x + blockIdx.y * gridDim.x + blockIdx.x) + * (blockDim.x * blockDim.y * blockDim.z) + + threadIdx.z * blockDim.y * blockDim.x + threadIdx.y * blockDim.x + threadIdx.x; + test_rank(exp, cuda::gpu_thread, cuda::grid); + } +} + +#if !_CCCL_COMPILER(NVRTC) +__global__ void test_kernel() +{ + test_thread(); +} + +void test() +{ + int cc_major{}; + assert(cudaDeviceGetAttribute(&cc_major, cudaDevAttrComputeCapabilityMajor, 0) == cudaSuccess); + + // thread block clusters require compute capability at least 9.0 + const bool enable_clusters = cc_major >= 9; + + test_kernel<<<1, 128>>>(); + test_kernel<<<128, 1>>>(); + test_kernel<<>>(); + test_kernel<<>>(); + if (enable_clusters) + { + cudaLaunchAttribute attribute[1]{}; + attribute[0].id = cudaLaunchAttributeClusterDimension; + attribute[0].val.clusterDim.x = 4; + attribute[0].val.clusterDim.y = 2; + attribute[0].val.clusterDim.z = 1; + + cudaLaunchConfig_t config{}; + config.gridDim = {12, 10, 3}; + config.blockDim = {2, 8, 4}; + config.attrs = attribute; + config.numAttrs = 1; + + void* pargs[1]{}; + assert(cudaLaunchKernelExC(&config, (const void*) test_kernel, pargs) == cudaSuccess); + } + + assert(cudaDeviceSynchronize() == cudaSuccess); +} +#endif // !_CCCL_COMPILER(NVRTC) + +int main(int, char**) +{ + NV_IF_ELSE_TARGET(NV_IS_HOST, (test();), (test_thread();)) + return 0; +} diff --git a/libcudacxx/test/libcudacxx/cuda/hierarchy/thread_level/native_hierarchy_query_signatures.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/hierarchy/thread_level/native_hierarchy_query_signatures.compile.pass.cpp new file mode 100644 index 00000000000..32250331ee1 --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/hierarchy/thread_level/native_hierarchy_query_signatures.compile.pass.cpp @@ -0,0 +1,102 @@ +//===----------------------------------------------------------------------===// +// +// Part of the libcu++ Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +// todo: enable with nvrtc +// UNSUPPORTED: nvrtc + +#include +#include +#include +#include + +template +__device__ void test_query_signatures(const Level& level) +{ + // 1. Test cuda::thread_level::dims(x) signature. + static_assert( + cuda::std::is_same_v, decltype(cuda::thread_level::dims(level))>); + static_assert(noexcept(cuda::thread_level::dims(level))); + + // 2. Test cuda::thread_level::static_dims(x) signature. + static_assert(cuda::std::is_same_v, + decltype(cuda::thread_level::static_dims(level))>); + static_assert(noexcept(cuda::thread_level::static_dims(level))); + + // 3. Test cuda::thread_level::extents(x) signature. + using ExtentsRet = cuda::std::conditional_t, + cuda::std::extents, + cuda::std::dims<3, unsigned>>; + static_assert(cuda::std::is_same_v); + static_assert(noexcept(cuda::thread_level::extents(level))); + + // 4. Test cuda::thread_level::count(x) signature. + static_assert(cuda::std::is_same_v); + static_assert(noexcept(cuda::thread_level::count(level))); + + // 5. Test cuda::thread_level::index(x) signature. + static_assert( + cuda::std::is_same_v, decltype(cuda::thread_level::index(level))>); + static_assert(noexcept(cuda::thread_level::index(level))); + + // 6. Test cuda::thread_level::rank(x) signature. + static_assert(cuda::std::is_same_v); + static_assert(noexcept(cuda::thread_level::rank(level))); +} + +template +__device__ void test_query_as_signatures(const Level& level) +{ + // 1. Test cuda::thread_level::dims(x) signature. + static_assert(cuda::std::is_same_v, decltype(cuda::thread_level::dims_as(level))>); + static_assert(noexcept(cuda::thread_level::dims_as(level))); + + // 2. Test cuda::thread_level::extents(x) signature. + using ExtentsRet = cuda::std:: + conditional_t, cuda::std::extents, cuda::std::dims<3, T>>; + static_assert(cuda::std::is_same_v(level))>); + static_assert(noexcept(cuda::thread_level::extents_as(level))); + + // 3. Test cuda::thread_level::count(x) signature. + static_assert(cuda::std::is_same_v(level))>); + static_assert(noexcept(cuda::thread_level::count_as(level))); + + // 4. Test cuda::thread_level::index(x) signature. + static_assert( + cuda::std::is_same_v, decltype(cuda::thread_level::index_as(level))>); + static_assert(noexcept(cuda::thread_level::index_as(level))); + + // 5. Test cuda::thread_level::rank(x) signature. + static_assert(cuda::std::is_same_v(level))>); + static_assert(noexcept(cuda::thread_level::rank_as(level))); +} + +template +__device__ void test(const InLevel& in_level) +{ + test_query_signatures(in_level); + test_query_as_signatures(in_level); + test_query_as_signatures(in_level); + test_query_as_signatures(in_level); + test_query_as_signatures(in_level); + test_query_as_signatures(in_level); + test_query_as_signatures(in_level); +} + +__device__ void test() +{ + test(cuda::warp); + test(cuda::block); + test(cuda::cluster); + test(cuda::grid); +} + +int main(int, char**) +{ + return 0; +} diff --git a/libcudacxx/test/libcudacxx/cuda/hierarchy/traits/is_natively_reachable_hierarchy_level_v.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/hierarchy/traits/is_natively_reachable_hierarchy_level_v.compile.pass.cpp new file mode 100644 index 00000000000..cdfffc6d372 --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/hierarchy/traits/is_natively_reachable_hierarchy_level_v.compile.pass.cpp @@ -0,0 +1,59 @@ +//===----------------------------------------------------------------------===// +// +// Part of the libcu++ Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +// todo: enable with nvrtc +// UNSUPPORTED: nvrtc + +#include + +struct MyLevel : cuda::hierarchy_level_base +{}; + +template +inline constexpr bool trait_v = cuda::__is_natively_reachable_hierarchy_level_v; + +static_assert(!trait_v); +static_assert(!trait_v); +static_assert(trait_v); +static_assert(trait_v); +static_assert(trait_v); +static_assert(!trait_v); + +static_assert(!trait_v); +static_assert(!trait_v); +static_assert(trait_v); +static_assert(trait_v); +static_assert(trait_v); +static_assert(!trait_v); + +static_assert(!trait_v); +static_assert(!trait_v); +static_assert(!trait_v); +static_assert(trait_v); +static_assert(trait_v); +static_assert(!trait_v); + +static_assert(!trait_v); +static_assert(!trait_v); +static_assert(!trait_v); +static_assert(!trait_v); +static_assert(trait_v); +static_assert(!trait_v); + +static_assert(!trait_v); +static_assert(!trait_v); +static_assert(!trait_v); +static_assert(!trait_v); +static_assert(!trait_v); +static_assert(!trait_v); + +int main(int, char**) +{ + return 0; +} diff --git a/libcudacxx/test/libcudacxx/cuda/hierarchy/warp_level/native_hierarchy_queries.pass.cpp b/libcudacxx/test/libcudacxx/cuda/hierarchy/warp_level/native_hierarchy_queries.pass.cpp new file mode 100644 index 00000000000..52dbd70c378 --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/hierarchy/warp_level/native_hierarchy_queries.pass.cpp @@ -0,0 +1,164 @@ +//===----------------------------------------------------------------------===// +// +// Part of the libcu++ Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +// todo: enable with nvrtc +// UNSUPPORTED: nvrtc + +#include +#include +#include +#include +#include + +#include "hierarchy_queries.h" + +__device__ void test_warp() +{ + constexpr cuda::std::size_t dext = cuda::std::dynamic_extent; + + const unsigned count_in_block = (blockDim.x * blockDim.y * blockDim.z + warpSize - 1) / warpSize; + const unsigned rank_in_block = ((threadIdx.z * blockDim.y + threadIdx.y) * blockDim.x + threadIdx.x) / warpSize; + const uint3 dims_in_block{count_in_block, 1, 1}; + const uint3 index_in_block{rank_in_block, 0, 0}; + + // 1. Test cuda::warp.dims(x) + test_dims(dims_in_block, cuda::warp, cuda::block); + { + uint3 exp = dims_in_block; + NV_IF_TARGET(NV_PROVIDES_SM_90, ({ + exp.x *= __clusterDim().x; + exp.y *= __clusterDim().y; + exp.z *= __clusterDim().z; + })) + test_dims(exp, cuda::warp, cuda::cluster); + } + test_dims({count_in_block * gridDim.x, gridDim.y, gridDim.z}, cuda::warp, cuda::grid); + + // 2. Test cuda::warp.static_dims(x) + test_static_dims(ulonglong3{dext, 1, 1}, cuda::warp, cuda::block); + test_static_dims(ulonglong3{dext, dext, dext}, cuda::warp, cuda::cluster); + test_static_dims(ulonglong3{dext, dext, dext}, cuda::warp, cuda::grid); + + // 3. Test cuda::warp.extents(x) + test_extents(cuda::std::dims<1, unsigned>{count_in_block}, cuda::warp, cuda::block); + { + uint3 exp = dims_in_block; + NV_IF_TARGET(NV_PROVIDES_SM_90, ({ + exp.x *= __clusterDim().x; + exp.y *= __clusterDim().y; + exp.z *= __clusterDim().z; + })) + test_extents(cuda::std::dims<3, unsigned>{exp.x, exp.y, exp.z}, cuda::warp, cuda::cluster); + } + { + const uint3 exp{count_in_block * gridDim.x, gridDim.y, gridDim.z}; + test_extents(cuda::std::dims<3, unsigned>{exp.x, exp.y, exp.z}, cuda::warp, cuda::grid); + } + + // 4. Test cuda::warp.count(x) + test_count(count_in_block, cuda::warp, cuda::block); + { + uint3 exp = dims_in_block; + NV_IF_TARGET(NV_PROVIDES_SM_90, ({ + exp.x *= __clusterDim().x; + exp.y *= __clusterDim().y; + exp.z *= __clusterDim().z; + })) + test_count(cuda::std::size_t{exp.z} * exp.y * exp.x, cuda::warp, cuda::cluster); + } + { + const uint3 exp{count_in_block * gridDim.x, gridDim.y, gridDim.z}; + test_count(cuda::std::size_t{exp.z} * exp.y * exp.x, cuda::warp, cuda::grid); + } + + // 5. test cuda::warp.index(x) + test_index(index_in_block, cuda::warp, cuda::block); + { + uint3 exp = index_in_block; + NV_IF_TARGET(NV_PROVIDES_SM_90, ({ + exp.x += count_in_block * __clusterRelativeBlockIdx().x; + exp.y += __clusterRelativeBlockIdx().y; + exp.z += __clusterRelativeBlockIdx().z; + })) + test_index(exp, cuda::warp, cuda::cluster); + } + { + const uint3 exp{ + rank_in_block + count_in_block * blockIdx.x, + blockIdx.y, + blockIdx.z, + }; + test_index(exp, cuda::warp, cuda::grid); + } + + // 6. Test cuda::warp.rank(x) + test_rank(rank_in_block, cuda::warp, cuda::block); + { + cuda::std::size_t exp = 0; + NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, + ({ + exp = (__clusterRelativeBlockIdx().z * __clusterDim().y + __clusterRelativeBlockIdx().y) + * __clusterDim().x * count_in_block + + __clusterRelativeBlockIdx().x * count_in_block + rank_in_block; + }), + ({ exp = rank_in_block; })) + test_rank(exp, cuda::warp, cuda::cluster); + } + { + const cuda::std::size_t exp = + (blockIdx.z * gridDim.y + blockIdx.y) * gridDim.x * count_in_block + blockIdx.x * count_in_block + rank_in_block; + test_rank(exp, cuda::warp, cuda::grid); + } +} + +#if !_CCCL_COMPILER(NVRTC) +__global__ void test_kernel() +{ + test_warp(); +} + +void test() +{ + int cc_major{}; + assert(cudaDeviceGetAttribute(&cc_major, cudaDevAttrComputeCapabilityMajor, 0) == cudaSuccess); + + // thread block clusters require compute capability at least 9.0 + const bool enable_clusters = cc_major >= 9; + + test_kernel<<<1, 128>>>(); + test_kernel<<<128, 1>>>(); + test_kernel<<>>(); + test_kernel<<>>(); + if (enable_clusters) + { + cudaLaunchAttribute attribute[1]{}; + attribute[0].id = cudaLaunchAttributeClusterDimension; + attribute[0].val.clusterDim.x = 4; + attribute[0].val.clusterDim.y = 2; + attribute[0].val.clusterDim.z = 1; + + cudaLaunchConfig_t config{}; + config.gridDim = {12, 10, 3}; + config.blockDim = {2, 8, 4}; + config.attrs = attribute; + config.numAttrs = 1; + + void* pargs[1]{}; + assert(cudaLaunchKernelExC(&config, (const void*) test_kernel, pargs) == cudaSuccess); + } + + assert(cudaDeviceSynchronize() == cudaSuccess); +} +#endif // !_CCCL_COMPILER(NVRTC) + +int main(int, char**) +{ + NV_IF_ELSE_TARGET(NV_IS_HOST, (test();), (test_warp();)) + return 0; +} diff --git a/libcudacxx/test/libcudacxx/cuda/hierarchy/warp_level/native_hierarchy_query_signatures.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/hierarchy/warp_level/native_hierarchy_query_signatures.compile.pass.cpp new file mode 100644 index 00000000000..175e73cb915 --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/hierarchy/warp_level/native_hierarchy_query_signatures.compile.pass.cpp @@ -0,0 +1,95 @@ +//===----------------------------------------------------------------------===// +// +// Part of the libcu++ Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +// todo: enable with nvrtc +// UNSUPPORTED: nvrtc + +#include +#include +#include +#include + +template +__device__ void test_query_signatures(const Level& level) +{ + // 1. Test cuda::warp_level::dims(x) signature. + static_assert(cuda::std::is_same_v, decltype(cuda::warp_level::dims(level))>); + static_assert(noexcept(cuda::warp_level::dims(level))); + + // 2. Test cuda::warp_level::static_dims(x) signature. + static_assert(cuda::std::is_same_v, + decltype(cuda::warp_level::static_dims(level))>); + static_assert(noexcept(cuda::warp_level::static_dims(level))); + + // 3. Test cuda::warp_level::extents(x) signature. + using ExtentsRet = cuda::std::dims<(cuda::std::is_same_v) ? 1 : 3, unsigned>; + static_assert(cuda::std::is_same_v); + static_assert(noexcept(cuda::warp_level::extents(level))); + + // 4. Test cuda::warp_level::count(x) signature. + static_assert(cuda::std::is_same_v); + static_assert(noexcept(cuda::warp_level::count(level))); + + // 5. Test cuda::warp_level::index(x) signature. + static_assert(cuda::std::is_same_v, decltype(cuda::warp_level::index(level))>); + static_assert(noexcept(cuda::warp_level::index(level))); + + // 6. Test cuda::warp_level::rank(x) signature. + static_assert(cuda::std::is_same_v); + static_assert(noexcept(cuda::warp_level::rank(level))); +} + +template +__device__ void test_query_as_signatures(const Level& level) +{ + // 1. Test cuda::warp_level::dims(x) signature. + static_assert(cuda::std::is_same_v, decltype(cuda::warp_level::dims_as(level))>); + static_assert(noexcept(cuda::warp_level::dims_as(level))); + + // 2. Test cuda::warp_level::extents(x) signature. + using ExtentsRet = cuda::std::dims<(cuda::std::is_same_v) ? 1 : 3, T>; + static_assert(cuda::std::is_same_v(level))>); + static_assert(noexcept(cuda::warp_level::extents_as(level))); + + // 3. Test cuda::warp_level::count(x) signature. + static_assert(cuda::std::is_same_v(level))>); + static_assert(noexcept(cuda::warp_level::count_as(level))); + + // 4. Test cuda::warp_level::index(x) signature. + static_assert(cuda::std::is_same_v, decltype(cuda::warp_level::index_as(level))>); + static_assert(noexcept(cuda::warp_level::index_as(level))); + + // 5. Test cuda::warp_level::rank(x) signature. + static_assert(cuda::std::is_same_v(level))>); + static_assert(noexcept(cuda::warp_level::rank_as(level))); +} + +template +__device__ void test(const InLevel& in_level) +{ + test_query_signatures(in_level); + test_query_as_signatures(in_level); + test_query_as_signatures(in_level); + test_query_as_signatures(in_level); + test_query_as_signatures(in_level); + test_query_as_signatures(in_level); + test_query_as_signatures(in_level); +} + +__device__ void test() +{ + test(cuda::block); + test(cuda::cluster); + test(cuda::grid); +} + +int main(int, char**) +{ + return 0; +} diff --git a/libcudacxx/test/support/hierarchy_queries.h b/libcudacxx/test/support/hierarchy_queries.h new file mode 100644 index 00000000000..27ebd65e2c1 --- /dev/null +++ b/libcudacxx/test/support/hierarchy_queries.h @@ -0,0 +1,115 @@ +//===----------------------------------------------------------------------===// +// +// Part of the libcu++ Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef SUPPORT_HIERARCHY_QUERIES_H +#define SUPPORT_HIERARCHY_QUERIES_H + +#include +#include +#include +#include + +template +__device__ void test_result(cuda::hierarchy_query_result res, Vec exp) +{ + assert(res.x == static_cast(exp.x)); + assert(res.y == static_cast(exp.y)); + assert(res.z == static_cast(exp.z)); +} + +template +__device__ void test_result(cuda::std::extents res, cuda::std::extents exp) +{ + for (cuda::std::size_t i = 0; i < sizeof...(Exts); ++i) + { + assert(res.extent(i) == static_cast(exp.extent(i))); + } +} + +template +__device__ void test_dims(const uint3 exp, const Level& level, Args... args) +{ + test_result(level.dims(args...), exp); + test_result(level.template dims_as(args...), exp); + test_result(level.template dims_as(args...), exp); + test_result(level.template dims_as(args...), exp); + test_result(level.template dims_as(args...), exp); + test_result(level.template dims_as(args...), exp); + test_result(level.template dims_as(args...), exp); +} + +template +__device__ void test_static_dims(const ulonglong3 exp, Level level, Args... args) +{ + static_assert(level.static_dims(args...).x != 0); + test_result(level.static_dims(args...), exp); +} + +template +__device__ void test_extents(const Exp exp, const Level& level, Args... args) +{ + test_result(level.extents(args...), exp); + test_result(level.template extents_as(args...), exp); + test_result(level.template extents_as(args...), exp); + test_result(level.template extents_as(args...), exp); + test_result(level.template extents_as(args...), exp); + test_result(level.template extents_as(args...), exp); + test_result(level.template extents_as(args...), exp); +} + +template +__device__ void test_count(const cuda::std::size_t exp, const Level& level, Args... args) +{ + assert(level.count(args...) == exp); + assert(level.template count_as(args...) == static_cast(exp)); + assert(level.template count_as(args...) == static_cast(exp)); + assert(level.template count_as(args...) == static_cast(exp)); + assert(level.template count_as(args...) == static_cast(exp)); + assert(level.template count_as(args...) == static_cast(exp)); + assert(level.template count_as(args...) == static_cast(exp)); +} + +template +__device__ void test_index(const uint3 exp, const Level& level, Args... args) +{ + test_result(level.index(args...), exp); + test_result(level.template index_as(args...), exp); + test_result(level.template index_as(args...), exp); + test_result(level.template index_as(args...), exp); + test_result(level.template index_as(args...), exp); + test_result(level.template index_as(args...), exp); + test_result(level.template index_as(args...), exp); +} + +template +__device__ void test_rank(const cuda::std::size_t exp, const Level& level, Args... args) +{ + assert(level.rank(args...) == exp); + assert(level.template rank_as(args...) == static_cast(exp)); + assert(level.template rank_as(args...) == static_cast(exp)); + assert(level.template rank_as(args...) == static_cast(exp)); + assert(level.template rank_as(args...) == static_cast(exp)); + assert(level.template rank_as(args...) == static_cast(exp)); + assert(level.template rank_as(args...) == static_cast(exp)); +} + +template +__device__ constexpr cuda::std::size_t mul_static_extents(Args... args) +{ + if (((args == cuda::std::dynamic_extent) || ...)) + { + return cuda::std::dynamic_extent; + } + else + { + return (cuda::std::size_t{1} * ... * args); + } +} + +#endif // SUPPORT_HIERARCHY_QUERIES_H From 1ef85d46ff98c9b449d89a226f5b456518a9bd88 Mon Sep 17 00:00:00 2001 From: Nader Al Awar Date: Thu, 18 Dec 2025 23:21:51 -0600 Subject: [PATCH 22/56] Use vectorized tuning for triad benchmark for dtypes of size 2 (#7019) --- .../dispatch/tuning/tuning_transform.cuh | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/cub/cub/device/dispatch/tuning/tuning_transform.cuh b/cub/cub/device/dispatch/tuning/tuning_transform.cuh index 368cc6504a6..df8e391488b 100644 --- a/cub/cub/device/dispatch/tuning/tuning_transform.cuh +++ b/cub/cub/device/dispatch/tuning/tuning_transform.cuh @@ -326,6 +326,15 @@ struct tuning_vec<1200, StoreSize> static constexpr int items_per_thread = 8; }; +// manually tuned triad on A100 +template +struct tuning_vec<800, StoreSize, LoadSize0, LoadSizes...> +{ + static constexpr int block_threads = 128; + static constexpr int vec_size = 4; + static constexpr int items_per_thread = 16; +}; + template int{max_smem_per_block}; - static constexpr bool fallback_to_vectorized = exhaust_smem || no_input_streams || !can_memcpy_all_inputs; + + // on Ampere, the vectorized kernel performs better for 1 and 2 byte values + static constexpr bool use_vector_kernel_on_ampere = + ((size_of> < 4) && ...) && sizeof...(RandomAccessIteratorsIn) > 1 + && size_of> < 4; + + static constexpr bool fallback_to_vectorized = + exhaust_smem || no_input_streams || !can_memcpy_all_inputs || use_vector_kernel_on_ampere; public: static constexpr auto algorithm = From 00a1b95bd4b0b43afc702a906e23b5fc02335e3e Mon Sep 17 00:00:00 2001 From: pciolkosz Date: Thu, 18 Dec 2025 22:26:56 -0800 Subject: [PATCH 23/56] [libcu++] Fix synchronous resource adapter property passing (#6976) * Fix synchronous resource adapter property passing * Hide pinned pool on older CUDA versions * Workaround MSVC bug * Missing maybe_unused --- .../legacy_pinned_memory_resource.h | 8 +- .../synchronous_resource_adapter.h | 15 +++- .../__utility/__basic_any/virtual_functions.h | 25 +++++-- libcudacxx/include/cuda/memory_resource | 1 + .../memory_resource/synchronous_adapter.cu | 73 +++++++++++++++++++ .../cuda/memory_resource/test_resource.cuh | 2 + 6 files changed, 109 insertions(+), 15 deletions(-) create mode 100644 libcudacxx/test/libcudacxx/cuda/memory_resource/synchronous_adapter.cu diff --git a/libcudacxx/include/cuda/__memory_resource/legacy_pinned_memory_resource.h b/libcudacxx/include/cuda/__memory_resource/legacy_pinned_memory_resource.h index ef541915471..a43ccb24800 100644 --- a/libcudacxx/include/cuda/__memory_resource/legacy_pinned_memory_resource.h +++ b/libcudacxx/include/cuda/__memory_resource/legacy_pinned_memory_resource.h @@ -61,7 +61,7 @@ class legacy_pinned_memory_resource //! @throw std::invalid_argument in case of invalid alignment or \c cuda::cuda_error of the returned error code. //! @return Pointer to the newly allocated memory [[nodiscard]] _CCCL_HOST_API void* - allocate_sync(const size_t __bytes, const size_t __alignment = ::cuda::mr::default_cuda_malloc_host_alignment) + allocate_sync(const size_t __bytes, const size_t __alignment = ::cuda::mr::default_cuda_malloc_alignment) { // We need to ensure that the provided alignment matches the minimal provided alignment if (!__is_valid_alignment(__alignment)) @@ -83,7 +83,7 @@ class legacy_pinned_memory_resource _CCCL_HOST_API void deallocate_sync( void* __ptr, const size_t, - [[maybe_unused]] const size_t __alignment = ::cuda::mr::default_cuda_malloc_host_alignment) noexcept + [[maybe_unused]] const size_t __alignment = ::cuda::mr::default_cuda_malloc_alignment) noexcept { // We need to ensure that the provided alignment matches the minimal provided alignment _CCCL_ASSERT(__is_valid_alignment(__alignment), @@ -121,8 +121,8 @@ class legacy_pinned_memory_resource //! @brief Checks whether the passed in alignment is valid _CCCL_HOST_API static constexpr bool __is_valid_alignment(const size_t __alignment) noexcept { - return __alignment <= ::cuda::mr::default_cuda_malloc_host_alignment - && (::cuda::mr::default_cuda_malloc_host_alignment % __alignment == 0); + return __alignment <= ::cuda::mr::default_cuda_malloc_alignment + && (::cuda::mr::default_cuda_malloc_alignment % __alignment == 0); } using default_queries = ::cuda::mr::properties_list<::cuda::mr::device_accessible, ::cuda::mr::host_accessible>; diff --git a/libcudacxx/include/cuda/__memory_resource/synchronous_resource_adapter.h b/libcudacxx/include/cuda/__memory_resource/synchronous_resource_adapter.h index e4c59dbf2d8..634ad7f7b33 100644 --- a/libcudacxx/include/cuda/__memory_resource/synchronous_resource_adapter.h +++ b/libcudacxx/include/cuda/__memory_resource/synchronous_resource_adapter.h @@ -21,6 +21,7 @@ # pragma system_header #endif // no system header +#include #include #include #include @@ -48,7 +49,9 @@ _CCCL_CONCEPT __has_member_deallocate = _CCCL_REQUIRES_EXPR( //! @note This adapter takes ownership of the contained resource. //! @tparam _Resource The type of the resource to be adapted template -struct synchronous_resource_adapter : ::cuda::mr::__copy_default_queries<_Resource> +struct synchronous_resource_adapter + : ::cuda::mr::__copy_default_queries<_Resource> + , ::cuda::forward_property, _Resource> { _CCCL_HOST_API synchronous_resource_adapter(const _Resource& __resource) noexcept : __resource(__resource) @@ -107,10 +110,14 @@ struct synchronous_resource_adapter : ::cuda::mr::__copy_default_queries<_Resour } #endif // _CCCL_STD_VER <= 2017 - template - friend constexpr void get_property(const synchronous_resource_adapter& __res, _Property __prop) noexcept + _CCCL_HOST_API _Resource& upstream_resource() noexcept { - __res.__resource.get_property(__prop); + return __resource; + } + + _CCCL_HOST_API const _Resource& upstream_resource() const noexcept + { + return __resource; } private: diff --git a/libcudacxx/include/cuda/__utility/__basic_any/virtual_functions.h b/libcudacxx/include/cuda/__utility/__basic_any/virtual_functions.h index 27036114500..6f093c8f53d 100644 --- a/libcudacxx/include/cuda/__utility/__basic_any/virtual_functions.h +++ b/libcudacxx/include/cuda/__utility/__basic_any/virtual_functions.h @@ -69,9 +69,12 @@ _CCCL_NODEBUG_API auto __c_style_cast(_Src* __ptr) noexcept -> _DstPtr return (_DstPtr) __ptr; // NOLINT(cppcoreguidelines-pro-type-cstyle-cast) } -template -[[nodiscard]] _CCCL_API auto __override_fn_([[maybe_unused]] ::cuda::std::__maybe_const<_IsConst, void>* __pv, - [[maybe_unused]] _Args... __args) noexcept(_IsNothrow) -> _Ret +// Helper function to not use a function pointer as a template parameter, which breaks MSVC in some cases. +template +[[nodiscard]] _CCCL_API auto __override_fn_dispatch_impl( + [[maybe_unused]] _FnType __fn, + [[maybe_unused]] ::cuda::std::__maybe_const<_IsConst, void>* __pv, + [[maybe_unused]] _Args... __args) noexcept(_IsNothrow) -> _Ret { using __value_type _CCCL_NODEBUG_ALIAS = ::cuda::std::__maybe_const<_IsConst, _Tp>; @@ -81,22 +84,30 @@ template ) + else if constexpr (::cuda::std::is_member_function_pointer_v<_FnType>) { // _Fn may be a pointer to a member function of a private base of _Tp. So // after static_cast-ing to _Tp*, we need to use a C-style cast to get a // pointer to the correct base class. - using __class_type = ::cuda::std::__maybe_const<_IsConst, __class_of>; + using __class_type = ::cuda::std::__maybe_const<_IsConst, __class_of<_FnType>>; __class_type& __obj = *::cuda::__c_style_cast<__class_type*>(static_cast<__value_type*>(__pv)); - return (__obj.*_Fn)(static_cast<_Args&&>(__args)...); + return (__obj.*__fn)(static_cast<_Args&&>(__args)...); } else { __value_type& __obj = *static_cast<__value_type*>(__pv); - return (*_Fn)(__obj, static_cast<_Args&&>(__args)...); + return (*__fn)(__obj, static_cast<_Args&&>(__args)...); } } +template +[[nodiscard]] _CCCL_API auto __override_fn_([[maybe_unused]] ::cuda::std::__maybe_const<_IsConst, void>* __pv, + [[maybe_unused]] _Args... __args) noexcept(_IsNothrow) -> _Ret +{ + return __override_fn_dispatch_impl<_Tp, decltype(_Fn), _Ret, _IsConst, _IsNothrow, _Args...>( + _Fn, __pv, static_cast<_Args&&>(__args)...); +} + _CCCL_DIAG_POP template diff --git a/libcudacxx/include/cuda/memory_resource b/libcudacxx/include/cuda/memory_resource index 6b6c9911bbc..fb9394930c8 100644 --- a/libcudacxx/include/cuda/memory_resource +++ b/libcudacxx/include/cuda/memory_resource @@ -37,5 +37,6 @@ #include #include #include +#include #endif //_CCCL_BEGIN_NAMESPACE_CUDA diff --git a/libcudacxx/test/libcudacxx/cuda/memory_resource/synchronous_adapter.cu b/libcudacxx/test/libcudacxx/cuda/memory_resource/synchronous_adapter.cu new file mode 100644 index 00000000000..4afb7b0c8e5 --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/memory_resource/synchronous_adapter.cu @@ -0,0 +1,73 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#include +#include + +#include + +#include "test_resource.cuh" + +template +constexpr bool same_default_queries = + cuda::std::is_same_v::default_queries, + typename Resource::default_queries>; + +template +constexpr bool passed_property = + cuda::mr::synchronous_resource_with, Property> + == cuda::mr::synchronous_resource_with; + +template +constexpr bool same_properties = + passed_property && passed_property + && passed_property && passed_property; + +C2H_CCCLRT_TEST("synchronous_resource_adapter", "[memory_resource]") +{ + cuda::stream stream{cuda::device_ref{0}}; + + SECTION("Test wrapping a resource") + { + auto pool = cuda::device_default_memory_pool(cuda::device_ref{0}); + cuda::mr::synchronous_resource_adapter adapter{pool}; + auto* ptr = adapter.allocate(stream, 1024, 128); + CCCLRT_CHECK(ptr != nullptr); + CCCLRT_CHECK(pool.attribute(cuda::memory_pool_attributes::used_mem_current) > 0); + adapter.deallocate(stream, ptr, 1024, 128); + CCCLRT_CHECK(pool.attribute(cuda::memory_pool_attributes::used_mem_current) == 0); + } + SECTION("Test wrapping a synchronous resource") + { + cuda::mr::synchronous_resource_adapter adapter{ + cuda::mr::legacy_pinned_memory_resource{}}; + auto* ptr = adapter.allocate(stream, 1024, 128); + CCCLRT_CHECK(ptr != nullptr); + adapter.deallocate(stream, ptr, 1024, 128); + } + SECTION("test property passing through") + { +#if _CCCL_CTK_AT_LEAST(12, 6) + STATIC_CHECK(same_properties); +#endif // _CCCL_CTK_AT_LEAST(12, 6) + STATIC_CHECK(same_properties); + STATIC_CHECK(same_properties); + STATIC_CHECK(same_properties); + } + SECTION("test default queries") + { +#if _CCCL_CTK_AT_LEAST(12, 6) + STATIC_CHECK(same_default_queries); +#endif // _CCCL_CTK_AT_LEAST(12, 6) + STATIC_CHECK(same_default_queries); + STATIC_CHECK(same_default_queries); + STATIC_CHECK(same_default_queries); + } +} diff --git a/libcudacxx/test/libcudacxx/cuda/memory_resource/test_resource.cuh b/libcudacxx/test/libcudacxx/cuda/memory_resource/test_resource.cuh index aa3afc58f9e..ed1b9e5a1d1 100644 --- a/libcudacxx/test/libcudacxx/cuda/memory_resource/test_resource.cuh +++ b/libcudacxx/test/libcudacxx/cuda/memory_resource/test_resource.cuh @@ -220,6 +220,8 @@ struct test_resource { return self.data; } + + using default_queries = cuda::mr::properties_list<>; }; using big_resource = test_resource; From adc23f533f4d3661cf0419f53fadc323a3315d4e Mon Sep 17 00:00:00 2001 From: pciolkosz Date: Thu, 18 Dec 2025 22:31:55 -0800 Subject: [PATCH 24/56] [libcu++] Remove _view from the shared memory getter name (#6997) * Remove _view from the shared memory getter * Forgot about cudax --- cudax/test/launch/launch_smoke.cu | 4 +- .../include/cuda/__launch/configuration.h | 135 ++++++++++++------ .../ccclrt/launch/dynamic_shared_memory.cu | 6 +- .../cuda/ccclrt/launch/launch_smoke.cu | 4 +- 4 files changed, 98 insertions(+), 51 deletions(-) diff --git a/cudax/test/launch/launch_smoke.cu b/cudax/test/launch/launch_smoke.cu index 5ade1c0cf03..efffdf44be0 100644 --- a/cudax/test/launch/launch_smoke.cu +++ b/cudax/test/launch/launch_smoke.cu @@ -88,7 +88,7 @@ struct dynamic_smem_single template __device__ void operator()(Config config) { - decltype(auto) dynamic_smem = cuda::dynamic_shared_memory_view(config); + decltype(auto) dynamic_smem = cuda::dynamic_shared_memory(config); static_assert(::cuda::std::is_same_v); CUDAX_REQUIRE(::cuda::device::is_object_from(dynamic_smem, ::cuda::device::address_space::shared)); kernel_run_proof = true; @@ -101,7 +101,7 @@ struct dynamic_smem_span template __device__ void operator()(Config config, int size) { - auto dynamic_smem = cuda::dynamic_shared_memory_view(config); + auto dynamic_smem = cuda::dynamic_shared_memory(config); static_assert(decltype(dynamic_smem)::extent == Extent); static_assert(::cuda::std::is_same_v); CUDAX_REQUIRE(dynamic_smem.size() == size); diff --git a/libcudacxx/include/cuda/__launch/configuration.h b/libcudacxx/include/cuda/__launch/configuration.h index 16e2511ddf0..d81a69ab4b5 100644 --- a/libcudacxx/include/cuda/__launch/configuration.h +++ b/libcudacxx/include/cuda/__launch/configuration.h @@ -196,7 +196,7 @@ inline constexpr ::cuda::std::size_t __max_portable_dyn_smem_size = 48 * 1024; * This type can be constructed with dynamic_shared_memory helper function. * * When launch configuration contains this option, that configuration can be - * then passed to dynamic_shared_memory_view to get the view_type over the + * then passed to dynamic_shared_memory to get the view_type over the * dynamic shared memory. It is also possible to obtain that memory through * the original extern __shared__ variable[] declaration. * @@ -213,14 +213,14 @@ inline constexpr ::cuda::std::size_t __max_portable_dyn_smem_size = 48 * 1024; * template * __global__ void kernel(Configuration conf) * { - * auto dynamic_shared = cuda::dynamic_shared_memory_view(conf); + * auto dynamic_shared = cuda::dynamic_shared_memory(conf); * dynamic_shared[0] = 1; * } * * void kernel_launch(cuda::stream_ref stream) { * auto dims = cuda::make_hierarchy(cuda::block<128>(), cuda::grid(4)); * auto conf = cuda::make_configuration(dims, - * dynamic_shared_memory()); + * cuda::dynamic_shared_memory()); * * cuda::launch(stream, conf, kernel); * } @@ -239,7 +239,7 @@ inline constexpr ::cuda::std::size_t __max_portable_dyn_smem_size = 48 * 1024; * per block */ template -class _CCCL_DECLSPEC_EMPTY_BASES dynamic_shared_memory +class _CCCL_DECLSPEC_EMPTY_BASES dynamic_shared_memory_option : __dyn_smem_option_base<_Tp> , public __detail::launch_option { @@ -258,48 +258,11 @@ class _CCCL_DECLSPEC_EMPTY_BASES dynamic_shared_memory using typename __base_type::value_type; //!< Value type of the dynamic //!< shared memory elements. using typename __base_type::view_type; //!< The view type returned by the - //!< cuda::dynamic_shared_memory_view(config). + //!< cuda::dynamic_shared_memory(config). static constexpr bool is_relevant_on_device = true; static constexpr __detail::launch_option_kind kind = __detail::launch_option_kind::dynamic_shared_memory; - _CCCL_HIDE_FROM_ABI constexpr dynamic_shared_memory() noexcept = default; - - _CCCL_HOST_API constexpr dynamic_shared_memory(non_portable_t) noexcept - : __non_portable_{true} - {} - - _CCCL_TEMPLATE(class _Tp2 = _Tp) - _CCCL_REQUIRES((!::cuda::std::is_unbounded_array_v<_Tp2>) ) - _CCCL_HOST_API constexpr dynamic_shared_memory() noexcept - { - static_assert(sizeof(_Tp2) <= __max_portable_dyn_smem_size, "portable dynamic shared memory limit exceeded"); - } - - _CCCL_TEMPLATE(class _Tp2 = _Tp) - _CCCL_REQUIRES((!::cuda::std::is_unbounded_array_v<_Tp2>) ) - _CCCL_HOST_API constexpr dynamic_shared_memory(non_portable_t) noexcept - : __non_portable_{true} - {} - - _CCCL_TEMPLATE(class _Tp2 = _Tp) - _CCCL_REQUIRES(::cuda::std::is_unbounded_array_v<_Tp2>) - _CCCL_HOST_API constexpr dynamic_shared_memory(::cuda::std::size_t __n) - : __base_type{__n} - { - if (__n * sizeof(value_type) > __max_portable_dyn_smem_size) - { - ::cuda::std::__throw_invalid_argument("portable dynamic shared memory limit exceeded"); - } - } - - _CCCL_TEMPLATE(class _Tp2 = _Tp) - _CCCL_REQUIRES(::cuda::std::is_unbounded_array_v<_Tp2>) - _CCCL_HOST_API constexpr dynamic_shared_memory(::cuda::std::size_t __n, non_portable_t) noexcept - : __base_type{__n} - , __non_portable_{true} - {} - //! @brief Gets the size of the dynamic shared memory in bytes. [[nodiscard]] _CCCL_API constexpr ::cuda::std::size_t size_bytes() const noexcept { @@ -328,11 +291,32 @@ class _CCCL_DECLSPEC_EMPTY_BASES dynamic_shared_memory return view_type{__ptr, __base_type::__n_}; } } + + // Helper function to access private constructors + static constexpr dynamic_shared_memory_option __create(bool __non_portable = false) noexcept + { + return dynamic_shared_memory_option{__non_portable}; + } + + static constexpr dynamic_shared_memory_option __create(::cuda::std::size_t __n, bool __non_portable = false) noexcept + { + return dynamic_shared_memory_option{__n, __non_portable}; + } + +private: + _CCCL_HOST_API constexpr dynamic_shared_memory_option(bool __non_portable = false) noexcept + : __non_portable_{__non_portable} + {} + + _CCCL_HOST_API constexpr dynamic_shared_memory_option(::cuda::std::size_t __n, bool __non_portable = false) noexcept + : __base_type{__n} + , __non_portable_{__non_portable} + {} }; template [[nodiscard]] ::cudaError_t __apply_launch_option( - const dynamic_shared_memory<_Tp>& __opt, ::CUlaunchConfig& __config, ::CUfunction __kernel) noexcept + const dynamic_shared_memory_option<_Tp>& __opt, ::CUlaunchConfig& __config, ::CUfunction __kernel) noexcept { ::cudaError_t __status = ::cudaSuccess; @@ -389,6 +373,69 @@ template return ::cudaSuccess; } +/** + * @brief Function that creates dynamic_shared_memory_option for non-unbounded array types + * + * @tparam _Tp Type intended to be stored in dynamic shared memory (must not be an unbounded array) + * @return dynamic_shared_memory_option<_Tp> instance + */ +_CCCL_TEMPLATE(class _Tp) +_CCCL_REQUIRES((!::cuda::std::is_unbounded_array_v<_Tp>) ) +[[nodiscard]] _CCCL_HOST_API constexpr dynamic_shared_memory_option<_Tp> dynamic_shared_memory() noexcept +{ + static_assert(sizeof(_Tp) <= __max_portable_dyn_smem_size, "portable dynamic shared memory limit exceeded"); + return dynamic_shared_memory_option<_Tp>::__create(false); +} + +/** + * @brief Function that creates dynamic_shared_memory_option for non-unbounded array types with non-portable flag + * + * @tparam _Tp Type intended to be stored in dynamic shared memory (must not be an unbounded array) + * @param __non_portable Flag indicating non-portable size + * @return dynamic_shared_memory_option<_Tp> instance + */ +_CCCL_TEMPLATE(class _Tp) +_CCCL_REQUIRES((!::cuda::std::is_unbounded_array_v<_Tp>) ) +[[nodiscard]] _CCCL_HOST_API constexpr dynamic_shared_memory_option<_Tp> dynamic_shared_memory(non_portable_t) noexcept +{ + return dynamic_shared_memory_option<_Tp>::__create(true); +} + +/** + * @brief Function that creates dynamic_shared_memory_option for unbounded array types + * + * @tparam _Tp Unbounded array type + * @param __n Number of elements in the dynamic shared memory + * @return dynamic_shared_memory_option<_Tp> instance + */ +_CCCL_TEMPLATE(class _Tp) +_CCCL_REQUIRES(::cuda::std::is_unbounded_array_v<_Tp>) +[[nodiscard]] _CCCL_HOST_API constexpr dynamic_shared_memory_option<_Tp> dynamic_shared_memory(::cuda::std::size_t __n) +{ + using value_type = typename dynamic_shared_memory_option<_Tp>::value_type; + if (__n * sizeof(value_type) > __max_portable_dyn_smem_size) + { + ::cuda::std::__throw_invalid_argument("portable dynamic shared memory limit exceeded"); + } + return dynamic_shared_memory_option<_Tp>::__create(__n, false); +} + +/** + * @brief Function that creates dynamic_shared_memory_option for unbounded array types with non-portable flag + * + * @tparam _Tp Unbounded array type + * @param __n Number of elements in the dynamic shared memory + * @param __non_portable Flag indicating non-portable size + * @return dynamic_shared_memory_option<_Tp> instance + */ +_CCCL_TEMPLATE(class _Tp) +_CCCL_REQUIRES(::cuda::std::is_unbounded_array_v<_Tp>) +[[nodiscard]] _CCCL_HOST_API constexpr dynamic_shared_memory_option<_Tp> +dynamic_shared_memory(::cuda::std::size_t __n, non_portable_t) noexcept +{ + return dynamic_shared_memory_option<_Tp>::__create(__n, true); +} + /** * @brief Launch option specifying launch priority * @@ -733,7 +780,7 @@ template # if _CCCL_CUDA_COMPILATION() template -_CCCL_DEVICE_API decltype(auto) dynamic_shared_memory_view(const kernel_config<_Dims, _Opts...>& __config) noexcept +_CCCL_DEVICE_API decltype(auto) dynamic_shared_memory(const kernel_config<_Dims, _Opts...>& __config) noexcept { auto& __opt = __detail::find_option_in_tuple<__detail::launch_option_kind::dynamic_shared_memory>(__config.options); using _Opt = ::cuda::std::remove_reference_t; diff --git a/libcudacxx/test/libcudacxx/cuda/ccclrt/launch/dynamic_shared_memory.cu b/libcudacxx/test/libcudacxx/cuda/ccclrt/launch/dynamic_shared_memory.cu index c0eaab93c05..4b373a5b665 100644 --- a/libcudacxx/test/libcudacxx/cuda/ccclrt/launch/dynamic_shared_memory.cu +++ b/libcudacxx/test/libcudacxx/cuda/ccclrt/launch/dynamic_shared_memory.cu @@ -25,10 +25,10 @@ struct TestKernel template __device__ void operator()(const Config& config) { - static_assert(cuda::std::is_same_v); - static_assert(noexcept(cuda::dynamic_shared_memory_view(config))); + static_assert(cuda::std::is_same_v); + static_assert(noexcept(cuda::dynamic_shared_memory(config))); - write_smem(cuda::dynamic_shared_memory_view(config)); + write_smem(cuda::dynamic_shared_memory(config)); } __device__ void write_smem(T& view) diff --git a/libcudacxx/test/libcudacxx/cuda/ccclrt/launch/launch_smoke.cu b/libcudacxx/test/libcudacxx/cuda/ccclrt/launch/launch_smoke.cu index 232df2f0a96..c757347b8de 100644 --- a/libcudacxx/test/libcudacxx/cuda/ccclrt/launch/launch_smoke.cu +++ b/libcudacxx/test/libcudacxx/cuda/ccclrt/launch/launch_smoke.cu @@ -82,7 +82,7 @@ struct dynamic_smem_single template __device__ void operator()(Config config) { - decltype(auto) dynamic_smem = cuda::dynamic_shared_memory_view(config); + decltype(auto) dynamic_smem = cuda::dynamic_shared_memory(config); static_assert(::cuda::std::is_same_v); CCCLRT_REQUIRE_DEVICE(::cuda::device::is_object_from(dynamic_smem, ::cuda::device::address_space::shared)); kernel_run_proof = true; @@ -95,7 +95,7 @@ struct dynamic_smem_span template __device__ void operator()(Config config, int size) { - auto dynamic_smem = cuda::dynamic_shared_memory_view(config); + auto dynamic_smem = cuda::dynamic_shared_memory(config); static_assert(decltype(dynamic_smem)::extent == Extent); static_assert(::cuda::std::is_same_v); CCCLRT_REQUIRE_DEVICE(dynamic_smem.size() == size); From 33aa5425bfca70ac0f3255bc068a5462c164d49c Mon Sep 17 00:00:00 2001 From: pciolkosz Date: Thu, 18 Dec 2025 22:32:36 -0800 Subject: [PATCH 25/56] [thrust] Ignore CUDA free errors in thrust memory resource (#7002) * Ignore CUDA free errors in thrust memory resource * Add a comment --- thrust/thrust/system/cuda/memory_resource.h | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/thrust/thrust/system/cuda/memory_resource.h b/thrust/thrust/system/cuda/memory_resource.h index e052aec340d..f435539fa15 100644 --- a/thrust/thrust/system/cuda/memory_resource.h +++ b/thrust/thrust/system/cuda/memory_resource.h @@ -67,12 +67,9 @@ class cuda_memory_resource final : public mr::memory_resource void do_deallocate(Pointer p, [[maybe_unused]] std::size_t bytes, [[maybe_unused]] std::size_t alignment) override { - cudaError_t status = Dealloc(thrust::detail::pointer_traits::get(p)); - - if (status != cudaSuccess) - { - thrust::cuda_cub::throw_on_error(status, "CUDA free failed"); - } + // We skip error checking here, we shouldn't throw in deallocate in case this is called in a destructor or after + // main exits and CUDA calls can start returning errors about CUDA being cleaned up. + [[maybe_unused]] auto status = Dealloc(thrust::detail::pointer_traits::get(p)); } }; From 262b7183adc7fe8be6ddb6d82ebadd23a709bdef Mon Sep 17 00:00:00 2001 From: pciolkosz Date: Fri, 19 Dec 2025 01:34:23 -0800 Subject: [PATCH 26/56] [libcu++] Correctly handle extended lambda in cuda::launch (#6987) * Don't set current device in CUDA 13 and handle extended lambda * Add extended lambda test * Compiler workarounds * Waive extended lambda test on NVRTC * Apply suggestion from @davebayer --------- Co-authored-by: David Bayer <48736217+davebayer@users.noreply.github.com> --- libcudacxx/include/cuda/__launch/launch.h | 6 +- .../cuda/ccclrt/common/host_device.cuh | 2 +- .../libcudacxx/cuda/ccclrt/common/testing.cuh | 22 +----- .../libcudacxx/cuda/ccclrt/common/utility.cuh | 23 +++++- .../ccclrt/launch/extended_lambda.pass.cpp | 46 +++++++++++ .../cuda/ccclrt/launch/launch_smoke.cu | 77 +++++++++---------- .../cuda/ccclrt/stream/stream_smoke.cu | 2 +- .../resources/common_tests.cuh | 1 + 8 files changed, 111 insertions(+), 68 deletions(-) create mode 100644 libcudacxx/test/libcudacxx/cuda/ccclrt/launch/extended_lambda.pass.cpp diff --git a/libcudacxx/include/cuda/__launch/launch.h b/libcudacxx/include/cuda/__launch/launch.h index ed30b9163a5..dd6e8085e06 100644 --- a/libcudacxx/include/cuda/__launch/launch.h +++ b/libcudacxx/include/cuda/__launch/launch.h @@ -191,7 +191,11 @@ _CCCL_HOST_API auto launch(_Submitter&& __submitter, auto __combined = __conf.combine_with_default(__kernel); if constexpr (::cuda::std::is_invocable_v<_Kernel, kernel_config<_Dimensions, _Config...>, - ::cuda::std::decay_t>...>) + ::cuda::std::decay_t>...> +# if _CCCL_CUDA_COMPILER(NVCC) + && !__nv_is_extended_device_lambda_closure_type(_Kernel) +# endif + ) { auto __launcher = __kernel_launcher>...>; diff --git a/libcudacxx/test/libcudacxx/cuda/ccclrt/common/host_device.cuh b/libcudacxx/test/libcudacxx/cuda/ccclrt/common/host_device.cuh index 424cd7fe024..22f8d599f4d 100644 --- a/libcudacxx/test/libcudacxx/cuda/ccclrt/common/host_device.cuh +++ b/libcudacxx/test/libcudacxx/cuda/ccclrt/common/host_device.cuh @@ -13,7 +13,7 @@ #include -#include "utility.cuh" +#include "testing.cuh" template void __global__ lambda_launcher(const Dims dims, const Lambda lambda) diff --git a/libcudacxx/test/libcudacxx/cuda/ccclrt/common/testing.cuh b/libcudacxx/test/libcudacxx/cuda/ccclrt/common/testing.cuh index fdec338ad36..8cf7bc9b7f9 100644 --- a/libcudacxx/test/libcudacxx/cuda/ccclrt/common/testing.cuh +++ b/libcudacxx/test/libcudacxx/cuda/ccclrt/common/testing.cuh @@ -20,31 +20,11 @@ #include // IWYU pragma: keep #include +#include "utility.cuh" #include #define CUDART(call) REQUIRE((call) == cudaSuccess) -__device__ inline void ccclrt_require_impl( - bool condition, const char* condition_text, const char* filename, unsigned int linenum, const char* funcname) -{ - if (!condition) - { - // TODO do warp aggregate prints for easier readability? - printf("%s:%u: %s: block: [%d,%d,%d], thread: [%d,%d,%d] Condition `%s` failed.\n", - filename, - linenum, - funcname, - blockIdx.x, - blockIdx.y, - blockIdx.z, - threadIdx.x, - threadIdx.y, - threadIdx.z, - condition_text); - __trap(); - } -} - // There is a problem with clang-cuda and nv/target, but we don't need the device side macros yet, // disable them for now #if _CCCL_CUDA_COMPILER(CLANG) diff --git a/libcudacxx/test/libcudacxx/cuda/ccclrt/common/utility.cuh b/libcudacxx/test/libcudacxx/cuda/ccclrt/common/utility.cuh index d7c7e477042..14bb07ad6e9 100644 --- a/libcudacxx/test/libcudacxx/cuda/ccclrt/common/utility.cuh +++ b/libcudacxx/test/libcudacxx/cuda/ccclrt/common/utility.cuh @@ -22,7 +22,26 @@ #include // IWYU pragma: keep (needed for placement new) -#include "testing.cuh" +__device__ inline void ccclrt_require_impl( + bool condition, const char* condition_text, const char* filename, unsigned int linenum, const char* funcname) +{ + if (!condition) + { + // TODO do warp aggregate prints for easier readability? + printf("%s:%u: %s: block: [%d,%d,%d], thread: [%d,%d,%d] Condition `%s` failed.\n", + filename, + linenum, + funcname, + blockIdx.x, + blockIdx.y, + blockIdx.z, + threadIdx.x, + threadIdx.y, + threadIdx.z, + condition_text); + __trap(); + } +} namespace { @@ -157,7 +176,7 @@ void launch_kernel_single_thread(cuda::stream_ref stream, Fn fn, Args... args) { cuda::__ensure_current_context guard(stream); kernel_launcher<<<1, 1, 0, stream.get()>>>(fn, args...); - CUDART(cudaGetLastError()); + assert(cudaGetLastError() == cudaSuccess); } } // namespace test } // namespace diff --git a/libcudacxx/test/libcudacxx/cuda/ccclrt/launch/extended_lambda.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ccclrt/launch/extended_lambda.pass.cpp new file mode 100644 index 00000000000..755cd6d81bf --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/ccclrt/launch/extended_lambda.pass.cpp @@ -0,0 +1,46 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +// ADDITIONAL_COMPILE_FLAGS: --extended-lambda +// UNSUPPORTED: nvrtc + +#include +#include +#include + +#include "../common/utility.cuh" + +__host__ void test_extended_lambda() +{ + cuda::stream stream{cuda::devices[0]}; + test::pinned i(0); + auto config = cuda::block_dims<32>() & cuda::grid_dims<1>(); + auto assign_42_lambda = [] __device__(int* pi) { + *pi = 42; + }; + cuda::launch(stream, config, assign_42_lambda, i.get()); + stream.sync(); + assert(*i == 42); + + auto assign_1337_lambda = [] __device__(auto config, int* pi) { + static_assert(config.dims.count(cuda::gpu_thread, cuda::block) == 32); + static_assert(config.dims.count(cuda::block) == 1); + *pi = 1337; + }; + cuda::launch(stream, config, assign_1337_lambda, config, i.get()); + stream.sync(); + assert(*i == 1337); +} + +int main(int, char**) +{ + NV_IF_TARGET(NV_IS_HOST, test_extended_lambda();) + return 0; +} diff --git a/libcudacxx/test/libcudacxx/cuda/ccclrt/launch/launch_smoke.cu b/libcudacxx/test/libcudacxx/cuda/ccclrt/launch/launch_smoke.cu index c757347b8de..ceec4861af7 100644 --- a/libcudacxx/test/libcudacxx/cuda/ccclrt/launch/launch_smoke.cu +++ b/libcudacxx/test/libcudacxx/cuda/ccclrt/launch/launch_smoke.cu @@ -215,20 +215,6 @@ void launch_smoke_test(cudaStream_t dst) } } - /* Comment out for now until I figure how to enable extended lambda for only this file - // Lambda - { - cuda::launch(dst, cuda::block_dims<256>() & cuda::grid_dims(1), - [] __device__(auto config) { - if (config.dims.rank(cuda::gpu_thread, cuda::block) == 0) { - printf("Hello from the GPU\n"); - kernel_run_proof = true; - } - }); - check_kernel_run(dst); - } - */ - // Dynamic shared memory option { auto config = cuda::block_dims<32>() & cuda::grid_dims<1>(); @@ -264,20 +250,25 @@ void launch_smoke_test(cudaStream_t dst) } } -C2H_TEST("Launch smoke stream", "[launch]") +C2H_CCCLRT_TEST("Launch smoke stream", "[launch]") { // Use raw stream to make sure it can be implicitly converted on call to // launch cudaStream_t stream; - CUDART(cudaStreamCreate(&stream)); + { + ::cuda::__ensure_current_context guard(cuda::device_ref{0}); + CUDART(cudaStreamCreate(&stream)); + } launch_smoke_test(stream); - CUDART(cudaStreamSynchronize(stream)); - CUDART(cudaStreamDestroy(stream)); + { + ::cuda::__ensure_current_context guard(cuda::device_ref{0}); + CUDART(cudaStreamSynchronize(stream)); + CUDART(cudaStreamDestroy(stream)); + } } -#endif // !_CCCL_CUDA_COMPILER(CLANG) template struct kernel_with_default_config @@ -300,42 +291,44 @@ struct kernel_with_default_config } }; -/* Comment out for now until I figure how to enable extended lambda for only this file -void test_default_config() { - cuda::stream stream{cuda::device_ref{0}}; - auto grid = cuda::grid_dims(4); - auto block = cuda::block_dims<256>; - - auto verify_lambda = [] __device__(auto config) { +struct verify_callable +{ + template + __device__ void operator()(Config config) + { static_assert(config.dims.count(cuda::gpu_thread, cuda::block) == 256); CCCLRT_REQUIRE(config.dims.count(cuda::block) == 4); cooperative_groups::this_grid().sync(); - }; + } +}; + +C2H_CCCLRT_TEST("Launch with default config", "") +{ + cuda::stream stream{cuda::device_ref{0}}; + auto grid = cuda::grid_dims(4); + auto block = cuda::block_dims<256>; - SECTION("Combine with empty") { - kernel_with_default_config kernel{ - cuda::make_config(block, grid, cuda::cooperative_launch())}; + SECTION("Combine with empty") + { + kernel_with_default_config kernel{cuda::make_config(block, grid, cuda::cooperative_launch())}; static_assert(cuda::__is_kernel_config); static_assert(cuda::__kernel_has_default_config); - cuda::launch(stream, cuda::make_config(), kernel, verify_lambda); + cuda::launch(stream, cuda::make_config(), kernel, verify_callable{}); stream.sync(); } - SECTION("Combine with no overlap") { + SECTION("Combine with no overlap") + { kernel_with_default_config kernel{cuda::make_config(block)}; - cuda::launch(stream, cuda::make_config(grid, cuda::cooperative_launch()), - kernel, verify_lambda); + cuda::launch(stream, cuda::make_config(grid, cuda::cooperative_launch()), kernel, verify_callable{}); stream.sync(); } - SECTION("Combine with overlap") { - kernel_with_default_config kernel{ - cuda::make_config(cuda::block_dims<1>, cuda::cooperative_launch())}; - cuda::launch(stream, - cuda::make_config(block, grid, cuda::cooperative_launch()), - kernel, verify_lambda); + SECTION("Combine with overlap") + { + kernel_with_default_config kernel{cuda::make_config(cuda::block_dims<1>(), cuda::cooperative_launch())}; + cuda::launch(stream, cuda::make_config(block, grid, cuda::cooperative_launch()), kernel, verify_callable{}); stream.sync(); } } -C2H_TEST("Launch with default config", "") { test_default_config(); } -*/ +#endif // !_CCCL_CUDA_COMPILER(CLANG) diff --git a/libcudacxx/test/libcudacxx/cuda/ccclrt/stream/stream_smoke.cu b/libcudacxx/test/libcudacxx/cuda/ccclrt/stream/stream_smoke.cu index c42bf45b643..04a361749dc 100644 --- a/libcudacxx/test/libcudacxx/cuda/ccclrt/stream/stream_smoke.cu +++ b/libcudacxx/test/libcudacxx/cuda/ccclrt/stream/stream_smoke.cu @@ -13,7 +13,7 @@ #include #include -#include +#include C2H_CCCLRT_TEST("Can create a stream and launch work into it", "[stream]") { diff --git a/libcudacxx/test/libcudacxx/cuda/memory_resource/resources/common_tests.cuh b/libcudacxx/test/libcudacxx/cuda/memory_resource/resources/common_tests.cuh index b6617bf3088..ed2eaf022f9 100644 --- a/libcudacxx/test/libcudacxx/cuda/memory_resource/resources/common_tests.cuh +++ b/libcudacxx/test/libcudacxx/cuda/memory_resource/resources/common_tests.cuh @@ -10,6 +10,7 @@ #pragma once +#include #include template From 6402bc6a6c98ed003fec2c03ed0db9162cc9c377 Mon Sep 17 00:00:00 2001 From: Federico Busato <50413820+fbusato@users.noreply.github.com> Date: Fri, 19 Dec 2025 02:11:35 -0800 Subject: [PATCH 27/56] the `` header must be included when using `_CCCL_THROW`, regardless of exception support (#7028) Co-authored-by: David Bayer <48736217+davebayer@users.noreply.github.com> --- docs/cccl/development/macro.rst | 4 ++++ libcudacxx/include/cuda/__driver/driver_api.h | 2 ++ libcudacxx/include/cuda/__tma/make_tma_descriptor.h | 2 ++ 3 files changed, 8 insertions(+) diff --git a/docs/cccl/development/macro.rst b/docs/cccl/development/macro.rst index 686a1ea50cc..1f61f2670a1 100644 --- a/docs/cccl/development/macro.rst +++ b/docs/cccl/development/macro.rst @@ -360,6 +360,10 @@ CUDA doesn't support exceptions in device code, however, sometimes we need to wr *Note*: The ``_CCCL_CATCH`` clause must always introduce a named variable, like: ``_CCCL_CATCH(const exception_type& var)``. +.. note:: + + ``_CCCL_THROW`` requires to include the ```` header, regardless exceptions are enabled or not. + Example: .. code-block:: c++ diff --git a/libcudacxx/include/cuda/__driver/driver_api.h b/libcudacxx/include/cuda/__driver/driver_api.h index 287a7f8b103..272408b4052 100644 --- a/libcudacxx/include/cuda/__driver/driver_api.h +++ b/libcudacxx/include/cuda/__driver/driver_api.h @@ -35,6 +35,8 @@ # include # endif +# include + # include # include diff --git a/libcudacxx/include/cuda/__tma/make_tma_descriptor.h b/libcudacxx/include/cuda/__tma/make_tma_descriptor.h index dc75f058713..5ccae27ccb2 100644 --- a/libcudacxx/include/cuda/__tma/make_tma_descriptor.h +++ b/libcudacxx/include/cuda/__tma/make_tma_descriptor.h @@ -33,6 +33,8 @@ # include # include +# include + # include # include From 5546b87bd747e324b6f779ea53119632d9736288 Mon Sep 17 00:00:00 2001 From: Bernhard Manfred Gruber Date: Fri, 19 Dec 2025 11:26:48 +0100 Subject: [PATCH 28/56] Error out when nvrtcc cannot parse cuda_thread_count (#7035) --- libcudacxx/test/utils/nvidia/nvrtc/nvrtcc_common.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/libcudacxx/test/utils/nvidia/nvrtc/nvrtcc_common.h b/libcudacxx/test/utils/nvidia/nvrtc/nvrtcc_common.h index f4df7674e25..5c4abb3a86d 100644 --- a/libcudacxx/test/utils/nvidia/nvrtc/nvrtcc_common.h +++ b/libcudacxx/test/utils/nvidia/nvrtc/nvrtcc_common.h @@ -95,6 +95,9 @@ static int parse_int_assignment(const std::string& input, std::string var, int d return std::stoi(match[1].str(), nullptr); } + fprintf(stderr, "ERROR: Could not find an integer literal for '%s' on line '%s':\r\n", var.c_str(), line.c_str()); + exit(1); + return def; } From 58aba1d75211ee21b562faa627138598c533f663 Mon Sep 17 00:00:00 2001 From: David Bayer <48736217+davebayer@users.noreply.github.com> Date: Fri, 19 Dec 2025 11:38:56 +0100 Subject: [PATCH 29/56] Allow all public headers to be included with host compilers only (#7012) --- .../LibcudacxxPublicHeaderTestingHost.cmake | 46 ++++++++--- libcudacxx/include/cuda/__container/buffer.h | 78 ++++++++++--------- .../cuda/__container/heterogeneous_iterator.h | 46 ++++++----- .../__container/uninitialized_async_buffer.h | 36 +++++---- .../include/cuda/__device/arch_traits.h | 36 ++++----- libcudacxx/include/cuda/__event/timed_event.h | 3 - .../cuda/__functional/for_each_canceled.h | 12 +-- libcudacxx/include/cuda/__fwd/devices.h | 3 + .../include/cuda/__hierarchy/block_level.h | 23 ++++-- .../include/cuda/__hierarchy/cluster_level.h | 23 ++++-- .../include/cuda/__hierarchy/dimensions.h | 12 ++- .../cuda/__hierarchy/get_launch_dimensions.h | 20 +++-- .../include/cuda/__hierarchy/grid_level.h | 12 ++- .../cuda/__hierarchy/hierarchy_dimensions.h | 52 +++++++------ .../cuda/__hierarchy/hierarchy_level_base.h | 40 +++++----- .../cuda/__hierarchy/hierarchy_levels.h | 18 +++-- .../cuda/__hierarchy/hierarchy_query_result.h | 18 +++-- .../cuda/__hierarchy/level_dimensions.h | 30 +++---- .../__hierarchy/native_hierarchy_level_base.h | 52 +++++++------ .../include/cuda/__hierarchy/thread_level.h | 23 ++++-- libcudacxx/include/cuda/__hierarchy/traits.h | 22 +++--- .../include/cuda/__hierarchy/warp_level.h | 31 +++++--- .../__memcpy_async/dispatch_memcpy_async.h | 16 ++-- .../__memcpy_async/is_local_smem_barrier.h | 2 +- .../cuda/__memcpy_async/memcpy_completion.h | 4 +- .../__memcpy_async/try_get_barrier_handle.h | 2 +- .../cuda/__memory_pool/device_memory_pool.h | 21 +++-- .../cuda/__memory_pool/memory_pool_base.h | 59 +++++++------- .../cuda/__memory_pool/pinned_memory_pool.h | 35 ++++----- .../cuda/__memory_resource/any_resource.h | 30 +++---- .../__memory_resource/get_memory_resource.h | 22 +++--- .../cuda/__memory_resource/get_property.h | 16 ++-- .../legacy_managed_memory_resource.h | 30 +++---- .../legacy_pinned_memory_resource.h | 29 ++++--- .../cuda/__memory_resource/properties.h | 14 ++-- .../include/cuda/__memory_resource/resource.h | 32 ++++---- .../cuda/__memory_resource/shared_resource.h | 30 ++++--- .../synchronous_resource_adapter.h | 22 +++--- .../include/cuda/__stream/internal_streams.h | 10 ++- .../include/cuda/__stream/launch_transform.h | 34 ++++---- libcudacxx/include/cuda/pipeline | 6 +- 41 files changed, 594 insertions(+), 456 deletions(-) diff --git a/libcudacxx/cmake/LibcudacxxPublicHeaderTestingHost.cmake b/libcudacxx/cmake/LibcudacxxPublicHeaderTestingHost.cmake index 26e3c1fe160..d37367332d5 100644 --- a/libcudacxx/cmake/LibcudacxxPublicHeaderTestingHost.cmake +++ b/libcudacxx/cmake/LibcudacxxPublicHeaderTestingHost.cmake @@ -6,6 +6,7 @@ # Meta target for all configs' header builds: add_custom_target(libcudacxx.test.public_headers_host_only) +add_custom_target(libcudacxx.test.public_headers_host_only_with_ctk) # Grep all public headers file( @@ -13,18 +14,8 @@ file( LIST_DIRECTORIES false RELATIVE "${libcudacxx_SOURCE_DIR}/include" CONFIGURE_DEPENDS + "${libcudacxx_SOURCE_DIR}/include/cuda/*" "${libcudacxx_SOURCE_DIR}/include/cuda/std/*" - # Add some files we expect to work in host only compilation - "${libcudacxx_SOURCE_DIR}/include/cuda/bit" - "${libcudacxx_SOURCE_DIR}/include/cuda/cmath" - "${libcudacxx_SOURCE_DIR}/include/cuda/functional" - "${libcudacxx_SOURCE_DIR}/include/cuda/iterator" - "${libcudacxx_SOURCE_DIR}/include/cuda/mdspan" - "${libcudacxx_SOURCE_DIR}/include/cuda/memory" - "${libcudacxx_SOURCE_DIR}/include/cuda/numeric" - "${libcudacxx_SOURCE_DIR}/include/cuda/type_traits" - "${libcudacxx_SOURCE_DIR}/include/cuda/utility" - "${libcudacxx_SOURCE_DIR}/include/cuda/version" ) set(public_host_header_cxx_compile_options) @@ -63,6 +54,38 @@ function(libcudacxx_create_public_header_test_host header_name headertest_src) ) endfunction() +function( + libcudacxx_create_public_header_test_host_with_ctk + header_name + headertest_src +) + # Create the default target for that file + add_library( + public_headers_host_only_with_ctk_${header_name} + SHARED + "${headertest_src}.cpp" + ) + cccl_configure_target(public_headers_host_only_with_ctk_${header_name}) + target_compile_definitions( + public_headers_host_only_with_ctk_${header_name} + PRIVATE # + ${public_host_header_cxx_compile_definitions} + _CCCL_HEADER_TEST + ) + target_compile_options( + public_headers_host_only_with_ctk_${header_name} + PRIVATE ${public_host_header_cxx_compile_options} + ) + target_link_libraries( + public_headers_host_only_with_ctk_${header_name} + PUBLIC libcudacxx.compiler_interface CUDA::cudart + ) + add_dependencies( + libcudacxx.test.public_headers_host_only_with_ctk + public_headers_host_only_with_ctk_${header_name} + ) +endfunction() + function(libcudacxx_add_public_headers_host_only header) # ${header} contains the "/" from the subfolder, replace by "_" for actual names string(REPLACE "/" "_" header_name "${header}") @@ -76,6 +99,7 @@ function(libcudacxx_add_public_headers_host_only header) # Create the default target for that file libcudacxx_create_public_header_test_host(${header_name} ${headertest_src}) + libcudacxx_create_public_header_test_host_with_ctk(${header_name} ${headertest_src}) endfunction() foreach (header IN LISTS public_headers_host_only) diff --git a/libcudacxx/include/cuda/__container/buffer.h b/libcudacxx/include/cuda/__container/buffer.h index 3564e2d8806..c3b26795c76 100644 --- a/libcudacxx/include/cuda/__container/buffer.h +++ b/libcudacxx/include/cuda/__container/buffer.h @@ -21,35 +21,37 @@ # pragma system_header #endif // no system header -#if _CCCL_CUDA_COMPILATION() -# include -#endif // _CCCL_CUDA_COMPILATION() - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include +#if _CCCL_HAS_CTK() + +# if _CCCL_CUDA_COMPILATION() +# include +# endif // _CCCL_CUDA_COMPILATION() + +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include + +# include //! @file The \c buffer class provides a container of contiguous memory _CCCL_BEGIN_NAMESPACE_CUDA @@ -309,7 +311,7 @@ class buffer __buf_.size()); } -#ifndef _CCCL_DOXYGEN_INVOKED // doxygen conflates the overloads +# ifndef _CCCL_DOXYGEN_INVOKED // doxygen conflates the overloads _CCCL_TEMPLATE(class _Range, class _Resource, class _Env = ::cuda::std::execution::env<>) _CCCL_REQUIRES( ::cuda::mr::synchronous_resource<::cuda::std::decay_t<_Resource>> _CCCL_AND __compatible_range<_Range> @@ -332,7 +334,7 @@ class buffer __unwrapped_begin(), __buf_.size()); } -#endif // _CCCL_DOXYGEN_INVOKED +# endif // _CCCL_DOXYGEN_INVOKED //! @} //! @addtogroup iterators @@ -447,7 +449,7 @@ class buffer return __buf_.data(); } -#ifndef _CCCL_DOXYGEN_INVOKED +# ifndef _CCCL_DOXYGEN_INVOKED //! @brief Returns a pointer to the first element of the buffer. If the buffer //! is empty, the returned pointer will be null. [[nodiscard]] _CCCL_HIDE_FROM_ABI pointer __unwrapped_begin() noexcept @@ -477,7 +479,7 @@ class buffer { return __buf_.data() + __buf_.size(); } -#endif // _CCCL_DOXYGEN_INVOKED +# endif // _CCCL_DOXYGEN_INVOKED //! @} @@ -683,13 +685,13 @@ __fill_n(cuda::stream_ref __stream, _Tp* __first, ::cuda::std::size_t __count, c } else { -#if _CCCL_CUDA_COMPILATION() +# if _CCCL_CUDA_COMPILATION() ::cuda::__ensure_current_context __guard(__stream); ::cub::DeviceTransform::Fill(__first, __count, __value, __stream.get()); -#else // ^^^ _CCCL_CUDA_COMPILATION() ^^^ / vvv !_CCCL_CUDA_COMPILATION() vvv +# else // ^^^ _CCCL_CUDA_COMPILATION() ^^^ / vvv !_CCCL_CUDA_COMPILATION() vvv static_assert(sizeof(_Tp) <= 4, "CUDA compiler is required to initialize an async_buffer with elements larger than 4 bytes"); -#endif // ^^^ !_CCCL_CUDA_COMPILATION() ^^^ +# endif // ^^^ !_CCCL_CUDA_COMPILATION() ^^^ } } } @@ -886,6 +888,8 @@ auto make_buffer(stream_ref __stream, _Resource&& __mr, _Range&& __range, const } _CCCL_END_NAMESPACE_CUDA -#include +# include + +#endif // _CCCL_HAS_CTK() #endif //_CUDA___CONTAINER_BUFFER_H diff --git a/libcudacxx/include/cuda/__container/heterogeneous_iterator.h b/libcudacxx/include/cuda/__container/heterogeneous_iterator.h index e33c40f2016..9a62fc277fb 100644 --- a/libcudacxx/include/cuda/__container/heterogeneous_iterator.h +++ b/libcudacxx/include/cuda/__container/heterogeneous_iterator.h @@ -21,17 +21,19 @@ # pragma system_header #endif // no system header -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include +#if _CCCL_HAS_CTK() + +# include +# include +# include +# include +# include +# include +# include +# include +# include + +# include //! @file The \c heterogeneous_iterator class is an iterator that provides typed execution space safety. _CCCL_BEGIN_NAMESPACE_CUDA @@ -290,7 +292,7 @@ class heterogeneous_iterator return __temp; } -#ifndef _CCCL_DOXYGEN_INVOKED // Do not document +# ifndef _CCCL_DOXYGEN_INVOKED // Do not document //! @brief Advance a \c heterogeneous_iterator //! @param __count The number of elements to advance. //! @param __other A heterogeneous_iterator. @@ -301,7 +303,7 @@ class heterogeneous_iterator __other += __count; return __other; } -#endif // _CCCL_DOXYGEN_INVOKED +# endif // _CCCL_DOXYGEN_INVOKED //! @brief Advance a \c heterogeneous_iterator by the negative value of \p __count //! @param __count The number of elements to advance. @@ -330,7 +332,7 @@ class heterogeneous_iterator return static_cast(this->__ptr_ - __other.__ptr_); } -#ifndef _CCCL_DOXYGEN_INVOKED // Do not document +# ifndef _CCCL_DOXYGEN_INVOKED // Do not document //! @brief Equality comparison between two heterogeneous_iterator //! @param __lhs A heterogeneous_iterator. //! @param __rhs Another heterogeneous_iterator. @@ -340,7 +342,7 @@ class heterogeneous_iterator { return __lhs.__ptr_ == __rhs.__ptr_; } -# if _CCCL_STD_VER <= 2017 +# if _CCCL_STD_VER <= 2017 //! @brief Inequality comparison between two heterogeneous_iterator //! @param __lhs A heterogeneous_iterator. //! @param __rhs Another heterogeneous_iterator. @@ -350,15 +352,15 @@ class heterogeneous_iterator { return __lhs.__ptr_ != __rhs.__ptr_; } -# endif // _CCCL_STD_VER <= 2017 +# endif // _CCCL_STD_VER <= 2017 -# if _LIBCUDACXX_HAS_SPACESHIP_OPERATOR() +# if _LIBCUDACXX_HAS_SPACESHIP_OPERATOR() [[nodiscard]] _CCCL_API friend constexpr ::cuda::std::strong_ordering operator<=>(const heterogeneous_iterator& __lhs, const heterogeneous_iterator& __rhs) noexcept { return __lhs.__ptr_ <=> __rhs.__ptr_; } -# else // ^^^ _LIBCUDACXX_HAS_SPACESHIP_OPERATOR() ^^^ / vvv !_LIBCUDACXX_HAS_SPACESHIP_OPERATOR() vvv +# else // ^^^ _LIBCUDACXX_HAS_SPACESHIP_OPERATOR() ^^^ / vvv !_LIBCUDACXX_HAS_SPACESHIP_OPERATOR() vvv //! @brief Less than relation between two heterogeneous_iterator //! @param __lhs A heterogeneous_iterator. //! @param __rhs Another heterogeneous_iterator. @@ -399,8 +401,8 @@ class heterogeneous_iterator { return __lhs.__ptr_ >= __rhs.__ptr_; } -# endif // !_LIBCUDACXX_HAS_SPACESHIP_OPERATOR() -#endif // _CCCL_DOXYGEN_INVOKED +# endif // !_LIBCUDACXX_HAS_SPACESHIP_OPERATOR() +# endif // _CCCL_DOXYGEN_INVOKED _CCCL_API constexpr pointer __unwrap() const noexcept { @@ -431,6 +433,8 @@ struct pointer_traits<::cuda::heterogeneous_iterator<_Tp, _Properties...>> _CCCL_END_NAMESPACE_CUDA_STD -#include +# include + +#endif // _CCCL_HAS_CTK() #endif //__CUDAX__CONTAINERS_HETEROGENEOUS_ITERATOR_CUH diff --git a/libcudacxx/include/cuda/__container/uninitialized_async_buffer.h b/libcudacxx/include/cuda/__container/uninitialized_async_buffer.h index cf49467b64d..6f00ef61b76 100644 --- a/libcudacxx/include/cuda/__container/uninitialized_async_buffer.h +++ b/libcudacxx/include/cuda/__container/uninitialized_async_buffer.h @@ -21,19 +21,21 @@ # pragma system_header #endif // no system header -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include +#if _CCCL_HAS_CTK() + +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include + +# include //! @file //! The \c __uninitialized_async_buffer class provides a typed buffer allocated @@ -138,7 +140,7 @@ class __uninitialized_async_buffer return {__self.__get_data(), __self.size()}; } -#ifndef _CCCL_DOXYGEN_INVOKED +# ifndef _CCCL_DOXYGEN_INVOKED // This is needed to ensure that we do not do a deep copy in // __replace_allocation struct __fake_resource_ref @@ -179,7 +181,7 @@ class __uninitialized_async_buffer _CCCL_REQUIRES(::cuda::std::__is_included_in_v<_Property, _Properties...>) _CCCL_HIDE_FROM_ABI friend constexpr void get_property(const __fake_resource_ref&, _Property) noexcept {} }; -#endif // _CCCL_DOXYGEN_INVOKED +# endif // _CCCL_DOXYGEN_INVOKED public: using value_type = _Tp; @@ -411,6 +413,8 @@ template using uninitialized_async_device_buffer = __uninitialized_async_buffer<_Tp, ::cuda::mr::device_accessible>; _CCCL_END_NAMESPACE_CUDA -#include +# include + +#endif // _CCCL_HAS_CTK() #endif //__CUDAX__CONTAINERS_UNINITIALIZED_ASYNC_BUFFER_H diff --git a/libcudacxx/include/cuda/__device/arch_traits.h b/libcudacxx/include/cuda/__device/arch_traits.h index 1d952458992..2da1882cd66 100644 --- a/libcudacxx/include/cuda/__device/arch_traits.h +++ b/libcudacxx/include/cuda/__device/arch_traits.h @@ -21,17 +21,15 @@ # pragma system_header #endif // no system header -#if _CCCL_HAS_CTK() - -# include -# include -# include -# include -# include -# include -# include +#include +#include +#include +#include +#include +#include +#include -# include +#include _CCCL_BEGIN_NAMESPACE_CUDA @@ -507,7 +505,11 @@ template <> case arch_id::sm_121a: return ::cuda::arch_traits(); default: +#if _CCCL_HAS_CTK() ::cuda::__throw_cuda_error(::cudaErrorInvalidValue, "Traits requested for an unknown architecture"); +#else // ^^^ _CCCL_HAS_CTK() ^^^ / vvv !_CCCL_HAS_CTK() vvv + ::cuda::__throw_cuda_error(/*cudaErrorInvalidValue*/ 1, "Traits requested for an unknown architecture"); +#endif // ^^^ !_CCCL_HAS_CTK() ^^^ break; } } @@ -522,7 +524,7 @@ template <> _CCCL_END_NAMESPACE_CUDA -# if _CCCL_CUDA_COMPILATION() +#if _CCCL_CUDA_COMPILATION() _CCCL_BEGIN_NAMESPACE_CUDA_DEVICE @@ -535,19 +537,17 @@ _CCCL_BEGIN_NAMESPACE_CUDA_DEVICE template [[nodiscard]] _CCCL_DEVICE_API inline _CCCL_TARGET_CONSTEXPR ::cuda::arch_traits_t current_arch_traits() noexcept { -# if _CCCL_DEVICE_COMPILATION() +# if _CCCL_DEVICE_COMPILATION() return ::cuda::arch_traits_for(::cuda::device::current_arch_id<_Dummy>()); -# else // ^^^ _CCCL_DEVICE_COMPILATION() ^^^ / vvv !_CCCL_DEVICE_COMPILATION() vvv +# else // ^^^ _CCCL_DEVICE_COMPILATION() ^^^ / vvv !_CCCL_DEVICE_COMPILATION() vvv return {}; -# endif // ^^^ !_CCCL_DEVICE_COMPILATION() ^^^ +# endif // ^^^ !_CCCL_DEVICE_COMPILATION() ^^^ } _CCCL_END_NAMESPACE_CUDA_DEVICE -# endif // _CCCL_CUDA_COMPILATION - -# include +#endif // _CCCL_CUDA_COMPILATION -#endif // _CCCL_HAS_CTK() +#include #endif // _CUDA___DEVICE_ARCH_TRAITS_H diff --git a/libcudacxx/include/cuda/__event/timed_event.h b/libcudacxx/include/cuda/__event/timed_event.h index 603c140fb7c..a313f5bacff 100644 --- a/libcudacxx/include/cuda/__event/timed_event.h +++ b/libcudacxx/include/cuda/__event/timed_event.h @@ -11,9 +11,6 @@ #ifndef _CUDA___EVENT_TIMED_EVENT_H #define _CUDA___EVENT_TIMED_EVENT_H -#include -// cuda_runtime_api needs to come first - #include #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) diff --git a/libcudacxx/include/cuda/__functional/for_each_canceled.h b/libcudacxx/include/cuda/__functional/for_each_canceled.h index 54a97ad0290..b230911e5e8 100644 --- a/libcudacxx/include/cuda/__functional/for_each_canceled.h +++ b/libcudacxx/include/cuda/__functional/for_each_canceled.h @@ -21,14 +21,14 @@ # pragma system_header #endif // no system header -#include -#include -#include -#include +#if _CCCL_CUDA_COMPILATION() -#include +# include +# include +# include +# include -#if _CCCL_CUDA_COMPILATION() +# include # include diff --git a/libcudacxx/include/cuda/__fwd/devices.h b/libcudacxx/include/cuda/__fwd/devices.h index 5e77301b2e6..3158b66ec2e 100644 --- a/libcudacxx/include/cuda/__fwd/devices.h +++ b/libcudacxx/include/cuda/__fwd/devices.h @@ -27,10 +27,13 @@ _CCCL_BEGIN_NAMESPACE_CUDA +#if _CCCL_HAS_CTK() class __physical_device; class device_ref; template <::cudaDeviceAttr _Attr> struct __dev_attr; +#endif // _CCCL_HAS_CTK() + struct arch_traits_t; class compute_capability; enum class arch_id : int; diff --git a/libcudacxx/include/cuda/__hierarchy/block_level.h b/libcudacxx/include/cuda/__hierarchy/block_level.h index cfa000df0d2..81e1790d93f 100644 --- a/libcudacxx/include/cuda/__hierarchy/block_level.h +++ b/libcudacxx/include/cuda/__hierarchy/block_level.h @@ -21,14 +21,16 @@ # pragma system_header #endif // no system header -#include -#include -#include -#include -#include -#include +#if _CCCL_HAS_CTK() -#include +# include +# include +# include +# include +# include +# include + +# include _CCCL_BEGIN_NAMESPACE_CUDA @@ -43,6 +45,8 @@ struct block_level : __native_hierarchy_level_base using __base_type = __native_hierarchy_level_base; using __base_type::count_as; using __base_type::extents_as; + +# if _CCCL_CUDA_COMPILATION() using __base_type::index_as; using __base_type::rank_as; @@ -130,12 +134,15 @@ struct block_level : __native_hierarchy_level_base const auto __idx = index_as<_Tp>(__level, __hier); return static_cast<_Tp>((__idx.z * __dims.y + __idx.y) * __dims.x + __idx.x); } +# endif // _CCCL_CUDA_COMPILATION() }; _CCCL_GLOBAL_CONSTANT block_level block; _CCCL_END_NAMESPACE_CUDA -#include +# include + +#endif // _CCCL_HAS_CTK() #endif // _CUDA___HIERARCHY_BLOCK_LEVEL_H diff --git a/libcudacxx/include/cuda/__hierarchy/cluster_level.h b/libcudacxx/include/cuda/__hierarchy/cluster_level.h index 70ef8fdce89..72a40127fbb 100644 --- a/libcudacxx/include/cuda/__hierarchy/cluster_level.h +++ b/libcudacxx/include/cuda/__hierarchy/cluster_level.h @@ -21,14 +21,16 @@ # pragma system_header #endif // no system header -#include -#include -#include -#include -#include -#include +#if _CCCL_HAS_CTK() -#include +# include +# include +# include +# include +# include +# include + +# include _CCCL_BEGIN_NAMESPACE_CUDA @@ -42,6 +44,8 @@ struct cluster_level : __native_hierarchy_level_base using __base_type = __native_hierarchy_level_base; using __base_type::extents_as; + +# if _CCCL_CUDA_COMPILATION() using __base_type::index_as; // interactions with grid level @@ -63,12 +67,15 @@ struct cluster_level : __native_hierarchy_level_base NV_IF_TARGET(NV_PROVIDES_SM_90, (__idx = ::__clusterIdx();)) return {static_cast<_Tp>(__idx.x), static_cast<_Tp>(__idx.y), static_cast<_Tp>(__idx.z)}; } +# endif // _CCCL_CUDA_COMPILATION() }; _CCCL_GLOBAL_CONSTANT cluster_level cluster; _CCCL_END_NAMESPACE_CUDA -#include +# include + +#endif // _CCCL_HAS_CTK() #endif // _CUDA___HIERARCHY_CLUSTER_LEVEL_H diff --git a/libcudacxx/include/cuda/__hierarchy/dimensions.h b/libcudacxx/include/cuda/__hierarchy/dimensions.h index 395c4c6e996..97151555b33 100644 --- a/libcudacxx/include/cuda/__hierarchy/dimensions.h +++ b/libcudacxx/include/cuda/__hierarchy/dimensions.h @@ -21,10 +21,12 @@ # pragma system_header #endif // no system header -#include -#include +#if _CCCL_HAS_CTK() -#include +# include +# include + +# include _CCCL_BEGIN_NAMESPACE_CUDA @@ -166,6 +168,8 @@ template } // namespace __detail _CCCL_END_NAMESPACE_CUDA -#include +# include + +#endif // _CCCL_HAS_CTK() #endif // _CUDA___HIERARCHY_DIMENSIONS_H diff --git a/libcudacxx/include/cuda/__hierarchy/get_launch_dimensions.h b/libcudacxx/include/cuda/__hierarchy/get_launch_dimensions.h index ee3d07ee8ae..173c07413c1 100644 --- a/libcudacxx/include/cuda/__hierarchy/get_launch_dimensions.h +++ b/libcudacxx/include/cuda/__hierarchy/get_launch_dimensions.h @@ -21,14 +21,16 @@ # pragma system_header #endif // no system header -#include -#include -#include -#include -#include -#include +#if _CCCL_HAS_CTK() -#include +# include +# include +# include +# include +# include +# include + +# include _CCCL_BEGIN_NAMESPACE_CUDA @@ -85,6 +87,8 @@ constexpr auto _CCCL_HOST get_launch_dimensions(const hierarchy_dimensions<_Leve _CCCL_END_NAMESPACE_CUDA -#include +# include + +#endif // _CCCL_HAS_CTK() #endif // _CUDA___HIERARCHY_GET_LAUNCH_DIMENSIONS_H diff --git a/libcudacxx/include/cuda/__hierarchy/grid_level.h b/libcudacxx/include/cuda/__hierarchy/grid_level.h index 9cd6a111709..3ab82979802 100644 --- a/libcudacxx/include/cuda/__hierarchy/grid_level.h +++ b/libcudacxx/include/cuda/__hierarchy/grid_level.h @@ -21,10 +21,12 @@ # pragma system_header #endif // no system header -#include -#include +#if _CCCL_HAS_CTK() -#include +# include +# include + +# include _CCCL_BEGIN_NAMESPACE_CUDA @@ -39,6 +41,8 @@ _CCCL_GLOBAL_CONSTANT grid_level grid; _CCCL_END_NAMESPACE_CUDA -#include +# include + +#endif // _CCCL_HAS_CTK() #endif // _CUDA___HIERARCHY_GRID_LEVEL_H diff --git a/libcudacxx/include/cuda/__hierarchy/hierarchy_dimensions.h b/libcudacxx/include/cuda/__hierarchy/hierarchy_dimensions.h index 1d375843d83..0542e84abaf 100644 --- a/libcudacxx/include/cuda/__hierarchy/hierarchy_dimensions.h +++ b/libcudacxx/include/cuda/__hierarchy/hierarchy_dimensions.h @@ -21,22 +21,24 @@ # pragma system_header #endif // no system header -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -#include +#if _CCCL_HAS_CTK() + +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include + +# include + +# include _CCCL_BEGIN_NAMESPACE_CUDA @@ -254,7 +256,7 @@ struct __hierarchy_extents_helper }; template -[[nodiscard]] _CCCL_DEVICE constexpr auto __static_index_hint(const dimensions<_Tp, _Extents...>& __dims, ::dim3 __index) +[[nodiscard]] _CCCL_DEVICE constexpr auto __static_index_hint(const dimensions<_Tp, _Extents...>&, ::dim3 __index) { using _HintedIndexT = dimensions<_Tp, (_Extents == 1 ? 0 : ::cuda::std::dynamic_extent)...>; return _HintedIndexT(__index.x, __index.y, __index.z); @@ -376,10 +378,10 @@ struct hierarchy_dimensions : levels(__ls) {} -#if !defined(_CCCL_NO_THREE_WAY_COMPARISON) && !_CCCL_COMPILER(MSVC, <, 19, 39) && !_CCCL_COMPILER(GCC, <, 12) +# if !defined(_CCCL_NO_THREE_WAY_COMPARISON) && !_CCCL_COMPILER(MSVC, <, 19, 39) && !_CCCL_COMPILER(GCC, <, 12) [[nodiscard]] _CCCL_HIDE_FROM_ABI constexpr bool operator==(const hierarchy_dimensions&) const noexcept = default; -#else // ^^^ !_CCCL_NO_THREE_WAY_COMPARISON ^^^ / vvv - // _CCCL_NO_THREE_WAY_COMPARISON vvv +# else // ^^^ !_CCCL_NO_THREE_WAY_COMPARISON ^^^ / vvv + // _CCCL_NO_THREE_WAY_COMPARISON vvv [[nodiscard]] _CCCL_API friend constexpr bool operator==(const hierarchy_dimensions& __left, const hierarchy_dimensions& __right) noexcept { @@ -391,7 +393,7 @@ struct hierarchy_dimensions { return __left.levels != __right.levels; } -#endif // _CCCL_NO_THREE_WAY_COMPARISON +# endif // _CCCL_NO_THREE_WAY_COMPARISON private: // This being static is a bit of a hack to make extents_type working without @@ -829,12 +831,12 @@ struct hierarchy_dimensions } } -#ifndef _CCCL_DOXYGEN_INVOKED // Do not document +# ifndef _CCCL_DOXYGEN_INVOKED // Do not document constexpr hierarchy_dimensions combine([[maybe_unused]] __empty_hierarchy __empty) const { return *this; } -#endif // _CCCL_DOXYGEN_INVOKED +# endif // _CCCL_DOXYGEN_INVOKED }; // TODO consider having LUnit optional argument for template argument deduction @@ -914,6 +916,8 @@ constexpr auto hierarchy_add_level(const hierarchy_dimensions<_Unit, _Levels...> } _CCCL_END_NAMESPACE_CUDA -#include +# include + +#endif // _CCCL_HAS_CTK() #endif // _CUDA___HIERARCHY_HIERARCHY_DIMENSIONS_H diff --git a/libcudacxx/include/cuda/__hierarchy/hierarchy_level_base.h b/libcudacxx/include/cuda/__hierarchy/hierarchy_level_base.h index ccf8e319fb7..4bf91c3454b 100644 --- a/libcudacxx/include/cuda/__hierarchy/hierarchy_level_base.h +++ b/libcudacxx/include/cuda/__hierarchy/hierarchy_level_base.h @@ -21,19 +21,21 @@ # pragma system_header #endif // no system header -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include +#if _CCCL_HAS_CTK() + +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include + +# include _CCCL_BEGIN_NAMESPACE_CUDA @@ -153,7 +155,7 @@ struct hierarchy_level_base return _Level::template count_as<__default_1d_query_type<_InLevel>>(__level, __hier); } -#if _CCCL_CUDA_COMPILATION() +# if _CCCL_CUDA_COMPILATION() _CCCL_TEMPLATE(class _InLevel, class _Hierarchy) _CCCL_REQUIRES(__is_hierarchy_level_v<_InLevel> _CCCL_AND __is_hierarchy_v<_Hierarchy>) [[nodiscard]] _CCCL_DEVICE_API static constexpr auto index(const _InLevel& __level, const _Hierarchy& __hier) noexcept @@ -168,7 +170,7 @@ struct hierarchy_level_base { return _Level::template rank_as<__default_1d_query_type<_InLevel>>(__level, __hier); } -#endif // _CCCL_CUDA_COMPILATION() +# endif // _CCCL_CUDA_COMPILATION() _CCCL_TEMPLATE(class _Tp, class _InLevel, class _Hierarchy) _CCCL_REQUIRES(::cuda::std::__cccl_is_integer_v<_Tp> _CCCL_AND __is_hierarchy_level_v<_InLevel> _CCCL_AND @@ -216,7 +218,7 @@ struct hierarchy_level_base return __count_as_impl<_Tp>(__level, __hier); } -#if _CCCL_CUDA_COMPILATION() +# if _CCCL_CUDA_COMPILATION() _CCCL_TEMPLATE(class _Tp, class _InLevel, class _Hierarchy) _CCCL_REQUIRES(::cuda::std::__cccl_is_integer_v<_Tp> _CCCL_AND __is_hierarchy_level_v<_InLevel> _CCCL_AND __is_hierarchy_v<_Hierarchy>) @@ -283,7 +285,7 @@ struct hierarchy_level_base } return __ret; } -#endif // _CCCL_CUDA_COMPILATION() +# endif // _CCCL_CUDA_COMPILATION() private: template @@ -334,6 +336,8 @@ struct hierarchy_level_base _CCCL_END_NAMESPACE_CUDA -#include +# include + +#endif // _CCCL_HAS_CTK() #endif // _CUDA___HIERARCHY_HIERARCHY_LEVEL_BASE_H diff --git a/libcudacxx/include/cuda/__hierarchy/hierarchy_levels.h b/libcudacxx/include/cuda/__hierarchy/hierarchy_levels.h index a629cf938a9..61b75536f08 100644 --- a/libcudacxx/include/cuda/__hierarchy/hierarchy_levels.h +++ b/libcudacxx/include/cuda/__hierarchy/hierarchy_levels.h @@ -21,13 +21,15 @@ # pragma system_header #endif // no system header -#include -#include -#include +#if _CCCL_HAS_CTK() -#include +# include +# include +# include -#include +# include + +# include _CCCL_BEGIN_NAMESPACE_CUDA @@ -119,6 +121,7 @@ namespace __detail template struct __dims_helper; +# if _CCCL_CUDA_COMPILATION() template struct __dims_helper<_Level, _Level> { @@ -188,6 +191,7 @@ struct __dims_helper NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, (return __clusterIdx();), (return ::dim3(0, 0, 0);)); } }; +# endif // _CCCL_CUDA_COMPILATION() // Seems like a compiler bug, where NODISCARD is marked as ignored due to void // return type, while its not possible to ever have void return type here @@ -412,6 +416,8 @@ _CCCL_DEVICE auto index(const _Unit&, const _Level&) } // namespace hierarchy _CCCL_END_NAMESPACE_CUDA -#include +# include + +#endif // _CCCL_HAS_CTK() #endif // _CUDA___HIERARCHY_HIERARCHY_LEVELS_H diff --git a/libcudacxx/include/cuda/__hierarchy/hierarchy_query_result.h b/libcudacxx/include/cuda/__hierarchy/hierarchy_query_result.h index 0192198ba03..dd73dce727d 100644 --- a/libcudacxx/include/cuda/__hierarchy/hierarchy_query_result.h +++ b/libcudacxx/include/cuda/__hierarchy/hierarchy_query_result.h @@ -21,13 +21,15 @@ # pragma system_header #endif // no system header -#include -#include -#include -#include -#include +#if _CCCL_HAS_CTK() -#include +# include +# include +# include +# include +# include + +# include _CCCL_BEGIN_NAMESPACE_CUDA @@ -144,6 +146,8 @@ struct hierarchy_query_result _CCCL_END_NAMESPACE_CUDA -#include +# include + +#endif // _CCCL_HAS_CTK() #endif // _CUDA___HIERARCHY_HIERARCHY_QUERY_RESULT_H diff --git a/libcudacxx/include/cuda/__hierarchy/level_dimensions.h b/libcudacxx/include/cuda/__hierarchy/level_dimensions.h index ec4fcd38d9f..42a76be3a68 100644 --- a/libcudacxx/include/cuda/__hierarchy/level_dimensions.h +++ b/libcudacxx/include/cuda/__hierarchy/level_dimensions.h @@ -21,15 +21,17 @@ # pragma system_header #endif // no system header -#include -#include -#include -#include -#include -#include -#include +#if _CCCL_HAS_CTK() -#include +# include +# include +# include +# include +# include +# include +# include + +# include _CCCL_BEGIN_NAMESPACE_CUDA @@ -140,10 +142,10 @@ struct level_dimensions _CCCL_API constexpr level_dimensions() : dims(){}; -#if !defined(_CCCL_NO_THREE_WAY_COMPARISON) && !_CCCL_COMPILER(MSVC, <, 19, 39) && !_CCCL_COMPILER(GCC, <, 12) +# if !defined(_CCCL_NO_THREE_WAY_COMPARISON) && !_CCCL_COMPILER(MSVC, <, 19, 39) && !_CCCL_COMPILER(GCC, <, 12) [[nodiscard]] _CCCL_HIDE_FROM_ABI constexpr bool operator==(const level_dimensions&) const noexcept = default; -#else // ^^^ !_CCCL_NO_THREE_WAY_COMPARISON ^^^ / vvv - // _CCCL_NO_THREE_WAY_COMPARISON vvv +# else // ^^^ !_CCCL_NO_THREE_WAY_COMPARISON ^^^ / vvv + // _CCCL_NO_THREE_WAY_COMPARISON vvv [[nodiscard]] _CCCL_API friend constexpr bool operator==(const level_dimensions& __left, const level_dimensions& __right) noexcept { @@ -155,7 +157,7 @@ struct level_dimensions { return __left.dims != __right.dims; } -#endif // _CCCL_NO_THREE_WAY_COMPARISON +# endif // _CCCL_NO_THREE_WAY_COMPARISON }; /** @@ -234,6 +236,8 @@ _CCCL_API constexpr auto block_dims(_Dims __dims) noexcept } _CCCL_END_NAMESPACE_CUDA -#include +# include + +#endif // _CCCL_HAS_CTK() #endif // _CUDA___HIERARCHY_LEVEL_DIMENSIONS_H diff --git a/libcudacxx/include/cuda/__hierarchy/native_hierarchy_level_base.h b/libcudacxx/include/cuda/__hierarchy/native_hierarchy_level_base.h index 2d97796a7a4..fed901e297f 100644 --- a/libcudacxx/include/cuda/__hierarchy/native_hierarchy_level_base.h +++ b/libcudacxx/include/cuda/__hierarchy/native_hierarchy_level_base.h @@ -21,33 +21,40 @@ # pragma system_header #endif // no system header -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include +#if _CCCL_HAS_CTK() + +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include + +# include _CCCL_BEGIN_NAMESPACE_CUDA // cudafe++ makes the queries (that are device only) return void when compiling for host, which causes host compilers // to warn about applying [[nodiscard]] to a function that returns void. _CCCL_DIAG_PUSH -#if _CCCL_CUDA_COMPILER(NVCC) +# if _CCCL_CUDA_COMPILER(NVCC) _CCCL_DIAG_SUPPRESS_GCC("-Wattributes") _CCCL_DIAG_SUPPRESS_CLANG("-Wignored-attributes") _CCCL_DIAG_SUPPRESS_NVHPC(nodiscard_doesnt_apply) -#endif // _CCCL_CUDA_COMPILER(NVCC) +# endif // _CCCL_CUDA_COMPILER(NVCC) template struct __native_hierarchy_level_base : hierarchy_level_base<_Level> { + template + using __default_md_query_type = unsigned; + template + using __default_1d_query_type = ::cuda::std::size_t; + using __base_type = hierarchy_level_base<_Level>; using __base_type::count; using __base_type::count_as; @@ -55,18 +62,13 @@ struct __native_hierarchy_level_base : hierarchy_level_base<_Level> using __base_type::dims_as; using __base_type::extents; using __base_type::extents_as; + using __base_type::static_dims; + +# if _CCCL_CUDA_COMPILATION() using __base_type::index; using __base_type::index_as; using __base_type::rank; using __base_type::rank_as; - using __base_type::static_dims; - - template - using __default_md_query_type = unsigned; - template - using __default_1d_query_type = ::cuda::std::size_t; - -#if _CCCL_CUDA_COMPILATION() _CCCL_TEMPLATE(class _InLevel) _CCCL_REQUIRES(__is_native_hierarchy_level_v<_InLevel>) @@ -187,7 +189,7 @@ struct __native_hierarchy_level_base : hierarchy_level_base<_Level> return __ret; } -#endif // _CCCL_CUDA_COMPILATION() +# endif // _CCCL_CUDA_COMPILATION() }; _CCCL_DIAG_POP @@ -198,6 +200,8 @@ struct __native_hierarchy_level_base : hierarchy_level_base +# include + +#endif // _CCCL_HAS_CTK() #endif // _CUDA___HIERARCHY_NATIVE_HIERARCHY_LEVEL_BASE_H diff --git a/libcudacxx/include/cuda/__hierarchy/thread_level.h b/libcudacxx/include/cuda/__hierarchy/thread_level.h index f8beda576d1..ad117180262 100644 --- a/libcudacxx/include/cuda/__hierarchy/thread_level.h +++ b/libcudacxx/include/cuda/__hierarchy/thread_level.h @@ -21,14 +21,16 @@ # pragma system_header #endif // no system header -#include -#include -#include -#include -#include -#include +#if _CCCL_HAS_CTK() -#include +# include +# include +# include +# include +# include +# include + +# include _CCCL_BEGIN_NAMESPACE_CUDA @@ -42,6 +44,8 @@ struct thread_level : __native_hierarchy_level_base using __base_type = __native_hierarchy_level_base; using __base_type::extents_as; + +# if _CCCL_CUDA_COMPILATION() using __base_type::index_as; using __base_type::rank_as; @@ -85,12 +89,15 @@ struct thread_level : __native_hierarchy_level_base { return static_cast<_Tp>(::cuda::ptx::get_sreg_laneid()); } +# endif // _CCCL_CUDA_COMPILATION() }; _CCCL_GLOBAL_CONSTANT thread_level gpu_thread; _CCCL_END_NAMESPACE_CUDA -#include +# include + +#endif // _CCCL_HAS_CTK() #endif // _CUDA___HIERARCHY_THREAD_LEVEL_H diff --git a/libcudacxx/include/cuda/__hierarchy/traits.h b/libcudacxx/include/cuda/__hierarchy/traits.h index 1e64e0aaa77..d59dd6fd193 100644 --- a/libcudacxx/include/cuda/__hierarchy/traits.h +++ b/libcudacxx/include/cuda/__hierarchy/traits.h @@ -21,15 +21,17 @@ # pragma system_header #endif // no system header -#include -#include -#include -#include -#include -#include -#include +#if _CCCL_HAS_CTK() -#include +# include +# include +# include +# include +# include +# include +# include + +# include _CCCL_BEGIN_NAMESPACE_CUDA @@ -114,6 +116,8 @@ using __next_hierarchy_level_t = typename __next_hierarchy_level<_Level, _Hierar _CCCL_END_NAMESPACE_CUDA -#include +# include + +#endif // _CCCL_HAS_CTK() #endif // _CUDA___HIERARCHY_TRAITS_H diff --git a/libcudacxx/include/cuda/__hierarchy/warp_level.h b/libcudacxx/include/cuda/__hierarchy/warp_level.h index 6e1d9170391..bfbfff2af66 100644 --- a/libcudacxx/include/cuda/__hierarchy/warp_level.h +++ b/libcudacxx/include/cuda/__hierarchy/warp_level.h @@ -21,17 +21,19 @@ # pragma system_header #endif // no system header -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include +#if _CCCL_HAS_CTK() + +# include +# include +# include +# include +# include +# include +# include +# include +# include + +# include _CCCL_BEGIN_NAMESPACE_CUDA @@ -41,6 +43,8 @@ struct warp_level : __native_hierarchy_level_base using __base_type = __native_hierarchy_level_base; using __base_type::extents_as; + +# if _CCCL_CUDA_COMPILATION() using __base_type::index_as; _CCCL_TEMPLATE(class _Tp) @@ -56,12 +60,15 @@ struct warp_level : __native_hierarchy_level_base { return {static_cast<_Tp>(gpu_thread.rank(block) / 32), 0, 0}; } +# endif // _CCCL_CUDA_COMPILATION() }; _CCCL_GLOBAL_CONSTANT warp_level warp; _CCCL_END_NAMESPACE_CUDA -#include +# include + +#endif // _CCCL_HAS_CTK() #endif // _CUDA___HIERARCHY_WARP_LEVEL_H diff --git a/libcudacxx/include/cuda/__memcpy_async/dispatch_memcpy_async.h b/libcudacxx/include/cuda/__memcpy_async/dispatch_memcpy_async.h index 8fdfae61c6e..5a281a1f1c2 100644 --- a/libcudacxx/include/cuda/__memcpy_async/dispatch_memcpy_async.h +++ b/libcudacxx/include/cuda/__memcpy_async/dispatch_memcpy_async.h @@ -55,8 +55,8 @@ template <::cuda::std::size_t _Align, typename _Group> char* __dest_char, char const* __src_char, ::cuda::std::size_t __size, - ::cuda::std::uint32_t __allowed_completions, - ::cuda::std::uint64_t* __bar_handle) + [[maybe_unused]] ::cuda::std::uint32_t __allowed_completions, + [[maybe_unused]] ::cuda::std::uint64_t* __bar_handle) { ::cuda::__cp_async_fallback_mechanism<_Align>(__group, __dest_char, __src_char, __size); return __completion_mechanism::__sync; @@ -68,8 +68,8 @@ template <::cuda::std::size_t _Align, typename _Group> char* __dest_char, char const* __src_char, ::cuda::std::size_t __size, - ::cuda::std::uint32_t __allowed_completions, - ::cuda::std::uint64_t* __bar_handle) + [[maybe_unused]] ::cuda::std::uint32_t __allowed_completions, + [[maybe_unused]] ::cuda::std::uint64_t* __bar_handle) { #if __cccl_ptx_isa >= 800 NV_IF_TARGET( @@ -114,8 +114,8 @@ template <::cuda::std::size_t _Align, typename _Group> char* __dest_char, char const* __src_char, ::cuda::std::size_t __size, - ::cuda::std::uint32_t __allowed_completions, - ::cuda::std::uint64_t* __bar_handle) + [[maybe_unused]] ::cuda::std::uint32_t __allowed_completions, + [[maybe_unused]] ::cuda::std::uint64_t* __bar_handle) { NV_IF_ELSE_TARGET( NV_IS_DEVICE, @@ -149,8 +149,8 @@ template <::cuda::std::size_t _Align, typename _Group> _Group const& __group, char* __dest_char, char const* __src_char, - ::cuda::std::size_t __size, - ::cuda::std::uint32_t __allowed_completions) + [[maybe_unused]] ::cuda::std::size_t __size, + [[maybe_unused]] ::cuda::std::uint32_t __allowed_completions) { _CCCL_ASSERT(!(__allowed_completions & uint32_t(__completion_mechanism::__mbarrier_complete_tx)), "Cannot allow mbarrier_complete_tx completion mechanism when not passing a barrier. "); diff --git a/libcudacxx/include/cuda/__memcpy_async/is_local_smem_barrier.h b/libcudacxx/include/cuda/__memcpy_async/is_local_smem_barrier.h index 7c9f69bf8d9..994d557f435 100644 --- a/libcudacxx/include/cuda/__memcpy_async/is_local_smem_barrier.h +++ b/libcudacxx/include/cuda/__memcpy_async/is_local_smem_barrier.h @@ -38,7 +38,7 @@ template > -_CCCL_API inline bool __is_local_smem_barrier(barrier<_Sco, _CompF>& __barrier) +_CCCL_API inline bool __is_local_smem_barrier([[maybe_unused]] barrier<_Sco, _CompF>& __barrier) { NV_IF_ELSE_TARGET( NV_IS_DEVICE, diff --git a/libcudacxx/include/cuda/__memcpy_async/memcpy_completion.h b/libcudacxx/include/cuda/__memcpy_async/memcpy_completion.h index a9441d0d40e..cd0dcccfc9d 100644 --- a/libcudacxx/include/cuda/__memcpy_async/memcpy_completion.h +++ b/libcudacxx/include/cuda/__memcpy_async/memcpy_completion.h @@ -107,8 +107,8 @@ struct __memcpy_completion_impl } template - [[nodiscard]] _CCCL_API inline static async_contract_fulfillment __defer_non_smem_barrier( - __completion_mechanism __cm, _Group const& __group, ::cuda::std::size_t __size, barrier<_Sco, _CompF>& __barrier) + [[nodiscard]] _CCCL_API inline static async_contract_fulfillment + __defer_non_smem_barrier(__completion_mechanism __cm, _Group const&, ::cuda::std::size_t, barrier<_Sco, _CompF>&) { // Overload for non-smem barriers. switch (__cm) diff --git a/libcudacxx/include/cuda/__memcpy_async/try_get_barrier_handle.h b/libcudacxx/include/cuda/__memcpy_async/try_get_barrier_handle.h index 4fe9b4767b8..439a331c598 100644 --- a/libcudacxx/include/cuda/__memcpy_async/try_get_barrier_handle.h +++ b/libcudacxx/include/cuda/__memcpy_async/try_get_barrier_handle.h @@ -36,7 +36,7 @@ _CCCL_BEGIN_NAMESPACE_CUDA //! @brief __try_get_barrier_handle returns barrier handle of block-scoped barriers and a nullptr otherwise. template -_CCCL_API inline ::cuda::std::uint64_t* __try_get_barrier_handle(barrier<_Sco, _CompF>& __barrier) +_CCCL_API inline ::cuda::std::uint64_t* __try_get_barrier_handle(barrier<_Sco, _CompF>&) { return nullptr; } diff --git a/libcudacxx/include/cuda/__memory_pool/device_memory_pool.h b/libcudacxx/include/cuda/__memory_pool/device_memory_pool.h index 9cc447088df..1c8838b0e2b 100644 --- a/libcudacxx/include/cuda/__memory_pool/device_memory_pool.h +++ b/libcudacxx/include/cuda/__memory_pool/device_memory_pool.h @@ -21,18 +21,15 @@ # pragma system_header #endif // no system header -#if _CCCL_CUDA_COMPILER(CLANG) -# include -# include -#endif // _CCCL_CUDA_COMPILER(CLANG) +#if _CCCL_HAS_CTK() -#include -#include -#include -#include -#include +# include +# include +# include +# include +# include -#include +# include //! @file //! The \c device_memory_pool class provides an asynchronous memory resource @@ -161,6 +158,8 @@ static_assert(::cuda::mr::resource_with +# include + +#endif // _CCCL_HAS_CTK() #endif //_CUDA___MEMORY_RESOURCE_DEVICE_MEMORY_POOL_H diff --git a/libcudacxx/include/cuda/__memory_pool/memory_pool_base.h b/libcudacxx/include/cuda/__memory_pool/memory_pool_base.h index d5de7937780..01db9b8fe3b 100644 --- a/libcudacxx/include/cuda/__memory_pool/memory_pool_base.h +++ b/libcudacxx/include/cuda/__memory_pool/memory_pool_base.h @@ -21,23 +21,20 @@ # pragma system_header #endif // no system header -#if _CCCL_CUDA_COMPILER(CLANG) -# include -# include -#endif // _CCCL_CUDA_COMPILER(CLANG) - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include +#if _CCCL_HAS_CTK() + +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include + +# include _CCCL_BEGIN_NAMESPACE_CUDA @@ -199,9 +196,9 @@ inline void __verify_device_supports_export_handle_type( return; } if (__location.type != ::CU_MEM_LOCATION_TYPE_DEVICE -#if _CCCL_CTK_AT_LEAST(12, 6) +# if _CCCL_CTK_AT_LEAST(12, 6) && __location.type != ::CU_MEM_LOCATION_TYPE_HOST_NUMA -#endif +# endif ) { ::cuda::__throw_cuda_error(::cudaErrorNotSupported, @@ -222,17 +219,17 @@ __get_default_memory_pool(const CUmemLocation __location, [[maybe_unused]] const auto __device = __location.type == ::CU_MEM_LOCATION_TYPE_DEVICE ? __location.id : 0; ::cuda::__verify_device_supports_stream_ordered_allocations(__device); -#if _CCCL_CTK_AT_LEAST(13, 0) +# if _CCCL_CTK_AT_LEAST(13, 0) ::cudaMemPool_t __pool = ::cuda::__driver::__getDefaultMemPool(__location, __allocation_type); if (::cuda::memory_pool_attributes::release_threshold(__pool) == 0) { ::cuda::memory_pool_attributes::release_threshold.set(__pool, ::cuda::std::numeric_limits::max()); } -#else // ^^^ _CCCL_CTK_AT_LEAST(13, 0) ^^^ / vvv _CCCL_CTK_BELOW(13, 0) vvv +# else // ^^^ _CCCL_CTK_AT_LEAST(13, 0) ^^^ / vvv _CCCL_CTK_BELOW(13, 0) vvv _CCCL_ASSERT(__location.type == ::CU_MEM_LOCATION_TYPE_DEVICE, "Before CUDA 13 only device memory pools have a default"); ::cudaMemPool_t __pool = ::cuda::__driver::__deviceGetDefaultMemPool(__device); -#endif // ^^^ _CCCL_CTK_BELOW(13, 0) ^^^ +# endif // ^^^ _CCCL_CTK_BELOW(13, 0) ^^^ return __pool; } @@ -294,27 +291,27 @@ struct memory_pool_properties __pool_properties.handleTypes = ::CUmemAllocationHandleType(__properties.allocation_handle_type); __pool_properties.location = __location; -#if _CCCL_CTK_AT_LEAST(12, 2) +# if _CCCL_CTK_AT_LEAST(12, 2) if (__properties.max_pool_size != 0) { -# if _CCCL_CTK_AT_LEAST(13, 0) +# if _CCCL_CTK_AT_LEAST(13, 0) if (__allocation_type == ::CU_MEM_ALLOCATION_TYPE_MANAGED) { ::cuda::std::__throw_invalid_argument("Max pool size is not supported for managed memory pools"); } -# endif // _CCCL_CTK_AT_LEAST(13, 0) +# endif // _CCCL_CTK_AT_LEAST(13, 0) if (__properties.initial_pool_size > __properties.max_pool_size) { ::cuda::std::__throw_invalid_argument("Initial pool size must be less than the max pool size"); } } __pool_properties.maxSize = __properties.max_pool_size; -#else +# else if (__properties.max_pool_size != 0) { ::cuda::std::__throw_invalid_argument("Max pool size is not supported on this CUDA version"); } -#endif // _CCCL_CTK_AT_LEAST(12, 2) +# endif // _CCCL_CTK_AT_LEAST(12, 2) if (__properties.initial_pool_size > __properties.release_threshold) { @@ -627,18 +624,20 @@ class __memory_pool_base return __pool_ == __rhs.__pool_; } -#if _CCCL_STD_VER <= 2017 +# if _CCCL_STD_VER <= 2017 //! @brief Inequality comparison with another __memory_pool_base. //! @returns true if underlying \c cudaMemPool_t are not equal. [[nodiscard]] _CCCL_HOST_API bool operator!=(__memory_pool_base const& __rhs) const noexcept { return __pool_ != __rhs.__pool_; } -#endif // _CCCL_STD_VER <= 2017 +# endif // _CCCL_STD_VER <= 2017 }; _CCCL_END_NAMESPACE_CUDA -#include +# include + +#endif // _CCCL_HAS_CTK() #endif // _CUDA___MEMORY_RESOURCE_MEMORY_POOL_BASE_H diff --git a/libcudacxx/include/cuda/__memory_pool/pinned_memory_pool.h b/libcudacxx/include/cuda/__memory_pool/pinned_memory_pool.h index 89318fc5920..38b9702e656 100644 --- a/libcudacxx/include/cuda/__memory_pool/pinned_memory_pool.h +++ b/libcudacxx/include/cuda/__memory_pool/pinned_memory_pool.h @@ -21,25 +21,22 @@ # pragma system_header #endif // no system header -#if _CCCL_CUDA_COMPILER(CLANG) -# include -# include -#endif // _CCCL_CUDA_COMPILER(CLANG) +#if _CCCL_HAS_CTK() -#include -#include -#include -#include -#include +# include +# include +# include +# include +# include -#include +# include //! @file //! The \c pinned_memory_resource class provides a memory resource that //! allocates pinned memory. _CCCL_BEGIN_NAMESPACE_CUDA -#if _CCCL_CTK_AT_LEAST(12, 6) +# if _CCCL_CTK_AT_LEAST(12, 6) static ::cudaMemPool_t __get_default_host_pinned_pool(); @@ -110,7 +107,7 @@ struct pinned_memory_pool : pinned_memory_pool_ref { using reference_type = pinned_memory_pool_ref; -# if _CCCL_CTK_AT_LEAST(13, 0) +# if _CCCL_CTK_AT_LEAST(13, 0) //! @brief Constructs a \c pinned_memory_pool with optional properties. //! Properties include the initial pool size and the release threshold. If the //! pool size grows beyond the release threshold, unused memory held by the @@ -129,7 +126,7 @@ struct pinned_memory_pool : pinned_memory_pool_ref { enable_access_from(cuda::devices); } -# endif // _CCCL_CTK_AT_LEAST(13, 0) +# endif // _CCCL_CTK_AT_LEAST(13, 0) //! @brief Constructs a \c pinned_memory_pool with the specified NUMA node id //! and optional properties. Properties include the initial pool size and the @@ -190,7 +187,7 @@ static_assert(::cuda::mr::resource_with +# include + +#endif // _CCCL_HAS_CTK() #endif //_CUDA___MEMORY_RESOURCE_PINNED_MEMORY_POOL_H diff --git a/libcudacxx/include/cuda/__memory_resource/any_resource.h b/libcudacxx/include/cuda/__memory_resource/any_resource.h index 97f4e370034..949000c91a2 100644 --- a/libcudacxx/include/cuda/__memory_resource/any_resource.h +++ b/libcudacxx/include/cuda/__memory_resource/any_resource.h @@ -21,19 +21,21 @@ # pragma system_header #endif // no system header -#include -#include -#include -#include -#include -#include -#include -#include +#if _CCCL_HAS_CTK() -#include +# include +# include +# include +# include +# include +# include +# include +# include + +# include _CCCL_BEGIN_NAMESPACE_CUDA_MR -#ifndef _CCCL_DOXYGEN_INVOKED // Do not document this +# ifndef _CCCL_DOXYGEN_INVOKED // Do not document this template using __property_result_t _CCCL_NODEBUG_ALIAS = ::cuda::std::__type_call1< // @@ -376,7 +378,7 @@ synchronous_resource_ref<_Properties...> __as_resource_ref(resource_ref<_Propert return __mr; } -#else // ^^^ !_CCCL_DOXYGEN_INVOKED ^^^ / vvv _CCCL_DOXYGEN_INVOKED vvv +# else // ^^^ !_CCCL_DOXYGEN_INVOKED ^^^ / vvv _CCCL_DOXYGEN_INVOKED vvv enum class _ResourceKind { @@ -821,7 +823,7 @@ using synchronous_resource_ref = basic_resource_ref<_ResourceKind::_Synchronous, template using resource_ref = basic_resource_ref<_ResourceKind::_Asynchronous, _Properties...>; -#endif // _CCCL_DOXYGEN_INVOKED +# endif // _CCCL_DOXYGEN_INVOKED //! @rst //! .. _cudax-memory-resource-make-any-resource: @@ -877,6 +879,8 @@ auto make_any_resource(_Args&&... __args) -> any_resource<_Properties...> _CCCL_END_NAMESPACE_CUDA_MR -#include +# include + +#endif // _CCCL_HAS_CTK() #endif // _CUDA___MEMORY_RESOURCE_ANY_RESOURCE_H diff --git a/libcudacxx/include/cuda/__memory_resource/get_memory_resource.h b/libcudacxx/include/cuda/__memory_resource/get_memory_resource.h index f9a2bd5c1ea..3c984f86df3 100644 --- a/libcudacxx/include/cuda/__memory_resource/get_memory_resource.h +++ b/libcudacxx/include/cuda/__memory_resource/get_memory_resource.h @@ -21,15 +21,17 @@ # pragma system_header #endif // no system header -#include -#include -#include -#include -#include -#include -#include +#if _CCCL_HAS_CTK() -#include +# include +# include +# include +# include +# include +# include +# include + +# include _CCCL_BEGIN_NAMESPACE_CUDA_MR @@ -77,6 +79,8 @@ _CCCL_GLOBAL_CONSTANT auto get_memory_resource = get_memory_resource_t{}; _CCCL_END_NAMESPACE_CUDA_MR -#include +# include + +#endif // _CCCL_HAS_CTK() #endif //_CUDA__MEMORY_RESOURCE_GET_MEMORY_RESOURCE_H diff --git a/libcudacxx/include/cuda/__memory_resource/get_property.h b/libcudacxx/include/cuda/__memory_resource/get_property.h index 4de47677cd9..9623d4f6210 100644 --- a/libcudacxx/include/cuda/__memory_resource/get_property.h +++ b/libcudacxx/include/cuda/__memory_resource/get_property.h @@ -21,12 +21,14 @@ # pragma system_header #endif // no system header -#include -#include -#include -#include +#if _CCCL_HAS_CTK() -#include +# include +# include +# include +# include + +# include _CCCL_BEGIN_NAMESPACE_CUDA @@ -148,6 +150,8 @@ using forward_property = __forward_property::__fn<_Derived, _Upstream>; _CCCL_END_NAMESPACE_CUDA -#include +# include + +#endif // _CCCL_HAS_CTK() #endif //_CUDA___MEMORY_RESOURCE_GET_PROPERTY_H diff --git a/libcudacxx/include/cuda/__memory_resource/legacy_managed_memory_resource.h b/libcudacxx/include/cuda/__memory_resource/legacy_managed_memory_resource.h index 8db6cffbf54..fe3c7af1686 100644 --- a/libcudacxx/include/cuda/__memory_resource/legacy_managed_memory_resource.h +++ b/libcudacxx/include/cuda/__memory_resource/legacy_managed_memory_resource.h @@ -21,20 +21,18 @@ # pragma system_header #endif // no system header -#if _CCCL_CUDA_COMPILER(CLANG) -# include -#endif // _CCCL_CUDA_COMPILER(CLANG) +#if _CCCL_HAS_CTK() -#include -#include -#include -#include -#include -#include -#include -#include +# include +# include +# include +# include +# include +# include +# include +# include -#include +# include //! @file //! The \c managed_memory_resource class provides a memory resource that allocates managed memory. @@ -107,7 +105,7 @@ class legacy_managed_memory_resource { return __flags_ == __other.__flags_; } -#if _CCCL_STD_VER <= 2017 +# if _CCCL_STD_VER <= 2017 //! @brief Inequality comparison with another \c managed_memory_resource. //! @param __other The other \c managed_memory_resource. //! @return Whether both \c managed_memory_resource were constructed with different flags. @@ -115,7 +113,7 @@ class legacy_managed_memory_resource { return __flags_ != __other.__flags_; } -#endif // _CCCL_STD_VER <= 2017 +# endif // _CCCL_STD_VER <= 2017 //! @brief Enables the \c device_accessible property _CCCL_HOST_API friend constexpr void @@ -143,6 +141,8 @@ static_assert(::cuda::mr::synchronous_resource_with +# include + +#endif // _CCCL_HAS_CTK() #endif //_CUDA___MEMORY_RESOURCE_LEGACY_MANAGED_MEMORY_RESOURCE_H diff --git a/libcudacxx/include/cuda/__memory_resource/legacy_pinned_memory_resource.h b/libcudacxx/include/cuda/__memory_resource/legacy_pinned_memory_resource.h index a43ccb24800..911dafb428e 100644 --- a/libcudacxx/include/cuda/__memory_resource/legacy_pinned_memory_resource.h +++ b/libcudacxx/include/cuda/__memory_resource/legacy_pinned_memory_resource.h @@ -21,20 +21,17 @@ # pragma system_header #endif // no system header -#if _CCCL_CUDA_COMPILER(CLANG) -# include -# include -#endif // _CCCL_CUDA_COMPILER(CLANG) +#if _CCCL_HAS_CTK() -#include -#include -#include -#include -#include -#include -#include +# include +# include +# include +# include +# include +# include +# include -#include +# include //! @file //! The \c legacy_pinned_memory_resource class provides a memory resource that allocates pinned memory. @@ -99,7 +96,7 @@ class legacy_pinned_memory_resource { return true; } -#if _CCCL_STD_VER <= 2017 +# if _CCCL_STD_VER <= 2017 //! @brief Equality comparison with another \c legacy_pinned_memory_resource. //! @param __other The other \c legacy_pinned_memory_resource. //! @return Whether both \c legacy_pinned_memory_resource were constructed with different flags. @@ -107,7 +104,7 @@ class legacy_pinned_memory_resource { return false; } -#endif // _CCCL_STD_VER <= 2017 +# endif // _CCCL_STD_VER <= 2017 //! @brief Enables the \c device_accessible property _CCCL_HOST_API friend constexpr void @@ -136,6 +133,8 @@ static_assert(::cuda::mr::synchronous_resource_with +# include + +#endif // _CCCL_HAS_CTK() #endif //_CUDA___MEMORY_RESOURCE_LEGACY_PINNED_MEMORY_RESOURCE_H diff --git a/libcudacxx/include/cuda/__memory_resource/properties.h b/libcudacxx/include/cuda/__memory_resource/properties.h index c6f8ade529b..6eac8634755 100644 --- a/libcudacxx/include/cuda/__memory_resource/properties.h +++ b/libcudacxx/include/cuda/__memory_resource/properties.h @@ -21,11 +21,13 @@ # pragma system_header #endif // no system header -#include -#include -#include +#if _CCCL_HAS_CTK() -#include +# include +# include +# include + +# include _CCCL_BEGIN_NAMESPACE_CUDA_MR @@ -125,6 +127,8 @@ struct __memory_accessability_from_properties _CCCL_END_NAMESPACE_CUDA_MR -#include +# include + +#endif // _CCCL_HAS_CTK() #endif //_CUDA___MEMORY_RESOURCE_PROPERTIES_H diff --git a/libcudacxx/include/cuda/__memory_resource/resource.h b/libcudacxx/include/cuda/__memory_resource/resource.h index 3e169651a95..c9ffeacfc45 100644 --- a/libcudacxx/include/cuda/__memory_resource/resource.h +++ b/libcudacxx/include/cuda/__memory_resource/resource.h @@ -21,19 +21,21 @@ # pragma system_header #endif // no system header -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include +#if _CCCL_HAS_CTK() + +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include + +# include _CCCL_BEGIN_NAMESPACE_CUDA_MR @@ -130,6 +132,8 @@ _CCCL_CONCEPT __non_polymorphic_resources = _CCCL_REQUIRES_EXPR((_Resource, _Oth _CCCL_END_NAMESPACE_CUDA_MR -#include +# include + +#endif // _CCCL_HAS_CTK() #endif //_CUDA___MEMORY_RESOURCE_RESOURCE_H diff --git a/libcudacxx/include/cuda/__memory_resource/shared_resource.h b/libcudacxx/include/cuda/__memory_resource/shared_resource.h index fe01c2e798f..3bfc7d590eb 100644 --- a/libcudacxx/include/cuda/__memory_resource/shared_resource.h +++ b/libcudacxx/include/cuda/__memory_resource/shared_resource.h @@ -21,19 +21,22 @@ # pragma system_header #endif // no system header -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include +#if _CCCL_HAS_CTK() + +# include +# include +# include +# include +# include +# include +# include +# include +# include + +# include _CCCL_BEGIN_NAMESPACE_CUDA_MR + //! @rst //! .. _cudax-memory-resource-shared-resource: //! @@ -254,8 +257,11 @@ auto make_shared_resource(_Args&&... __args) -> shared_resource<_Resource> "_Resource does not satisfy the cuda::mr::synchronous_resource concept"); return shared_resource<_Resource>{::cuda::std::in_place_type<_Resource>, ::cuda::std::forward<_Args>(__args)...}; } + _CCCL_END_NAMESPACE_CUDA_MR -#include +# include + +#endif // _CCCL_HAS_CTK() #endif // _CUDA___MEMORY_RESOURCE_SHARED_RESOURCE_H diff --git a/libcudacxx/include/cuda/__memory_resource/synchronous_resource_adapter.h b/libcudacxx/include/cuda/__memory_resource/synchronous_resource_adapter.h index 634ad7f7b33..65eecbcbe0f 100644 --- a/libcudacxx/include/cuda/__memory_resource/synchronous_resource_adapter.h +++ b/libcudacxx/include/cuda/__memory_resource/synchronous_resource_adapter.h @@ -21,13 +21,15 @@ # pragma system_header #endif // no system header -#include -#include -#include -#include -#include +#if _CCCL_HAS_CTK() -#include +# include +# include +# include +# include +# include + +# include _CCCL_BEGIN_NAMESPACE_CUDA_MR @@ -103,12 +105,12 @@ struct synchronous_resource_adapter return __resource == __rhs.__resource; } -#if _CCCL_STD_VER <= 2017 +# if _CCCL_STD_VER <= 2017 [[nodiscard]] _CCCL_HOST_API bool operator!=(const synchronous_resource_adapter& __rhs) const noexcept { return __resource != __rhs.__resource; } -#endif // _CCCL_STD_VER <= 2017 +# endif // _CCCL_STD_VER <= 2017 _CCCL_HOST_API _Resource& upstream_resource() noexcept { @@ -138,6 +140,8 @@ _CCCL_HOST_API decltype(auto) __adapt_if_synchronous(_Resource&& __resource) noe } _CCCL_END_NAMESPACE_CUDA_MR -#include +# include + +#endif // _CCCL_HAS_CTK() #endif //_CUDA___MEMORY_RESOURCE_SYNCHRONOUS_RESOURCE_ADAPTER_H diff --git a/libcudacxx/include/cuda/__stream/internal_streams.h b/libcudacxx/include/cuda/__stream/internal_streams.h index e4f74c9b9b2..6cce1db7c2c 100644 --- a/libcudacxx/include/cuda/__stream/internal_streams.h +++ b/libcudacxx/include/cuda/__stream/internal_streams.h @@ -21,11 +21,11 @@ # pragma system_header #endif // no system header -#include +#if _CCCL_HAS_CTK() -#include +# include -#include +# include _CCCL_BEGIN_NAMESPACE_CUDA @@ -44,6 +44,8 @@ inline ::cuda::stream_ref __cccl_allocation_stream() _CCCL_END_NAMESPACE_CUDA -#include +# include + +#endif // _CCCL_HAS_CTK() #endif // _CUDA___STREAM_INTERNAL_STREAMS_H diff --git a/libcudacxx/include/cuda/__stream/launch_transform.h b/libcudacxx/include/cuda/__stream/launch_transform.h index d0dbfb7851f..cfe87646650 100644 --- a/libcudacxx/include/cuda/__stream/launch_transform.h +++ b/libcudacxx/include/cuda/__stream/launch_transform.h @@ -21,20 +21,22 @@ # pragma system_header #endif // no system header -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include +#if _CCCL_HAS_CTK() + +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include + +# include _CCCL_BEGIN_NAMESPACE_CUDA namespace __detail @@ -188,6 +190,8 @@ using transformed_device_argument_t _CCCL_NODEBUG_ALIAS = _CCCL_END_NAMESPACE_CUDA -#include +# include + +#endif // _CCCL_HAS_CTK() #endif // _CUDA__STREAM_LAUNCH_TRANSFORM_H diff --git a/libcudacxx/include/cuda/pipeline b/libcudacxx/include/cuda/pipeline index 9d4b990f55d..ef4655f5e3c 100644 --- a/libcudacxx/include/cuda/pipeline +++ b/libcudacxx/include/cuda/pipeline @@ -468,15 +468,15 @@ _CCCL_API inline pipeline make_pipeline() } template -_CCCL_API inline void pipeline_consumer_wait_prior(pipeline& __pipeline) +_CCCL_API inline void pipeline_consumer_wait_prior([[maybe_unused]] pipeline& __pipeline) { NV_IF_TARGET(NV_PROVIDES_SM_80, ::cuda::device::__pipeline_consumer_wait<_Prior>(__pipeline); __pipeline.__tail = __pipeline.__head - _Prior;) } template -_CCCL_API inline void -pipeline_producer_commit([[maybe_unused]] pipeline& __pipeline, barrier<_Scope>& __barrier) +_CCCL_API inline void pipeline_producer_commit([[maybe_unused]] pipeline& __pipeline, + [[maybe_unused]] barrier<_Scope>& __barrier) { NV_IF_TARGET(NV_PROVIDES_SM_80, ((void) __memcpy_completion_impl::__defer( From e80cee274de0fc8af366c71218006631c43d5297 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath <3190405+shwina@users.noreply.github.com> Date: Fri, 19 Dec 2025 08:30:57 -0500 Subject: [PATCH 30/56] [cuda.compute]: Fixes and updates to benchmarks (#6999) * Move algorithm cache to a central registry * Update select benchmark * Update merge_sort benchmark --------- Co-authored-by: Ashwin Srinath --- .../benchmarks/compute/bench_merge_sort.py | 20 +-- .../benchmarks/compute/bench_reduce.py | 20 +-- .../benchmarks/compute/bench_scan.py | 24 +--- .../benchmarks/compute/bench_select.py | 127 ++++++++++++++++++ .../compute/bench_three_way_partition.py | 15 +-- .../benchmarks/compute/bench_transform.py | 35 +---- .../benchmarks/compute/bench_zip_iterator.py | 20 +-- .../cuda_cccl/benchmarks/compute/conftest.py | 6 +- python/cuda_cccl/cuda/compute/__init__.py | 4 +- python/cuda_cccl/cuda/compute/_caching.py | 28 ++++ 10 files changed, 187 insertions(+), 112 deletions(-) create mode 100644 python/cuda_cccl/benchmarks/compute/bench_select.py diff --git a/python/cuda_cccl/benchmarks/compute/bench_merge_sort.py b/python/cuda_cccl/benchmarks/compute/bench_merge_sort.py index f638f9e74b4..608af19a4f3 100644 --- a/python/cuda_cccl/benchmarks/compute/bench_merge_sort.py +++ b/python/cuda_cccl/benchmarks/compute/bench_merge_sort.py @@ -91,10 +91,7 @@ def run(): ) fixture = request.getfixturevalue(bench_fixture) - if bench_fixture == "compile_benchmark": - fixture(cuda.compute.make_merge_sort, run) - else: - fixture(run) + fixture(run) @pytest.mark.parametrize("bench_fixture", ["compile_benchmark", "benchmark"]) @@ -115,10 +112,7 @@ def run(): ) fixture = request.getfixturevalue(bench_fixture) - if bench_fixture == "compile_benchmark": - fixture(cuda.compute.make_merge_sort, run) - else: - fixture(run) + fixture(run) @pytest.mark.parametrize("bench_fixture", ["compile_benchmark", "benchmark"]) @@ -141,10 +135,7 @@ def run(): ) fixture = request.getfixturevalue(bench_fixture) - if bench_fixture == "compile_benchmark": - fixture(cuda.compute.make_merge_sort, run) - else: - fixture(run) + fixture(run) @pytest.mark.parametrize("bench_fixture", ["compile_benchmark", "benchmark"]) @@ -167,7 +158,4 @@ def run(): ) fixture = request.getfixturevalue(bench_fixture) - if bench_fixture == "compile_benchmark": - fixture(cuda.compute.make_merge_sort, run) - else: - fixture(run) + fixture(run) diff --git a/python/cuda_cccl/benchmarks/compute/bench_reduce.py b/python/cuda_cccl/benchmarks/compute/bench_reduce.py index 46952a29e78..8bf483702a3 100644 --- a/python/cuda_cccl/benchmarks/compute/bench_reduce.py +++ b/python/cuda_cccl/benchmarks/compute/bench_reduce.py @@ -91,10 +91,7 @@ def run(): reduce_pointer(input_array, build_only=(bench_fixture == "compile_benchmark")) fixture = request.getfixturevalue(bench_fixture) - if bench_fixture == "compile_benchmark": - fixture(cuda.compute.make_reduce_into, run) - else: - fixture(run) + fixture(run) @pytest.mark.parametrize("bench_fixture", ["compile_benchmark", "benchmark"]) @@ -109,10 +106,7 @@ def run(): ) fixture = request.getfixturevalue(bench_fixture) - if bench_fixture == "compile_benchmark": - fixture(cuda.compute.make_reduce_into, run) - else: - fixture(run) + fixture(run) @pytest.mark.parametrize("bench_fixture", ["compile_benchmark", "benchmark"]) @@ -127,10 +121,7 @@ def run(): reduce_struct(input_array, build_only=(bench_fixture == "compile_benchmark")) fixture = request.getfixturevalue(bench_fixture) - if bench_fixture == "compile_benchmark": - fixture(cuda.compute.make_reduce_into, run) - else: - fixture(run) + fixture(run) @pytest.mark.parametrize("bench_fixture", ["compile_benchmark", "benchmark"]) @@ -141,10 +132,7 @@ def run(): reduce_pointer_custom_op(input_array, build_only=False) fixture = request.getfixturevalue(bench_fixture) - if bench_fixture == "compile_benchmark": - fixture(cuda.compute.make_reduce_into, run) - else: - fixture(run) + fixture(run) def bench_reduce_pointer_single_phase(benchmark, size): diff --git a/python/cuda_cccl/benchmarks/compute/bench_scan.py b/python/cuda_cccl/benchmarks/compute/bench_scan.py index 6cd6b7a62bf..95f4e892c06 100644 --- a/python/cuda_cccl/benchmarks/compute/bench_scan.py +++ b/python/cuda_cccl/benchmarks/compute/bench_scan.py @@ -108,13 +108,7 @@ def run(): ) fixture = request.getfixturevalue(bench_fixture) - if bench_fixture == "compile_benchmark": - if scan_type == "exclusive": - fixture(cuda.compute.make_exclusive_scan, run) - else: - fixture(cuda.compute.make_inclusive_scan, run) - else: - fixture(run) + fixture(run) @pytest.mark.parametrize("scan_type", ["exclusive", "inclusive"]) @@ -145,13 +139,7 @@ def run(): ) fixture = request.getfixturevalue(bench_fixture) - if bench_fixture == "compile_benchmark": - if scan_type == "exclusive": - fixture(cuda.compute.make_exclusive_scan, run) - else: - fixture(cuda.compute.make_inclusive_scan, run) - else: - fixture(run) + fixture(run) @pytest.mark.parametrize("scan_type", ["exclusive", "inclusive"]) @@ -171,13 +159,7 @@ def run(): ) fixture = request.getfixturevalue(bench_fixture) - if bench_fixture == "compile_benchmark": - if scan_type == "exclusive": - fixture(cuda.compute.make_exclusive_scan, run) - else: - fixture(cuda.compute.make_inclusive_scan, run) - else: - fixture(run) + fixture(run) def scan_pointer_single_phase(input_array, build_only, scan_type): diff --git a/python/cuda_cccl/benchmarks/compute/bench_select.py b/python/cuda_cccl/benchmarks/compute/bench_select.py new file mode 100644 index 00000000000..7f3f38609f6 --- /dev/null +++ b/python/cuda_cccl/benchmarks/compute/bench_select.py @@ -0,0 +1,127 @@ +import cupy as cp +import numpy as np +import pytest + +import cuda.compute +from cuda.compute import ( + CacheModifiedInputIterator, + gpu_struct, +) + + +def select_pointer(inp, out, num_selected, build_only): + size = len(inp) + + def even_op(x): + return x % 2 == 0 + + selector = cuda.compute.make_select(inp, out, num_selected, even_op) + if not build_only: + temp_bytes = selector(None, inp, out, num_selected, size) + temp_storage = cp.empty(temp_bytes, dtype=np.uint8) + selector(temp_storage, inp, out, num_selected, size) + + cp.cuda.runtime.deviceSynchronize() + + +def select_iterator(size, d_in, out, num_selected, build_only): + d_in_iter = CacheModifiedInputIterator(d_in, modifier="stream") + + def less_than_50(x): + return x < 50 + + selector = cuda.compute.make_select(d_in_iter, out, num_selected, less_than_50) + if not build_only: + temp_bytes = selector(None, d_in_iter, out, num_selected, size) + temp_storage = cp.empty(temp_bytes, dtype=np.uint8) + selector(temp_storage, d_in_iter, out, num_selected, size) + + cp.cuda.runtime.deviceSynchronize() + + +@gpu_struct +class Point: + x: np.int32 + y: np.int32 + + +def select_struct(inp, out, num_selected, build_only): + size = len(inp) + + def in_first_quadrant(p: Point) -> np.uint8: + return (p.x > 50) and (p.y > 50) + + selector = cuda.compute.make_select(inp, out, num_selected, in_first_quadrant) + if not build_only: + temp_bytes = selector(None, inp, out, num_selected, size) + temp_storage = cp.empty(temp_bytes, dtype=np.uint8) + selector(temp_storage, inp, out, num_selected, size) + + cp.cuda.runtime.deviceSynchronize() + + +def select_stateful(inp, out, num_selected, threshold_state, build_only): + size = len(inp) + + def threshold_select(x): + return x > threshold_state[0] + + selector = cuda.compute.make_select(inp, out, num_selected, threshold_select) + if not build_only: + temp_bytes = selector(None, inp, out, num_selected, size) + temp_storage = cp.empty(temp_bytes, dtype=np.uint8) + selector(temp_storage, inp, out, num_selected, size) + + cp.cuda.runtime.deviceSynchronize() + + +@pytest.mark.parametrize("bench_fixture", ["compile_benchmark", "benchmark"]) +def bench_select_pointer(bench_fixture, request, size): + actual_size = 100 if bench_fixture == "compile_benchmark" else size + inp = cp.random.randint(0, 100, actual_size, dtype=np.int32) + out = cp.empty_like(inp) + num_selected = cp.empty(2, dtype=np.uint64) + + def run(): + select_pointer( + inp, out, num_selected, build_only=(bench_fixture == "compile_benchmark") + ) + + fixture = request.getfixturevalue(bench_fixture) + fixture(run) + + +@pytest.mark.parametrize("bench_fixture", ["compile_benchmark", "benchmark"]) +def bench_select_iterator(bench_fixture, request, size): + actual_size = 100 if bench_fixture == "compile_benchmark" else size + d_in = cp.random.randint(0, 100, actual_size, dtype=np.int32) + out = cp.empty(actual_size, dtype=np.int32) + num_selected = cp.empty(2, dtype=np.uint64) + + def run(): + select_iterator( + actual_size, + d_in, + out, + num_selected, + build_only=(bench_fixture == "compile_benchmark"), + ) + + fixture = request.getfixturevalue(bench_fixture) + fixture(run) + + +@pytest.mark.parametrize("bench_fixture", ["compile_benchmark", "benchmark"]) +def bench_select_struct(bench_fixture, request, size): + actual_size = 100 if bench_fixture == "compile_benchmark" else size + inp = cp.random.randint(0, 100, (actual_size, 2), dtype=np.int32).view(Point.dtype) + out = cp.empty_like(inp) + num_selected = cp.empty(2, dtype=np.uint64) + + def run(): + select_struct( + inp, out, num_selected, build_only=(bench_fixture == "compile_benchmark") + ) + + fixture = request.getfixturevalue(bench_fixture) + fixture(run) diff --git a/python/cuda_cccl/benchmarks/compute/bench_three_way_partition.py b/python/cuda_cccl/benchmarks/compute/bench_three_way_partition.py index e88b377f138..a9bef4e27a7 100644 --- a/python/cuda_cccl/benchmarks/compute/bench_three_way_partition.py +++ b/python/cuda_cccl/benchmarks/compute/bench_three_way_partition.py @@ -167,10 +167,7 @@ def run(): ) fixture = request.getfixturevalue(bench_fixture) - if bench_fixture == "compile_benchmark": - fixture(cuda.compute.make_three_way_partition, run) - else: - fixture(run) + fixture(run) @pytest.mark.parametrize("bench_fixture", ["compile_benchmark", "benchmark"]) @@ -192,10 +189,7 @@ def run(): ) fixture = request.getfixturevalue(bench_fixture) - if bench_fixture == "compile_benchmark": - fixture(cuda.compute.make_three_way_partition, run) - else: - fixture(run) + fixture(run) @pytest.mark.parametrize("bench_fixture", ["compile_benchmark", "benchmark"]) @@ -218,10 +212,7 @@ def run(): ) fixture = request.getfixturevalue(bench_fixture) - if bench_fixture == "compile_benchmark": - fixture(cuda.compute.make_three_way_partition, run) - else: - fixture(run) + fixture(run) def three_way_partition_pointer_single_phase(inp): diff --git a/python/cuda_cccl/benchmarks/compute/bench_transform.py b/python/cuda_cccl/benchmarks/compute/bench_transform.py index e17044afd00..79c1cce4e77 100644 --- a/python/cuda_cccl/benchmarks/compute/bench_transform.py +++ b/python/cuda_cccl/benchmarks/compute/bench_transform.py @@ -117,10 +117,7 @@ def run(): ) fixture = request.getfixturevalue(bench_fixture) - if bench_fixture == "compile_benchmark": - fixture(cuda.compute.make_unary_transform, run) - else: - fixture(run) + fixture(run) @pytest.mark.parametrize("bench_fixture", ["compile_benchmark", "benchmark"]) @@ -135,10 +132,7 @@ def run(): ) fixture = request.getfixturevalue(bench_fixture) - if bench_fixture == "compile_benchmark": - fixture(cuda.compute.make_unary_transform, run) - else: - fixture(run) + fixture(run) @pytest.mark.parametrize("bench_fixture", ["compile_benchmark", "benchmark"]) @@ -154,10 +148,7 @@ def run(): ) fixture = request.getfixturevalue(bench_fixture) - if bench_fixture == "compile_benchmark": - fixture(cuda.compute.make_unary_transform, run) - else: - fixture(run) + fixture(run) @pytest.mark.parametrize("bench_fixture", ["compile_benchmark", "benchmark"]) @@ -174,10 +165,7 @@ def run(): ) fixture = request.getfixturevalue(bench_fixture) - if bench_fixture == "compile_benchmark": - fixture(cuda.compute.make_binary_transform, run) - else: - fixture(run) + fixture(run) @pytest.mark.parametrize("bench_fixture", ["compile_benchmark", "benchmark"]) @@ -192,10 +180,7 @@ def run(): ) fixture = request.getfixturevalue(bench_fixture) - if bench_fixture == "compile_benchmark": - fixture(cuda.compute.make_binary_transform, run) - else: - fixture(run) + fixture(run) @pytest.mark.parametrize("bench_fixture", ["compile_benchmark", "benchmark"]) @@ -212,10 +197,7 @@ def run(): ) fixture = request.getfixturevalue(bench_fixture) - if bench_fixture == "compile_benchmark": - fixture(cuda.compute.make_binary_transform, run) - else: - fixture(run) + fixture(run) @pytest.mark.parametrize("bench_fixture", ["compile_benchmark", "benchmark"]) @@ -231,7 +213,4 @@ def run(): ) fixture = request.getfixturevalue(bench_fixture) - if bench_fixture == "compile_benchmark": - fixture(cuda.compute.make_binary_transform, run) - else: - fixture(run) + fixture(run) diff --git a/python/cuda_cccl/benchmarks/compute/bench_zip_iterator.py b/python/cuda_cccl/benchmarks/compute/bench_zip_iterator.py index 41387086a9e..c9e27dcd5ae 100644 --- a/python/cuda_cccl/benchmarks/compute/bench_zip_iterator.py +++ b/python/cuda_cccl/benchmarks/compute/bench_zip_iterator.py @@ -106,10 +106,7 @@ def run(): reduce_zip_array(input_array, build_only=(bench_fixture == "compile_benchmark")) fixture = request.getfixturevalue(bench_fixture) - if bench_fixture == "compile_benchmark": - fixture(cuda.compute.make_reduce_into, run) - else: - fixture(run) + fixture(run) @pytest.mark.parametrize("bench_fixture", ["compile_benchmark", "benchmark"]) @@ -122,10 +119,7 @@ def run(): ) fixture = request.getfixturevalue(bench_fixture) - if bench_fixture == "compile_benchmark": - fixture(cuda.compute.make_reduce_into, run) - else: - fixture(run) + fixture(run) @pytest.mark.parametrize("bench_fixture", ["compile_benchmark", "benchmark"]) @@ -138,10 +132,7 @@ def run(): ) fixture = request.getfixturevalue(bench_fixture) - if bench_fixture == "compile_benchmark": - fixture(cuda.compute.make_reduce_into, run) - else: - fixture(run) + fixture(run) @pytest.mark.parametrize("bench_fixture", ["compile_benchmark", "benchmark"]) @@ -163,7 +154,4 @@ def run(): ) fixture = request.getfixturevalue(bench_fixture) - if bench_fixture == "compile_benchmark": - fixture(cuda.compute.make_binary_transform, run) - else: - fixture(run) + fixture(run) diff --git a/python/cuda_cccl/benchmarks/compute/conftest.py b/python/cuda_cccl/benchmarks/compute/conftest.py index 0e4f9c73829..981b203b7b4 100644 --- a/python/cuda_cccl/benchmarks/compute/conftest.py +++ b/python/cuda_cccl/benchmarks/compute/conftest.py @@ -1,5 +1,7 @@ import pytest +import cuda.compute + @pytest.fixture(params=[True, False]) def build_only(request): @@ -13,11 +15,11 @@ def size(request): @pytest.fixture def compile_benchmark(benchmark): - def run_compile_benchmark(algorithm, function): + def run_compile_benchmark(function): def setup(): # This function is called once before the benchmark runs # to set up the environment. - algorithm.cache_clear() + cuda.compute.clear_all_caches() benchmark.pedantic( function, diff --git a/python/cuda_cccl/cuda/compute/__init__.py b/python/cuda_cccl/cuda/compute/__init__.py index 71f0ad70f4b..854beee17ba 100644 --- a/python/cuda_cccl/cuda/compute/__init__.py +++ b/python/cuda_cccl/cuda/compute/__init__.py @@ -2,6 +2,7 @@ # # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +from ._caching import clear_all_caches from .algorithms import ( DoubleBuffer, SortOrder, @@ -49,13 +50,13 @@ __all__ = [ "binary_transform", + "clear_all_caches", "CacheModifiedInputIterator", "ConstantIterator", "CountingIterator", "DiscardIterator", "DoubleBuffer", "exclusive_scan", - "select", "gpu_struct", "histogram_even", "inclusive_scan", @@ -81,6 +82,7 @@ "ReverseIterator", "segmented_reduce", "segmented_sort", + "select", "SortOrder", "TransformIterator", "TransformOutputIterator", diff --git a/python/cuda_cccl/cuda/compute/_caching.py b/python/cuda_cccl/cuda/compute/_caching.py index 0443f38c0ea..ad675c05341 100644 --- a/python/cuda_cccl/cuda/compute/_caching.py +++ b/python/cuda_cccl/cuda/compute/_caching.py @@ -10,6 +10,9 @@ except ImportError: from cuda.core.experimental import Device +# Central registry of all algorithm caches +_cache_registry: dict[str, object] = {} + def cache_with_key(key): """ @@ -21,6 +24,9 @@ def cache_with_key(key): ----- The CUDA compute capability of the current device is appended to the cache key returned by `key`. + + The decorated function is automatically registered in the central + cache registry for easy cache management. """ def deco(func): @@ -39,11 +45,33 @@ def cache_clear(): cache.clear() inner.cache_clear = cache_clear + + # Register the cache in the central registry + cache_name = func.__qualname__ + _cache_registry[cache_name] = inner + return inner return deco +def clear_all_caches(): + """ + Clear all algorithm caches. + + This function clears all cached algorithm build results, forcing + recompilation on the next invocation. Useful for benchmarking + compilation time. + + Example + ------- + >>> import cuda.compute + >>> cuda.compute.clear_all_caches() + """ + for cached_func in _cache_registry.values(): + cached_func.cache_clear() + + class CachableFunction: """ A type that wraps a function and provides custom comparison From d91b7112f79282569f54b1ecb9be6bf982778768 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath <3190405+shwina@users.noreply.github.com> Date: Fri, 19 Dec 2025 09:44:36 -0500 Subject: [PATCH 31/56] Support operations with side-effects (state) in `cuda.compute` (#7008) * Move algorithm cache to a central registry * Add bench_select.py * Add tests for stateful select and transform * For the purposes of caching, hash DeviceArrayLike objects by pointer, shape, and dtype * Update select benchmark * Bump numba-cuda dependency to 0.23.0 * Add select example * Lint * Remove duplicate cache registry --------- Co-authored-by: Ashwin Srinath --- .../benchmarks/compute/bench_select.py | 21 +++ python/cuda_cccl/cuda/compute/_caching.py | 40 +++++- python/cuda_cccl/pyproject.toml | 5 +- .../select/select_with_side_effect.py | 48 +++++++ python/cuda_cccl/tests/compute/test_select.py | 123 ++++++++++++++++++ .../cuda_cccl/tests/compute/test_transform.py | 24 ++++ 6 files changed, 251 insertions(+), 10 deletions(-) create mode 100644 python/cuda_cccl/tests/compute/examples/select/select_with_side_effect.py diff --git a/python/cuda_cccl/benchmarks/compute/bench_select.py b/python/cuda_cccl/benchmarks/compute/bench_select.py index 7f3f38609f6..8d284c63a74 100644 --- a/python/cuda_cccl/benchmarks/compute/bench_select.py +++ b/python/cuda_cccl/benchmarks/compute/bench_select.py @@ -125,3 +125,24 @@ def run(): fixture = request.getfixturevalue(bench_fixture) fixture(run) + + +@pytest.mark.parametrize("bench_fixture", ["compile_benchmark", "benchmark"]) +def bench_select_stateful(bench_fixture, request, size): + actual_size = 100 if bench_fixture == "compile_benchmark" else size + inp = cp.random.randint(0, 100, actual_size, dtype=np.int32) + out = cp.empty_like(inp) + num_selected = cp.empty(2, dtype=np.uint64) + threshold_state = cp.array([50], dtype=np.int32) + + def run(): + select_stateful( + inp, + out, + num_selected, + threshold_state, + build_only=(bench_fixture == "compile_benchmark"), + ) + + fixture = request.getfixturevalue(bench_fixture) + fixture(run) diff --git a/python/cuda_cccl/cuda/compute/_caching.py b/python/cuda_cccl/cuda/compute/_caching.py index ad675c05341..7c6bcd0ec02 100644 --- a/python/cuda_cccl/cuda/compute/_caching.py +++ b/python/cuda_cccl/cuda/compute/_caching.py @@ -10,6 +10,7 @@ except ImportError: from cuda.core.experimental import Device + # Central registry of all algorithm caches _cache_registry: dict[str, object] = {} @@ -55,6 +56,33 @@ def cache_clear(): return deco +def _hash_device_array_like(value): + # hash based on pointer, shape, and dtype + ptr = value.__cuda_array_interface__["data"][0] + shape = value.__cuda_array_interface__["shape"] + dtype = value.__cuda_array_interface__["typestr"] + return hash((ptr, shape, dtype)) + + +def _make_hashable(value): + import numba.cuda.dispatcher + + from .typing import DeviceArrayLike + + if isinstance(value, numba.cuda.dispatcher.CUDADispatcher): + return CachableFunction(value.py_func) + elif isinstance(value, DeviceArrayLike): + return _hash_device_array_like(value) + elif isinstance(value, (list, tuple)): + return tuple(_make_hashable(v) for v in value) + elif isinstance(value, dict): + return tuple( + sorted((_make_hashable(k), _make_hashable(v)) for k, v in value.items()) + ) + else: + return id(value) + + def clear_all_caches(): """ Clear all algorithm caches. @@ -83,8 +111,6 @@ class CachableFunction: """ def __init__(self, func): - import numba.cuda.dispatcher - self._func = func closure = func.__closure__ if func.__closure__ is not None else [] @@ -92,16 +118,16 @@ def __init__(self, func): # if any of the contents is a numba.cuda.dispatcher.CUDADispatcher # use the function for caching purposes: for cell in closure: - if isinstance(cell.cell_contents, numba.cuda.dispatcher.CUDADispatcher): - contents.append(CachableFunction(cell.cell_contents.py_func)) - else: - contents.append(cell.cell_contents) + contents.append(_make_hashable(cell.cell_contents)) self._identity = ( func.__name__, func.__code__.co_code, func.__code__.co_consts, tuple(contents), - tuple(func.__globals__.get(name, None) for name in func.__code__.co_names), + tuple( + _make_hashable(func.__globals__.get(name, None)) + for name in func.__code__.co_names + ), ) def __eq__(self, other): diff --git a/python/cuda_cccl/pyproject.toml b/python/cuda_cccl/pyproject.toml index e414dfda371..a401fbe4bcd 100644 --- a/python/cuda_cccl/pyproject.toml +++ b/python/cuda_cccl/pyproject.toml @@ -31,7 +31,6 @@ dependencies = [ "numpy", "cuda-pathfinder>=1.2.3", "cuda-core", - "numba-cuda>=0.20.0,!=0.21.2", "typing_extensions", ] @@ -42,12 +41,12 @@ readme = { file = "README.md", content-type = "text/markdown" } cu12 = [ "cuda-bindings>=12.9.1,<13.0.0", "cuda-toolkit[nvrtc,nvjitlink,cudart,nvcc]==12.*", - "numba-cuda[cu12]>=0.20.0,!=0.21.2", + "numba-cuda[cu12]>=0.23.0", ] cu13 = [ "cuda-bindings>=13.0.0,<14.0.0", "cuda-toolkit[nvrtc,nvjitlink,cudart,nvcc,nvvm]==13.*", - "numba-cuda[cu13]>=0.20.0,!=0.21.2", + "numba-cuda[cu13]>=0.23.0", ] test-cu12 = [ # an undocumented way to inherit the dependencies of the cu12 extra. diff --git a/python/cuda_cccl/tests/compute/examples/select/select_with_side_effect.py b/python/cuda_cccl/tests/compute/examples/select/select_with_side_effect.py new file mode 100644 index 00000000000..a39f28d3bb4 --- /dev/null +++ b/python/cuda_cccl/tests/compute/examples/select/select_with_side_effect.py @@ -0,0 +1,48 @@ +# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# +# +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +# example-begin +import cupy as cp +from numba import cuda as numba_cuda + +from cuda.compute.algorithms import select + +# Create input data: values 0 to 99 +d_in = cp.arange(100, dtype=cp.int32) +d_out = cp.empty_like(d_in) +d_num_selected = cp.empty(1, dtype=cp.uint64) + +# Counter for rejected items (side effect state) +reject_count = cp.zeros(1, dtype=cp.int32) + + +# Define condition that counts rejected items as a side effect +def count_rejects(x): + if x % 2 == 0: + return True + else: + numba_cuda.atomic.add(reject_count, 0, 1) + return False + + +# Execute select - selects even numbers, counts rejections +select(d_in, d_out, d_num_selected, count_rejects, len(d_in)) + +# Get results +num_selected = int(d_num_selected.get()[0]) +num_rejected = int(reject_count.get()[0]) +result = d_out[:num_selected].get() + +print(f"Selected {num_selected} items (values % 2 == 0)") +print(f"Rejected {num_rejected} items (values % 2 != 0)") +print(f"First 5 selected: {result[:5]}") +# Output: +# Selected 50 items (even numbers) +# Rejected 50 items (odd numbers) +# First 5 selected: [0 2 4 6 8] +# example-end + +assert num_selected == 50 # Even numbers +assert num_rejected == 50 # Odd numbers diff --git a/python/cuda_cccl/tests/compute/test_select.py b/python/cuda_cccl/tests/compute/test_select.py index b6b8f311f53..e0c2e8f2c14 100644 --- a/python/cuda_cccl/tests/compute/test_select.py +++ b/python/cuda_cccl/tests/compute/test_select.py @@ -420,3 +420,126 @@ def condition(pair): expected_count = np.sum(h_sums < 70) assert num_selected == expected_count + + +def test_select_stateful_threshold(): + """Test stateful select that uses state for threshold""" + num_items = 1000 + h_in = random_array(num_items, np.int32, max_value=100) + + # Create device state containing threshold value + threshold_value = 50 + threshold_state = cp.array([threshold_value], dtype=np.int32) + + # Define condition that references state as closure + def threshold_select(x): + return x > threshold_state[0] + + d_in = cp.asarray(h_in) + d_out = cp.empty_like(d_in) + d_num_selected = cp.empty(2, dtype=np.uint64) + + cuda.compute.select( + d_in, + d_out, + d_num_selected, + threshold_select, + num_items, + ) + + # Check selected output + num_selected = int(d_num_selected[0].get()) + got = d_out.get()[:num_selected] + + # Verify all output values are > threshold + assert np.all(got > threshold_value) + + # Verify we got the expected number of items + expected_selected = h_in[h_in > threshold_value] + expected_count = len(expected_selected) + + assert num_selected == expected_count + + # Verify exact results + assert np.array_equal(got, expected_selected) + + +def test_select_stateful_atomic(): + """Test stateful select with atomic operations to count rejected items""" + from numba import cuda as numba_cuda + + num_items = 1000 + h_in = random_array(num_items, np.int32, max_value=100) + + # Create device state for counting rejected items + reject_counter = cp.zeros(1, dtype=np.int32) + + # Define condition that references state as closure + def count_rejects(x): + if x > 50: + return True + else: + numba_cuda.atomic.add(reject_counter, 0, 1) + return False + + d_in = cp.asarray(h_in) + d_out = cp.empty_like(d_in) + d_num_selected = cp.empty(2, dtype=np.uint64) + + cuda.compute.select( + d_in, + d_out, + d_num_selected, + count_rejects, + num_items, + ) + + # Check selected output + num_selected = int(d_num_selected[0].get()) + got = d_out.get()[:num_selected] + + # Verify all output values are > 50 + assert np.all(got > 50) + + # Verify we got the expected number of items + expected_selected = h_in[h_in > 50] + expected_count = len(expected_selected) + + assert num_selected == expected_count + + # Verify exact results + assert np.array_equal(got, expected_selected) + + # Verify state contains count of rejected items + rejected_count = int(reject_counter[0].get()) + expected_rejected = len(h_in[h_in <= 50]) + assert rejected_count == expected_rejected, ( + f"Expected {expected_rejected} rejections, got {rejected_count}" + ) + + +def test_select_with_side_effect_counting_rejects(): + """Select with side effect that counts rejected items""" + from numba import cuda as numba_cuda + + d_in = cp.arange(100, dtype=np.int32) + d_out = cp.empty_like(d_in) + d_num_selected = cp.empty(1, dtype=np.uint64) + + reject_count = cp.zeros(1, dtype=np.int32) + + # Define condition that references state as closure + def count_rejects(x): + if x >= 50: + return True + else: + numba_cuda.atomic.add(reject_count, 0, 1) + return False + + cuda.compute.select(d_in, d_out, d_num_selected, count_rejects, len(d_in)) + + num_selected = int(d_num_selected.get()[0]) + num_rejected = int(reject_count.get()[0]) + + assert num_selected == 50 # Values 50-99 + assert num_rejected == 50 # Values 0-49 diff --git a/python/cuda_cccl/tests/compute/test_transform.py b/python/cuda_cccl/tests/compute/test_transform.py index dc3aceabcb3..a4712e46249 100644 --- a/python/cuda_cccl/tests/compute/test_transform.py +++ b/python/cuda_cccl/tests/compute/test_transform.py @@ -380,3 +380,27 @@ def add_vectors(v1: Vec2D, v2: Vec2D) -> Vec2D: np.testing.assert_equal(result["x"], h_in1["x"] + h_in2["x"]) np.testing.assert_equal(result["y"], h_in1["y"] + h_in2["y"]) + + +def test_unary_transform_stateful_counting(): + """Test unary_transform with state that counts even numbers.""" + from numba import cuda as numba_cuda + + d_in = cp.arange(100, dtype=np.int32) + d_out = cp.empty_like(d_in) + + even_count = cp.zeros(1, dtype=np.int32) + + # Define op that references state as closure + def count_evens(x): + if x % 2 == 0: + numba_cuda.atomic.add(even_count, 0, 1) + return x * 2 + + cuda.compute.unary_transform(d_in, d_out, count_evens, len(d_in)) + + expected_output = cp.arange(100, dtype=np.int32) * 2 + np.testing.assert_array_equal(d_out.get(), expected_output.get()) + + num_evens = int(even_count.get()[0]) + assert num_evens == 50 # 0, 2, 4, ..., 98 From c40c68d8a39392e29e6ea899219bca7323df867e Mon Sep 17 00:00:00 2001 From: Bernhard Manfred Gruber Date: Fri, 19 Dec 2025 16:11:15 +0100 Subject: [PATCH 32/56] Fix `cuda::memcpy async` edge cases and add more tests (#6608) --- .../asynchronous_operations/memcpy_async.rst | 54 +++++- .../cp_async_bulk_shared_global.h | 10 +- .../include/cuda/__memcpy_async/elect_one.h | 25 ++- .../cuda/__memcpy_async/group_traits.h | 61 +++++++ libcudacxx/include/cuda/pipeline | 10 ++ .../cuda/memcpy_async/group_memcpy_async.h | 100 +++++++---- .../group_memcpy_async_16b.pass.cpp | 6 +- .../group_memcpy_async_32b.pass.cpp | 6 +- .../group_memcpy_async_64b.pass.cpp | 6 +- .../memcpy_async/memcpy_async_block.pass.cpp | 155 ++++++++++++++++++ 10 files changed, 381 insertions(+), 52 deletions(-) create mode 100644 libcudacxx/include/cuda/__memcpy_async/group_traits.h create mode 100644 libcudacxx/test/libcudacxx/cuda/memcpy_async/memcpy_async_block.pass.cpp diff --git a/docs/libcudacxx/extended_api/asynchronous_operations/memcpy_async.rst b/docs/libcudacxx/extended_api/asynchronous_operations/memcpy_async.rst index e2d4cf261ab..565e501efe7 100644 --- a/docs/libcudacxx/extended_api/asynchronous_operations/memcpy_async.rst +++ b/docs/libcudacxx/extended_api/asynchronous_operations/memcpy_async.rst @@ -67,13 +67,13 @@ memory location pointed to by ``source`` to the memory location pointed to by ``destination``. Both objects are reinterpreted as arrays of ``unsigned char``. -1. Binds the asynchronous copy completion to ``cuda::barrier`` and +1. Non-group version. Binds the asynchronous copy completion to ``cuda::barrier`` and issues the copy in the current thread. -2. Binds the asynchronous copy completion to ``cuda::barrier`` and +2. Group version. Binds the asynchronous copy completion to ``cuda::barrier`` and cooperatively issues the copy across all threads in ``group``. -3. Binds the asynchronous copy completion to ``cuda::pipeline`` and +3. Non-group version. Binds the asynchronous copy completion to ``cuda::pipeline`` and issues the copy in the current thread. -4. Binds the asynchronous copy completion to ``cuda::pipeline`` and +4. Group version. Binds the asynchronous copy completion to ``cuda::pipeline`` and cooperatively issues the copy across all threads in ``group``. 5. 5-8: convenience wrappers using ``cuda::annotated_ptr`` where ``Sync`` is either ``cuda::barrier`` or ``cuda::pipeline``. @@ -91,14 +91,25 @@ namely: the behavior is undefined. - If the objects are not of `TriviallyCopyable `_ type the program is ill-formed, no diagnostic required. + +Additionally: + - If *Shape* is :ref:`cuda::aligned_size_t `, ``source`` and ``destination`` are both required to be aligned on ``cuda::aligned_size_t::align``, else the behavior is undefined. - If ``cuda::pipeline`` is in a *quitted state* (see :ref:`cuda::pipeline::quit `), the behavior is undefined. - - For cooperative variants, if the parameters are not the same across all threads in ``group``, the behavior is - undefined. + - For cooperative overloads (with a group parameter), + if the parameters are not the same across all threads in ``group``, + or not all threads represented by ``group`` call the overload, the behavior is undefined. + - The group of a cooperative overload can also represent a partition of the active threads calling the overload, + in which case a copy is cooperatively issued per partition of the active threads described by ``group``. + For example, if ``group`` is a ``cooperative_groups::thread_block_tile<32, ...>`` + and the overload is called with 128 threads active, 4 copies will be issued, one cooperatively per warp. + - If a non-group overload is called with multiple threads active, + each thread issues its own copy and thus must have different arguments and the copies must not overlap. + Template Parameters ------------------- @@ -133,6 +144,26 @@ Parameters * - ``pipeline`` - The pipeline object used to wait on the copy completion. +Related traits +-------------- + +.. code:: cuda + + template + constexpr inline bool is_thread_block_group_v; + +This trait is ``true`` if ``Group`` represents the full CUDA thread block. +For example, ``cooperative_groups::thread_block`` satisfies this trait. +Users are encouraged to specialize this trait for their own groups. + +.. code:: cuda + + template + constexpr inline bool is_warp_group_v = false; + +This trait is ``true`` if ``Group`` represents a full CUDA warp. +For example, ``cooperative_groups::thread_block_tile<32, ...>`` satisfies this trait. +Users are encouraged to specialize this trait for their own groups. Implementation notes -------------------- @@ -143,6 +174,9 @@ via the ``cp.async.bulk`` instruction to perform the copy if: - the data is aligned to 16 bytes, - the source is global memory, - the destination is shared memory. +Additionally, the cooperative overload (taking a group) can generate more efficient code +if the group satisfies the trait ``cuda::is_thread_block_group_v`` or ``cuda::is_warp_group_v``. +In those cases, a uniform data path is generated for the bulk copy and thread peeling is avoided. On Ampere+ GPUs, the ``cp.async`` instruction may be used to perform the copy if: - the data is aligned to at least 4 bytes, @@ -233,4 +267,12 @@ a custom group can be defined like: } }; + template <> + inline constexpr bool cuda::is_thread_block_group_v = true; + +Such a group will emit the least amount of code when used with ``cuda::memcpy_async``, +since the ``thread_rank()`` is easily computed (because the block is 1D) +and we declared the group as representing the whole thread block, +which allows emit a uniform data path on Hopper+ GPUs in certain conditions. + `See it on Godbolt `__ diff --git a/libcudacxx/include/cuda/__memcpy_async/cp_async_bulk_shared_global.h b/libcudacxx/include/cuda/__memcpy_async/cp_async_bulk_shared_global.h index 6e43d77ccdd..e8a1f2cec9d 100644 --- a/libcudacxx/include/cuda/__memcpy_async/cp_async_bulk_shared_global.h +++ b/libcudacxx/include/cuda/__memcpy_async/cp_async_bulk_shared_global.h @@ -25,8 +25,8 @@ #if _CCCL_CUDA_COMPILATION() # if __cccl_ptx_isa >= 800 +# include # include -# include # include # include # include @@ -39,12 +39,6 @@ _CCCL_BEGIN_NAMESPACE_CUDA -template -[[nodiscard]] _CCCL_DEVICE _CCCL_FORCEINLINE bool __elect_from_group(const _Group& __g) noexcept -{ - return __g.thread_rank() == 0; -} - extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_shared_global_is_not_supported_before_SM_90__(); template inline _CCCL_DEVICE void __cp_async_bulk_shared_global_and_expect_tx( @@ -53,7 +47,7 @@ inline _CCCL_DEVICE void __cp_async_bulk_shared_global_and_expect_tx( // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk NV_IF_ELSE_TARGET( NV_PROVIDES_SM_90, - (if (__elect_from_group(__g)) { + (if (::cuda::device::__group_elect_one(__g)) { ::cuda::ptx::cp_async_bulk( ::cuda::std::conditional_t<__cccl_ptx_isa >= 860, ::cuda::ptx::space_shared_t, ::cuda::ptx::space_cluster_t>{}, ::cuda::ptx::space_global, diff --git a/libcudacxx/include/cuda/__memcpy_async/elect_one.h b/libcudacxx/include/cuda/__memcpy_async/elect_one.h index 46e2686a5cc..ee6dc1c6a34 100644 --- a/libcudacxx/include/cuda/__memcpy_async/elect_one.h +++ b/libcudacxx/include/cuda/__memcpy_async/elect_one.h @@ -22,10 +22,13 @@ # pragma system_header #endif // no system header +#include #include #include +#if _CCCL_CUDA_COMPILATION() + _CCCL_BEGIN_NAMESPACE_CUDA_DEVICE //! Elects a single leader thread from a one dimensional thread block. For SM90+ will use ptx::elect_sync() etc., @@ -39,12 +42,30 @@ _CCCL_BEGIN_NAMESPACE_CUDA_DEVICE NV_PROVIDES_SM_90, (const auto tid = threadIdx.x; // const auto warp_id = tid / 32; - const auto uniform_warp_id = __shfl_sync(~0, warp_id, 0); // broadcast from lane 0 - return uniform_warp_id == 0 && cuda::ptx::elect_sync(~0); // elect a leader thread among warp 0 + const auto uniform_warp_id = ::__shfl_sync(~0, warp_id, 0); // broadcast from lane 0 + return uniform_warp_id == 0 && ::cuda::ptx::elect_sync(~0); // elect a leader thread among warp 0 ), (return threadIdx.x == 0;)); } +template +[[nodiscard]] _CCCL_DEVICE_API _CCCL_FORCEINLINE bool __group_elect_one(const _Group& __g) noexcept +{ + NV_IF_TARGET(NV_PROVIDES_SM_90, + ( + if constexpr (is_thread_block_group_v<_Group>) { + // cooperative groups maps a multidimensional thread id into the thread rank the same way as warps do + const unsigned __tid = __g.thread_rank(); + const unsigned __warp_id = __tid / 32; + const unsigned __uniform_warp_id = ::__shfl_sync(~0, __warp_id, 0); // broadcast from lane 0 + return __uniform_warp_id == 0 && ::cuda::ptx::elect_sync(~0); // elect a leader thread among warp 0 + } else if constexpr (is_warp_group_v<_Group>) { return ::cuda::ptx::elect_sync(~0); })); + + return __g.thread_rank() == 0; +} + +#endif // _CCCL_CUDA_COMPILATION() + _CCCL_END_NAMESPACE_CUDA_DEVICE #include diff --git a/libcudacxx/include/cuda/__memcpy_async/group_traits.h b/libcudacxx/include/cuda/__memcpy_async/group_traits.h new file mode 100644 index 00000000000..e984dd4ca22 --- /dev/null +++ b/libcudacxx/include/cuda/__memcpy_async/group_traits.h @@ -0,0 +1,61 @@ +// -*- C++ -*- +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDA___MEMCPY_ASYNC_GROUP_TRAITS_H_ +#define _CUDA___MEMCPY_ASYNC_GROUP_TRAITS_H_ + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include + +// forward declare cooperative groups types. we cannot include since it does not work with NVHPC +namespace cooperative_groups +{ +namespace __v1 +{ +class thread_block; + +template +class thread_block_tile; +} // namespace __v1 +using namespace __v1; +} // namespace cooperative_groups + +_CCCL_BEGIN_NAMESPACE_CUDA + +//! Trait to detect whether a group represents a CUDA thread block, for example: ``cooperative_groups::thread_block``. +template +inline constexpr bool is_thread_block_group_v = false; + +template <> +inline constexpr bool is_thread_block_group_v<::cooperative_groups::thread_block> = true; + +//! Trait to detect whether a group represents a CUDA warp, for example: +//! ``cooperative_groups::thread_block_tile<32, ...>``. +template +inline constexpr bool is_warp_group_v = false; + +template +inline constexpr bool is_warp_group_v<::cooperative_groups::thread_block_tile<32, _Parent>> = true; + +_CCCL_END_NAMESPACE_CUDA + +#include + +#endif // _CUDA___MEMCPY_ASYNC_GROUP_TRAITS_H_ diff --git a/libcudacxx/include/cuda/pipeline b/libcudacxx/include/cuda/pipeline index ef4655f5e3c..e100d6aa3b5 100644 --- a/libcudacxx/include/cuda/pipeline +++ b/libcudacxx/include/cuda/pipeline @@ -135,6 +135,11 @@ public: return __released; } + _CCCL_API inline bool __is_active() const + { + return __active; + } + _CCCL_API inline void producer_acquire() { barrier<_Scope>& __stage_barrier = __shared_state_get_stage(__head)->__consumed; @@ -487,6 +492,11 @@ template _CCCL_API inline async_contract_fulfillment __memcpy_async_pipeline( _Group const& __group, _Tp* __destination, _Tp const* __source, _Size __size, pipeline<_Scope>& __pipeline) { + if constexpr (_Scope != thread_scope_thread) + { + _CCCL_ASSERT(__pipeline.__is_active(), "The pipeline used for memcpy_async must be active (not quitted)"); + } + // 1. Set the completion mechanisms that can be used. // // Do not (yet) allow async_bulk_group completion. Do not allow diff --git a/libcudacxx/test/libcudacxx/cuda/memcpy_async/group_memcpy_async.h b/libcudacxx/test/libcudacxx/cuda/memcpy_async/group_memcpy_async.h index 2ea2236a3ed..f1eef0def2d 100644 --- a/libcudacxx/test/libcudacxx/cuda/memcpy_async/group_memcpy_async.h +++ b/libcudacxx/test/libcudacxx/cuda/memcpy_async/group_memcpy_async.h @@ -11,6 +11,7 @@ // UNSUPPORTED: pre-sm-70 #include +#include #include @@ -18,11 +19,14 @@ namespace cg = cooperative_groups; +inline constexpr int thread_block_size = 64; + template struct storage { // A prime to avoid accidental alignment of the size with smaller element types. - constexpr static int size = 61; + constexpr static int size = 67; + static_assert(size >= thread_block_size); __host__ __device__ storage(T val = 0) { @@ -65,10 +69,10 @@ struct storage }; #if !TEST_COMPILER(NVRTC) && !TEST_COMPILER(CLANG) -static_assert(std::is_trivially_copy_constructible>::value, ""); -static_assert(std::is_trivially_copy_constructible>::value, ""); -static_assert(std::is_trivially_copy_constructible>::value, ""); -static_assert(std::is_trivially_copy_constructible>::value, ""); +static_assert(cuda::std::is_trivially_copy_constructible_v>, ""); +static_assert(cuda::std::is_trivially_copy_constructible_v>, ""); +static_assert(cuda::std::is_trivially_copy_constructible_v>, ""); +static_assert(cuda::std::is_trivially_copy_constructible_v>, ""); #endif template __device__ __noinline__ void test_fully_specialized() { - SourceSelector source_sel; - typename DestSelector::template offsetted dest_sel; + SourceSelector, constructor_initializer> source_sel; + typename DestSelector, constructor_initializer>::template offsetted + dest_sel; BarrierSelector, constructor_initializer> bar_sel; - __shared__ T* source; - __shared__ T* dest; + __shared__ storage* source; + __shared__ storage* dest; __shared__ cuda::barrier* bar; - source = source_sel.construct(static_cast(12)); - dest = dest_sel.construct(static_cast(0)); - bar = bar_sel.construct(4); - + source = source_sel.construct(static_cast>(12)); + dest = dest_sel.construct(static_cast>(0)); + bar = bar_sel.construct(+thread_block_size); assert(*source == 12); assert(*dest == 0); - cuda::memcpy_async(cg::this_thread_block(), dest, source, sizeof(T), *bar); - + // test normal version + cuda::memcpy_async(cg::this_thread_block(), dest, source, sizeof(storage), *bar); bar->arrive_and_wait(); - - assert(*source == 12); assert(*dest == 12); + // prepare source if (cg::this_thread_block().thread_rank() == 0) { *source = 24; } cg::this_thread_block().sync(); + assert(*source == 24); - cuda::memcpy_async(cg::this_thread_block(), static_cast(dest), static_cast(source), sizeof(T), *bar); + // test void* overload and use just warp 1 for copy + auto warps = cg::tiled_partition<32>(cg::this_thread_block()); + if (warps.meta_group_rank() == 1) + { + assert(threadIdx.x >= 32 && threadIdx.x < 64); + static_assert(thread_block_size >= 64); + cuda::memcpy_async(warps, static_cast(dest), static_cast(source), sizeof(storage), *bar); + } + bar->arrive_and_wait(); + assert(*dest == 24); + // prepare source + if (cg::this_thread_block().thread_rank() == 0) + { + *source = 48; + } + cg::this_thread_block().sync(); + assert(*source == 48); + + // use 2 groups of 4 threads to copy 8 items each, but spread them 16 bytes + auto tiled_groups = cg::tiled_partition<4>(cg::this_thread_block()); + if (threadIdx.x < 8) + { + static_assert(thread_block_size >= 8); + printf("%u copying 8 items at meta group rank %u\n", threadIdx.x, tiled_groups.meta_group_rank()); + cuda::memcpy_async( + tiled_groups, + &dest->data[tiled_groups.meta_group_rank() * 16], + &source->data[tiled_groups.meta_group_rank() * 16], + sizeof(T) * 8, + *bar); + } bar->arrive_and_wait(); - assert(*source == 24); - assert(*dest == 24); + for (int i = 0; i < 8; ++i) + { + assert(dest->data[i + 0] == static_cast(48 + (i + 0))); // 8 copied items from first group + assert(dest->data[i + 8] == static_cast(24 + (i + 8))); // 8 untouched items between + assert(dest->data[i + 16] == static_cast(48 + (i + 16))); // 8 copied items from second group + } + for (int i = 24; i < storage::size; ++i) + { + assert(dest->data[i] == static_cast(24 + i)); // untouched items afterwards + } } struct completion { - __host__ __device__ void operator()() const {} + __device__ void operator()() const {} }; template class SourceSelector, template class DestSelector, template class BarrierSelector> -__host__ __device__ __noinline__ void test_select_scope() +__device__ __noinline__ void test_select_scope() { test_fully_specialized(); test_fully_specialized(); @@ -138,19 +180,17 @@ __host__ __device__ __noinline__ void test_select_scope() } template class SourceSelector, template class DestSelector> -__host__ __device__ __noinline__ void test_select_barrier() +__device__ __noinline__ void test_select_barrier() { - NV_IF_TARGET(NV_IS_DEVICE, - (test_select_scope(); - test_select_scope();)) + test_select_scope(); + test_select_scope(); } template class SourceSelector> -__host__ __device__ __noinline__ void test_select_destination() +__device__ __noinline__ void test_select_destination() { - NV_IF_TARGET(NV_IS_DEVICE, - (test_select_barrier(); - test_select_barrier();)) + test_select_barrier(); + test_select_barrier(); } template diff --git a/libcudacxx/test/libcudacxx/cuda/memcpy_async/group_memcpy_async_16b.pass.cpp b/libcudacxx/test/libcudacxx/cuda/memcpy_async/group_memcpy_async_16b.pass.cpp index 810f8d41eb1..d6b09c3e7d5 100644 --- a/libcudacxx/test/libcudacxx/cuda/memcpy_async/group_memcpy_async_16b.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/memcpy_async/group_memcpy_async_16b.pass.cpp @@ -14,9 +14,11 @@ int main(int argc, char** argv) { - NV_IF_TARGET(NV_IS_HOST, cuda_thread_count = 4;) + // important: `cuda__thread__count =` (typo on purpose) needs to be followed by an integer literal, otherwise nvrtcc + // cannot regex-match it + NV_IF_TARGET(NV_IS_HOST, (cuda_thread_count = 64;), (assert(blockDim.x == thread_block_size);)) - test_select_source>(); + test_select_source(); return 0; } diff --git a/libcudacxx/test/libcudacxx/cuda/memcpy_async/group_memcpy_async_32b.pass.cpp b/libcudacxx/test/libcudacxx/cuda/memcpy_async/group_memcpy_async_32b.pass.cpp index 4d7640b06ad..b10f40020dc 100644 --- a/libcudacxx/test/libcudacxx/cuda/memcpy_async/group_memcpy_async_32b.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/memcpy_async/group_memcpy_async_32b.pass.cpp @@ -14,9 +14,11 @@ int main(int argc, char** argv) { - NV_IF_TARGET(NV_IS_HOST, cuda_thread_count = 4;) + // important: `cuda__thread__count =` (typo on purpose) needs to be followed by an integer literal, otherwise nvrtcc + // cannot regex-match it + NV_IF_TARGET(NV_IS_HOST, (cuda_thread_count = 64;), (assert(blockDim.x == thread_block_size);)) - test_select_source>(); + test_select_source(); return 0; } diff --git a/libcudacxx/test/libcudacxx/cuda/memcpy_async/group_memcpy_async_64b.pass.cpp b/libcudacxx/test/libcudacxx/cuda/memcpy_async/group_memcpy_async_64b.pass.cpp index 1af6076f0db..14411378ae5 100644 --- a/libcudacxx/test/libcudacxx/cuda/memcpy_async/group_memcpy_async_64b.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/memcpy_async/group_memcpy_async_64b.pass.cpp @@ -14,9 +14,11 @@ int main(int argc, char** argv) { - NV_IF_TARGET(NV_IS_HOST, cuda_thread_count = 4;) + // important: `cuda__thread__count =` (typo on purpose) needs to be followed by an integer literal, otherwise nvrtcc + // cannot regex-match it + NV_IF_TARGET(NV_IS_HOST, (cuda_thread_count = 64;), (assert(blockDim.x == thread_block_size);)) - test_select_source>(); + test_select_source(); return 0; } diff --git a/libcudacxx/test/libcudacxx/cuda/memcpy_async/memcpy_async_block.pass.cpp b/libcudacxx/test/libcudacxx/cuda/memcpy_async/memcpy_async_block.pass.cpp new file mode 100644 index 00000000000..5ca713f6851 --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/memcpy_async/memcpy_async_block.pass.cpp @@ -0,0 +1,155 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: pre-sm-70 + +// clang-cuda < 20 errors out with "fatal error: error in backend: Cannot cast between two non-generic address spaces" +// XFAIL: clang-14 && !nvcc +// XFAIL: clang-15 && !nvcc +// XFAIL: clang-16 && !nvcc +// XFAIL: clang-17 && !nvcc +// XFAIL: clang-18 && !nvcc +// XFAIL: clang-19 && !nvcc + +#include + +#include "cuda_space_selector.h" + +inline constexpr int thread_block_size = 64; + +template class SourceSelector, + template class DestSelector, + template class BarrierSelector, + cuda::thread_scope BarrierScope, + typename... CompletionF> +__device__ __noinline__ void test_fully_specialized() +{ + // these tests focus on non-trivial thread ids and concurrent calls in the presence of other threads + + struct data_t + { + T data[thread_block_size]; + }; + + SourceSelector source_sel; + typename DestSelector::template offsetted dest_sel; + BarrierSelector, constructor_initializer> bar_sel; + + data_t* source = source_sel.construct(); + data_t* dest = dest_sel.construct(); + cuda::barrier* bar = bar_sel.construct(+thread_block_size); + + // init SMEM + source->data[threadIdx.x] = 12; + dest->data[threadIdx.x] = 0; + __barrier_sync(0); + + // single thread + if (threadIdx.x == 0) + { + cuda::memcpy_async(dest->data, source->data, sizeof(T), *bar); + } + bar->arrive_and_wait(); + assert(dest->data[threadIdx.x] == (threadIdx.x == 0 ? 12 : 0)); // 12 0 0 0 0 0 ... + + // void* overload, single thread, different id, two calls with 3 and 2 items + source->data[threadIdx.x] = 24; + __barrier_sync(0); + + if (threadIdx.x == 42) + { + static_assert(42 < thread_block_size); + cuda::memcpy_async(static_cast(dest->data), static_cast(source->data), sizeof(T) * 3, *bar); + cuda::memcpy_async(static_cast(dest->data + 3), static_cast(source->data + 3), sizeof(T) * 2, *bar); + } + bar->arrive_and_wait(); + assert(dest->data[threadIdx.x] == (threadIdx.x < 5 ? 24 : 0)); // 24 24 24 24 24 0 0 0 ... + + // use 3 threads to perform 8 copies of 2 items each, spaced at 4 bytes in the destination + source->data[threadIdx.x] = 48; + __barrier_sync(0); + + if (threadIdx.x < 3) + { + static_assert(thread_block_size >= 64); + cuda::memcpy_async(dest->data + threadIdx.x * 4, source->data + threadIdx.x * 2, sizeof(T) * 2, *bar); + } + bar->arrive_and_wait(); + + assert(dest->data[0] == 48); + assert(dest->data[1] == 48); + assert(dest->data[2] == 24); + assert(dest->data[3] == 24); + assert(dest->data[4] == 48); + assert(dest->data[5] == 48); + assert(dest->data[6] == 0); + assert(dest->data[7] == 0); + assert(dest->data[8] == 48); + assert(dest->data[9] == 48); + if (threadIdx.x >= 10) + { + assert(dest->data[threadIdx.x] == 0); + } +} + +struct completion +{ + __device__ void operator()() const {} +}; + +template class SourceSelector, + template class DestSelector, + template class BarrierSelector> +__device__ __noinline__ void test_select_scope() +{ + test_fully_specialized(); + test_fully_specialized(); + test_fully_specialized(); + // Test one of the scopes with a non-default completion. Testing them all would make this test take twice as much time + // to compile. Selected block scope because the block scope barrier with the default completion has a special path, so + // this tests both that the API entrypoints accept barriers with arbitrary completion function, and that the + // synchronization mechanism detects it correctly. + test_fully_specialized(); +} + +template class SourceSelector, template class DestSelector> +__device__ __noinline__ void test_select_barrier() +{ + test_select_scope(); + test_select_scope(); +} + +template class SourceSelector> +__device__ __noinline__ void test_select_destination() +{ + test_select_barrier(); + test_select_barrier(); +} + +template +__device__ __noinline__ void test_select_source() +{ + test_select_destination(); + test_select_destination(); +} + +int main(int argc, char** argv) +{ + // important: `cuda__thread__count =` (typo on purpose) needs to be followed by an integer literal, otherwise nvrtcc + // cannot regex-match it + NV_IF_TARGET(NV_IS_HOST, (cuda_thread_count = 64;), ({ + assert(blockDim.x == thread_block_size); + test_select_source(); + })) + + return 0; +} From 16bdfbfda86ce4a0ef7fdbf94df29884c5696b4b Mon Sep 17 00:00:00 2001 From: Kyle Edwards Date: Fri, 19 Dec 2025 13:09:25 -0500 Subject: [PATCH 33/56] Explicitly set `CCCL_TOPLEVEL_PROJECT` to `OFF` when needed (#7016) --- CMakeLists.txt | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 3aa0ca30b72..cbfe2dc57ec 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -9,8 +9,15 @@ endif() # Determine whether CCCL is the top-level project or included into # another project via add_subdirectory() -if ("${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_CURRENT_LIST_DIR}") - set(CCCL_TOPLEVEL_PROJECT ON) + +if (NOT DEFINED CCCL_TOPLEVEL_PROJECT) + # DO NOT REMOVE THE FOLLOWING LINE + set(CCCL_TOPLEVEL_PROJECT OFF) + # REQUIRED FOR CCCL TO WORK VIA CPM WITH INSTALL RULES + + if ("${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_CURRENT_LIST_DIR}") + set(CCCL_TOPLEVEL_PROJECT ON) + endif() endif() # Enable CXX so CMake can configure install paths From 11d32ec1fe5f85129e5138dfd03918631b3a908e Mon Sep 17 00:00:00 2001 From: pciolkosz Date: Fri, 19 Dec 2025 11:22:33 -0800 Subject: [PATCH 34/56] [libcu++] Add explicit alignment specification in buffer (#7005) * Add explicit alignment specification in buffer * Fix shared resource test * Missed alignment in deallocate --- .../include/cuda/__container/uninitialized_async_buffer.h | 6 +++--- .../test/libcudacxx/cuda/memory_resource/shared_resource.cu | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/libcudacxx/include/cuda/__container/uninitialized_async_buffer.h b/libcudacxx/include/cuda/__container/uninitialized_async_buffer.h index 6f00ef61b76..c9bcb8ab1e0 100644 --- a/libcudacxx/include/cuda/__container/uninitialized_async_buffer.h +++ b/libcudacxx/include/cuda/__container/uninitialized_async_buffer.h @@ -205,7 +205,7 @@ class __uninitialized_async_buffer : __mr_(::cuda::std::move(__mr)) , __stream_(__stream) , __count_(__count) - , __buf_(__count_ == 0 ? nullptr : __mr_.allocate(__stream_, __get_allocation_size(__count_))) + , __buf_(__count_ == 0 ? nullptr : __mr_.allocate(__stream_, __get_allocation_size(__count_), alignof(_Tp))) {} _CCCL_HIDE_FROM_ABI __uninitialized_async_buffer(const __uninitialized_async_buffer&) = delete; @@ -247,7 +247,7 @@ class __uninitialized_async_buffer if (__buf_) { - __mr_.deallocate(__stream_, __buf_, __get_allocation_size(__count_)); + __mr_.deallocate(__stream_, __buf_, __get_allocation_size(__count_), alignof(_Tp)); } __mr_ = ::cuda::std::move(__other.__mr_); __stream_ = ::cuda::std::exchange(__other.__stream_, ::cuda::stream_ref{::cudaStream_t{}}); @@ -267,7 +267,7 @@ class __uninitialized_async_buffer { if (__buf_) { - __mr_.deallocate(__stream, __buf_, __get_allocation_size(__count_)); + __mr_.deallocate(__stream, __buf_, __get_allocation_size(__count_), alignof(_Tp)); __buf_ = nullptr; __count_ = 0; } diff --git a/libcudacxx/test/libcudacxx/cuda/memory_resource/shared_resource.cu b/libcudacxx/test/libcudacxx/cuda/memory_resource/shared_resource.cu index bce25c4d69c..049052b3f95 100644 --- a/libcudacxx/test/libcudacxx/cuda/memory_resource/shared_resource.cu +++ b/libcudacxx/test/libcudacxx/cuda/memory_resource/shared_resource.cu @@ -133,7 +133,7 @@ TEMPLATE_TEST_CASE_METHOD(test_fixture, "shared_resource", "[container][resource SECTION("basic sanity test about shared resource handling") { Counts expected{}; - this->align(alignof(cuda::std::max_align_t)); + this->align(alignof(int)); { this->bytes(42 * sizeof(int)); cuda::stream stream{cuda::device_ref{0}}; From d1dcaa5e9329b90c22861233505735cbd3e404b5 Mon Sep 17 00:00:00 2001 From: Paul Taylor <178183+trxcllnt@users.noreply.github.com> Date: Fri, 19 Dec 2025 12:07:55 -0800 Subject: [PATCH 35/56] Use the sccache-dist build cluster for RAPIDS CI jobs (#7014) * use the sccache-dist build cluster for RAPIDS CI jobs [skip-matrix] [skip-vdc] [skip-docs] [skip-matx] [skip-pytorch] [test-rapids] * run devcontainer-utils lifecycle scripts [skip-matrix] [skip-vdc] [skip-docs] [skip-matx] [skip-pytorch] [test-rapids] * define GH_TOKEN [skip-matrix] [skip-vdc] [skip-docs] [skip-matx] [skip-pytorch] [test-rapids] * cpu32 -> cpu16 [skip-matrix] [skip-vdc] [skip-docs] [skip-matx] [skip-pytorch] [test-rapids] * remove preprocessor cache key prefix [skip-matrix] [skip-vdc] [skip-docs] [skip-matx] [skip-pytorch] [test-rapids] * increase nofile ulimit [skip-matrix] [skip-vdc] [skip-docs] [skip-matx] [skip-pytorch] [test-rapids] --- .devcontainer/cccl-entrypoint.sh | 12 ++++ .github/workflows/build-rapids.yml | 27 +++++---- ci/rapids/cuda13.0-conda/devcontainer.json | 68 +++++++++++++--------- ci/rapids/rapids-entrypoint.sh | 8 +++ 4 files changed, 78 insertions(+), 37 deletions(-) diff --git a/.devcontainer/cccl-entrypoint.sh b/.devcontainer/cccl-entrypoint.sh index 9216860be9a..36506e8ab3a 100755 --- a/.devcontainer/cccl-entrypoint.sh +++ b/.devcontainer/cccl-entrypoint.sh @@ -4,9 +4,21 @@ set -e; +if ! test -n "${DISABLE_SCCACHE:+x}" && test -n "${DEVCONTAINER_UTILS_ENABLE_SCCACHE_DIST:+x}" && ! test -n "${SCCACHE_DIST_URL:+x}"; then + export SCCACHE_DIST_URL="https://$(dpkg --print-architecture).$(uname -s | tr '[:upper:]' '[:lower:]').sccache.rapids.nvidia.com"; + echo "export SCCACHE_DIST_URL=$SCCACHE_DIST_URL" >> ~/.bashrc; +fi + +if [[ -n "${GITHUB_ACTIONS:-}" ]]; then + echo "::group::Initializing devcontainer..." +fi + devcontainer-utils-post-create-command; devcontainer-utils-init-git; devcontainer-utils-post-attach-command; +if [[ -n "${GITHUB_ACTIONS:-}" ]]; then + echo "::endgroup::" +fi if ! dpkg -s ca-certificates > /dev/null 2>&1; then if [[ -n "${GITHUB_ACTIONS:-}" ]]; then diff --git a/.github/workflows/build-rapids.yml b/.github/workflows/build-rapids.yml index e298f284927..40b5d0a3535 100644 --- a/.github/workflows/build-rapids.yml +++ b/.github/workflows/build-rapids.yml @@ -58,7 +58,7 @@ jobs: name: "${{ matrix.libs }}" if: needs.check-event.outputs.ok == 'true' needs: check-event - runs-on: ${{ fromJSON(github.repository != 'NVIDIA/cccl' && '"ubuntu-latest"' || '"linux-amd64-cpu32"') }} + runs-on: ${{ fromJSON(github.repository != 'NVIDIA/cccl' && '"ubuntu-latest"' || '"linux-amd64-cpu16"') }} strategy: fail-fast: false matrix: @@ -88,6 +88,8 @@ jobs: CCCL_TAG: ${{ inputs.override_cccl_tag }} CCCL_VERSION: ${{ inputs.override_cccl_version }} CI: true + CONDA_ENV_CREATE_QUIET: true + GH_TOKEN: ${{ github.token }} RAPIDS_LIBS: ${{ matrix.libs }} # Uncomment any of these to customize the git repo and branch for a RAPIDS lib: # RAPIDS_cmake_GIT_REPO: '{"upstream": "rapidsai", "tag": "main"}' @@ -102,11 +104,21 @@ jobs: # RAPIDS_raft_GIT_REPO: '{"upstream": "rapidsai", "tag": "main"}' # RAPIDS_rmm_GIT_REPO: '{"upstream": "rapidsai", "tag": "main"}' # RAPIDS_ucxx_GIT_REPO: '{"upstream": "rapidsai", "tag": "main"}' + # Build cluster auth + SCCACHE_DIST_TOKEN: "${{ secrets.SCCACHE_DIST_TOKEN }}" + SCCACHE_DIST_AUTH_TOKEN_VAR: "SCCACHE_DIST_TOKEN" + # Retry intermittent failures + SCCACHE_DIST_MAX_RETRIES: "inf" + # Never fallback to building locally, fail instead + SCCACHE_DIST_FALLBACK_TO_LOCAL_COMPILE: "false" + SCCACHE_IDLE_TIMEOUT: 0 run: | cat <<"EOF" > "$RUNNER_TEMP/ci.sh" #! /usr/bin/env bash set -eo pipefail + ulimit -n $(ulimit -Hn) + declare -a failures _print_err_exit_msg() { @@ -122,7 +134,7 @@ jobs: cat <<____EOF RAPIDS_LIBS='${RAPIDS_LIBS}'$(for lib in cmake ${RAPIDS_LIBS}; do var=RAPIDS_${lib//-/_}_GIT_REPO; if test -v "$var" && test -n "${!var}"; then echo -n " $var='${!var}'"; fi; done) \\ .devcontainer/launch.sh -d -c ${{matrix.cuda}} -H rapids-conda -- ./ci/rapids/rapids-entrypoint.sh \\ - /bin/bash -li -c 'uninstall-all -j -qqq && clean-all -j && build-all -j -v || exec /bin/bash -li' + /bin/bash -li -c 'uninstall-all -j -qqq && clean-all -j && build-all -j0 -v || exec /bin/bash -li' ____EOF echo "" echo "For additional information, see:" @@ -165,14 +177,9 @@ jobs: --env "AWS_SESSION_TOKEN=${AWS_SESSION_TOKEN:-}" \ --env "AWS_SECRET_ACCESS_KEY=${AWS_SECRET_ACCESS_KEY:-}" \ --env "CONDA_ENV_CREATE_QUIET=true" \ - --env "CCCL_TAG=${CCCL_TAG}" \ - --env "CCCL_VERSION=${CCCL_VERSION}" \ - --env "DEVCONTAINER_UTILS_ENABLE_SCCACHE_DIST=true" \ - --env "INFER_NUM_DEVICE_ARCHITECTURES=true" \ - --env "MAX_DEVICE_OBJ_TO_COMPILE_IN_PARALLEL=100" \ - --env "SCCACHE_BUCKET=${SCCACHE_BUCKET:-}" \ - --env "SCCACHE_REGION=${SCCACHE_REGION:-}" \ - --env "SCCACHE_IDLE_TIMEOUT=0" \ + --env "CCCL_TAG=$CCCL_TAG" \ + --env "CCCL_VERSION=$CCCL_VERSION" \ + --env "GH_TOKEN=$GH_TOKEN" \ --env "GITHUB_ACTIONS=$GITHUB_ACTIONS" \ --env "GITHUB_SHA=$GITHUB_SHA" \ --env "GITHUB_REF_NAME=$GITHUB_REF_NAME" \ diff --git a/ci/rapids/cuda13.0-conda/devcontainer.json b/ci/rapids/cuda13.0-conda/devcontainer.json index aa1073cca39..53ab6891005 100644 --- a/ci/rapids/cuda13.0-conda/devcontainer.json +++ b/ci/rapids/cuda13.0-conda/devcontainer.json @@ -1,54 +1,61 @@ { - "image": "rapidsai/devcontainers:25.12-cpp-mambaforge-ubuntu24.04", + "image": "rapidsai/devcontainers:26.02-cpp-mambaforge", "runArgs": [ "--init", "--rm", "--name", - "${localEnv:USER:anon}-${localWorkspaceFolderBasename}-rapids-25.12-cuda13.0-conda" + "${localEnv:USER:anon}-${localWorkspaceFolderBasename}-rapids-26.02-cuda13.0-conda", + "--ulimit", + "nofile=500000" ], "hostRequirements": {"gpu": "optional"}, - "features": { - "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:25.12": {} - }, - "overrideFeatureInstallOrder": [ - "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils" - ], "containerEnv": { + "AWS_ROLE_ARN": "arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs", "CI": "${localEnv:CI}", - "CUDAARCHS": "75-real", "CUDA_VERSION": "13.0", + "CUDAARCHS": "75-real", "DEFAULT_CONDA_ENV": "rapids", - "PYTHONSAFEPATH": "1", - "PYTHONUNBUFFERED": "1", - "PYTHONDONTWRITEBYTECODE": "1", - "PYTHON_PACKAGE_MANAGER": "conda", - "SCCACHE_REGION": "us-east-2", - "SCCACHE_BUCKET": "rapids-sccache-devs", - "AWS_ROLE_ARN": "arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs", + "DEVCONTAINER_UTILS_ENABLE_SCCACHE_DIST": "true", "HISTFILE": "/home/coder/.cache/._bash_history", + "INFER_NUM_DEVICE_ARCHITECTURES": "1", "LIBCUDF_KERNEL_CACHE_PATH": "/home/coder/cudf/cpp/build/latest/jitify_cache", - "RAPIDS_LIBS": "${localEnv:RAPIDS_LIBS}", + "MAX_DEVICE_OBJ_TO_COMPILE_IN_PARALLEL": "20", + "PYTHON_PACKAGE_MANAGER": "conda", + "PYTHONDONTWRITEBYTECODE": "1", + "PYTHONSAFEPATH": "1", + "PYTHONUNBUFFERED": "1", "RAPIDS_cmake_GIT_REPO": "${localEnv:RAPIDS_cmake_GIT_REPO}", - "RAPIDS_rmm_GIT_REPO": "${localEnv:RAPIDS_rmm_GIT_REPO}", - "RAPIDS_ucxx_GIT_REPO": "${localEnv:RAPIDS_ucxx_GIT_REPO}", - "RAPIDS_kvikio_GIT_REPO": "${localEnv:RAPIDS_kvikio_GIT_REPO}", "RAPIDS_cudf_GIT_REPO": "${localEnv:RAPIDS_cudf_GIT_REPO}", - "RAPIDS_raft_GIT_REPO": "${localEnv:RAPIDS_raft_GIT_REPO}", - "RAPIDS_cuvs_GIT_REPO": "${localEnv:RAPIDS_cuvs_GIT_REPO}", - "RAPIDS_cumlprims_mg_GIT_REPO": "${localEnv:RAPIDS_cumlprims_mg_GIT_REPO}", - "RAPIDS_cuml_GIT_REPO": "${localEnv:RAPIDS_cuml_GIT_REPO}", "RAPIDS_cugraph_GIT_REPO": "${localEnv:RAPIDS_cugraph_GIT_REPO}", - "RAPIDS_cugraph_gnn_GIT_REPO": "${localEnv:RAPIDS_cugraph_gnn_GIT_REPO}" + "RAPIDS_cugraph_gnn_GIT_REPO": "${localEnv:RAPIDS_cugraph_gnn_GIT_REPO}", + "RAPIDS_cuml_GIT_REPO": "${localEnv:RAPIDS_cuml_GIT_REPO}", + "RAPIDS_cumlprims_mg_GIT_REPO": "${localEnv:RAPIDS_cumlprims_mg_GIT_REPO}", + "RAPIDS_cuvs_GIT_REPO": "${localEnv:RAPIDS_cuvs_GIT_REPO}", + "RAPIDS_kvikio_GIT_REPO": "${localEnv:RAPIDS_kvikio_GIT_REPO}", + "RAPIDS_LIBS": "${localEnv:RAPIDS_LIBS}", + "RAPIDS_raft_GIT_REPO": "${localEnv:RAPIDS_raft_GIT_REPO}", + "RAPIDS_rmm_GIT_REPO": "${localEnv:RAPIDS_rmm_GIT_REPO}", + "RAPIDS_ucxx_GIT_REPO": "${localEnv:RAPIDS_ucxx_GIT_REPO}", + "SCCACHE_BUCKET": "rapids-sccache-devs", + "SCCACHE_DIST_AUTH_TOKEN_VAR": "${localEnv:SCCACHE_DIST_AUTH_TOKEN_VAR}", + "SCCACHE_DIST_FALLBACK_TO_LOCAL_COMPILE": "${localEnv:SCCACHE_DIST_FALLBACK_TO_LOCAL_COMPILE:true}", + "SCCACHE_DIST_MAX_RETRIES": "${localEnv:SCCACHE_DIST_MAX_RETRIES:4}", + "SCCACHE_DIST_REQUEST_TIMEOUT": "${localEnv:SCCACHE_DIST_REQUEST_TIMEOUT:7140}", + "SCCACHE_DIST_TOKEN": "${localEnv:SCCACHE_DIST_TOKEN}", + "SCCACHE_IDLE_TIMEOUT": "${localEnv:SCCACHE_IDLE_TIMEOUT:0}", + "SCCACHE_REGION": "us-east-2", + "SCCACHE_S3_USE_PREPROCESSOR_CACHE_MODE": "true", + "SCCACHE_SERVER_LOG": "${localEnv:SCCACHE_SERVER_LOG:sccache=debug}" }, "initializeCommand": [ "/bin/bash", "-c", - "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config} ${localWorkspaceFolder}/ci/rapids/.{conda,log/devcontainer-utils} ${localWorkspaceFolder}/ci/rapids/.repos/{rmm,kvikio,ucxx,cudf,raft,cuvs,cumlprims_mg,cuml,cugraph,cugraph-gnn}" + "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config,local/state} ${localWorkspaceFolder}/ci/rapids/.{conda,log/devcontainer-utils} ${localWorkspaceFolder}/ci/rapids/.repos/{rmm,kvikio,ucxx,cudf,raft,cuvs,cumlprims_mg,cuml,cugraph,cugraph-gnn}" ], "postCreateCommand": [ "/bin/bash", "-c", - "if [ ${CI:-false} = 'false' ]; then . /home/coder/cccl/ci/rapids/post-create-command.sh; fi" + "if [ ${CI:-false} = 'false' ]; then . /home/coder/cccl/ci/rapids/post-create-command.sh; fi; if test -z \"${DISABLE_SCCACHE:+x}\"; then echo \"export SCCACHE_DIST_URL='https://$(dpkg --print-architecture).$(uname -s | tr '[:upper:]' '[:lower:]').sccache.rapids.nvidia.com'\" >> /home/coder/.bashrc; fi" ], "postAttachCommand": [ "/bin/bash", @@ -61,6 +68,7 @@ "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent", "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent", "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent", + "source=${localWorkspaceFolder}/.local/state,target=/home/coder/.local/state,type=bind,consistency=consistent", "source=${localWorkspaceFolder}/ci/rapids/.repos/rmm,target=/home/coder/rmm,type=bind,consistency=consistent", "source=${localWorkspaceFolder}/ci/rapids/.repos/kvikio,target=/home/coder/kvikio,type=bind,consistency=consistent", "source=${localWorkspaceFolder}/ci/rapids/.repos/ucxx,target=/home/coder/ucxx,type=bind,consistency=consistent", @@ -74,6 +82,12 @@ "source=${localWorkspaceFolder}/ci/rapids/.conda,target=/home/coder/.conda,type=bind,consistency=consistent", "source=${localWorkspaceFolder}/ci/rapids/.log/devcontainer-utils,target=/var/log/devcontainer-utils,type=bind,consistency=consistent" ], + "features": { + "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:26.2": {} + }, + "overrideFeatureInstallOrder": [ + "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils" + ], "customizations": { "vscode": { "extensions": [ diff --git a/ci/rapids/rapids-entrypoint.sh b/ci/rapids/rapids-entrypoint.sh index 821aed77d63..b8b43112d93 100755 --- a/ci/rapids/rapids-entrypoint.sh +++ b/ci/rapids/rapids-entrypoint.sh @@ -4,9 +4,17 @@ set -e; +if [[ -n "${GITHUB_ACTIONS:-}" ]]; then + echo "::group::Cloning RAPIDS..." +fi + ci/rapids/post-create-command.sh; rapids-post-start-command -f; +if [[ -n "${GITHUB_ACTIONS:-}" ]]; then + echo "::endgroup::" +fi + if test $# -gt 0; then exec "$@"; else From 52834f8fcd5e29f2f68caae159623ba08811039e Mon Sep 17 00:00:00 2001 From: fbusato Date: Fri, 19 Dec 2025 16:59:26 -0800 Subject: [PATCH 36/56] first version --- .../include/cuda/__mdspan/dlpack_to_mdspan.h | 156 ++++++++++++++++++ 1 file changed, 156 insertions(+) create mode 100644 libcudacxx/include/cuda/__mdspan/dlpack_to_mdspan.h diff --git a/libcudacxx/include/cuda/__mdspan/dlpack_to_mdspan.h b/libcudacxx/include/cuda/__mdspan/dlpack_to_mdspan.h new file mode 100644 index 00000000000..ae9be140f26 --- /dev/null +++ b/libcudacxx/include/cuda/__mdspan/dlpack_to_mdspan.h @@ -0,0 +1,156 @@ +//===----------------------------------------------------------------------===// +// +// Part of the libcu++ Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDA___MDSPAN_DLPACK_TO_MDSPAN_H +#define _CUDA___MDSPAN_DLPACK_TO_MDSPAN_H + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#if !_CCCL_COMPILER(NVRTC) && _CCCL_HAS_INCLUDE() + +# include +# include +# include +# include +# include +# include + +# include + +# include +// +# include + +_CCCL_BEGIN_NAMESPACE_CUDA + +static_assert(DLPACK_MAJOR_VERSION == 1, "DLPACK_MAJOR_VERSION must be 1"); + +template +[[nodiscard]] _CCCL_HOST_API inline bool __validate_dlpack_data_type(const ::DLDataType& __dtype) noexcept +{ + const auto __expected = ::cuda::__data_type_to_dlpack<_ElementType>(); + return __dtype.code == __expected.code && __dtype.bits == __expected.bits && __dtype.lanes == __expected.lanes; +} + +template +[[nodiscard]] +_CCCL_HOST_API ::cuda::std::mdspan<_ElementType, ::cuda::std::dextents<::cuda::std::int64_t, _Rank>, _LayoutPolicy> +__to_mdspan(const ::DLTensor& __tensor) +{ + using __extents_type = ::cuda::std::dextents<::cuda::std::int64_t, _Rank>; + using __mdspan_type = ::cuda::std::mdspan<_ElementType, __extents_type, _LayoutPolicy>; + using __mapping_type = typename _LayoutPolicy::template mapping<__extents_type>; + using __element_type = typename __mdspan_type::element_type; + if (__tensor.ndim != int{_Rank}) + { + _CCCL_THROW(::std::invalid_argument{"DLTensor rank does not match expected rank"}); + } + if (!::cuda::__validate_dlpack_data_type<__element_type>(__tensor.dtype)) + { + _CCCL_THROW(::std::invalid_argument{"DLTensor data type does not match expected type"}); + } + auto __base_data = static_cast(__tensor.data) + __tensor.byte_offset; + auto __data = reinterpret_cast<__element_type*>(__base_data); + if constexpr (_Rank == 0) + { + return __mdspan_type{__data, __mapping_type{}}; + } + else if constexpr (::cuda::std::is_same_v<_LayoutPolicy, ::cuda::std::layout_stride>) + { + using ::cuda::std::int64_t; + using ::cuda::std::size_t; + ::cuda::std::array __extents_arr{}; + ::cuda::std::array __strides_arr{}; + for (size_t __i = 0; __i < _Rank; ++__i) + { + __extents_arr[__i] = __tensor.shape[__i]; + // strides == nullptr means row-major (C-contiguous) layout + if (__tensor.strides != nullptr) + { + __strides_arr[__i] = __tensor.strides[__i]; + } + else + { + __strides_arr[__i] = 1; + for (size_t __j = __i + 1; __j < _Rank; ++__j) + { + __strides_arr[__i] *= __tensor.shape[__j]; + } + } + } + __extents_type __extents{__extents_arr}; + __mapping_type __mapping{__extents, __strides_arr}; + return __mdspan_type{__data, __mapping}; + } + else + { + static_assert(::cuda::std::__always_false_v<_LayoutPolicy>, "Unsupported layout policy"); + } +} + +/*********************************************************************************************************************** + * Public API + **********************************************************************************************************************/ + +template +[[nodiscard]] +_CCCL_HOST_API ::cuda::host_mdspan<_ElementType, ::cuda::std::dextents<::cuda::std::int64_t, _Rank>, _LayoutPolicy> +to_host_mdspan(const ::DLTensor& __tensor) +{ + if (__tensor.device.device_type != ::kDLCPU) + { + _CCCL_THROW(::std::invalid_argument{"DLTensor device type must be kDLCPU for host_mdspan"}); + } + using __extents_type = ::cuda::std::dextents<::cuda::std::int64_t, _Rank>; + using __mdspan_type = ::cuda::host_mdspan<_ElementType, __extents_type, _LayoutPolicy>; + return __mdspan_type{::cuda::__to_mdspan<_ElementType, _Rank, _LayoutPolicy>(__tensor)}; +} + +template +[[nodiscard]] +_CCCL_HOST_API ::cuda::device_mdspan<_ElementType, ::cuda::std::dextents<::cuda::std::int64_t, _Rank>, _LayoutPolicy> +to_device_mdspan(const ::DLTensor& __tensor) +{ + if (__tensor.device.device_type != ::kDLCUDA) + { + _CCCL_THROW(::std::invalid_argument{"DLTensor device type must be kDLCUDA for device_mdspan"}); + } + using __extents_type = ::cuda::std::dextents<::cuda::std::int64_t, _Rank>; + using __mdspan_type = ::cuda::device_mdspan<_ElementType, __extents_type, _LayoutPolicy>; + return __mdspan_type{::cuda::__to_mdspan<_ElementType, _Rank, _LayoutPolicy>(__tensor)}; +} + +template +[[nodiscard]] +_CCCL_HOST_API ::cuda::managed_mdspan<_ElementType, ::cuda::std::dextents<::cuda::std::int64_t, _Rank>, _LayoutPolicy> +to_managed_mdspan(const ::DLTensor& __tensor) +{ + if (__tensor.device.device_type != ::kDLCUDAManaged) + { + _CCCL_THROW(::std::invalid_argument{"DLTensor device type must be kDLCUDAManaged for managed_mdspan"}); + } + using __extents_type = ::cuda::std::dextents<::cuda::std::int64_t, _Rank>; + using __mdspan_type = ::cuda::managed_mdspan<_ElementType, __extents_type, _LayoutPolicy>; + return __mdspan_type{::cuda::__to_mdspan<_ElementType, _Rank, _LayoutPolicy>(__tensor)}; +} + +_CCCL_END_NAMESPACE_CUDA + +# include + +#endif // !_CCCL_COMPILER(NVRTC) && _CCCL_HAS_INCLUDE() +#endif // _CUDA___MDSPAN_DLPACK_TO_MDSPAN_H From dec5dcaeb739682317894c90c1bb66b4383138f3 Mon Sep 17 00:00:00 2001 From: fbusato Date: Mon, 22 Dec 2025 16:27:59 -0800 Subject: [PATCH 37/56] complete the implementation --- .../include/cuda/__mdspan/dlpack_to_mdspan.h | 174 ++++++++++++++---- 1 file changed, 138 insertions(+), 36 deletions(-) diff --git a/libcudacxx/include/cuda/__mdspan/dlpack_to_mdspan.h b/libcudacxx/include/cuda/__mdspan/dlpack_to_mdspan.h index ae9be140f26..44c1ea89fa2 100644 --- a/libcudacxx/include/cuda/__mdspan/dlpack_to_mdspan.h +++ b/libcudacxx/include/cuda/__mdspan/dlpack_to_mdspan.h @@ -23,10 +23,13 @@ #if !_CCCL_COMPILER(NVRTC) && _CCCL_HAS_INCLUDE() # include -# include +// # include // __data_type_to_dlpack +# include # include # include +# include # include +# include # include # include @@ -46,60 +49,159 @@ template return __dtype.code == __expected.code && __dtype.bits == __expected.bits && __dtype.lanes == __expected.lanes; } -template [[nodiscard]] -_CCCL_HOST_API ::cuda::std::mdspan<_ElementType, ::cuda::std::dextents<::cuda::std::int64_t, _Rank>, _LayoutPolicy> -__to_mdspan(const ::DLTensor& __tensor) +_CCCL_HOST_API inline ::cuda::std::int64_t __layout_right_stride( + const ::cuda::std::int64_t* __shapes, ::cuda::std::size_t __pos, ::cuda::std::size_t __rank) noexcept { - using __extents_type = ::cuda::std::dextents<::cuda::std::int64_t, _Rank>; - using __mdspan_type = ::cuda::std::mdspan<_ElementType, __extents_type, _LayoutPolicy>; - using __mapping_type = typename _LayoutPolicy::template mapping<__extents_type>; - using __element_type = typename __mdspan_type::element_type; - if (__tensor.ndim != int{_Rank}) + ::cuda::std::int64_t __stride = 1; + for (auto __i = __pos + 1; __i < __rank; ++__i) { - _CCCL_THROW(::std::invalid_argument{"DLTensor rank does not match expected rank"}); + __stride *= __shapes[__i]; // TODO: check for overflow } - if (!::cuda::__validate_dlpack_data_type<__element_type>(__tensor.dtype)) + return __stride; +} + +[[nodiscard]] +_CCCL_HOST_API inline ::cuda::std::int64_t +__layout_left_stride(const ::cuda::std::int64_t* __shapes, ::cuda::std::size_t __pos) noexcept +{ + ::cuda::std::int64_t __stride = 1; + for (::cuda::std::size_t __i = 0; __i < __pos; ++__i) { - _CCCL_THROW(::std::invalid_argument{"DLTensor data type does not match expected type"}); + __stride *= __shapes[__i]; // TODO: check for overflow } - auto __base_data = static_cast(__tensor.data) + __tensor.byte_offset; - auto __data = reinterpret_cast<__element_type*>(__base_data); - if constexpr (_Rank == 0) + return __stride; +} + +template +_CCCL_HOST_API void __validate_dlpack_strides(const ::DLTensor& __tensor, [[maybe_unused]] ::cuda::std::size_t __rank) +{ + constexpr bool __is_layout_right = ::cuda::std::is_same_v<_LayoutPolicy, ::cuda::std::layout_right>; + constexpr bool __is_layout_left = ::cuda::std::is_same_v<_LayoutPolicy, ::cuda::std::layout_left>; + constexpr bool __is_layout_stride = ::cuda::std::is_same_v<_LayoutPolicy, ::cuda::std::layout_stride>; + const auto __strides_ptr = __tensor.strides; + if (__strides_ptr == nullptr) { - return __mdspan_type{__data, __mapping_type{}}; +# if DLPACK_MAJOR_VERSION > 1 || (DLPACK_MAJOR_VERSION == 1 && DLPACK_MINOR_VERSION >= 2) + _CCCL_THROW(::std::invalid_argument{"strides=nullptr is not supported for DLPack v1.2 and later"}); +# else + // strides == nullptr means row-major (C-contiguous) layout + if (__is_layout_left && __rank > 1) + { + _CCCL_THROW(::std::invalid_argument{"strides must be non-null for layout_left"}); + } + else + { + return; + } +# endif // DLPACK_MAJOR_VERSION > 1 || (DLPACK_MAJOR_VERSION == 1 && DLPACK_MINOR_VERSION >= 2) } - else if constexpr (::cuda::std::is_same_v<_LayoutPolicy, ::cuda::std::layout_stride>) + for (::cuda::std::size_t __pos = 0; __pos < __rank; ++__pos) { - using ::cuda::std::int64_t; - using ::cuda::std::size_t; - ::cuda::std::array __extents_arr{}; - ::cuda::std::array __strides_arr{}; - for (size_t __i = 0; __i < _Rank; ++__i) + if constexpr (__is_layout_right) { - __extents_arr[__i] = __tensor.shape[__i]; - // strides == nullptr means row-major (C-contiguous) layout - if (__tensor.strides != nullptr) + if (__strides_ptr[__pos] != ::cuda::__layout_right_stride(__tensor.shape, __pos, __rank)) { - __strides_arr[__i] = __tensor.strides[__i]; + _CCCL_THROW(::std::invalid_argument{"DLTensor strides are not compatible with layout_right"}); } - else + } + else if constexpr (__is_layout_left) + { + if (__strides_ptr[__pos] != ::cuda::__layout_left_stride(__tensor.shape, __pos)) { - __strides_arr[__i] = 1; - for (size_t __j = __i + 1; __j < _Rank; ++__j) - { - __strides_arr[__i] *= __tensor.shape[__j]; - } + _CCCL_THROW(::std::invalid_argument{"DLTensor strides are not compatible with layout_left"}); + } + } + else if constexpr (__is_layout_stride) + { + if (__strides_ptr[__pos] <= 0) + { + _CCCL_THROW(::std::invalid_argument{"mdspan strides must be positive"}); } } - __extents_type __extents{__extents_arr}; - __mapping_type __mapping{__extents, __strides_arr}; - return __mdspan_type{__data, __mapping}; } - else +} + +template +[[nodiscard]] +_CCCL_HOST_API ::cuda::std::mdspan<_ElementType, ::cuda::std::dextents<::cuda::std::int64_t, _Rank>, _LayoutPolicy> +__to_mdspan(const ::DLTensor& __tensor) +{ + using __extents_type = ::cuda::std::dextents<::cuda::std::int64_t, _Rank>; + using __mdspan_type = ::cuda::std::mdspan<_ElementType, __extents_type, _LayoutPolicy>; + using __mapping_type = typename _LayoutPolicy::template mapping<__extents_type>; + using __element_type = typename __mdspan_type::element_type; + constexpr bool __is_layout_right = ::cuda::std::is_same_v<_LayoutPolicy, ::cuda::std::layout_right>; + constexpr bool __is_layout_left = ::cuda::std::is_same_v<_LayoutPolicy, ::cuda::std::layout_left>; + constexpr bool __is_layout_stride = ::cuda::std::is_same_v<_LayoutPolicy, ::cuda::std::layout_stride>; + // TODO: add support for layout_right_padded and layout_left_padded + if constexpr (!__is_layout_right && !__is_layout_left && !__is_layout_stride) { static_assert(::cuda::std::__always_false_v<_LayoutPolicy>, "Unsupported layout policy"); } + else + { + if (__tensor.ndim != int{_Rank}) + { + _CCCL_THROW(::std::invalid_argument{"DLTensor rank does not match expected rank"}); + } + if (!::cuda::__validate_dlpack_data_type<__element_type>(__tensor.dtype)) + { + _CCCL_THROW(::std::invalid_argument{"DLTensor data type does not match expected type"}); + } + if (__tensor.data == nullptr) + { + _CCCL_THROW(::std::invalid_argument{"DLTensor data must be non-null"}); + } + auto __base_data = static_cast(__tensor.data) + __tensor.byte_offset; + auto __data = reinterpret_cast<__element_type*>(__base_data); + const auto __datatype_size = __tensor.dtype.bits * __tensor.dtype.lanes / 8; + // this is not the exact solution because data type size != data type alignment. + // However, it always works for the supported data types. + if (__datatype_size > 0 && !::cuda::is_aligned(__data, __datatype_size)) + { + _CCCL_THROW(::std::invalid_argument{"DLTensor data must be aligned to the data type"}); + } + if constexpr (_Rank == 0) + { + return __mdspan_type{__data, __mapping_type{}}; + } + else // Rank > 0 + { + if (__tensor.shape == nullptr) + { + _CCCL_THROW(::std::invalid_argument{"DLTensor shape must be non-null"}); + } + using ::cuda::std::int64_t; + using ::cuda::std::size_t; + ::cuda::std::array __extents_array{}; + for (size_t __i = 0; __i < _Rank; ++__i) + { + if (__tensor.shape[__i] < 0) + { + _CCCL_THROW(::std::invalid_argument{"DLTensor shape must be positive"}); + } + __extents_array[__i] = __tensor.shape[__i]; + } + ::cuda::__validate_dlpack_strides<_LayoutPolicy>(__tensor, _Rank); + if constexpr (__is_layout_stride) + { + ::cuda::std::array __strides_array{}; + for (size_t __i = 0; __i < _Rank; ++__i) + { + const bool __has_strides = __tensor.strides != nullptr; + __strides_array[__i] = + __has_strides ? __tensor.strides[__i] : ::cuda::__layout_right_stride(__tensor.shape, __i, _Rank); + } + return __mdspan_type{__data, __mapping_type{__extents_array, __strides_array}}; + } + else + { + __extents_type __extents{__extents_array}; + return __mdspan_type{__data, __extents}; + } + } + } } /*********************************************************************************************************************** From b38a6a791236b66a10879cd10927d8b7dcf2df35 Mon Sep 17 00:00:00 2001 From: fbusato Date: Mon, 22 Dec 2025 17:38:19 -0800 Subject: [PATCH 38/56] add unit test --- .../include/cuda/__mdspan/dlpack_to_mdspan.h | 2 +- libcudacxx/include/cuda/mdspan | 1 + .../dlpack_to_mdspan.pass.cpp | 679 ++++++++++++++++++ 3 files changed, 681 insertions(+), 1 deletion(-) create mode 100644 libcudacxx/test/libcudacxx/cuda/containers/views/mdspan/dlpack_to_mdspan/dlpack_to_mdspan.pass.cpp diff --git a/libcudacxx/include/cuda/__mdspan/dlpack_to_mdspan.h b/libcudacxx/include/cuda/__mdspan/dlpack_to_mdspan.h index 44c1ea89fa2..98fdc8ed9b0 100644 --- a/libcudacxx/include/cuda/__mdspan/dlpack_to_mdspan.h +++ b/libcudacxx/include/cuda/__mdspan/dlpack_to_mdspan.h @@ -23,7 +23,7 @@ #if !_CCCL_COMPILER(NVRTC) && _CCCL_HAS_INCLUDE() # include -// # include // __data_type_to_dlpack +# include # include # include # include diff --git a/libcudacxx/include/cuda/mdspan b/libcudacxx/include/cuda/mdspan index 3129198d02a..f8e36e75e43 100644 --- a/libcudacxx/include/cuda/mdspan +++ b/libcudacxx/include/cuda/mdspan @@ -21,6 +21,7 @@ # pragma system_header #endif // no system header +#include #include #include #include diff --git a/libcudacxx/test/libcudacxx/cuda/containers/views/mdspan/dlpack_to_mdspan/dlpack_to_mdspan.pass.cpp b/libcudacxx/test/libcudacxx/cuda/containers/views/mdspan/dlpack_to_mdspan/dlpack_to_mdspan.pass.cpp new file mode 100644 index 00000000000..b56caa50e1f --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/containers/views/mdspan/dlpack_to_mdspan/dlpack_to_mdspan.pass.cpp @@ -0,0 +1,679 @@ +//===----------------------------------------------------------------------===// +// +// Part of the libcu++ Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// +// UNSUPPORTED: nvrtc + +#include + +#include +#include +#include +#include +#include + +#include + +#include "test_macros.h" + +template +using dlpack_array = cuda::std::array; + +//============================================================================== +// Test: Rank-0 mdspan conversion +//============================================================================== + +bool test_rank0() +{ + float data = 42.0f; + DLTensor tensor{}; + tensor.data = &data; + tensor.device = DLDevice{kDLCPU, 0}; + tensor.ndim = 0; + tensor.dtype = DLDataType{DLDataTypeCode::kDLFloat, 32, 1}; + + auto host_mdspan = cuda::to_host_mdspan(tensor); + + assert(host_mdspan.rank() == 0); + assert(host_mdspan.size() == 1); + assert(host_mdspan.data_handle() == &data); + assert(host_mdspan() == 42.0f); + return true; +} + +//============================================================================== +// Test: Empty tensor (zero in one dimension) +//============================================================================== + +bool test_empty_tensor() +{ + int dummy = 0; // Non-null but won't be accessed + dlpack_array<2> shape = {0, 5}; + dlpack_array<2> strides = {5, 1}; // row-major + DLTensor tensor{}; + tensor.data = &dummy; + tensor.device = DLDevice{kDLCPU, 0}; + tensor.ndim = 2; + tensor.dtype = DLDataType{DLDataTypeCode::kDLInt, 32, 1}; + tensor.shape = shape.data(); + tensor.strides = strides.data(); + + auto host_mdspan = cuda::to_host_mdspan(tensor); + + assert(host_mdspan.extent(0) == 0); + assert(host_mdspan.extent(1) == 5); + assert(host_mdspan.size() == 0); + assert(host_mdspan.empty()); + return true; +} + +//============================================================================== +// Test: Rank-1 mdspan with layout_right (row-major) +//============================================================================== + +bool test_rank1() +{ + cuda::std::array data = {1, 2, 3, 4, 5}; + dlpack_array<1> shape = {5}; + dlpack_array<1> strides = {1}; + DLTensor tensor{}; + tensor.data = data.data(); + tensor.device = DLDevice{kDLCPU, 0}; + tensor.ndim = 1; + tensor.dtype = ::DLDataType{::kDLInt, 32, 1}; + tensor.shape = shape.data(); + tensor.strides = strides.data(); + + auto host_mdspan_right = cuda::to_host_mdspan(tensor); + auto host_mdspan_left = cuda::to_host_mdspan(tensor); + auto host_mdspan_stride = cuda::to_host_mdspan(tensor); + + assert(host_mdspan_right.rank() == 1); + assert(host_mdspan_right.extent(0) == 5); + assert(host_mdspan_right.stride(0) == 1); + for (int i = 0; i < 5; ++i) + { + assert(host_mdspan_right(i) == data[i]); + } + assert(host_mdspan_left.rank() == 1); + assert(host_mdspan_left.extent(0) == 5); + assert(host_mdspan_left.stride(0) == 1); + for (int i = 0; i < 5; ++i) + { + assert(host_mdspan_left(i) == data[i]); + } + assert(host_mdspan_stride.rank() == 1); + assert(host_mdspan_stride.extent(0) == 5); + assert(host_mdspan_stride.stride(0) == 1); + for (int i = 0; i < 5; ++i) + { + assert(host_mdspan_stride(i) == data[i]); + } + return true; +} + +//============================================================================== +// Test: Rank-2 mdspan with layout_right (row-major) +//============================================================================== + +bool test_rank2_layout_right() +{ + // 2x3 matrix in row-major order + cuda::std::array data = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f}; + dlpack_array<2> shape = {2, 3}; + dlpack_array<2> strides = {3, 1}; // row-major + DLTensor tensor{}; + tensor.data = data.data(); + tensor.device = DLDevice{kDLCPU, 0}; + tensor.ndim = 2; + tensor.dtype = cuda::__data_type_to_dlpack(); + tensor.shape = shape.data(); + tensor.strides = strides.data(); + + auto host_mdspan = cuda::to_host_mdspan(tensor); + + assert(host_mdspan.rank() == 2); + assert(host_mdspan.extent(0) == 2); + assert(host_mdspan.extent(1) == 3); + assert(host_mdspan.stride(0) == 3); // row stride + assert(host_mdspan.stride(1) == 1); // column stride + + // Check values: row-major layout + assert(host_mdspan(0, 0) == 1.0f); + assert(host_mdspan(0, 1) == 2.0f); + assert(host_mdspan(0, 2) == 3.0f); + assert(host_mdspan(1, 0) == 4.0f); + assert(host_mdspan(1, 1) == 5.0f); + assert(host_mdspan(1, 2) == 6.0f); + return true; +} + +//============================================================================== +// Test: Rank-2 mdspan with layout_left (column-major) +//============================================================================== + +bool test_rank2_layout_left() +{ + // 2x3 matrix in column-major order + cuda::std::array data = {1.0f, 4.0f, 2.0f, 5.0f, 3.0f, 6.0f}; + dlpack_array<2> shape = {2, 3}; + dlpack_array<2> strides = {1, 2}; // column-major + DLTensor tensor{}; + tensor.data = data.data(); + tensor.device = DLDevice{kDLCPU, 0}; + tensor.ndim = 2; + tensor.dtype = cuda::__data_type_to_dlpack(); + tensor.shape = shape.data(); + tensor.strides = strides.data(); + + auto host_mdspan = cuda::to_host_mdspan(tensor); + + assert(host_mdspan.rank() == 2); + assert(host_mdspan.extent(0) == 2); + assert(host_mdspan.extent(1) == 3); + assert(host_mdspan.stride(0) == 1); // row stride + assert(host_mdspan.stride(1) == 2); // column stride + + // Check values: column-major layout + assert(host_mdspan(0, 0) == 1.0f); + assert(host_mdspan(0, 1) == 2.0f); + assert(host_mdspan(0, 2) == 3.0f); + assert(host_mdspan(1, 0) == 4.0f); + assert(host_mdspan(1, 1) == 5.0f); + assert(host_mdspan(1, 2) == 6.0f); + return true; +} + +//============================================================================== +// Test: Rank-2 mdspan with layout_stride (arbitrary strides) +//============================================================================== + +bool test_rank2_layout_stride() +{ + // 2x3 matrix with custom strides (e.g., padded) + cuda::std::array data = {1, 2, 3, 0, 4, 5, 6, 0}; // Each row padded to 4 elements + dlpack_array<2> shape = {2, 3}; + dlpack_array<2> strides = {4, 1}; // Row stride = 4 (padded), col stride = 1 + DLTensor tensor{}; + tensor.data = data.data(); + tensor.device = DLDevice{kDLCPU, 0}; + tensor.ndim = 2; + tensor.dtype = cuda::__data_type_to_dlpack(); + tensor.shape = shape.data(); + tensor.strides = strides.data(); + + auto host_mdspan = cuda::to_host_mdspan(tensor); + + assert(host_mdspan.rank() == 2); + assert(host_mdspan.extent(0) == 2); + assert(host_mdspan.extent(1) == 3); + assert(host_mdspan.stride(0) == 4); + assert(host_mdspan.stride(1) == 1); + + assert(host_mdspan(0, 0) == 1); + assert(host_mdspan(0, 1) == 2); + assert(host_mdspan(0, 2) == 3); + assert(host_mdspan(1, 0) == 4); + assert(host_mdspan(1, 1) == 5); + assert(host_mdspan(1, 2) == 6); + return true; +} + +//============================================================================== +// Test: layout_stride with default (layout_right) strides when strides is nullptr +// Note: This tests the fallback behavior for DLPack < 1.2 +//============================================================================== +#if !(DLPACK_MAJOR_VERSION == 1 && DLPACK_MINOR_VERSION >= 2) +bool test_layout_stride_null_strides() +{ + cuda::std::array data = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f}; + dlpack_array<2> shape = {2, 3}; + DLTensor tensor{}; + tensor.data = data.data(); + tensor.device = DLDevice{kDLCPU, 0}; + tensor.ndim = 2; + tensor.dtype = cuda::__data_type_to_dlpack(); + tensor.shape = shape.data(); + tensor.strides = nullptr; // null strides + + auto host_mdspan = cuda::to_host_mdspan(tensor); + + // Should use row-major strides by default + assert(host_mdspan.stride(0) == 3); + assert(host_mdspan.stride(1) == 1); + return true; +} +#endif + +//============================================================================== +// Test: byte_offset support +//============================================================================== + +bool test_byte_offset() +{ + cuda::std::array data = {0, 0, 1, 2, 3, 4, 5, 6}; + // Skip first 2 ints (8 bytes) + dlpack_array<1> shape = {6}; + dlpack_array<1> strides = {1}; + DLTensor tensor{}; + tensor.data = data.data(); + tensor.device = DLDevice{kDLCPU, 0}; + tensor.ndim = 1; + tensor.dtype = DLDataType{DLDataTypeCode::kDLInt, 32, 1}; + tensor.shape = shape.data(); + tensor.strides = strides.data(); + tensor.byte_offset = sizeof(int) * 2; + + auto host_mdspan = cuda::to_host_mdspan(tensor); + + assert(host_mdspan.extent(0) == 6); + assert(host_mdspan(0) == 1); + assert(host_mdspan(5) == 6); + return true; +} + +//============================================================================== +// Exception tests +//============================================================================== + +void test_exception_wrong_rank() +{ + cuda::std::array data{}; + dlpack_array<2> shape = {2, 3}; + dlpack_array<2> strides = {3, 1}; // row-major + DLTensor tensor{}; + tensor.data = data.data(); + tensor.device = DLDevice{kDLCPU, 0}; + tensor.ndim = 2; + tensor.dtype = DLDataType{DLDataTypeCode::kDLInt, 32, 1}; + tensor.shape = shape.data(); + tensor.strides = strides.data(); + + bool caught = false; + try + { + // Try to convert rank-2 tensor to rank-1 mdspan + unused(cuda::to_host_mdspan(tensor)); + } + catch (const std::invalid_argument&) + { + caught = true; + } + assert(caught); +} + +void test_exception_wrong_dtype() +{ + cuda::std::array data{}; + dlpack_array<1> shape = {4}; + dlpack_array<1> strides = {1}; + DLTensor tensor{}; + tensor.data = data.data(); + tensor.device = DLDevice{kDLCPU, 0}; + tensor.ndim = 1; + tensor.dtype = DLDataType{DLDataTypeCode::kDLInt, 32, 1}; // dtype is int + tensor.shape = shape.data(); + tensor.strides = strides.data(); + + bool caught = false; + try + { + // Try to convert int tensor to float mdspan + unused(cuda::to_host_mdspan(tensor)); + } + catch (const std::invalid_argument&) + { + caught = true; + } + assert(caught); +} + +void test_exception_null_data() +{ + dlpack_array<1> shape = {4}; + dlpack_array<1> strides = {1}; + DLTensor tensor{}; + tensor.data = nullptr; + tensor.device = DLDevice{kDLCPU, 0}; + tensor.ndim = 1; + tensor.dtype = DLDataType{DLDataTypeCode::kDLInt, 32, 1}; + tensor.shape = shape.data(); + tensor.strides = strides.data(); + + bool caught = false; + try + { + unused(cuda::to_host_mdspan(tensor)); + } + catch (const std::invalid_argument&) + { + caught = true; + } + assert(caught); +} + +void test_exception_null_shape() +{ + cuda::std::array data{}; + DLTensor tensor{}; + tensor.data = data.data(); + tensor.device = DLDevice{kDLCPU, 0}; + tensor.ndim = 1; + tensor.dtype = DLDataType{DLDataTypeCode::kDLInt, 32, 1}; + tensor.shape = nullptr; // null shape + + bool caught = false; + try + { + unused(cuda::to_host_mdspan(tensor)); + } + catch (const std::invalid_argument&) + { + caught = true; + } + assert(caught); +} + +void test_exception_negative_shape() +{ + cuda::std::array data{}; + dlpack_array<1> shape = {-3}; // negative shape + dlpack_array<1> strides = {1}; + DLTensor tensor{}; + tensor.data = data.data(); + tensor.device = DLDevice{kDLCPU, 0}; + tensor.ndim = 1; + tensor.dtype = DLDataType{DLDataTypeCode::kDLInt, 32, 1}; + tensor.shape = shape.data(); + tensor.strides = strides.data(); + + bool caught = false; + try + { + unused(cuda::to_host_mdspan(tensor)); + } + catch (const std::invalid_argument&) + { + caught = true; + } + assert(caught); +} + +void test_exception_wrong_device_type_host() +{ + cuda::std::array data{}; + dlpack_array<1> shape = {4}; + dlpack_array<1> strides = {1}; + DLTensor tensor{}; + tensor.data = data.data(); + tensor.device = DLDevice{::kDLCUDA, 0}; // CUDA device, not CPU + tensor.ndim = 1; + tensor.dtype = DLDataType{DLDataTypeCode::kDLInt, 32, 1}; + tensor.shape = shape.data(); + tensor.strides = strides.data(); + + bool caught = false; + try + { + unused(cuda::to_host_mdspan(tensor)); + } + catch (const std::invalid_argument&) + { + caught = true; + } + assert(caught); +} + +void test_exception_wrong_device_type_device() +{ + cuda::std::array data{}; + dlpack_array<1> shape = {4}; + dlpack_array<1> strides = {1}; + DLTensor tensor{}; + tensor.data = data.data(); + tensor.device = DLDevice{kDLCPU, 0}; // CPU device, not CUDA + tensor.ndim = 1; + tensor.dtype = DLDataType{DLDataTypeCode::kDLInt, 32, 1}; + tensor.shape = shape.data(); + tensor.strides = strides.data(); + + bool caught = false; + try + { + unused(cuda::to_device_mdspan(tensor)); + } + catch (const std::invalid_argument&) + { + caught = true; + } + assert(caught); +} + +void test_exception_wrong_device_type_managed() +{ + cuda::std::array data{}; + dlpack_array<1> shape = {4}; + dlpack_array<1> strides = {1}; + DLTensor tensor{}; + tensor.data = data.data(); + tensor.device = DLDevice{kDLCPU, 0}; // CPU device, not CUDA managed + tensor.ndim = 1; + tensor.dtype = DLDataType{DLDataTypeCode::kDLInt, 32, 1}; + tensor.shape = shape.data(); + tensor.strides = strides.data(); + + bool caught = false; + try + { + unused(cuda::to_managed_mdspan(tensor)); + } + catch (const std::invalid_argument&) + { + caught = true; + } + assert(caught); +} + +void test_exception_stride_mismatch_layout_right() +{ + cuda::std::array data{}; + dlpack_array<2> shape = {2, 3}; + dlpack_array<2> strides = {1, 2}; // Column-major, not row-major + DLTensor tensor{}; + tensor.data = data.data(); + tensor.device = DLDevice{kDLCPU, 0}; + tensor.ndim = 2; + tensor.dtype = DLDataType{DLDataTypeCode::kDLFloat, 32, 1}; + tensor.shape = shape.data(); + tensor.strides = strides.data(); + + bool caught = false; + try + { + unused(cuda::to_host_mdspan(tensor)); + } + catch (const std::invalid_argument&) + { + caught = true; + } + assert(caught); +} + +void test_exception_stride_mismatch_layout_left() +{ + cuda::std::array data{}; + dlpack_array<2> shape = {2, 3}; + dlpack_array<2> strides = {3, 1}; // Row-major, not column-major + DLTensor tensor{}; + tensor.data = data.data(); + tensor.device = DLDevice{kDLCPU, 0}; + tensor.ndim = 2; + tensor.dtype = DLDataType{DLDataTypeCode::kDLFloat, 32, 1}; + tensor.shape = shape.data(); + tensor.strides = strides.data(); + + bool caught = false; + try + { + unused(cuda::to_host_mdspan(tensor)); + } + catch (const std::invalid_argument&) + { + caught = true; + } + assert(caught); +} + +void test_exception_zero_stride_layout_stride() +{ + cuda::std::array data{}; + dlpack_array<2> shape = {2, 3}; + dlpack_array<2> strides = {0, 1}; // Zero stride is invalid + DLTensor tensor{}; + tensor.data = data.data(); + tensor.device = DLDevice{kDLCPU, 0}; + tensor.ndim = 2; + tensor.dtype = DLDataType{DLDataTypeCode::kDLInt, 32, 1}; + tensor.shape = shape.data(); + tensor.strides = strides.data(); + + bool caught = false; + try + { + unused(cuda::to_host_mdspan(tensor)); + } + catch (const std::invalid_argument&) + { + caught = true; + } + assert(caught); +} + +void test_exception_null_strides_dlpack_v12() +{ + cuda::std::array data{}; + dlpack_array<2> shape = {2, 3}; + DLTensor tensor{}; + tensor.data = data.data(); + tensor.device = DLDevice{kDLCPU, 0}; + tensor.ndim = 2; + tensor.dtype = DLDataType{DLDataTypeCode::kDLFloat, 32, 1}; + tensor.shape = shape.data(); + tensor.strides = nullptr; // null strides not allowed in DLPack v1.2+ + + bool caught = false; + try + { + unused(cuda::to_host_mdspan(tensor)); + } + catch (const std::invalid_argument&) + { + caught = true; + } + assert(caught); +} + +void test_exception_misaligned_data() +{ + // Create a buffer that allows us to get a misaligned pointer + alignas(16) cuda::std::array buffer{}; + // Get a pointer that's 1 byte into the buffer (misaligned for int) + auto misaligned_ptr = reinterpret_cast(buffer.data() + 1); + dlpack_array<1> shape = {3}; + dlpack_array<1> strides = {1}; + DLTensor tensor{}; + tensor.data = misaligned_ptr; + tensor.device = DLDevice{kDLCPU, 0}; + tensor.ndim = 1; + tensor.dtype = DLDataType{DLDataTypeCode::kDLInt, 32, 1}; + tensor.shape = shape.data(); + tensor.strides = strides.data(); + + bool caught = false; + try + { + unused(cuda::to_host_mdspan(tensor)); + } + catch (const std::invalid_argument&) + { + caught = true; + } + assert(caught); +} + +bool test_exceptions() +{ + test_exception_wrong_rank(); + test_exception_wrong_dtype(); + test_exception_null_data(); + test_exception_null_shape(); + test_exception_negative_shape(); + test_exception_wrong_device_type_host(); + test_exception_wrong_device_type_device(); + test_exception_wrong_device_type_managed(); + test_exception_stride_mismatch_layout_right(); + test_exception_stride_mismatch_layout_left(); + test_exception_zero_stride_layout_stride(); +#if DLPACK_MAJOR_VERSION > 1 || (DLPACK_MAJOR_VERSION == 1 && DLPACK_MINOR_VERSION >= 2) + test_exception_null_strides_dlpack_v12(); +#endif + test_exception_misaligned_data(); + return true; +} + +//============================================================================== +// Test: Return type checking +//============================================================================== + +bool test_return_types() +{ + cuda::std::array data{}; + dlpack_array<1> shape = {4}; + dlpack_array<1> strides = {1}; + DLTensor tensor{}; + tensor.data = data.data(); + tensor.device = DLDevice{kDLCPU, 0}; + tensor.ndim = 1; + tensor.dtype = cuda::__data_type_to_dlpack(); + tensor.shape = shape.data(); + tensor.strides = strides.data(); + + // Check return type of to_host_mdspan + auto host_ms = cuda::to_host_mdspan(tensor); + static_assert( + cuda::std::is_same_v, cuda::std::layout_stride>>); + assert(host_ms.extent(0) == 4); + + auto host_ms_right = cuda::to_host_mdspan(tensor); + static_assert( + cuda::std::is_same_v, cuda::std::layout_right>>); + assert(host_ms_right.extent(0) == 4); + + return true; +} + +int main(int, char**) +{ + NV_IF_TARGET( + NV_IS_HOST, + (assert(test_rank0()); // + assert(test_rank1()); + assert(test_rank2_layout_right()); + assert(test_rank2_layout_left()); + assert(test_rank2_layout_stride()); + assert(test_element_types()); + assert(test_byte_offset()); + assert(test_empty_tensor()); + assert(test_return_types()); + assert(test_exceptions());)) +#if !(DLPACK_MAJOR_VERSION > 1 || (DLPACK_MAJOR_VERSION == 1 && DLPACK_MINOR_VERSION >= 2)) + NV_IF_TARGET(NV_IS_HOST, (assert(test_layout_stride_null_strides());)) +#endif + return 0; +} From 5be18933ee94de7f676b16e4afe65b9964bc598b Mon Sep 17 00:00:00 2001 From: fbusato Date: Mon, 22 Dec 2025 17:41:10 -0800 Subject: [PATCH 39/56] fix unit test --- .../views/mdspan/dlpack_to_mdspan/dlpack_to_mdspan.pass.cpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/libcudacxx/test/libcudacxx/cuda/containers/views/mdspan/dlpack_to_mdspan/dlpack_to_mdspan.pass.cpp b/libcudacxx/test/libcudacxx/cuda/containers/views/mdspan/dlpack_to_mdspan/dlpack_to_mdspan.pass.cpp index b56caa50e1f..e5174c669b1 100644 --- a/libcudacxx/test/libcudacxx/cuda/containers/views/mdspan/dlpack_to_mdspan/dlpack_to_mdspan.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/containers/views/mdspan/dlpack_to_mdspan/dlpack_to_mdspan.pass.cpp @@ -227,7 +227,9 @@ bool test_rank2_layout_stride() // Test: layout_stride with default (layout_right) strides when strides is nullptr // Note: This tests the fallback behavior for DLPack < 1.2 //============================================================================== + #if !(DLPACK_MAJOR_VERSION == 1 && DLPACK_MINOR_VERSION >= 2) + bool test_layout_stride_null_strides() { cuda::std::array data = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f}; @@ -247,7 +249,8 @@ bool test_layout_stride_null_strides() assert(host_mdspan.stride(1) == 1); return true; } -#endif + +#endif // !(DLPACK_MAJOR_VERSION == 1 && DLPACK_MINOR_VERSION >= 2) //============================================================================== // Test: byte_offset support @@ -667,7 +670,6 @@ int main(int, char**) assert(test_rank2_layout_right()); assert(test_rank2_layout_left()); assert(test_rank2_layout_stride()); - assert(test_element_types()); assert(test_byte_offset()); assert(test_empty_tensor()); assert(test_return_types()); From f0909df43768696c8fe5997181510ea0594f16b2 Mon Sep 17 00:00:00 2001 From: fbusato Date: Mon, 22 Dec 2025 17:44:35 -0800 Subject: [PATCH 40/56] formatting --- .../views/mdspan/dlpack_to_mdspan/dlpack_to_mdspan.pass.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/libcudacxx/test/libcudacxx/cuda/containers/views/mdspan/dlpack_to_mdspan/dlpack_to_mdspan.pass.cpp b/libcudacxx/test/libcudacxx/cuda/containers/views/mdspan/dlpack_to_mdspan/dlpack_to_mdspan.pass.cpp index e5174c669b1..a05801e5ab0 100644 --- a/libcudacxx/test/libcudacxx/cuda/containers/views/mdspan/dlpack_to_mdspan/dlpack_to_mdspan.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/containers/views/mdspan/dlpack_to_mdspan/dlpack_to_mdspan.pass.cpp @@ -8,8 +8,6 @@ //===----------------------------------------------------------------------===// // UNSUPPORTED: nvrtc -#include - #include #include #include @@ -19,6 +17,7 @@ #include #include "test_macros.h" +#include template using dlpack_array = cuda::std::array; From d149dff6e9b3fbacd6770d7d00141999ef346091 Mon Sep 17 00:00:00 2001 From: fbusato Date: Mon, 22 Dec 2025 17:46:50 -0800 Subject: [PATCH 41/56] minor fixes --- .../views/mdspan/dlpack_to_mdspan/dlpack_to_mdspan.pass.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/libcudacxx/test/libcudacxx/cuda/containers/views/mdspan/dlpack_to_mdspan/dlpack_to_mdspan.pass.cpp b/libcudacxx/test/libcudacxx/cuda/containers/views/mdspan/dlpack_to_mdspan/dlpack_to_mdspan.pass.cpp index a05801e5ab0..c797fec4b71 100644 --- a/libcudacxx/test/libcudacxx/cuda/containers/views/mdspan/dlpack_to_mdspan/dlpack_to_mdspan.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/containers/views/mdspan/dlpack_to_mdspan/dlpack_to_mdspan.pass.cpp @@ -656,7 +656,6 @@ bool test_return_types() cuda::std::is_same_v, cuda::std::layout_right>>); assert(host_ms_right.extent(0) == 4); - return true; } @@ -665,12 +664,12 @@ int main(int, char**) NV_IF_TARGET( NV_IS_HOST, (assert(test_rank0()); // + assert(test_empty_tensor()); assert(test_rank1()); assert(test_rank2_layout_right()); assert(test_rank2_layout_left()); assert(test_rank2_layout_stride()); assert(test_byte_offset()); - assert(test_empty_tensor()); assert(test_return_types()); assert(test_exceptions());)) #if !(DLPACK_MAJOR_VERSION > 1 || (DLPACK_MAJOR_VERSION == 1 && DLPACK_MINOR_VERSION >= 2)) From e96ebea13842a54276fe72438923a4ca2822e5ea Mon Sep 17 00:00:00 2001 From: fbusato Date: Tue, 23 Dec 2025 10:36:25 -0800 Subject: [PATCH 42/56] fix compiler warnings --- libcudacxx/include/cuda/__mdspan/dlpack_to_mdspan.h | 11 +++++++---- libcudacxx/include/cuda/__mdspan/mdspan_to_dlpack.h | 4 ++-- 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/libcudacxx/include/cuda/__mdspan/dlpack_to_mdspan.h b/libcudacxx/include/cuda/__mdspan/dlpack_to_mdspan.h index 98fdc8ed9b0..2f6dd61619a 100644 --- a/libcudacxx/include/cuda/__mdspan/dlpack_to_mdspan.h +++ b/libcudacxx/include/cuda/__mdspan/dlpack_to_mdspan.h @@ -76,10 +76,11 @@ __layout_left_stride(const ::cuda::std::int64_t* __shapes, ::cuda::std::size_t _ template _CCCL_HOST_API void __validate_dlpack_strides(const ::DLTensor& __tensor, [[maybe_unused]] ::cuda::std::size_t __rank) { - constexpr bool __is_layout_right = ::cuda::std::is_same_v<_LayoutPolicy, ::cuda::std::layout_right>; - constexpr bool __is_layout_left = ::cuda::std::is_same_v<_LayoutPolicy, ::cuda::std::layout_left>; - constexpr bool __is_layout_stride = ::cuda::std::is_same_v<_LayoutPolicy, ::cuda::std::layout_stride>; - const auto __strides_ptr = __tensor.strides; + [[maybe_unused]] constexpr bool __is_layout_right = ::cuda::std::is_same_v<_LayoutPolicy, ::cuda::std::layout_right>; + [[maybe_unused]] constexpr bool __is_layout_left = ::cuda::std::is_same_v<_LayoutPolicy, ::cuda::std::layout_left>; + [[maybe_unused]] constexpr bool __is_layout_stride = + ::cuda::std::is_same_v<_LayoutPolicy, ::cuda::std::layout_stride>; + const auto __strides_ptr = __tensor.strides; if (__strides_ptr == nullptr) { # if DLPACK_MAJOR_VERSION > 1 || (DLPACK_MAJOR_VERSION == 1 && DLPACK_MINOR_VERSION >= 2) @@ -138,6 +139,8 @@ __to_mdspan(const ::DLTensor& __tensor) if constexpr (!__is_layout_right && !__is_layout_left && !__is_layout_stride) { static_assert(::cuda::std::__always_false_v<_LayoutPolicy>, "Unsupported layout policy"); + _CCCL_UNREACHABLE(); + return __mdspan_type{}; } else { diff --git a/libcudacxx/include/cuda/__mdspan/mdspan_to_dlpack.h b/libcudacxx/include/cuda/__mdspan/mdspan_to_dlpack.h index f8ef04bf1aa..56ed55f19eb 100644 --- a/libcudacxx/include/cuda/__mdspan/mdspan_to_dlpack.h +++ b/libcudacxx/include/cuda/__mdspan/mdspan_to_dlpack.h @@ -150,9 +150,9 @@ template else { static_assert(::cuda::std::__always_false_v<_ElementType>, "Unsupported type"); + _CCCL_UNREACHABLE(); + return ::DLDataType{}; } - _CCCL_UNREACHABLE(); - return ::DLDataType{}; } template <::cuda::std::size_t _Rank> From 136ab598282aa7afd5ec367ff66808010cf78dac Mon Sep 17 00:00:00 2001 From: fbusato Date: Mon, 5 Jan 2026 12:02:59 -0800 Subject: [PATCH 43/56] refactor vector type traits by removing conditional compilation for vector types below version 13.0 --- .../include/cuda/__type_traits/vector_type.h | 15 --------------- 1 file changed, 15 deletions(-) diff --git a/libcudacxx/include/cuda/__type_traits/vector_type.h b/libcudacxx/include/cuda/__type_traits/vector_type.h index 18139f75325..100931020f2 100644 --- a/libcudacxx/include/cuda/__type_traits/vector_type.h +++ b/libcudacxx/include/cuda/__type_traits/vector_type.h @@ -408,15 +408,12 @@ template <> inline constexpr bool __is_vector_type_v<::long2> = true; template <> inline constexpr bool __is_vector_type_v<::long3> = true; -# if _CCCL_CTK_AT_LEAST(13, 0) template <> inline constexpr bool __is_vector_type_v<::long4_16a> = true; template <> inline constexpr bool __is_vector_type_v<::long4_32a> = true; -# else // ^^^ _CCCL_CTK_AT_LEAST(13, 0) ^^^ / vvv _CCCL_CTK_BELOW(13, 0) vvv template <> inline constexpr bool __is_vector_type_v<::long4> = true; -# endif // ^^^ _CCCL_CTK_BELOW(13, 0) ^^^ template <> inline constexpr bool __is_vector_type_v<::ulong1> = true; @@ -424,15 +421,12 @@ template <> inline constexpr bool __is_vector_type_v<::ulong2> = true; template <> inline constexpr bool __is_vector_type_v<::ulong3> = true; -# if _CCCL_CTK_AT_LEAST(13, 0) template <> inline constexpr bool __is_vector_type_v<::ulong4_16a> = true; template <> inline constexpr bool __is_vector_type_v<::ulong4_32a> = true; -# else // ^^^ _CCCL_CTK_AT_LEAST(13, 0) ^^^ / vvv _CCCL_CTK_BELOW(13, 0) vvv template <> inline constexpr bool __is_vector_type_v<::ulong4> = true; -# endif // ^^^ _CCCL_CTK_BELOW(13, 0) ^^^ template <> inline constexpr bool __is_vector_type_v<::longlong1> = true; @@ -440,15 +434,12 @@ template <> inline constexpr bool __is_vector_type_v<::longlong2> = true; template <> inline constexpr bool __is_vector_type_v<::longlong3> = true; -# if _CCCL_CTK_AT_LEAST(13, 0) template <> inline constexpr bool __is_vector_type_v<::longlong4_16a> = true; template <> inline constexpr bool __is_vector_type_v<::longlong4_32a> = true; -# else // ^^^ _CCCL_CTK_AT_LEAST(13, 0) ^^^ / vvv _CCCL_CTK_BELOW(13, 0) vvv template <> inline constexpr bool __is_vector_type_v<::longlong4> = true; -# endif // ^^^ _CCCL_CTK_BELOW(13, 0) ^^^ template <> inline constexpr bool __is_vector_type_v<::ulonglong1> = true; @@ -456,15 +447,12 @@ template <> inline constexpr bool __is_vector_type_v<::ulonglong2> = true; template <> inline constexpr bool __is_vector_type_v<::ulonglong3> = true; -# if _CCCL_CTK_AT_LEAST(13, 0) template <> inline constexpr bool __is_vector_type_v<::ulonglong4_16a> = true; template <> inline constexpr bool __is_vector_type_v<::ulonglong4_32a> = true; -# else // ^^^ _CCCL_CTK_AT_LEAST(13, 0) ^^^ / vvv _CCCL_CTK_BELOW(13, 0) vvv template <> inline constexpr bool __is_vector_type_v<::ulonglong4> = true; -# endif // ^^^ _CCCL_CTK_BELOW(13, 0) ^^^ template <> inline constexpr bool __is_vector_type_v<::float1> = true; @@ -481,15 +469,12 @@ template <> inline constexpr bool __is_vector_type_v<::double2> = true; template <> inline constexpr bool __is_vector_type_v<::double3> = true; -# if _CCCL_CTK_AT_LEAST(13, 0) template <> inline constexpr bool __is_vector_type_v<::double4_16a> = true; template <> inline constexpr bool __is_vector_type_v<::double4_32a> = true; -# else // ^^^ _CCCL_CTK_AT_LEAST(13, 0) ^^^ / vvv _CCCL_CTK_BELOW(13, 0) vvv template <> inline constexpr bool __is_vector_type_v<::double4> = true; -# endif // ^^^ _CCCL_CTK_BELOW(13, 0) ^^^ template <> inline constexpr bool __is_vector_type_v<::dim3> = true; From 501f48c0d6ddbb261b5ee1b017ccc30343768c75 Mon Sep 17 00:00:00 2001 From: fbusato Date: Mon, 5 Jan 2026 12:14:00 -0800 Subject: [PATCH 44/56] reenable vector types for CTK 13 --- libcudacxx/include/cuda/__type_traits/vector_type.h | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/libcudacxx/include/cuda/__type_traits/vector_type.h b/libcudacxx/include/cuda/__type_traits/vector_type.h index 100931020f2..4e88c9efa65 100644 --- a/libcudacxx/include/cuda/__type_traits/vector_type.h +++ b/libcudacxx/include/cuda/__type_traits/vector_type.h @@ -408,10 +408,12 @@ template <> inline constexpr bool __is_vector_type_v<::long2> = true; template <> inline constexpr bool __is_vector_type_v<::long3> = true; +# if _CCCL_CTK_AT_LEAST(13, 0) template <> inline constexpr bool __is_vector_type_v<::long4_16a> = true; template <> inline constexpr bool __is_vector_type_v<::long4_32a> = true; +# endif // ^^^ _CCCL_CTK_AT_LEAST(13, 0) template <> inline constexpr bool __is_vector_type_v<::long4> = true; @@ -421,10 +423,12 @@ template <> inline constexpr bool __is_vector_type_v<::ulong2> = true; template <> inline constexpr bool __is_vector_type_v<::ulong3> = true; +# if _CCCL_CTK_AT_LEAST(13, 0) template <> inline constexpr bool __is_vector_type_v<::ulong4_16a> = true; template <> inline constexpr bool __is_vector_type_v<::ulong4_32a> = true; +# endif // ^^^ _CCCL_CTK_AT_LEAST(13, 0) template <> inline constexpr bool __is_vector_type_v<::ulong4> = true; @@ -434,10 +438,12 @@ template <> inline constexpr bool __is_vector_type_v<::longlong2> = true; template <> inline constexpr bool __is_vector_type_v<::longlong3> = true; +# if _CCCL_CTK_AT_LEAST(13, 0) template <> inline constexpr bool __is_vector_type_v<::longlong4_16a> = true; template <> inline constexpr bool __is_vector_type_v<::longlong4_32a> = true; +# endif // ^^^ _CCCL_CTK_AT_LEAST(13, 0) template <> inline constexpr bool __is_vector_type_v<::longlong4> = true; @@ -447,10 +453,12 @@ template <> inline constexpr bool __is_vector_type_v<::ulonglong2> = true; template <> inline constexpr bool __is_vector_type_v<::ulonglong3> = true; +# if _CCCL_CTK_AT_LEAST(13, 0) template <> inline constexpr bool __is_vector_type_v<::ulonglong4_16a> = true; template <> inline constexpr bool __is_vector_type_v<::ulonglong4_32a> = true; +# endif // ^^^ _CCCL_CTK_AT_LEAST(13, 0) template <> inline constexpr bool __is_vector_type_v<::ulonglong4> = true; @@ -469,10 +477,12 @@ template <> inline constexpr bool __is_vector_type_v<::double2> = true; template <> inline constexpr bool __is_vector_type_v<::double3> = true; +# if _CCCL_CTK_AT_LEAST(13, 0) template <> inline constexpr bool __is_vector_type_v<::double4_16a> = true; template <> inline constexpr bool __is_vector_type_v<::double4_32a> = true; +# endif // ^^^ _CCCL_CTK_AT_LEAST(13, 0) template <> inline constexpr bool __is_vector_type_v<::double4> = true; From 604257dfa75db953368e1dcdefd8c7e10a2acdd1 Mon Sep 17 00:00:00 2001 From: fbusato Date: Mon, 5 Jan 2026 14:57:40 -0800 Subject: [PATCH 45/56] fix msvc warning --- libcudacxx/include/cuda/__mdspan/mdspan_to_dlpack.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/libcudacxx/include/cuda/__mdspan/mdspan_to_dlpack.h b/libcudacxx/include/cuda/__mdspan/mdspan_to_dlpack.h index f8ef04bf1aa..ecac4633178 100644 --- a/libcudacxx/include/cuda/__mdspan/mdspan_to_dlpack.h +++ b/libcudacxx/include/cuda/__mdspan/mdspan_to_dlpack.h @@ -142,6 +142,7 @@ template else { static_assert(::cuda::std::__always_false_v<_ElementType>, "Unsupported vector type"); + return ::DLDataType{}; } } # endif // _CCCL_HAS_CTK() @@ -150,9 +151,8 @@ template else { static_assert(::cuda::std::__always_false_v<_ElementType>, "Unsupported type"); + return ::DLDataType{}; } - _CCCL_UNREACHABLE(); - return ::DLDataType{}; } template <::cuda::std::size_t _Rank> From 14cf251c7bf243277b6a58ea528d87d82a174945 Mon Sep 17 00:00:00 2001 From: fbusato Date: Mon, 5 Jan 2026 16:13:46 -0800 Subject: [PATCH 46/56] documentation and copyright --- docs/libcudacxx/extended_api/mdspan.rst | 6 + .../extended_api/mdspan/dlpack_to_mdspan.rst | 130 ++++++++++++++++++ .../include/cuda/__mdspan/dlpack_to_mdspan.h | 2 +- .../dlpack_to_mdspan.pass.cpp | 2 +- 4 files changed, 138 insertions(+), 2 deletions(-) create mode 100644 docs/libcudacxx/extended_api/mdspan/dlpack_to_mdspan.rst diff --git a/docs/libcudacxx/extended_api/mdspan.rst b/docs/libcudacxx/extended_api/mdspan.rst index ca0582fa0d5..f0b29f3d1d0 100644 --- a/docs/libcudacxx/extended_api/mdspan.rst +++ b/docs/libcudacxx/extended_api/mdspan.rst @@ -11,6 +11,7 @@ Mdspan mdspan/restrict_accessor mdspan/shared_memory_accessor mdspan/mdspan_to_dlpack + mdspan/dlpack_to_mdspan .. list-table:: :widths: 25 45 30 30 @@ -40,3 +41,8 @@ Mdspan - Convert a ``mdspan`` to a ``DLTensor`` - CCCL 3.2.0 - CUDA 13.2 + + * - :ref:`dlpack to mdspan ` + - Convert a ``DLTensor`` to a ``mdspan`` + - CCCL 3.2.0 + - CUDA 13.2 diff --git a/docs/libcudacxx/extended_api/mdspan/dlpack_to_mdspan.rst b/docs/libcudacxx/extended_api/mdspan/dlpack_to_mdspan.rst new file mode 100644 index 00000000000..67e038c1018 --- /dev/null +++ b/docs/libcudacxx/extended_api/mdspan/dlpack_to_mdspan.rst @@ -0,0 +1,130 @@ +.. _libcudacxx-extended-api-mdspan-dlpack-to-mdspan: + +DLPack to ``mdspan`` +==================== + +This functionality provides a conversion from `DLPack `__ ``DLTensor`` to ``cuda::host_mdspan``, ``cuda::device_mdspan``, and ``cuda::managed_mdspan``. + +Defined in the ```` header. + +Conversion functions +-------------------- + +.. code:: cuda + + namespace cuda { + + template + [[nodiscard]] cuda::host_mdspan, LayoutPolicy> + to_host_mdspan(const DLTensor& tensor); + + template + [[nodiscard]] cuda::device_mdspan, LayoutPolicy> + to_device_mdspan(const DLTensor& tensor); + + template + [[nodiscard]] cuda::managed_mdspan, LayoutPolicy> + to_managed_mdspan(const DLTensor& tensor); + + } // namespace cuda + +Template parameters +------------------- + +- ``ElementType``: The element type of the resulting ``mdspan``. Must match the ``DLTensor::dtype``. +- ``Rank``: The number of dimensions. Must match ``DLTensor::ndim``. +- ``LayoutPolicy``: The layout policy for the resulting ``mdspan``. Defaults to ``cuda::std::layout_stride``. Supported layouts are: + + - ``cuda::std::layout_right`` (C-contiguous, row-major) + - ``cuda::std::layout_left`` (Fortran-contiguous, column-major) + - ``cuda::std::layout_stride`` (general strided layout) + +Semantics +--------- + +The conversion produces a non-owning ``mdspan`` view of the ``DLTensor`` data: + +- The ``mdspan`` data pointer is computed as ``static_cast(tensor.data) + tensor.byte_offset``. +- For ``rank > 0``, ``mdspan.extent(i)`` is ``tensor.shape[i]``. +- For ``layout_stride``, ``mdspan.stride(i)`` is ``tensor.strides[i]`` (or computed as row-major if ``strides`` is ``nullptr`` for DLPack < v1.2). +- The device type is validated: + + - ``kDLCPU`` for ``to_host_mdspan`` + - ``kDLCUDA`` for ``to_device_mdspan`` + - ``kDLCUDAManaged`` for ``to_managed_mdspan`` + +Constraints +----------- + +- ``LayoutPolicy`` must be one of ``cuda::std::layout_right``, ``cuda::std::layout_left``, or ``cuda::std::layout_stride``. +- For ``layout_right`` and ``layout_left``, the ``DLTensor`` strides must be compatible with the layout. + +Runtime errors +-------------- + +The conversion throws ``std::invalid_argument`` in the following cases: + +- ``DLTensor::ndim`` does not match the specified ``Rank``. +- ``DLTensor::dtype`` does not match ``ElementType``. +- ``DLTensor::data`` is ``nullptr``. +- ``DLTensor::shape`` is ``nullptr`` (for rank > 0). +- Any ``DLTensor::shape[i]`` is negative. +- ``DLTensor::strides`` is ``nullptr`` for DLPack v1.2 or later. +- ``DLTensor::strides`` is ``nullptr`` for ``layout_left`` with rank > 1 (DLPack < v1.2). +- ``DLTensor::strides[i]`` is not positive for ``layout_stride``. +- ``DLTensor::strides`` are not compatible with the requested ``layout_right`` or ``layout_left``. +- ``DLTensor::device.device_type`` does not match the target mdspan type. +- Data pointer is not properly aligned for the element type. + +Availability notes +------------------ + +- This API is available only when DLPack header is present, namely ```` is found in the include path. +- Requires DLPack major version 1. + +References +---------- + +- `DLPack C API `__ documentation. + +Example +------- + +.. code:: cuda + + #include + #include + #include + #include + + int main() { + int data[6] = {0, 1, 2, 3, 4, 5}; + + // Create a DLTensor manually for demonstration + int64_t shape[2] = {2, 3}; + int64_t strides[2] = {3, 1}; // row-major strides + + DLTensor tensor{}; + tensor.data = data; + tensor.device = {kDLCPU, 0}; + tensor.ndim = 2; + tensor.dtype = {kDLInt, 32, 1}; + tensor.shape = shape; + tensor.strides = strides; + tensor.byte_offset = 0; + + // Convert to host_mdspan + auto md = cuda::to_host_mdspan(tensor); + + assert(md.rank() == 2); + assert(md.extent(0) == 2 && md.extent(1) == 3); + assert(md.stride(0) == 3 && md.stride(1) == 1); + assert(md.data_handle() == data); + assert(md(0, 0) == 0 && md(1, 2) == 5); + } + +See also +-------- + +- :ref:`libcudacxx-extended-api-mdspan-mdspan-to-dlpack` for the reverse conversion. + diff --git a/libcudacxx/include/cuda/__mdspan/dlpack_to_mdspan.h b/libcudacxx/include/cuda/__mdspan/dlpack_to_mdspan.h index 2f6dd61619a..2ade5d4ff8e 100644 --- a/libcudacxx/include/cuda/__mdspan/dlpack_to_mdspan.h +++ b/libcudacxx/include/cuda/__mdspan/dlpack_to_mdspan.h @@ -3,7 +3,7 @@ // Part of the libcu++ Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. // //===----------------------------------------------------------------------===// diff --git a/libcudacxx/test/libcudacxx/cuda/containers/views/mdspan/dlpack_to_mdspan/dlpack_to_mdspan.pass.cpp b/libcudacxx/test/libcudacxx/cuda/containers/views/mdspan/dlpack_to_mdspan/dlpack_to_mdspan.pass.cpp index c797fec4b71..c393b99fcab 100644 --- a/libcudacxx/test/libcudacxx/cuda/containers/views/mdspan/dlpack_to_mdspan/dlpack_to_mdspan.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/containers/views/mdspan/dlpack_to_mdspan/dlpack_to_mdspan.pass.cpp @@ -3,7 +3,7 @@ // Part of the libcu++ Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. // //===----------------------------------------------------------------------===// // UNSUPPORTED: nvrtc From eb2635a03245bcbfd805ed09de4d1d3f5dddede2 Mon Sep 17 00:00:00 2001 From: fbusato Date: Mon, 5 Jan 2026 16:28:52 -0800 Subject: [PATCH 47/56] fix index_operator.pass --- .../views/mdspan/shared_mem_mdspan/index_operator.pass.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libcudacxx/test/libcudacxx/cuda/containers/views/mdspan/shared_mem_mdspan/index_operator.pass.cpp b/libcudacxx/test/libcudacxx/cuda/containers/views/mdspan/shared_mem_mdspan/index_operator.pass.cpp index e4e522e6555..9b1666241a5 100644 --- a/libcudacxx/test/libcudacxx/cuda/containers/views/mdspan/shared_mem_mdspan/index_operator.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/containers/views/mdspan/shared_mem_mdspan/index_operator.pass.cpp @@ -163,7 +163,7 @@ __device__ void test_layout() #if _CCCL_HAS_MULTIARG_OPERATOR_BRACKETS() test_iteration(construct_mapping(Layout(), cuda::std::extents())); - __shared__ int data[1]; + __shared__ int data[16]; // Check operator constraint for number of arguments static_assert(check_operator_constraints( cuda::shared_memory_mdspan(data, construct_mapping(Layout(), cuda::std::extents(1))), 0)); From ea7e4e4255615bc53e32e08d0a4d7bded06b940e Mon Sep 17 00:00:00 2001 From: fbusato Date: Mon, 5 Jan 2026 16:30:09 -0800 Subject: [PATCH 48/56] fix formatting --- docs/libcudacxx/extended_api/mdspan/dlpack_to_mdspan.rst | 1 - 1 file changed, 1 deletion(-) diff --git a/docs/libcudacxx/extended_api/mdspan/dlpack_to_mdspan.rst b/docs/libcudacxx/extended_api/mdspan/dlpack_to_mdspan.rst index 67e038c1018..1863c6c580b 100644 --- a/docs/libcudacxx/extended_api/mdspan/dlpack_to_mdspan.rst +++ b/docs/libcudacxx/extended_api/mdspan/dlpack_to_mdspan.rst @@ -127,4 +127,3 @@ See also -------- - :ref:`libcudacxx-extended-api-mdspan-mdspan-to-dlpack` for the reverse conversion. - From b6a52cd5df47c9492dd4fffa3b197c1308d9eb07 Mon Sep 17 00:00:00 2001 From: fbusato Date: Wed, 7 Jan 2026 09:33:57 -0800 Subject: [PATCH 49/56] use internal type --- .../extended_api/mdspan/mdspan_to_dlpack.rst | 27 ++++++++++--------- .../include/cuda/__mdspan/mdspan_to_dlpack.h | 24 ++++++++--------- .../mdspan_to_dlpack.wrapper.pass.cpp | 4 +-- 3 files changed, 29 insertions(+), 26 deletions(-) diff --git a/docs/libcudacxx/extended_api/mdspan/mdspan_to_dlpack.rst b/docs/libcudacxx/extended_api/mdspan/mdspan_to_dlpack.rst index 75d089a6d13..bb04aae5002 100644 --- a/docs/libcudacxx/extended_api/mdspan/mdspan_to_dlpack.rst +++ b/docs/libcudacxx/extended_api/mdspan/mdspan_to_dlpack.rst @@ -15,16 +15,16 @@ Conversion functions namespace cuda { template - [[nodiscard]] dlpack_tensor + [[nodiscard]] __dlpack_tensor to_dlpack(const cuda::host_mdspan& mdspan); template - [[nodiscard]] dlpack_tensor + [[nodiscard]] __dlpack_tensor to_dlpack(const cuda::device_mdspan& mdspan, cuda::device_ref device = cuda::device_ref{0}); template - [[nodiscard]] dlpack_tensor + [[nodiscard]] __dlpack_tensor to_dlpack(const cuda::managed_mdspan& mdspan); } // namespace cuda @@ -32,19 +32,21 @@ Conversion functions Types ----- +``__dlpack_tensor`` is an internal class that stores a ``DLTensor`` and owns the backing storage for its ``shape`` and ``strides`` pointers. The class does not use any heap allocation. + .. code:: cuda namespace cuda { template - class dlpack_tensor { + class __dlpack_tensor { public: - dlpack_tensor(); - dlpack_tensor(const dlpack_tensor&) noexcept; - dlpack_tensor(dlpack_tensor&&) noexcept; - dlpack_tensor& operator=(const dlpack_tensor&) noexcept; - dlpack_tensor& operator=(dlpack_tensor&&) noexcept; - ~dlpack_tensor() noexcept = default; + __dlpack_tensor(); + __dlpack_tensor(const __dlpack_tensor&) noexcept; + __dlpack_tensor(__dlpack_tensor&&) noexcept; + __dlpack_tensor& operator=(const __dlpack_tensor&) noexcept; + __dlpack_tensor& operator=(__dlpack_tensor&&) noexcept; + ~__dlpack_tensor() noexcept = default; DLTensor& get() noexcept; const DLTensor& get() const noexcept; @@ -52,11 +54,11 @@ Types } // namespace cuda -``cuda::dlpack_tensor`` stores a ``DLTensor`` and owns the backing storage for its ``shape`` and ``strides`` pointers. The class does not use any heap allocation. +``cuda::__dlpack_tensor`` stores a ``DLTensor`` and owns the backing storage for its ``shape`` and ``strides`` pointers. The class does not use any heap allocation. .. note:: **Lifetime** - The ``DLTensor`` associated with ``cuda::dlpack_tensor`` must not outlive the wrapper. If the wrapper is destroyed, the returned ``DLTensor::shape`` and ``DLTensor::strides`` pointers will dangle. + The ``DLTensor`` associated with ``cuda::__dlpack_tensor`` must not outlive the wrapper. If the wrapper is destroyed, the returned ``DLTensor::shape`` and ``DLTensor::strides`` pointers will dangle. .. note:: **Const-correctness** @@ -124,6 +126,7 @@ Example auto dl = cuda::to_dlpack(md); const auto& dltensor = dl.get(); + // auto dltensor = dl.get(); is incorrect; it returns a reference to a temporary object that will be destroyed at the end of the statement. // `dl` owns the shape/stride storage; `dltensor.data` is a non-owning pointer to `data`. assert(dltensor.device.device_type == kDLCPU); diff --git a/libcudacxx/include/cuda/__mdspan/mdspan_to_dlpack.h b/libcudacxx/include/cuda/__mdspan/mdspan_to_dlpack.h index ecac4633178..b5bdba1c80a 100644 --- a/libcudacxx/include/cuda/__mdspan/mdspan_to_dlpack.h +++ b/libcudacxx/include/cuda/__mdspan/mdspan_to_dlpack.h @@ -156,7 +156,7 @@ template } template <::cuda::std::size_t _Rank> -class dlpack_tensor +class __dlpack_tensor { ::cuda::std::array<::cuda::std::int64_t, _Rank> __shape{}; ::cuda::std::array<::cuda::std::int64_t, _Rank> __strides{}; @@ -169,12 +169,12 @@ class dlpack_tensor } public: - _CCCL_HOST_API explicit dlpack_tensor() noexcept + _CCCL_HOST_API explicit __dlpack_tensor() noexcept { __update_tensor(); } - _CCCL_HOST_API dlpack_tensor(const dlpack_tensor& __other) noexcept + _CCCL_HOST_API __dlpack_tensor(const __dlpack_tensor& __other) noexcept : __shape{__other.__shape} , __strides{__other.__strides} , __tensor{__other.__tensor} @@ -182,7 +182,7 @@ class dlpack_tensor __update_tensor(); } - _CCCL_HOST_API dlpack_tensor(dlpack_tensor&& __other) noexcept + _CCCL_HOST_API __dlpack_tensor(__dlpack_tensor&& __other) noexcept : __shape{::cuda::std::move(__other.__shape)} , __strides{::cuda::std::move(__other.__strides)} , __tensor{__other.__tensor} @@ -191,7 +191,7 @@ class dlpack_tensor __update_tensor(); } - _CCCL_HOST_API dlpack_tensor& operator=(const dlpack_tensor& __other) noexcept + _CCCL_HOST_API __dlpack_tensor& operator=(const __dlpack_tensor& __other) noexcept { if (this == &__other) { @@ -204,7 +204,7 @@ class dlpack_tensor return *this; } - _CCCL_HOST_API dlpack_tensor& operator=(dlpack_tensor&& __other) noexcept + _CCCL_HOST_API __dlpack_tensor& operator=(__dlpack_tensor&& __other) noexcept { if (this == &__other) { @@ -218,7 +218,7 @@ class dlpack_tensor return *this; } - _CCCL_HIDE_FROM_ABI ~dlpack_tensor() noexcept = default; + _CCCL_HIDE_FROM_ABI ~__dlpack_tensor() noexcept = default; [[nodiscard]] _CCCL_HOST_API ::DLTensor& get() noexcept { @@ -232,14 +232,14 @@ class dlpack_tensor }; template -[[nodiscard]] _CCCL_HOST_API dlpack_tensor<_Extents::rank()> +[[nodiscard]] _CCCL_HOST_API __dlpack_tensor<_Extents::rank()> __to_dlpack(const ::cuda::std::mdspan<_ElementType, _Extents, _Layout, _Accessor>& __mdspan, ::DLDeviceType __device_type, int __device_id) { static_assert(::cuda::std::is_pointer_v, "data_handle_type must be a pointer"); using __element_type = ::cuda::std::remove_cv_t<_ElementType>; - dlpack_tensor<_Extents::rank()> __wrapper{}; + __dlpack_tensor<_Extents::rank()> __wrapper{}; auto& __tensor = __wrapper.get(); __tensor.data = __mdspan.size() > 0 ? const_cast<__element_type*>(__mdspan.data_handle()) : nullptr; __tensor.device = ::DLDevice{__device_type, __device_id}; @@ -271,7 +271,7 @@ __to_dlpack(const ::cuda::std::mdspan<_ElementType, _Extents, _Layout, _Accessor **********************************************************************************************************************/ template -[[nodiscard]] _CCCL_HOST_API dlpack_tensor<_Extents::rank()> +[[nodiscard]] _CCCL_HOST_API __dlpack_tensor<_Extents::rank()> to_dlpack(const ::cuda::host_mdspan<_ElementType, _Extents, _Layout, _Accessor>& __mdspan) { using __mdspan_type = ::cuda::std::mdspan<_ElementType, _Extents, _Layout, _Accessor>; @@ -279,7 +279,7 @@ to_dlpack(const ::cuda::host_mdspan<_ElementType, _Extents, _Layout, _Accessor>& } template -[[nodiscard]] _CCCL_HOST_API dlpack_tensor<_Extents::rank()> +[[nodiscard]] _CCCL_HOST_API __dlpack_tensor<_Extents::rank()> to_dlpack(const ::cuda::device_mdspan<_ElementType, _Extents, _Layout, _Accessor>& __mdspan, ::cuda::device_ref __device = ::cuda::device_ref{0}) { @@ -288,7 +288,7 @@ to_dlpack(const ::cuda::device_mdspan<_ElementType, _Extents, _Layout, _Accessor } template -[[nodiscard]] _CCCL_HOST_API dlpack_tensor<_Extents::rank()> +[[nodiscard]] _CCCL_HOST_API __dlpack_tensor<_Extents::rank()> to_dlpack(const ::cuda::managed_mdspan<_ElementType, _Extents, _Layout, _Accessor>& __mdspan) { using __mdspan_type = ::cuda::std::mdspan<_ElementType, _Extents, _Layout, _Accessor>; diff --git a/libcudacxx/test/libcudacxx/cuda/containers/views/mdspan/mdspan_to_dlpack/mdspan_to_dlpack.wrapper.pass.cpp b/libcudacxx/test/libcudacxx/cuda/containers/views/mdspan/mdspan_to_dlpack/mdspan_to_dlpack.wrapper.pass.cpp index 58c6f99d9c8..e54cf6c93b6 100644 --- a/libcudacxx/test/libcudacxx/cuda/containers/views/mdspan/mdspan_to_dlpack/mdspan_to_dlpack.wrapper.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/containers/views/mdspan/mdspan_to_dlpack/mdspan_to_dlpack.wrapper.pass.cpp @@ -27,7 +27,7 @@ void check_datatype(const DLDataType& dt, uint8_t code, uint8_t bits, uint16_t l bool test_mdspan_to_dlpack_wrapper_default_ctor() { - cuda::dlpack_tensor<3> dlpack_wrapper{}; + cuda::__dlpack_tensor<3> dlpack_wrapper{}; DLDataType default_dtype = {}; DLDevice default_device = {}; auto& tensor = dlpack_wrapper.get(); @@ -185,7 +185,7 @@ bool test_dlpack_wrapper_move_assignment() bool test_dlpack_wrapper_get() { - using wrapper_t = cuda::dlpack_tensor<2>; + using wrapper_t = cuda::__dlpack_tensor<2>; static_assert(cuda::std::is_same_v().get()), ::DLTensor&>); static_assert(cuda::std::is_same_v().get()), const ::DLTensor&>); From 9bbf73b0cb3644529eb834161a79330844b48f81 Mon Sep 17 00:00:00 2001 From: fbusato Date: Wed, 7 Jan 2026 09:47:42 -0800 Subject: [PATCH 50/56] address comments --- .../extended_api/mdspan/dlpack_to_mdspan.rst | 6 +++--- .../include/cuda/__mdspan/dlpack_to_mdspan.h | 19 ++++++++++--------- 2 files changed, 13 insertions(+), 12 deletions(-) diff --git a/docs/libcudacxx/extended_api/mdspan/dlpack_to_mdspan.rst b/docs/libcudacxx/extended_api/mdspan/dlpack_to_mdspan.rst index 1863c6c580b..61497f6621d 100644 --- a/docs/libcudacxx/extended_api/mdspan/dlpack_to_mdspan.rst +++ b/docs/libcudacxx/extended_api/mdspan/dlpack_to_mdspan.rst @@ -15,15 +15,15 @@ Conversion functions namespace cuda { template - [[nodiscard]] cuda::host_mdspan, LayoutPolicy> + [[nodiscard]] cuda::host_mdspan, LayoutPolicy> to_host_mdspan(const DLTensor& tensor); template - [[nodiscard]] cuda::device_mdspan, LayoutPolicy> + [[nodiscard]] cuda::device_mdspan, LayoutPolicy> to_device_mdspan(const DLTensor& tensor); template - [[nodiscard]] cuda::managed_mdspan, LayoutPolicy> + [[nodiscard]] cuda::managed_mdspan, LayoutPolicy> to_managed_mdspan(const DLTensor& tensor); } // namespace cuda diff --git a/libcudacxx/include/cuda/__mdspan/dlpack_to_mdspan.h b/libcudacxx/include/cuda/__mdspan/dlpack_to_mdspan.h index 2ade5d4ff8e..91fe65ab6c3 100644 --- a/libcudacxx/include/cuda/__mdspan/dlpack_to_mdspan.h +++ b/libcudacxx/include/cuda/__mdspan/dlpack_to_mdspan.h @@ -28,6 +28,7 @@ # include # include # include +# include # include # include # include @@ -125,10 +126,10 @@ _CCCL_HOST_API void __validate_dlpack_strides(const ::DLTensor& __tensor, [[mayb template [[nodiscard]] -_CCCL_HOST_API ::cuda::std::mdspan<_ElementType, ::cuda::std::dextents<::cuda::std::int64_t, _Rank>, _LayoutPolicy> +_CCCL_HOST_API ::cuda::std::mdspan<_ElementType, ::cuda::std::dims<_Rank, ::cuda::std::int64_t>, _LayoutPolicy> __to_mdspan(const ::DLTensor& __tensor) { - using __extents_type = ::cuda::std::dextents<::cuda::std::int64_t, _Rank>; + using __extents_type = ::cuda::std::dims<_Rank, ::cuda::std::int64_t>; using __mdspan_type = ::cuda::std::mdspan<_ElementType, __extents_type, _LayoutPolicy>; using __mapping_type = typename _LayoutPolicy::template mapping<__extents_type>; using __element_type = typename __mdspan_type::element_type; @@ -144,7 +145,7 @@ __to_mdspan(const ::DLTensor& __tensor) } else { - if (__tensor.ndim != int{_Rank}) + if (cuda::std::cmp_not_equal(__tensor.ndim, _Rank)) { _CCCL_THROW(::std::invalid_argument{"DLTensor rank does not match expected rank"}); } @@ -213,42 +214,42 @@ __to_mdspan(const ::DLTensor& __tensor) template [[nodiscard]] -_CCCL_HOST_API ::cuda::host_mdspan<_ElementType, ::cuda::std::dextents<::cuda::std::int64_t, _Rank>, _LayoutPolicy> +_CCCL_HOST_API ::cuda::host_mdspan<_ElementType, ::cuda::std::dims<_Rank, ::cuda::std::int64_t>, _LayoutPolicy> to_host_mdspan(const ::DLTensor& __tensor) { if (__tensor.device.device_type != ::kDLCPU) { _CCCL_THROW(::std::invalid_argument{"DLTensor device type must be kDLCPU for host_mdspan"}); } - using __extents_type = ::cuda::std::dextents<::cuda::std::int64_t, _Rank>; + using __extents_type = ::cuda::std::dims<_Rank, ::cuda::std::int64_t>; using __mdspan_type = ::cuda::host_mdspan<_ElementType, __extents_type, _LayoutPolicy>; return __mdspan_type{::cuda::__to_mdspan<_ElementType, _Rank, _LayoutPolicy>(__tensor)}; } template [[nodiscard]] -_CCCL_HOST_API ::cuda::device_mdspan<_ElementType, ::cuda::std::dextents<::cuda::std::int64_t, _Rank>, _LayoutPolicy> +_CCCL_HOST_API ::cuda::device_mdspan<_ElementType, ::cuda::std::dims<_Rank, ::cuda::std::int64_t>, _LayoutPolicy> to_device_mdspan(const ::DLTensor& __tensor) { if (__tensor.device.device_type != ::kDLCUDA) { _CCCL_THROW(::std::invalid_argument{"DLTensor device type must be kDLCUDA for device_mdspan"}); } - using __extents_type = ::cuda::std::dextents<::cuda::std::int64_t, _Rank>; + using __extents_type = ::cuda::std::dims<_Rank, ::cuda::std::int64_t>; using __mdspan_type = ::cuda::device_mdspan<_ElementType, __extents_type, _LayoutPolicy>; return __mdspan_type{::cuda::__to_mdspan<_ElementType, _Rank, _LayoutPolicy>(__tensor)}; } template [[nodiscard]] -_CCCL_HOST_API ::cuda::managed_mdspan<_ElementType, ::cuda::std::dextents<::cuda::std::int64_t, _Rank>, _LayoutPolicy> +_CCCL_HOST_API ::cuda::managed_mdspan<_ElementType, ::cuda::std::dims<_Rank, ::cuda::std::int64_t>, _LayoutPolicy> to_managed_mdspan(const ::DLTensor& __tensor) { if (__tensor.device.device_type != ::kDLCUDAManaged) { _CCCL_THROW(::std::invalid_argument{"DLTensor device type must be kDLCUDAManaged for managed_mdspan"}); } - using __extents_type = ::cuda::std::dextents<::cuda::std::int64_t, _Rank>; + using __extents_type = ::cuda::std::dims<_Rank, ::cuda::std::int64_t>; using __mdspan_type = ::cuda::managed_mdspan<_ElementType, __extents_type, _LayoutPolicy>; return __mdspan_type{::cuda::__to_mdspan<_ElementType, _Rank, _LayoutPolicy>(__tensor)}; } From ba38968e0d41934c89b307489272babeb040fe6b Mon Sep 17 00:00:00 2001 From: fbusato Date: Wed, 14 Jan 2026 10:29:16 -0800 Subject: [PATCH 51/56] use _CCCL_HAS_DLPACK --- libcudacxx/include/cuda/__mdspan/dlpack_to_mdspan.h | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/libcudacxx/include/cuda/__mdspan/dlpack_to_mdspan.h b/libcudacxx/include/cuda/__mdspan/dlpack_to_mdspan.h index 91fe65ab6c3..b2dc3640c6b 100644 --- a/libcudacxx/include/cuda/__mdspan/dlpack_to_mdspan.h +++ b/libcudacxx/include/cuda/__mdspan/dlpack_to_mdspan.h @@ -20,8 +20,9 @@ # pragma system_header #endif // no system header -#if !_CCCL_COMPILER(NVRTC) && _CCCL_HAS_INCLUDE() +#if _CCCL_HAS_DLPACK() +# include # include # include # include @@ -41,8 +42,6 @@ _CCCL_BEGIN_NAMESPACE_CUDA -static_assert(DLPACK_MAJOR_VERSION == 1, "DLPACK_MAJOR_VERSION must be 1"); - template [[nodiscard]] _CCCL_HOST_API inline bool __validate_dlpack_data_type(const ::DLDataType& __dtype) noexcept { @@ -258,5 +257,5 @@ _CCCL_END_NAMESPACE_CUDA # include -#endif // !_CCCL_COMPILER(NVRTC) && _CCCL_HAS_INCLUDE() +#endif // __CCCL_HAS_DLPACK() #endif // _CUDA___MDSPAN_DLPACK_TO_MDSPAN_H From 4774c70c4df2d590a5a755a037c2989f9e6451f8 Mon Sep 17 00:00:00 2001 From: fbusato Date: Thu, 22 Jan 2026 11:27:36 -0800 Subject: [PATCH 52/56] align with mdspan_to_dlpack --- .../extended_api/mdspan/dlpack_to_mdspan.rst | 11 +- libcudacxx/include/cuda/__internal/dlpack.h | 2 +- .../include/cuda/__mdspan/dlpack_to_mdspan.h | 37 +- .../dlpack_to_mdspan.exceptions.pass.cpp | 363 +++++++++ .../dlpack_to_mdspan.pass.cpp | 745 ++++++++---------- 5 files changed, 706 insertions(+), 452 deletions(-) create mode 100644 libcudacxx/test/libcudacxx/cuda/containers/views/mdspan/dlpack_to_mdspan/dlpack_to_mdspan.exceptions.pass.cpp diff --git a/docs/libcudacxx/extended_api/mdspan/dlpack_to_mdspan.rst b/docs/libcudacxx/extended_api/mdspan/dlpack_to_mdspan.rst index 61497f6621d..3395e311400 100644 --- a/docs/libcudacxx/extended_api/mdspan/dlpack_to_mdspan.rst +++ b/docs/libcudacxx/extended_api/mdspan/dlpack_to_mdspan.rst @@ -53,6 +53,15 @@ The conversion produces a non-owning ``mdspan`` view of the ``DLTensor`` data: - ``kDLCUDA`` for ``to_device_mdspan`` - ``kDLCUDAManaged`` for ``to_managed_mdspan`` +Supported element types: + +- ``bool``. +- Signed and unsigned integers. +- IEEE-754 Floating-point and extended precision floating-point, including ``__half``, ``__nv_bfloat16``, ``__float128``, FP8, FP6, FP4 when available. +- Complex: ``cuda::std::complex<__half>``, ``cuda::std::complex``, and ``cuda::std::complex``. +- `CUDA built-in vector types `__, such as ``int2``, ``float4``, etc. +- Vector types for extended floating-point, such as ``__half2``, ``__nv_fp8x4_e4m3``, etc. + Constraints ----------- @@ -108,7 +117,7 @@ Example tensor.data = data; tensor.device = {kDLCPU, 0}; tensor.ndim = 2; - tensor.dtype = {kDLInt, 32, 1}; + tensor.dtype = DLDataType{kDLInt, 32, 1}; tensor.shape = shape; tensor.strides = strides; tensor.byte_offset = 0; diff --git a/libcudacxx/include/cuda/__internal/dlpack.h b/libcudacxx/include/cuda/__internal/dlpack.h index 61fb5dfcd2a..5af47c34b08 100644 --- a/libcudacxx/include/cuda/__internal/dlpack.h +++ b/libcudacxx/include/cuda/__internal/dlpack.h @@ -26,7 +26,7 @@ # include # define _CCCL_DLPACK_AT_LEAST(_MAJOR, _MINOR) \ - (DLPACK_MAJOR_VERSION > (_MAJOR) || (DLPACK_MAJOR_VERSION == (_MAJOR) && DLPACK_VERSION_MINOR >= (_MINOR))) + (DLPACK_MAJOR_VERSION > (_MAJOR) || (DLPACK_MAJOR_VERSION == (_MAJOR) && DLPACK_MINOR_VERSION >= (_MINOR))) # define _CCCL_DLPACK_BELOW(_MAJOR, _MINOR) (!_CCCL_DLPACK_AT_LEAST(_MAJOR, _MINOR)) # if DLPACK_MAJOR_VERSION != 1 diff --git a/libcudacxx/include/cuda/__mdspan/dlpack_to_mdspan.h b/libcudacxx/include/cuda/__mdspan/dlpack_to_mdspan.h index b2dc3640c6b..4c052b901bb 100644 --- a/libcudacxx/include/cuda/__mdspan/dlpack_to_mdspan.h +++ b/libcudacxx/include/cuda/__mdspan/dlpack_to_mdspan.h @@ -22,6 +22,7 @@ #if _CCCL_HAS_DLPACK() +# include # include # include # include @@ -50,12 +51,16 @@ template } [[nodiscard]] -_CCCL_HOST_API inline ::cuda::std::int64_t __layout_right_stride( - const ::cuda::std::int64_t* __shapes, ::cuda::std::size_t __pos, ::cuda::std::size_t __rank) noexcept +_CCCL_HOST_API inline ::cuda::std::int64_t +__get_layout_right_stride(const ::cuda::std::int64_t* __shapes, ::cuda::std::size_t __pos, ::cuda::std::size_t __rank) { ::cuda::std::int64_t __stride = 1; for (auto __i = __pos + 1; __i < __rank; ++__i) { + if (__stride * __shapes[__i] < 0 || ::cuda::mul_hi(__stride, __shapes[__i]) != 0) // TODO: replace with mul_overflow + { + _CCCL_THROW(::std::invalid_argument{"shape overflow"}); + } __stride *= __shapes[__i]; // TODO: check for overflow } return __stride; @@ -63,12 +68,16 @@ _CCCL_HOST_API inline ::cuda::std::int64_t __layout_right_stride( [[nodiscard]] _CCCL_HOST_API inline ::cuda::std::int64_t -__layout_left_stride(const ::cuda::std::int64_t* __shapes, ::cuda::std::size_t __pos) noexcept +__get_layout_left_stride(const ::cuda::std::int64_t* __shapes, ::cuda::std::size_t __pos) { ::cuda::std::int64_t __stride = 1; for (::cuda::std::size_t __i = 0; __i < __pos; ++__i) { - __stride *= __shapes[__i]; // TODO: check for overflow + if (__stride * __shapes[__i] < 0 || ::cuda::mul_hi(__stride, __shapes[__i]) != 0) // TODO: replace with mul_overflow + { + _CCCL_THROW(::std::invalid_argument{"shape overflow"}); + } + __stride *= __shapes[__i]; } return __stride; } @@ -83,7 +92,7 @@ _CCCL_HOST_API void __validate_dlpack_strides(const ::DLTensor& __tensor, [[mayb const auto __strides_ptr = __tensor.strides; if (__strides_ptr == nullptr) { -# if DLPACK_MAJOR_VERSION > 1 || (DLPACK_MAJOR_VERSION == 1 && DLPACK_MINOR_VERSION >= 2) +# if _CCCL_DLPACK_AT_LEAST(1, 2) _CCCL_THROW(::std::invalid_argument{"strides=nullptr is not supported for DLPack v1.2 and later"}); # else // strides == nullptr means row-major (C-contiguous) layout @@ -95,20 +104,20 @@ _CCCL_HOST_API void __validate_dlpack_strides(const ::DLTensor& __tensor, [[mayb { return; } -# endif // DLPACK_MAJOR_VERSION > 1 || (DLPACK_MAJOR_VERSION == 1 && DLPACK_MINOR_VERSION >= 2) +# endif // _CCCL_DLPACK_AT_LEAST(1, 2) } for (::cuda::std::size_t __pos = 0; __pos < __rank; ++__pos) { if constexpr (__is_layout_right) { - if (__strides_ptr[__pos] != ::cuda::__layout_right_stride(__tensor.shape, __pos, __rank)) + if (__strides_ptr[__pos] != ::cuda::__get_layout_right_stride(__tensor.shape, __pos, __rank)) { _CCCL_THROW(::std::invalid_argument{"DLTensor strides are not compatible with layout_right"}); } } else if constexpr (__is_layout_left) { - if (__strides_ptr[__pos] != ::cuda::__layout_left_stride(__tensor.shape, __pos)) + if (__strides_ptr[__pos] != ::cuda::__get_layout_left_stride(__tensor.shape, __pos)) { _CCCL_THROW(::std::invalid_argument{"DLTensor strides are not compatible with layout_left"}); } @@ -117,7 +126,7 @@ _CCCL_HOST_API void __validate_dlpack_strides(const ::DLTensor& __tensor, [[mayb { if (__strides_ptr[__pos] <= 0) { - _CCCL_THROW(::std::invalid_argument{"mdspan strides must be positive"}); + _CCCL_THROW(::std::invalid_argument{"layout_stride requires strictly positive strides"}); } } } @@ -135,11 +144,10 @@ __to_mdspan(const ::DLTensor& __tensor) constexpr bool __is_layout_right = ::cuda::std::is_same_v<_LayoutPolicy, ::cuda::std::layout_right>; constexpr bool __is_layout_left = ::cuda::std::is_same_v<_LayoutPolicy, ::cuda::std::layout_left>; constexpr bool __is_layout_stride = ::cuda::std::is_same_v<_LayoutPolicy, ::cuda::std::layout_stride>; - // TODO: add support for layout_right_padded and layout_left_padded + // TODO: add support for layout_stride_relaxed, layout_right_padded, layout_left_padded if constexpr (!__is_layout_right && !__is_layout_left && !__is_layout_stride) { static_assert(::cuda::std::__always_false_v<_LayoutPolicy>, "Unsupported layout policy"); - _CCCL_UNREACHABLE(); return __mdspan_type{}; } else @@ -182,7 +190,7 @@ __to_mdspan(const ::DLTensor& __tensor) { if (__tensor.shape[__i] < 0) { - _CCCL_THROW(::std::invalid_argument{"DLTensor shape must be positive"}); + _CCCL_THROW(::std::invalid_argument{"DLTensor shapes must be positive"}); } __extents_array[__i] = __tensor.shape[__i]; } @@ -194,14 +202,13 @@ __to_mdspan(const ::DLTensor& __tensor) { const bool __has_strides = __tensor.strides != nullptr; __strides_array[__i] = - __has_strides ? __tensor.strides[__i] : ::cuda::__layout_right_stride(__tensor.shape, __i, _Rank); + __has_strides ? __tensor.strides[__i] : ::cuda::__get_layout_right_stride(__tensor.shape, __i, _Rank); } return __mdspan_type{__data, __mapping_type{__extents_array, __strides_array}}; } else { - __extents_type __extents{__extents_array}; - return __mdspan_type{__data, __extents}; + return __mdspan_type{__data, __extents_type{__extents_array}}; } } } diff --git a/libcudacxx/test/libcudacxx/cuda/containers/views/mdspan/dlpack_to_mdspan/dlpack_to_mdspan.exceptions.pass.cpp b/libcudacxx/test/libcudacxx/cuda/containers/views/mdspan/dlpack_to_mdspan/dlpack_to_mdspan.exceptions.pass.cpp new file mode 100644 index 00000000000..8fc6e55839c --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/containers/views/mdspan/dlpack_to_mdspan/dlpack_to_mdspan.exceptions.pass.cpp @@ -0,0 +1,363 @@ +//===----------------------------------------------------------------------===// +// +// Part of the libcu++ Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// +// UNSUPPORTED: nvrtc + +#include +#include +#include +#include + +#include + +#include "test_macros.h" +#include + +template +using dlpack_array = cuda::std::array; + +//---------------------------------------------------------------------------------------------------------------------- +// Exception tests + +void test_exception_wrong_rank() +{ + cuda::std::array data{}; + dlpack_array<2> shape = {2, 3}; + dlpack_array<2> strides = {3, 1}; // row-major + DLTensor tensor{}; + tensor.data = data.data(); + tensor.device = DLDevice{kDLCPU, 0}; + tensor.ndim = 2; + tensor.dtype = DLDataType{DLDataTypeCode::kDLInt, 32, 1}; + tensor.shape = shape.data(); + tensor.strides = strides.data(); + bool caught = false; + try + { + // Try to convert rank-2 tensor to rank-1 mdspan + unused(cuda::to_host_mdspan(tensor)); + } + catch (const std::invalid_argument&) + { + caught = true; + } + assert(caught); +} + +void test_exception_wrong_dtype() +{ + cuda::std::array data{}; + dlpack_array<1> shape = {4}; + dlpack_array<1> strides = {1}; + DLTensor tensor{}; + tensor.data = data.data(); + tensor.device = DLDevice{kDLCPU, 0}; + tensor.ndim = 1; + tensor.dtype = DLDataType{DLDataTypeCode::kDLInt, 32, 1}; // dtype is int + tensor.shape = shape.data(); + tensor.strides = strides.data(); + bool caught = false; + try + { + // Try to convert int tensor to float mdspan + unused(cuda::to_host_mdspan(tensor)); + } + catch (const std::invalid_argument&) + { + caught = true; + } + assert(caught); +} + +void test_exception_null_data() +{ + dlpack_array<1> shape = {4}; + dlpack_array<1> strides = {1}; + DLTensor tensor{}; + tensor.data = nullptr; + tensor.device = DLDevice{kDLCPU, 0}; + tensor.ndim = 1; + tensor.dtype = DLDataType{DLDataTypeCode::kDLInt, 32, 1}; + tensor.shape = shape.data(); + tensor.strides = strides.data(); + bool caught = false; + try + { + unused(cuda::to_host_mdspan(tensor)); + } + catch (const std::invalid_argument&) + { + caught = true; + } + assert(caught); +} + +void test_exception_null_shape() +{ + cuda::std::array data{}; + DLTensor tensor{}; + tensor.data = data.data(); + tensor.device = DLDevice{kDLCPU, 0}; + tensor.ndim = 1; + tensor.dtype = DLDataType{DLDataTypeCode::kDLInt, 32, 1}; + tensor.shape = nullptr; // null shape + bool caught = false; + try + { + unused(cuda::to_host_mdspan(tensor)); + } + catch (const std::invalid_argument&) + { + caught = true; + } + assert(caught); +} + +void test_exception_negative_shape() +{ + cuda::std::array data{}; + dlpack_array<1> shape = {-3}; // negative shape + dlpack_array<1> strides = {1}; + DLTensor tensor{}; + tensor.data = data.data(); + tensor.device = DLDevice{kDLCPU, 0}; + tensor.ndim = 1; + tensor.dtype = DLDataType{DLDataTypeCode::kDLInt, 32, 1}; + tensor.shape = shape.data(); + tensor.strides = strides.data(); + bool caught = false; + try + { + unused(cuda::to_host_mdspan(tensor)); + } + catch (const std::invalid_argument&) + { + caught = true; + } + assert(caught); +} + +void test_exception_wrong_device_type_host() +{ + cuda::std::array data{}; + dlpack_array<1> shape = {4}; + dlpack_array<1> strides = {1}; + DLTensor tensor{}; + tensor.data = data.data(); + tensor.device = DLDevice{::kDLCUDA, 0}; // CUDA device, not CPU + tensor.ndim = 1; + tensor.dtype = DLDataType{DLDataTypeCode::kDLInt, 32, 1}; + tensor.shape = shape.data(); + tensor.strides = strides.data(); + bool caught = false; + try + { + unused(cuda::to_host_mdspan(tensor)); + } + catch (const std::invalid_argument&) + { + caught = true; + } + assert(caught); +} + +void test_exception_wrong_device_type_device() +{ + cuda::std::array data{}; + dlpack_array<1> shape = {4}; + dlpack_array<1> strides = {1}; + DLTensor tensor{}; + tensor.data = data.data(); + tensor.device = DLDevice{kDLCPU, 0}; // CPU device, not CUDA + tensor.ndim = 1; + tensor.dtype = DLDataType{DLDataTypeCode::kDLInt, 32, 1}; + tensor.shape = shape.data(); + tensor.strides = strides.data(); + bool caught = false; + try + { + unused(cuda::to_device_mdspan(tensor)); + } + catch (const std::invalid_argument&) + { + caught = true; + } + assert(caught); +} + +void test_exception_wrong_device_type_managed() +{ + cuda::std::array data{}; + dlpack_array<1> shape = {4}; + dlpack_array<1> strides = {1}; + DLTensor tensor{}; + tensor.data = data.data(); + tensor.device = DLDevice{kDLCPU, 0}; // CPU device, not CUDA managed + tensor.ndim = 1; + tensor.dtype = DLDataType{DLDataTypeCode::kDLInt, 32, 1}; + tensor.shape = shape.data(); + tensor.strides = strides.data(); + bool caught = false; + try + { + unused(cuda::to_managed_mdspan(tensor)); + } + catch (const std::invalid_argument&) + { + caught = true; + } + assert(caught); +} + +void test_exception_stride_mismatch_layout_right() +{ + cuda::std::array data{}; + dlpack_array<2> shape = {2, 3}; + dlpack_array<2> strides = {1, 2}; // Column-major, not row-major + DLTensor tensor{}; + tensor.data = data.data(); + tensor.device = DLDevice{kDLCPU, 0}; + tensor.ndim = 2; + tensor.dtype = DLDataType{DLDataTypeCode::kDLFloat, 32, 1}; + tensor.shape = shape.data(); + tensor.strides = strides.data(); + bool caught = false; + try + { + unused(cuda::to_host_mdspan(tensor)); + } + catch (const std::invalid_argument&) + { + caught = true; + } + assert(caught); +} + +void test_exception_stride_mismatch_layout_left() +{ + cuda::std::array data{}; + dlpack_array<2> shape = {2, 3}; + dlpack_array<2> strides = {3, 1}; // Row-major, not column-major + DLTensor tensor{}; + tensor.data = data.data(); + tensor.device = DLDevice{kDLCPU, 0}; + tensor.ndim = 2; + tensor.dtype = DLDataType{DLDataTypeCode::kDLFloat, 32, 1}; + tensor.shape = shape.data(); + tensor.strides = strides.data(); + bool caught = false; + try + { + unused(cuda::to_host_mdspan(tensor)); + } + catch (const std::invalid_argument&) + { + caught = true; + } + assert(caught); +} + +void test_exception_zero_stride_layout_stride() +{ + cuda::std::array data{}; + dlpack_array<2> shape = {2, 3}; + dlpack_array<2> strides = {0, 1}; // Zero stride is invalid + DLTensor tensor{}; + tensor.data = data.data(); + tensor.device = DLDevice{kDLCPU, 0}; + tensor.ndim = 2; + tensor.dtype = DLDataType{DLDataTypeCode::kDLInt, 32, 1}; + tensor.shape = shape.data(); + tensor.strides = strides.data(); + bool caught = false; + try + { + unused(cuda::to_host_mdspan(tensor)); + } + catch (const std::invalid_argument&) + { + caught = true; + } + assert(caught); +} + +void test_exception_null_strides_dlpack_v12() +{ + cuda::std::array data{}; + dlpack_array<2> shape = {2, 3}; + DLTensor tensor{}; + tensor.data = data.data(); + tensor.device = DLDevice{kDLCPU, 0}; + tensor.ndim = 2; + tensor.dtype = DLDataType{DLDataTypeCode::kDLFloat, 32, 1}; + tensor.shape = shape.data(); + tensor.strides = nullptr; // null strides not allowed in DLPack v1.2+ + bool caught = false; + try + { + unused(cuda::to_host_mdspan(tensor)); + } + catch (const std::invalid_argument&) + { + caught = true; + } + assert(caught); +} + +void test_exception_misaligned_data() +{ + // Create a buffer that allows us to get a misaligned pointer + alignas(16) cuda::std::array buffer{}; + // Get a pointer that's 1 byte into the buffer (misaligned for int) + auto misaligned_ptr = reinterpret_cast(buffer.data() + 1); + dlpack_array<1> shape = {3}; + dlpack_array<1> strides = {1}; + DLTensor tensor{}; + tensor.data = misaligned_ptr; + tensor.device = DLDevice{kDLCPU, 0}; + tensor.ndim = 1; + tensor.dtype = DLDataType{DLDataTypeCode::kDLInt, 32, 1}; + tensor.shape = shape.data(); + tensor.strides = strides.data(); + bool caught = false; + try + { + unused(cuda::to_host_mdspan(tensor)); + } + catch (const std::invalid_argument&) + { + caught = true; + } + assert(caught); +} + +bool test_exceptions() +{ + test_exception_wrong_rank(); + test_exception_wrong_dtype(); + test_exception_null_data(); + test_exception_null_shape(); + test_exception_negative_shape(); + test_exception_wrong_device_type_host(); + test_exception_wrong_device_type_device(); + test_exception_wrong_device_type_managed(); + test_exception_stride_mismatch_layout_right(); + test_exception_stride_mismatch_layout_left(); + test_exception_zero_stride_layout_stride(); +#if DLPACK_MAJOR_VERSION > 1 || (DLPACK_MAJOR_VERSION == 1 && DLPACK_MINOR_VERSION >= 2) + test_exception_null_strides_dlpack_v12(); +#endif + test_exception_misaligned_data(); + return true; +} + +int main(int, char**) +{ + NV_IF_TARGET(NV_IS_HOST, (assert(test_exceptions());)) + return 0; +} diff --git a/libcudacxx/test/libcudacxx/cuda/containers/views/mdspan/dlpack_to_mdspan/dlpack_to_mdspan.pass.cpp b/libcudacxx/test/libcudacxx/cuda/containers/views/mdspan/dlpack_to_mdspan/dlpack_to_mdspan.pass.cpp index c393b99fcab..c2d4b7fdd94 100644 --- a/libcudacxx/test/libcudacxx/cuda/containers/views/mdspan/dlpack_to_mdspan/dlpack_to_mdspan.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/containers/views/mdspan/dlpack_to_mdspan/dlpack_to_mdspan.pass.cpp @@ -22,19 +22,17 @@ template using dlpack_array = cuda::std::array; -//============================================================================== -// Test: Rank-0 mdspan conversion -//============================================================================== +//---------------------------------------------------------------------------------------------------------------------- +// Rank-0 mdspan conversion bool test_rank0() { float data = 42.0f; DLTensor tensor{}; - tensor.data = &data; - tensor.device = DLDevice{kDLCPU, 0}; - tensor.ndim = 0; - tensor.dtype = DLDataType{DLDataTypeCode::kDLFloat, 32, 1}; - + tensor.data = &data; + tensor.device = DLDevice{kDLCPU, 0}; + tensor.ndim = 0; + tensor.dtype = DLDataType{DLDataTypeCode::kDLFloat, 32, 1}; auto host_mdspan = cuda::to_host_mdspan(tensor); assert(host_mdspan.rank() == 0); @@ -44,35 +42,120 @@ bool test_rank0() return true; } -//============================================================================== -// Test: Empty tensor (zero in one dimension) -//============================================================================== +//---------------------------------------------------------------------------------------------------------------------- +// Empty tensor (zero in one dimension) -bool test_empty_tensor() +bool test_empty_tensor_layout_right_first_dim_zero() { int dummy = 0; // Non-null but won't be accessed dlpack_array<2> shape = {0, 5}; dlpack_array<2> strides = {5, 1}; // row-major DLTensor tensor{}; - tensor.data = &dummy; - tensor.device = DLDevice{kDLCPU, 0}; - tensor.ndim = 2; - tensor.dtype = DLDataType{DLDataTypeCode::kDLInt, 32, 1}; - tensor.shape = shape.data(); - tensor.strides = strides.data(); + tensor.data = &dummy; + tensor.device = DLDevice{kDLCPU, 0}; + tensor.ndim = 2; + tensor.dtype = DLDataType{DLDataTypeCode::kDLInt, 32, 1}; + tensor.shape = shape.data(); + tensor.strides = strides.data(); + auto host_mdspan = cuda::to_host_mdspan(tensor); + assert(host_mdspan.extent(0) == 0); + assert(host_mdspan.extent(1) == 5); + assert(host_mdspan.size() == 0); + assert(host_mdspan.empty()); + return true; +} + +bool test_empty_tensor_layout_right_second_dim_zero() +{ + int dummy = 0; // Non-null but won't be accessed + dlpack_array<2> shape = {2, 0}; + dlpack_array<2> strides = {0, 1}; // row-major: stride[0] = 0 * 1 = 0, stride[1] = 1 + DLTensor tensor{}; + tensor.data = &dummy; + tensor.device = DLDevice{kDLCPU, 0}; + tensor.ndim = 2; + tensor.dtype = DLDataType{DLDataTypeCode::kDLInt, 32, 1}; + tensor.shape = shape.data(); + tensor.strides = strides.data(); auto host_mdspan = cuda::to_host_mdspan(tensor); + assert(host_mdspan.extent(0) == 2); + assert(host_mdspan.extent(1) == 0); + assert(host_mdspan.size() == 0); + assert(host_mdspan.empty()); + return true; +} + +bool test_empty_tensor_layout_left_first_dim_zero() +{ + int dummy = 0; // Non-null but won't be accessed + dlpack_array<2> shape = {0, 5}; + dlpack_array<2> strides = {1, 0}; // column-major: stride[0] = 1, stride[1] = 0 * 1 = 0 + DLTensor tensor{}; + tensor.data = &dummy; + tensor.device = DLDevice{kDLCPU, 0}; + tensor.ndim = 2; + tensor.dtype = DLDataType{DLDataTypeCode::kDLInt, 32, 1}; + tensor.shape = shape.data(); + tensor.strides = strides.data(); + auto host_mdspan = cuda::to_host_mdspan(tensor); + + assert(host_mdspan.extent(0) == 0); + assert(host_mdspan.extent(1) == 5); + assert(host_mdspan.size() == 0); + assert(host_mdspan.empty()); + return true; +} + +bool test_empty_tensor_layout_stride_explicit_strides() +{ + int dummy = 0; // Non-null but won't be accessed + dlpack_array<2> shape = {0, 5}; + dlpack_array<2> strides = {5, 1}; // explicit strides + DLTensor tensor{}; + tensor.data = &dummy; + tensor.device = DLDevice{kDLCPU, 0}; + tensor.ndim = 2; + tensor.dtype = DLDataType{DLDataTypeCode::kDLInt, 32, 1}; + tensor.shape = shape.data(); + tensor.strides = strides.data(); + auto host_mdspan = cuda::to_host_mdspan(tensor); + assert(host_mdspan.extent(0) == 0); assert(host_mdspan.extent(1) == 5); + assert(host_mdspan.stride(0) == 5); + assert(host_mdspan.stride(1) == 1); assert(host_mdspan.size() == 0); assert(host_mdspan.empty()); return true; } -//============================================================================== -// Test: Rank-1 mdspan with layout_right (row-major) -//============================================================================== +bool test_empty_tensor_layout_stride_null_strides() +{ + int dummy = 0; // Non-null but won't be accessed + dlpack_array<2> shape = {0, 5}; + DLTensor tensor{}; + tensor.data = &dummy; + tensor.device = DLDevice{kDLCPU, 0}; + tensor.ndim = 2; + tensor.dtype = DLDataType{DLDataTypeCode::kDLInt, 32, 1}; + tensor.shape = shape.data(); + tensor.strides = nullptr; // null strides (only valid for DLPack < 1.2) + auto host_mdspan = cuda::to_host_mdspan(tensor); + + assert(host_mdspan.extent(0) == 0); + assert(host_mdspan.extent(1) == 5); + // Should use row-major strides by default + assert(host_mdspan.stride(0) == 5); + assert(host_mdspan.stride(1) == 1); + assert(host_mdspan.size() == 0); + assert(host_mdspan.empty()); + return true; +} + +//---------------------------------------------------------------------------------------------------------------------- +// Rank-1 mdspan with layout_right (row-major) bool test_rank1() { @@ -80,13 +163,12 @@ bool test_rank1() dlpack_array<1> shape = {5}; dlpack_array<1> strides = {1}; DLTensor tensor{}; - tensor.data = data.data(); - tensor.device = DLDevice{kDLCPU, 0}; - tensor.ndim = 1; - tensor.dtype = ::DLDataType{::kDLInt, 32, 1}; - tensor.shape = shape.data(); - tensor.strides = strides.data(); - + tensor.data = data.data(); + tensor.device = DLDevice{kDLCPU, 0}; + tensor.ndim = 1; + tensor.dtype = ::DLDataType{::kDLInt, 32, 1}; + tensor.shape = shape.data(); + tensor.strides = strides.data(); auto host_mdspan_right = cuda::to_host_mdspan(tensor); auto host_mdspan_left = cuda::to_host_mdspan(tensor); auto host_mdspan_stride = cuda::to_host_mdspan(tensor); @@ -115,9 +197,8 @@ bool test_rank1() return true; } -//============================================================================== -// Test: Rank-2 mdspan with layout_right (row-major) -//============================================================================== +//---------------------------------------------------------------------------------------------------------------------- +// Rank-2 mdspan with layout_right (row-major) bool test_rank2_layout_right() { @@ -126,13 +207,12 @@ bool test_rank2_layout_right() dlpack_array<2> shape = {2, 3}; dlpack_array<2> strides = {3, 1}; // row-major DLTensor tensor{}; - tensor.data = data.data(); - tensor.device = DLDevice{kDLCPU, 0}; - tensor.ndim = 2; - tensor.dtype = cuda::__data_type_to_dlpack(); - tensor.shape = shape.data(); - tensor.strides = strides.data(); - + tensor.data = data.data(); + tensor.device = DLDevice{kDLCPU, 0}; + tensor.ndim = 2; + tensor.dtype = cuda::__data_type_to_dlpack(); + tensor.shape = shape.data(); + tensor.strides = strides.data(); auto host_mdspan = cuda::to_host_mdspan(tensor); assert(host_mdspan.rank() == 2); @@ -140,7 +220,6 @@ bool test_rank2_layout_right() assert(host_mdspan.extent(1) == 3); assert(host_mdspan.stride(0) == 3); // row stride assert(host_mdspan.stride(1) == 1); // column stride - // Check values: row-major layout assert(host_mdspan(0, 0) == 1.0f); assert(host_mdspan(0, 1) == 2.0f); @@ -151,9 +230,8 @@ bool test_rank2_layout_right() return true; } -//============================================================================== -// Test: Rank-2 mdspan with layout_left (column-major) -//============================================================================== +//---------------------------------------------------------------------------------------------------------------------- +// Rank-2 mdspan with layout_left (column-major) bool test_rank2_layout_left() { @@ -162,13 +240,12 @@ bool test_rank2_layout_left() dlpack_array<2> shape = {2, 3}; dlpack_array<2> strides = {1, 2}; // column-major DLTensor tensor{}; - tensor.data = data.data(); - tensor.device = DLDevice{kDLCPU, 0}; - tensor.ndim = 2; - tensor.dtype = cuda::__data_type_to_dlpack(); - tensor.shape = shape.data(); - tensor.strides = strides.data(); - + tensor.data = data.data(); + tensor.device = DLDevice{kDLCPU, 0}; + tensor.ndim = 2; + tensor.dtype = cuda::__data_type_to_dlpack(); + tensor.shape = shape.data(); + tensor.strides = strides.data(); auto host_mdspan = cuda::to_host_mdspan(tensor); assert(host_mdspan.rank() == 2); @@ -176,7 +253,6 @@ bool test_rank2_layout_left() assert(host_mdspan.extent(1) == 3); assert(host_mdspan.stride(0) == 1); // row stride assert(host_mdspan.stride(1) == 2); // column stride - // Check values: column-major layout assert(host_mdspan(0, 0) == 1.0f); assert(host_mdspan(0, 1) == 2.0f); @@ -187,9 +263,8 @@ bool test_rank2_layout_left() return true; } -//============================================================================== -// Test: Rank-2 mdspan with layout_stride (arbitrary strides) -//============================================================================== +//---------------------------------------------------------------------------------------------------------------------- +// Rank-2 mdspan with layout_stride (arbitrary strides) bool test_rank2_layout_stride() { @@ -198,13 +273,12 @@ bool test_rank2_layout_stride() dlpack_array<2> shape = {2, 3}; dlpack_array<2> strides = {4, 1}; // Row stride = 4 (padded), col stride = 1 DLTensor tensor{}; - tensor.data = data.data(); - tensor.device = DLDevice{kDLCPU, 0}; - tensor.ndim = 2; - tensor.dtype = cuda::__data_type_to_dlpack(); - tensor.shape = shape.data(); - tensor.strides = strides.data(); - + tensor.data = data.data(); + tensor.device = DLDevice{kDLCPU, 0}; + tensor.ndim = 2; + tensor.dtype = cuda::__data_type_to_dlpack(); + tensor.shape = shape.data(); + tensor.strides = strides.data(); auto host_mdspan = cuda::to_host_mdspan(tensor); assert(host_mdspan.rank() == 2); @@ -212,7 +286,6 @@ bool test_rank2_layout_stride() assert(host_mdspan.extent(1) == 3); assert(host_mdspan.stride(0) == 4); assert(host_mdspan.stride(1) == 1); - assert(host_mdspan(0, 0) == 1); assert(host_mdspan(0, 1) == 2); assert(host_mdspan(0, 2) == 3); @@ -222,414 +295,203 @@ bool test_rank2_layout_stride() return true; } -//============================================================================== -// Test: layout_stride with default (layout_right) strides when strides is nullptr -// Note: This tests the fallback behavior for DLPack < 1.2 -//============================================================================== - -#if !(DLPACK_MAJOR_VERSION == 1 && DLPACK_MINOR_VERSION >= 2) +//---------------------------------------------------------------------------------------------------------------------- +// Rank-3 mdspan with layout_right (row-major) -bool test_layout_stride_null_strides() +bool test_rank3_layout_right() { - cuda::std::array data = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f}; - dlpack_array<2> shape = {2, 3}; + // 2x3x4 tensor in row-major order + cuda::std::array data = { + 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f, 12.0f, + 13.0f, 14.0f, 15.0f, 16.0f, 17.0f, 18.0f, 19.0f, 20.0f, 21.0f, 22.0f, 23.0f, 24.0f}; + dlpack_array<3> shape = {2, 3, 4}; + dlpack_array<3> strides = {12, 4, 1}; // row-major: stride[i] = product of shape[i+1:] DLTensor tensor{}; - tensor.data = data.data(); - tensor.device = DLDevice{kDLCPU, 0}; - tensor.ndim = 2; - tensor.dtype = cuda::__data_type_to_dlpack(); - tensor.shape = shape.data(); - tensor.strides = nullptr; // null strides - - auto host_mdspan = cuda::to_host_mdspan(tensor); - - // Should use row-major strides by default - assert(host_mdspan.stride(0) == 3); - assert(host_mdspan.stride(1) == 1); + tensor.data = data.data(); + tensor.device = DLDevice{kDLCPU, 0}; + tensor.ndim = 3; + tensor.dtype = cuda::__data_type_to_dlpack(); + tensor.shape = shape.data(); + tensor.strides = strides.data(); + auto host_mdspan = cuda::to_host_mdspan(tensor); + + assert(host_mdspan.rank() == 3); + assert(host_mdspan.extent(0) == 2); + assert(host_mdspan.extent(1) == 3); + assert(host_mdspan.extent(2) == 4); + assert(host_mdspan.stride(0) == 12); + assert(host_mdspan.stride(1) == 4); + assert(host_mdspan.stride(2) == 1); + // Check values + assert(host_mdspan(0, 0, 0) == 1.0f); + assert(host_mdspan(0, 0, 3) == 4.0f); + assert(host_mdspan(0, 1, 0) == 5.0f); + assert(host_mdspan(0, 2, 3) == 12.0f); + assert(host_mdspan(1, 0, 0) == 13.0f); + assert(host_mdspan(1, 2, 3) == 24.0f); return true; } -#endif // !(DLPACK_MAJOR_VERSION == 1 && DLPACK_MINOR_VERSION >= 2) - -//============================================================================== -// Test: byte_offset support -//============================================================================== +//---------------------------------------------------------------------------------------------------------------------- +// Rank-3 mdspan with layout_left (column-major) -bool test_byte_offset() +bool test_rank3_layout_left() { - cuda::std::array data = {0, 0, 1, 2, 3, 4, 5, 6}; - // Skip first 2 ints (8 bytes) - dlpack_array<1> shape = {6}; - dlpack_array<1> strides = {1}; + // 2x3x4 tensor in column-major order + // In column-major, elements are stored with the first index varying fastest + cuda::std::array data = { + 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f, 12.0f, + 13.0f, 14.0f, 15.0f, 16.0f, 17.0f, 18.0f, 19.0f, 20.0f, 21.0f, 22.0f, 23.0f, 24.0f}; + dlpack_array<3> shape = {2, 3, 4}; + dlpack_array<3> strides = {1, 2, 6}; // column-major: stride[i] = product of shape[:i] DLTensor tensor{}; - tensor.data = data.data(); - tensor.device = DLDevice{kDLCPU, 0}; - tensor.ndim = 1; - tensor.dtype = DLDataType{DLDataTypeCode::kDLInt, 32, 1}; - tensor.shape = shape.data(); - tensor.strides = strides.data(); - tensor.byte_offset = sizeof(int) * 2; - - auto host_mdspan = cuda::to_host_mdspan(tensor); - - assert(host_mdspan.extent(0) == 6); - assert(host_mdspan(0) == 1); - assert(host_mdspan(5) == 6); + tensor.data = data.data(); + tensor.device = DLDevice{kDLCPU, 0}; + tensor.ndim = 3; + tensor.dtype = cuda::__data_type_to_dlpack(); + tensor.shape = shape.data(); + tensor.strides = strides.data(); + auto host_mdspan = cuda::to_host_mdspan(tensor); + + assert(host_mdspan.rank() == 3); + assert(host_mdspan.extent(0) == 2); + assert(host_mdspan.extent(1) == 3); + assert(host_mdspan.extent(2) == 4); + assert(host_mdspan.stride(0) == 1); + assert(host_mdspan.stride(1) == 2); + assert(host_mdspan.stride(2) == 6); + // Check values: element at (i,j,k) is at index i*1 + j*2 + k*6 + 1 (1-indexed value) + assert(host_mdspan(0, 0, 0) == 1.0f); + assert(host_mdspan(1, 0, 0) == 2.0f); + assert(host_mdspan(0, 1, 0) == 3.0f); + assert(host_mdspan(1, 1, 0) == 4.0f); + assert(host_mdspan(0, 0, 1) == 7.0f); + assert(host_mdspan(1, 2, 3) == 24.0f); return true; } -//============================================================================== -// Exception tests -//============================================================================== - -void test_exception_wrong_rank() -{ - cuda::std::array data{}; - dlpack_array<2> shape = {2, 3}; - dlpack_array<2> strides = {3, 1}; // row-major - DLTensor tensor{}; - tensor.data = data.data(); - tensor.device = DLDevice{kDLCPU, 0}; - tensor.ndim = 2; - tensor.dtype = DLDataType{DLDataTypeCode::kDLInt, 32, 1}; - tensor.shape = shape.data(); - tensor.strides = strides.data(); - - bool caught = false; - try - { - // Try to convert rank-2 tensor to rank-1 mdspan - unused(cuda::to_host_mdspan(tensor)); - } - catch (const std::invalid_argument&) - { - caught = true; - } - assert(caught); -} - -void test_exception_wrong_dtype() -{ - cuda::std::array data{}; - dlpack_array<1> shape = {4}; - dlpack_array<1> strides = {1}; - DLTensor tensor{}; - tensor.data = data.data(); - tensor.device = DLDevice{kDLCPU, 0}; - tensor.ndim = 1; - tensor.dtype = DLDataType{DLDataTypeCode::kDLInt, 32, 1}; // dtype is int - tensor.shape = shape.data(); - tensor.strides = strides.data(); - - bool caught = false; - try - { - // Try to convert int tensor to float mdspan - unused(cuda::to_host_mdspan(tensor)); - } - catch (const std::invalid_argument&) - { - caught = true; - } - assert(caught); -} - -void test_exception_null_data() -{ - dlpack_array<1> shape = {4}; - dlpack_array<1> strides = {1}; - DLTensor tensor{}; - tensor.data = nullptr; - tensor.device = DLDevice{kDLCPU, 0}; - tensor.ndim = 1; - tensor.dtype = DLDataType{DLDataTypeCode::kDLInt, 32, 1}; - tensor.shape = shape.data(); - tensor.strides = strides.data(); - - bool caught = false; - try - { - unused(cuda::to_host_mdspan(tensor)); - } - catch (const std::invalid_argument&) - { - caught = true; - } - assert(caught); -} - -void test_exception_null_shape() -{ - cuda::std::array data{}; - DLTensor tensor{}; - tensor.data = data.data(); - tensor.device = DLDevice{kDLCPU, 0}; - tensor.ndim = 1; - tensor.dtype = DLDataType{DLDataTypeCode::kDLInt, 32, 1}; - tensor.shape = nullptr; // null shape - - bool caught = false; - try - { - unused(cuda::to_host_mdspan(tensor)); - } - catch (const std::invalid_argument&) - { - caught = true; - } - assert(caught); -} - -void test_exception_negative_shape() -{ - cuda::std::array data{}; - dlpack_array<1> shape = {-3}; // negative shape - dlpack_array<1> strides = {1}; - DLTensor tensor{}; - tensor.data = data.data(); - tensor.device = DLDevice{kDLCPU, 0}; - tensor.ndim = 1; - tensor.dtype = DLDataType{DLDataTypeCode::kDLInt, 32, 1}; - tensor.shape = shape.data(); - tensor.strides = strides.data(); - - bool caught = false; - try - { - unused(cuda::to_host_mdspan(tensor)); - } - catch (const std::invalid_argument&) - { - caught = true; - } - assert(caught); -} +//---------------------------------------------------------------------------------------------------------------------- +// Rank-3 mdspan with layout_stride -void test_exception_wrong_device_type_host() +bool test_rank3_layout_stride() { - cuda::std::array data{}; - dlpack_array<1> shape = {4}; - dlpack_array<1> strides = {1}; - DLTensor tensor{}; - tensor.data = data.data(); - tensor.device = DLDevice{::kDLCUDA, 0}; // CUDA device, not CPU - tensor.ndim = 1; - tensor.dtype = DLDataType{DLDataTypeCode::kDLInt, 32, 1}; - tensor.shape = shape.data(); - tensor.strides = strides.data(); - - bool caught = false; - try + // 2x3x4 tensor with custom strides (padded) + cuda::std::array data{}; // Extra space for padding + // Fill with sequential values at the expected positions + for (int i = 0; i < 2; ++i) { - unused(cuda::to_host_mdspan(tensor)); + for (int j = 0; j < 3; ++j) + { + for (int k = 0; k < 4; ++k) + { + data[i * 16 + j * 5 + k] = i * 12 + j * 4 + k + 1; + } + } } - catch (const std::invalid_argument&) - { - caught = true; - } - assert(caught); -} - -void test_exception_wrong_device_type_device() -{ - cuda::std::array data{}; - dlpack_array<1> shape = {4}; - dlpack_array<1> strides = {1}; + dlpack_array<3> shape = {2, 3, 4}; + dlpack_array<3> strides = {16, 5, 1}; // Custom strides with padding DLTensor tensor{}; - tensor.data = data.data(); - tensor.device = DLDevice{kDLCPU, 0}; // CPU device, not CUDA - tensor.ndim = 1; - tensor.dtype = DLDataType{DLDataTypeCode::kDLInt, 32, 1}; - tensor.shape = shape.data(); - tensor.strides = strides.data(); - - bool caught = false; - try - { - unused(cuda::to_device_mdspan(tensor)); - } - catch (const std::invalid_argument&) - { - caught = true; - } - assert(caught); + tensor.data = data.data(); + tensor.device = DLDevice{kDLCPU, 0}; + tensor.ndim = 3; + tensor.dtype = cuda::__data_type_to_dlpack(); + tensor.shape = shape.data(); + tensor.strides = strides.data(); + auto host_mdspan = cuda::to_host_mdspan(tensor); + + assert(host_mdspan.rank() == 3); + assert(host_mdspan.extent(0) == 2); + assert(host_mdspan.extent(1) == 3); + assert(host_mdspan.extent(2) == 4); + assert(host_mdspan.stride(0) == 16); + assert(host_mdspan.stride(1) == 5); + assert(host_mdspan.stride(2) == 1); + // Check values + assert(host_mdspan(0, 0, 0) == 1); + assert(host_mdspan(0, 0, 3) == 4); + assert(host_mdspan(0, 1, 0) == 5); + assert(host_mdspan(0, 2, 3) == 12); + assert(host_mdspan(1, 0, 0) == 13); + assert(host_mdspan(1, 2, 3) == 24); + return true; } -void test_exception_wrong_device_type_managed() -{ - cuda::std::array data{}; - dlpack_array<1> shape = {4}; - dlpack_array<1> strides = {1}; - DLTensor tensor{}; - tensor.data = data.data(); - tensor.device = DLDevice{kDLCPU, 0}; // CPU device, not CUDA managed - tensor.ndim = 1; - tensor.dtype = DLDataType{DLDataTypeCode::kDLInt, 32, 1}; - tensor.shape = shape.data(); - tensor.strides = strides.data(); +//---------------------------------------------------------------------------------------------------------------------- +// const element types - bool caught = false; - try - { - unused(cuda::to_managed_mdspan(tensor)); - } - catch (const std::invalid_argument&) - { - caught = true; - } - assert(caught); -} - -void test_exception_stride_mismatch_layout_right() +bool test_const_element_type_rank1() { - cuda::std::array data{}; - dlpack_array<2> shape = {2, 3}; - dlpack_array<2> strides = {1, 2}; // Column-major, not row-major + const cuda::std::array data = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f}; + dlpack_array<1> shape = {5}; + dlpack_array<1> strides = {1}; DLTensor tensor{}; - tensor.data = data.data(); - tensor.device = DLDevice{kDLCPU, 0}; - tensor.ndim = 2; - tensor.dtype = DLDataType{DLDataTypeCode::kDLFloat, 32, 1}; - tensor.shape = shape.data(); - tensor.strides = strides.data(); - - bool caught = false; - try - { - unused(cuda::to_host_mdspan(tensor)); - } - catch (const std::invalid_argument&) + tensor.data = const_cast(data.data()); // DLPack uses void*, need const_cast + tensor.device = DLDevice{kDLCPU, 0}; + tensor.ndim = 1; + tensor.dtype = cuda::__data_type_to_dlpack(); + tensor.shape = shape.data(); + tensor.strides = strides.data(); + auto host_mdspan = cuda::to_host_mdspan(tensor); + + static_assert(cuda::std::is_same_v); + assert(host_mdspan.rank() == 1); + assert(host_mdspan.extent(0) == 5); + for (int i = 0; i < 5; ++i) { - caught = true; + assert(host_mdspan(i) == data[i]); } - assert(caught); + return true; } -void test_exception_stride_mismatch_layout_left() -{ - cuda::std::array data{}; - dlpack_array<2> shape = {2, 3}; - dlpack_array<2> strides = {3, 1}; // Row-major, not column-major - DLTensor tensor{}; - tensor.data = data.data(); - tensor.device = DLDevice{kDLCPU, 0}; - tensor.ndim = 2; - tensor.dtype = DLDataType{DLDataTypeCode::kDLFloat, 32, 1}; - tensor.shape = shape.data(); - tensor.strides = strides.data(); - - bool caught = false; - try - { - unused(cuda::to_host_mdspan(tensor)); - } - catch (const std::invalid_argument&) - { - caught = true; - } - assert(caught); -} +//---------------------------------------------------------------------------------------------------------------------- +// layout_stride with default (layout_right) strides when strides is nullptr +// Note: This tests the fallback behavior for DLPack < 1.2 -void test_exception_zero_stride_layout_stride() +bool test_layout_stride_null_strides() { - cuda::std::array data{}; - dlpack_array<2> shape = {2, 3}; - dlpack_array<2> strides = {0, 1}; // Zero stride is invalid + cuda::std::array data = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f}; + dlpack_array<2> shape = {2, 3}; DLTensor tensor{}; - tensor.data = data.data(); - tensor.device = DLDevice{kDLCPU, 0}; - tensor.ndim = 2; - tensor.dtype = DLDataType{DLDataTypeCode::kDLInt, 32, 1}; - tensor.shape = shape.data(); - tensor.strides = strides.data(); - - bool caught = false; - try - { - unused(cuda::to_host_mdspan(tensor)); - } - catch (const std::invalid_argument&) - { - caught = true; - } - assert(caught); + tensor.data = data.data(); + tensor.device = DLDevice{kDLCPU, 0}; + tensor.ndim = 2; + tensor.dtype = cuda::__data_type_to_dlpack(); + tensor.shape = shape.data(); + tensor.strides = nullptr; // null strides + auto host_mdspan = cuda::to_host_mdspan(tensor); + // Should use row-major strides by default + assert(host_mdspan.stride(0) == 3); + assert(host_mdspan.stride(1) == 1); + return true; } -void test_exception_null_strides_dlpack_v12() -{ - cuda::std::array data{}; - dlpack_array<2> shape = {2, 3}; - DLTensor tensor{}; - tensor.data = data.data(); - tensor.device = DLDevice{kDLCPU, 0}; - tensor.ndim = 2; - tensor.dtype = DLDataType{DLDataTypeCode::kDLFloat, 32, 1}; - tensor.shape = shape.data(); - tensor.strides = nullptr; // null strides not allowed in DLPack v1.2+ - - bool caught = false; - try - { - unused(cuda::to_host_mdspan(tensor)); - } - catch (const std::invalid_argument&) - { - caught = true; - } - assert(caught); -} +//---------------------------------------------------------------------------------------------------------------------- +// byte_offset support -void test_exception_misaligned_data() +bool test_byte_offset() { - // Create a buffer that allows us to get a misaligned pointer - alignas(16) cuda::std::array buffer{}; - // Get a pointer that's 1 byte into the buffer (misaligned for int) - auto misaligned_ptr = reinterpret_cast(buffer.data() + 1); - dlpack_array<1> shape = {3}; + cuda::std::array data = {0, 0, 1, 2, 3, 4, 5, 6}; + // Skip first 2 ints (8 bytes) + dlpack_array<1> shape = {6}; dlpack_array<1> strides = {1}; DLTensor tensor{}; - tensor.data = misaligned_ptr; - tensor.device = DLDevice{kDLCPU, 0}; - tensor.ndim = 1; - tensor.dtype = DLDataType{DLDataTypeCode::kDLInt, 32, 1}; - tensor.shape = shape.data(); - tensor.strides = strides.data(); - - bool caught = false; - try - { - unused(cuda::to_host_mdspan(tensor)); - } - catch (const std::invalid_argument&) - { - caught = true; - } - assert(caught); -} + tensor.data = data.data(); + tensor.device = DLDevice{kDLCPU, 0}; + tensor.ndim = 1; + tensor.dtype = DLDataType{DLDataTypeCode::kDLInt, 32, 1}; + tensor.shape = shape.data(); + tensor.strides = strides.data(); + tensor.byte_offset = sizeof(int) * 2; + auto host_mdspan = cuda::to_host_mdspan(tensor); -bool test_exceptions() -{ - test_exception_wrong_rank(); - test_exception_wrong_dtype(); - test_exception_null_data(); - test_exception_null_shape(); - test_exception_negative_shape(); - test_exception_wrong_device_type_host(); - test_exception_wrong_device_type_device(); - test_exception_wrong_device_type_managed(); - test_exception_stride_mismatch_layout_right(); - test_exception_stride_mismatch_layout_left(); - test_exception_zero_stride_layout_stride(); -#if DLPACK_MAJOR_VERSION > 1 || (DLPACK_MAJOR_VERSION == 1 && DLPACK_MINOR_VERSION >= 2) - test_exception_null_strides_dlpack_v12(); -#endif - test_exception_misaligned_data(); + assert(host_mdspan.extent(0) == 6); + assert(host_mdspan(0) == 1); + assert(host_mdspan(5) == 6); return true; } -//============================================================================== -// Test: Return type checking -//============================================================================== +//---------------------------------------------------------------------------------------------------------------------- +// Return type checking bool test_return_types() { @@ -643,9 +505,9 @@ bool test_return_types() tensor.dtype = cuda::__data_type_to_dlpack(); tensor.shape = shape.data(); tensor.strides = strides.data(); - // Check return type of to_host_mdspan auto host_ms = cuda::to_host_mdspan(tensor); + static_assert( cuda::std::is_same_v, cuda::std::layout_stride>>); @@ -664,16 +526,29 @@ int main(int, char**) NV_IF_TARGET( NV_IS_HOST, (assert(test_rank0()); // - assert(test_empty_tensor()); + // Empty tensor tests + assert(test_empty_tensor_layout_right_first_dim_zero()); + assert(test_empty_tensor_layout_right_second_dim_zero()); + assert(test_empty_tensor_layout_left_first_dim_zero()); + assert(test_empty_tensor_layout_stride_explicit_strides()); + // Rank-1 and Rank-2 tests assert(test_rank1()); assert(test_rank2_layout_right()); assert(test_rank2_layout_left()); assert(test_rank2_layout_stride()); + // Rank-3 tests + assert(test_rank3_layout_right()); + assert(test_rank3_layout_left()); + assert(test_rank3_layout_stride()); + // Const element type tests + assert(test_const_element_type_rank1()); + // Other tests assert(test_byte_offset()); - assert(test_return_types()); - assert(test_exceptions());)) + assert(test_return_types());)) #if !(DLPACK_MAJOR_VERSION > 1 || (DLPACK_MAJOR_VERSION == 1 && DLPACK_MINOR_VERSION >= 2)) - NV_IF_TARGET(NV_IS_HOST, (assert(test_layout_stride_null_strides());)) + NV_IF_TARGET(NV_IS_HOST, + (assert(test_layout_stride_null_strides()); // + assert(test_empty_tensor_layout_stride_null_strides());)) #endif return 0; } From 8c283afbc79340e671e0e41116091bf80b2321b8 Mon Sep 17 00:00:00 2001 From: fbusato Date: Fri, 23 Jan 2026 10:58:31 -0800 Subject: [PATCH 53/56] align Availability notes --- docs/libcudacxx/extended_api/mdspan/dlpack_to_mdspan.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/libcudacxx/extended_api/mdspan/dlpack_to_mdspan.rst b/docs/libcudacxx/extended_api/mdspan/dlpack_to_mdspan.rst index 3395e311400..daa4d7df7fc 100644 --- a/docs/libcudacxx/extended_api/mdspan/dlpack_to_mdspan.rst +++ b/docs/libcudacxx/extended_api/mdspan/dlpack_to_mdspan.rst @@ -89,7 +89,7 @@ Availability notes ------------------ - This API is available only when DLPack header is present, namely ```` is found in the include path. -- Requires DLPack major version 1. +- This API can be disabled by defining ``CCCL_DISABLE_DLPACK`` before including any library headers. In this case, ```` will not be included. References ---------- From 3c4137e449021d2c46b8e3d8d057ed95e9b631db Mon Sep 17 00:00:00 2001 From: Federico Busato <50413820+fbusato@users.noreply.github.com> Date: Fri, 23 Jan 2026 12:21:49 -0800 Subject: [PATCH 54/56] Update libcudacxx/include/cuda/__mdspan/dlpack_to_mdspan.h Co-authored-by: David Bayer <48736217+davebayer@users.noreply.github.com> --- libcudacxx/include/cuda/__mdspan/dlpack_to_mdspan.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libcudacxx/include/cuda/__mdspan/dlpack_to_mdspan.h b/libcudacxx/include/cuda/__mdspan/dlpack_to_mdspan.h index 4c052b901bb..239b4727ab8 100644 --- a/libcudacxx/include/cuda/__mdspan/dlpack_to_mdspan.h +++ b/libcudacxx/include/cuda/__mdspan/dlpack_to_mdspan.h @@ -57,7 +57,7 @@ __get_layout_right_stride(const ::cuda::std::int64_t* __shapes, ::cuda::std::siz ::cuda::std::int64_t __stride = 1; for (auto __i = __pos + 1; __i < __rank; ++__i) { - if (__stride * __shapes[__i] < 0 || ::cuda::mul_hi(__stride, __shapes[__i]) != 0) // TODO: replace with mul_overflow + if (const auto __hi = ::cuda::mul_hi(__stride, __shapes[__i]); __hi != 0 && __hi != -1) // TODO: replace with mul_overflow { _CCCL_THROW(::std::invalid_argument{"shape overflow"}); } From 1ab523a90c4942f582bb2c3001fdbb96190c8aa6 Mon Sep 17 00:00:00 2001 From: fbusato Date: Fri, 23 Jan 2026 12:24:43 -0800 Subject: [PATCH 55/56] fix mul_overflow --- libcudacxx/include/cuda/__mdspan/dlpack_to_mdspan.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/libcudacxx/include/cuda/__mdspan/dlpack_to_mdspan.h b/libcudacxx/include/cuda/__mdspan/dlpack_to_mdspan.h index 239b4727ab8..652ef808bc2 100644 --- a/libcudacxx/include/cuda/__mdspan/dlpack_to_mdspan.h +++ b/libcudacxx/include/cuda/__mdspan/dlpack_to_mdspan.h @@ -57,7 +57,8 @@ __get_layout_right_stride(const ::cuda::std::int64_t* __shapes, ::cuda::std::siz ::cuda::std::int64_t __stride = 1; for (auto __i = __pos + 1; __i < __rank; ++__i) { - if (const auto __hi = ::cuda::mul_hi(__stride, __shapes[__i]); __hi != 0 && __hi != -1) // TODO: replace with mul_overflow + // TODO: replace with mul_overflow + if (const auto __hi = ::cuda::mul_hi(__stride, __shapes[__i]); __hi != 0 && __hi != -1) { _CCCL_THROW(::std::invalid_argument{"shape overflow"}); } @@ -73,7 +74,8 @@ __get_layout_left_stride(const ::cuda::std::int64_t* __shapes, ::cuda::std::size ::cuda::std::int64_t __stride = 1; for (::cuda::std::size_t __i = 0; __i < __pos; ++__i) { - if (__stride * __shapes[__i] < 0 || ::cuda::mul_hi(__stride, __shapes[__i]) != 0) // TODO: replace with mul_overflow + // TODO: replace with mul_overflow + if (const auto __hi = ::cuda::mul_hi(__stride, __shapes[__i]); __hi != 0 && __hi != -1) { _CCCL_THROW(::std::invalid_argument{"shape overflow"}); } From d2e15177edd447420d3d75d9d0c26af2029af905 Mon Sep 17 00:00:00 2001 From: fbusato Date: Mon, 26 Jan 2026 11:17:44 -0800 Subject: [PATCH 56/56] address comments --- .../include/cuda/__mdspan/dlpack_to_mdspan.h | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/libcudacxx/include/cuda/__mdspan/dlpack_to_mdspan.h b/libcudacxx/include/cuda/__mdspan/dlpack_to_mdspan.h index 652ef808bc2..565bb9894c0 100644 --- a/libcudacxx/include/cuda/__mdspan/dlpack_to_mdspan.h +++ b/libcudacxx/include/cuda/__mdspan/dlpack_to_mdspan.h @@ -35,7 +35,9 @@ # include # include -# include +# if !_CCCL_COMPILER(NVRTC) +# include +# endif // !_CCCL_COMPILER(NVRTC) # include // @@ -185,10 +187,8 @@ __to_mdspan(const ::DLTensor& __tensor) { _CCCL_THROW(::std::invalid_argument{"DLTensor shape must be non-null"}); } - using ::cuda::std::int64_t; - using ::cuda::std::size_t; - ::cuda::std::array __extents_array{}; - for (size_t __i = 0; __i < _Rank; ++__i) + ::cuda::std::array<::cuda::std::int64_t, _Rank> __extents_array{}; + for (::cuda::std::size_t __i = 0; __i < _Rank; ++__i) { if (__tensor.shape[__i] < 0) { @@ -199,8 +199,8 @@ __to_mdspan(const ::DLTensor& __tensor) ::cuda::__validate_dlpack_strides<_LayoutPolicy>(__tensor, _Rank); if constexpr (__is_layout_stride) { - ::cuda::std::array __strides_array{}; - for (size_t __i = 0; __i < _Rank; ++__i) + ::cuda::std::array<::cuda::std::int64_t, _Rank> __strides_array{}; + for (::cuda::std::size_t __i = 0; __i < _Rank; ++__i) { const bool __has_strides = __tensor.strides != nullptr; __strides_array[__i] =