Bwd data #2

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open

iq136boy wants to merge 24 commits into master from bwd_data

CMakeLists.txt

-Original file line number
+Diff line change
@@ Expand Up / @@ -55,9 +55,11 @@ include_directories(BEFORE @@
     if(DEVICE_BACKEND STREQUAL "AMD")
         configure_file("${PROJECT_SOURCE_DIR}/composable_kernel/include/utility/config.amd.hpp.in" "${PROJECT_BINARY_DIR}/composable_kernel/include/utility/config.hpp")
         configure_file("${PROJECT_SOURCE_DIR}/composable_kernel/include/utility/float_type.amd.hpp.in" "${PROJECT_BINARY_DIR}/composable_kernel/include/utility/float_type.hpp")
+        configure_file("${PROJECT_SOURCE_DIR}/composable_kernel/include/utility/in_memory_operation.amd.hpp.in" "${PROJECT_BINARY_DIR}/composable_kernel/include/utility/in_memory_operation.hpp")
     elseif(DEVICE_BACKEND STREQUAL "NVIDIA")
         configure_file("${PROJECT_SOURCE_DIR}/composable_kernel/include/utility/config.nvidia.hpp.in" "${PROJECT_BINARY_DIR}/composable_kernel/include/utility/config.hpp")
         configure_file("${PROJECT_SOURCE_DIR}/composable_kernel/include/utility/float_type.nvidia.hpp.in" "${PROJECT_BINARY_DIR}/composable_kernel/include/utility/float_type.hpp")
+        configure_file("${PROJECT_SOURCE_DIR}/composable_kernel/include/utility/in_memory_operation.nvidia.hpp.in" "${PROJECT_BINARY_DIR}/composable_kernel/include/utility/in_memory_operation.hpp")
     endif()
     add_subdirectory(driver)

composable_kernel/include/gridwise_operation_wrapper.hpp

-Original file line number
+Diff line change
@@ -0,0 +1,10 @@
+    #ifndef CK_GRIDWISE_OPERATION_KERNEL_WRAPPER
+    #define CK_GRIDWISE_OPERATION_KERNEL_WRAPPER
+    template <typename GridwiseOp, typename... Xs>
+    __global__ void run_gridwise_operation(GridwiseOp, Xs... xs)
+    {
+        GridwiseOp{}.Run(xs...);
+    }
+    #endif

composable_kernel/include/kernel_algorithm/gridwise_col2im_eb_nchw.hpp

-Original file line number
+Diff line change
@@ -0,0 +1,130 @@
+    #ifndef CK_GRIDWISE_COL2IM_EB_NCHW_HPP
+    #define CK_GRIDWISE_COL2IM_EB_NCHW_HPP
+    #include "common_header.hpp"
+    #include "tensor_descriptor.hpp"
+    #include "tensor_descriptor_helper.hpp"
+    #include "blockwise_generic_tensor_slice_copy.hpp"
+    namespace ck {
+    // B = merge(N, Ho, Wo)
+    template <index_t GridSize,
+              index_t BlockSize,
+              typename Float,
+              typename ColGlobalDesc,
+              typename ImgGlobalDesc,
+              typename FilterSizes,
+              typename OutputSizes,
+              typename ConvStrides,
+              typename ConvDilations,
+              typename LeftPads,
+              typename RightPads,
+              index_t EPerBlock,
+              index_t BPerBlock,
+              typename BlockCopySubLengths_E_B,
+              typename BlockCopyClusterLengths_E_B,
+              typename BlockCopyThreadClusterArrangeOrder,
+              typename BlockCopySrcAccessOrder,
+              typename BlockCopyDstAccessOrder,
+              index_t BlockCopyDataPerAccess_B>
+    struct GridwiseCol2Im_eb_nchw
+    {
+        __device__ void Run(const Float* const __restrict__ p_col_global,
+                            Float* const __restrict__ p_img_global) const
+        {
+            constexpr auto col_e_b_global_desc       = ColGlobalDesc{};
+            constexpr auto img_n_c_hi_wi_global_desc = ImgGlobalDesc{};
+            constexpr index_t N  = img_n_c_hi_wi_global_desc.GetLengths()[0];
+            constexpr index_t C  = img_n_c_hi_wi_global_desc.GetLengths()[1];
+            constexpr index_t Hi = img_n_c_hi_wi_global_desc.GetLengths()[2];
+            constexpr index_t Wi = img_n_c_hi_wi_global_desc.GetLengths()[3];
+            constexpr index_t Ho = OutputSizes{}[0];
+            constexpr index_t Wo = OutputSizes{}[1];
+            constexpr index_t Y = FilterSizes{}[0];
+            constexpr index_t X = FilterSizes{}[1];
+            constexpr index_t ConvStrideH = ConvStrides{}[0];
+            constexpr index_t ConvStrideW = ConvStrides{}[1];
+            constexpr index_t ConvDilationH = ConvDilations{}[0];
+            constexpr index_t ConvDilationW = ConvDilations{}[1];
+            constexpr index_t E = C * Y * X;
+            constexpr index_t B = N * Ho * Wo;
+            // sanity-check for vectorized memory load
+            static_assert((Wo == 1 || (ConvStrideW == 1 || BlockCopyDataPerAccess_B == 1)) &&
+                              (X == 1 || ConvDilationW % BlockCopyDataPerAccess_B == 0),
+                          "wrong! aligment requirement for vectorized global load of input tensor will "
+                          "be violated");
+            // divide block work by [E, B]
+            static_assert(E % EPerBlock == 0 && B % BPerBlock == 0,
+                          "wrong! cannot divide work evenly among block");
+            constexpr index_t EBlockWork = E / EPerBlock;
+            constexpr index_t BBlockWork = B / BPerBlock;
+            constexpr auto block_work_desc =
+                make_cluster_descriptor(Sequence<EBlockWork, BBlockWork>{});
+            const auto block_work_id = block_work_desc.CalculateClusterIndex(get_block_1d_id());
+            const index_t e_block_data_on_global = block_work_id[0] * EPerBlock;
+            const index_t b_block_data_on_global = block_work_id[1] * BPerBlock;
+            // construct img_eb_global_desc
+            constexpr auto img_n_c_hip_wip_global_desc = transform_tensor_descriptor(
+                img_n_c_hi_wi_global_desc,
+                make_tuple(
+                    PassThrough<N>{}, PassThrough<C>{}, Pad<Sequence<Hi, Wi>, LeftPads, RightPads>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}));
+            constexpr auto img_n_c_y_ho_x_wo_global_desc = transform_tensor_descriptor(
+                img_n_c_hip_wip_global_desc,
+                make_tuple(PassThrough<N>{},
+                           PassThrough<C>{},
+                           Embed<Sequence<Y, Ho>, Sequence<ConvDilationH, ConvStrideH, 0>>{},
+                           Embed<Sequence<X, Wo>, Sequence<ConvDilationW, ConvStrideW, 0>>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}, Sequence<4, 5>{}));
+            constexpr auto img_e_b_global_desc = transform_tensor_descriptor(
+                img_n_c_y_ho_x_wo_global_desc,
+                make_tuple(Merge<Sequence<C, Y, X>>{}, Merge<Sequence<N, Ho, Wo>>{}),
+                make_tuple(Sequence<1, 2, 4>{}, Sequence<0, 3, 5>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+            // blockwise atomic accumulation
+            auto blockwise_copy = BlockwiseGenericTensorSliceCopy_v4<BlockSize,
+                                                                     decltype(col_e_b_global_desc),
+                                                                     decltype(img_e_b_global_desc),
+                                                                     Sequence<EPerBlock, BPerBlock>,
+                                                                     BlockCopySubLengths_E_B,
+                                                                     BlockCopyClusterLengths_E_B,
+                                                                     BlockCopyThreadClusterArrangeOrder,
+                                                                     BlockCopySrcAccessOrder,
+                                                                     BlockCopyDstAccessOrder,
+,
+,
+                                                                     BlockCopyDataPerAccess_B,
+                                                                     BlockCopyDataPerAccess_B,
+                                                                     AddressSpace::vgpr,
+                                                                     AddressSpace::vgpr,
+                                                                     AddressSpace::global,
+                                                                     InMemoryDataOperation::atomic_add>(
+                {e_block_data_on_global, b_block_data_on_global},
+                {e_block_data_on_global, b_block_data_on_global});
+            // blockwise copy
+            blockwise_copy.Run(p_col_global, p_img_global);
+        }
+    };
+    } // namespace ck
+    #endif

...kernel_algorithm/gridwise_convolution_backward_data_implicit_gemm_v1r1_nchw_kcyx_nkhw.hpp

-Original file line number
+Diff line change
@@ -0,0 +1,157 @@
+    #ifndef CK_GRIDWISE_CONVOLUTION_BACKWARD_DATA_IMPLICIT_GEMM_V1R1_NCHW_KCYX_NKHW_HPP
+    #define CK_GRIDWISE_CONVOLUTION_BACKWARD_DATA_IMPLICIT_GEMM_V1R1_NCHW_KCYX_NKHW_HPP
+    #include "common_header.hpp"
+    #include "tensor_descriptor.hpp"
+    #include "tensor_descriptor_helper.hpp"
+    #include "gridwise_gemm.hpp"
+    namespace ck {
+    template <index_t GridSize,
+              index_t BlockSize,
+              typename Float,
+              typename AccFloat,
+              typename InGlobalDesc,
+              typename WeiGlobalDesc,
+              typename OutGlobalDesc,
+              typename ConvStrides,
+              typename ConvDilations,
+              typename LeftPads,
+              typename RightPads,
+              index_t EPerBlock,
+              index_t BPerBlock,
+              index_t KPerBlock,
+              index_t GemmMPerThreadSubC,
+              index_t GemmNPerThreadSubC,
+              index_t GemmMLevel0Cluster,
+              index_t GemmNLevel0Cluster,
+              index_t GemmMLevel1Cluster,
+              index_t GemmNLevel1Cluster,
+              index_t GemmKPerThreadLoop,
+              index_t GemmThreadGemmDataPerReadM,
+              index_t GemmThreadGemmDataPerReadN,
+              typename WeiBlockCopySubLengths_K_E,
+              typename WeiBlockCopyClusterLengths_K_E,
+              index_t WeiBlockCopyDataPerAccess_E,
+              typename OutBlockCopySubLengths_K_B,
+              typename OutBlockCopyClusterLengths_K_B,
+              index_t OutBlockCopyDataPerAccess_B,
+              index_t InThreadCopyDataPerAccess_B>
+    struct GridwiseConvolutionBackwardDataImplicitGemm_v1r1_nchw_kcyx_nkhw
+    {
+        __device__ void Run(Float* __restrict__ p_in_global,
+                            const Float* __restrict__ p_wei_global,
+                            const Float* __restrict__ p_out_global) const
+        {
+            constexpr auto I0 = Number<0>{};
+            constexpr auto I1 = Number<1>{};
+            constexpr auto I2 = Number<2>{};
+            constexpr auto I3 = Number<3>{};
+            constexpr auto True = integral_constant<bool, true>{};
+            constexpr auto in_n_c_hi_wi_global_desc  = InGlobalDesc{};
+            constexpr auto wei_k_c_y_x_global_desc   = WeiGlobalDesc{};
+            constexpr auto out_n_k_ho_wo_global_desc = OutGlobalDesc{};
+            constexpr index_t N  = in_n_c_hi_wi_global_desc.GetLengths()[0];
+            constexpr index_t C  = in_n_c_hi_wi_global_desc.GetLengths()[1];
+            constexpr index_t Hi = in_n_c_hi_wi_global_desc.GetLengths()[2];
+            constexpr index_t Wi = in_n_c_hi_wi_global_desc.GetLengths()[3];
+            constexpr index_t K  = out_n_k_ho_wo_global_desc.GetLengths()[1];
+            constexpr index_t Ho = out_n_k_ho_wo_global_desc.GetLengths()[2];
+            constexpr index_t Wo = out_n_k_ho_wo_global_desc.GetLengths()[3];
+            constexpr index_t Y = wei_k_c_y_x_global_desc.GetLengths()[2];
+            constexpr index_t X = wei_k_c_y_x_global_desc.GetLengths()[3];
+            constexpr index_t ConvStrideH = ConvStrides{}[0];
+            constexpr index_t ConvStrideW = ConvStrides{}[1];
+            constexpr index_t ConvDilationH = ConvDilations{}[0];
+            constexpr index_t ConvDilationW = ConvDilations{}[1];
+            constexpr index_t E = C * Y * X;
+            constexpr index_t B = N * Ho * Wo;
+            // sanity-check for vectorized memory load
+            static_assert((Wo == 1 || (ConvStrideW == 1 || InThreadCopyDataPerAccess_B == 1)) &&
+                              (X == 1 || ConvDilationW % InThreadCopyDataPerAccess_B == 0),
+                          "wrong! aligment requirement for vectorized global load of input tensor will "
+                          "be violated");
+            // output tensor
+            constexpr auto out_n_k_howo_global_desc =
+                unfold_tensor_descriptor(out_n_k_ho_wo_global_desc, I2, I3);
+            constexpr auto out_k_b_global_desc =
+                transform_tensor_descriptor(out_n_k_howo_global_desc,
+                                            make_tuple(PassThrough<K>{}, Merge<Sequence<N, Ho * Wo>>{}),
+                                            make_tuple(Sequence<1>{}, Sequence<0, 2>{}),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}));
+            // weight tensor
+            constexpr auto wei_k_e_global_desc =
+                unfold_tensor_descriptor(wei_k_c_y_x_global_desc, I1, I3);
+            // input tensor
+            constexpr auto in_n_c_hip_wip_global_desc = transform_tensor_descriptor(
+                in_n_c_hi_wi_global_desc,
+                make_tuple(
+                    PassThrough<N>{}, PassThrough<C>{}, Pad<Sequence<Hi, Wi>, LeftPads, RightPads>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}));
+            constexpr auto in_n_c_y_ho_x_wo_global_desc = transform_tensor_descriptor(
+                in_n_c_hip_wip_global_desc,
+                make_tuple(PassThrough<N>{},
+                           PassThrough<C>{},
+                           Embed<Sequence<Y, Ho>, Sequence<ConvDilationH, ConvStrideH, 0>>{},
+                           Embed<Sequence<X, Wo>, Sequence<ConvDilationW, ConvStrideW, 0>>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}, Sequence<4, 5>{}));
+            constexpr auto in_e_b_global_desc = transform_tensor_descriptor(
+                in_n_c_y_ho_x_wo_global_desc,
+                make_tuple(Merge<Sequence<C, Y, X>>{}, Merge<Sequence<N, Ho, Wo>>{}),
+                make_tuple(Sequence<1, 2, 4>{}, Sequence<0, 3, 5>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+            // GEMM: atomic add
+            constexpr auto gridwise_gemm =
+                GridwiseGemmTransposedANormalBNormalC_v1r1<GridSize,
+                                                           BlockSize,
+                                                           Float,
+                                                           AccFloat,
+                                                           decltype(wei_k_e_global_desc),
+                                                           decltype(out_k_b_global_desc),
+                                                           decltype(in_e_b_global_desc),
+                                                           InMemoryDataOperation::atomic_add,
+                                                           EPerBlock,
+                                                           BPerBlock,
+                                                           KPerBlock,
+                                                           GemmMPerThreadSubC,
+                                                           GemmNPerThreadSubC,
+                                                           GemmMLevel0Cluster,
+                                                           GemmNLevel0Cluster,
+                                                           GemmMLevel1Cluster,
+                                                           GemmNLevel1Cluster,
+                                                           GemmKPerThreadLoop,
+                                                           GemmThreadGemmDataPerReadM,
+                                                           GemmThreadGemmDataPerReadN,
+                                                           WeiBlockCopySubLengths_K_E,
+                                                           WeiBlockCopyClusterLengths_K_E,
+                                                           WeiBlockCopyDataPerAccess_E,
+                                                           OutBlockCopySubLengths_K_B,
+                                                           OutBlockCopyClusterLengths_K_B,
+                                                           OutBlockCopyDataPerAccess_B,
+                                                           InThreadCopyDataPerAccess_B>{};
+            gridwise_gemm.Run(p_wei_global, p_out_global, p_in_global);
+        }
+    };
+    } // namespace ck
+    #endif

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Bwd data #2

Uh oh!

Diff view

Diff view

There are no files selected for viewing

Uh oh!

Bwd data #2

Are you sure you want to change the base?

Uh oh!

Bwd data #2

Uh oh!

Uh oh!

Diff view

Diff view

There are no files selected for viewing

Uh oh!