From 7673fe8307eadca597cc289045bf6f49724f2834 Mon Sep 17 00:00:00 2001 From: Chao Liu Date: Tue, 12 Nov 2019 10:41:01 -0600 Subject: [PATCH 01/23] refactor conv driver --- driver/CMakeLists.txt | 8 +- driver/src/driver.cpp | 538 ------------------------------------------ driver/src/driver.cu | 1 - 3 files changed, 4 insertions(+), 543 deletions(-) delete mode 100644 driver/src/driver.cpp delete mode 120000 driver/src/driver.cu diff --git a/driver/CMakeLists.txt b/driver/CMakeLists.txt index 92fa1ec7..56745e0e 100644 --- a/driver/CMakeLists.txt +++ b/driver/CMakeLists.txt @@ -15,10 +15,10 @@ install(TARGETS host LIBRARY DESTINATION lib) if(DEVICE_BACKEND STREQUAL "AMD") - set(DRIVER_SOURCE src/driver.cpp) + set(DRIVER_SOURCE src/conv_driver.cpp) elseif(DEVICE_BACKEND STREQUAL "NVIDIA") - set(DRIVER_SOURCE src/driver.cu) + set(DRIVER_SOURCE src/conv_driver.cu) endif() -add_executable(driver ${DRIVER_SOURCE}) -target_link_libraries(driver PRIVATE host) +add_executable(conv ${DRIVER_SOURCE}) +target_link_libraries(conv PRIVATE host) diff --git a/driver/src/driver.cpp b/driver/src/driver.cpp deleted file mode 100644 index 720d5920..00000000 --- a/driver/src/driver.cpp +++ /dev/null @@ -1,538 +0,0 @@ -#include -#include -#include -#include -#include -#include "config.hpp" -#include "ConstantTensorDescriptor_deprecated.hpp" -#include "print_array.hpp" -#include "print_sequence.hpp" -#include "device.hpp" -#include "conv_common.hpp" -#include "host_conv.hpp" -//#include "device_convolution_direct_v2_nchw_kcyx_nkhw.hpp" -//#include "device_convolution_implicit_gemm_v1_chwn_cyxk_khwn.hpp" -//#include "device_convolution_implicit_gemm_v1_chwn_cyxk_khwn_padded.hpp" -//#include "device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw.hpp" -//#include "device_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hpp" -//#include "device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hpp" -#include "device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_deprecated.hpp" -#include "device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp" -//#include "device_convolution_implicit_gemm_v4r2_nchw_kcyx_nkhw.hpp" -//#include "device_convolution_implicit_gemm_v4r3_nchw_kcyx_nkhw.hpp" -#include "device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_deprecated.hpp" -#include "device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp" - -struct GeneratorTensor_1 -{ - template - double operator()(Is... is) - { - return 1; - } -}; - -struct GeneratorTensor_2 -{ - int min_value = 0; - int max_value = 1; - - template - double operator()(Is...) - { - return (std::rand() % (max_value - min_value)) + min_value; - } -}; - -struct GeneratorTensor_3 -{ - template - double operator()(Is... is) - { - std::array dims = {{static_cast(is)...}}; - - auto f_acc = [](auto a, auto b) { return 10 * a + b; }; - - return std::accumulate(dims.begin(), dims.end(), index_t(0), f_acc); - } -}; - -struct GeneratorTensor_Checkboard -{ - template - double operator()(Ts... Xs) const - { - std::array dims = {{Xs...}}; - return std::accumulate(dims.begin(), - dims.end(), - true, - [](bool init, index_t x) -> int { return init != (x % 2); }) - ? 1 - : -1; - } -}; - -int main(int argc, char* argv[]) -{ - using namespace ck; - -#if 0 - constexpr index_t N = 128; - constexpr index_t C = 128; - constexpr index_t HI = 17; - constexpr index_t WI = 17; - constexpr index_t K = 128; - constexpr index_t Y = 1; - constexpr index_t X = 7; - - using ConvStrides = Sequence<1, 1>; - using ConvDilations = Sequence<1, 1>; - - using LeftPads = Sequence<0, 3>; - using RightPads = Sequence<0, 3>; -#elif 0 - // 3x3, 34x34 - constexpr index_t N = 64; - constexpr index_t C = 256; - constexpr index_t HI = 34; - constexpr index_t WI = 34; - constexpr index_t K = 128; - constexpr index_t Y = 3; - constexpr index_t X = 3; - - using ConvStrides = Sequence<1, 1>; - using ConvDilations = Sequence<1, 1>; - - using LeftPads = Sequence<0, 0>; - using RightPads = Sequence<0, 0>; -#elif 0 - // 1x1 filter, 8x8 image - // cudnn@V100 68%, ck@V100 72%, ck@P100 52%, ck@VII 42% - constexpr index_t N = 64; - constexpr index_t C = 1536; - constexpr index_t HI = 8; - constexpr index_t WI = 8; - constexpr index_t K = 256; - constexpr index_t Y = 1; - constexpr index_t X = 1; - - using ConvStrides = Sequence<1, 1>; - using ConvDilations = Sequence<1, 1>; - - using LeftPads = Sequence<0, 0>; - using RightPads = Sequence<0, 0>; -#elif 0 - // 1x1 filter, 8x8 image - // cudnn@V100 77%, ck@V100 76%, ck@P100 79%, ck@VII 51% - constexpr index_t N = 128; - constexpr index_t C = 2048; - constexpr index_t HI = 8; - constexpr index_t WI = 8; - constexpr index_t K = 384; - constexpr index_t Y = 1; - constexpr index_t X = 1; - - using ConvStrides = Sequence<1, 1>; - using ConvDilations = Sequence<1, 1>; - - using LeftPads = Sequence<0, 0>; - using RightPads = Sequence<0, 0>; -#elif 0 - // 1x1 filter, 7x7 image - // cudnn@V100 82%, ck@V100 76%, ck@P100 67%, ck@VII 64% - constexpr index_t N = 128; - constexpr index_t C = 832; - constexpr index_t HI = 7; - constexpr index_t WI = 7; - constexpr index_t K = 384; - constexpr index_t Y = 1; - constexpr index_t X = 1; - - using ConvStrides = Sequence<1, 1>; - using ConvDilations = Sequence<1, 1>; - - using LeftPads = Sequence<0, 0>; - using RightPads = Sequence<0, 0>; -#elif 0 - // 1x1 filter, 8x8 image - // cudnn@V100 83%, ck@V100 75%, ck@P100 78%, ck@VII 65% - constexpr index_t N = 128; - constexpr index_t C = 1280; - constexpr index_t HI = 8; - constexpr index_t WI = 8; - constexpr index_t K = 384; - constexpr index_t Y = 1; - constexpr index_t X = 1; - - using ConvStrides = Sequence<1, 1>; - using ConvDilations = Sequence<1, 1>; - - using LeftPads = Sequence<0, 0>; - using RightPads = Sequence<0, 0>; -#elif 0 - // 1x1 filter, 14x14 image - // cudnn@V100 62%, ck@V100 68%, ck@P100 70%, ck@VII 50% - constexpr index_t N = 128; - constexpr index_t C = 512; - constexpr index_t HI = 14; - constexpr index_t WI = 14; - constexpr index_t K = 128; - constexpr index_t Y = 1; - constexpr index_t X = 1; - - using ConvStrides = Sequence<1, 1>; - using ConvDilations = Sequence<1, 1>; - - using LeftPads = Sequence<0, 0>; - using RightPads = Sequence<0, 0>; -#elif 0 - // 1x1 filter, 8x8 image - // cudnn@V100 74%, ck@V100 57%, ck@P100 78%, ck@VII 61% - constexpr index_t N = 64; - constexpr index_t C = 1536; - constexpr index_t HI = 8; - constexpr index_t WI = 8; - constexpr index_t K = 384; - constexpr index_t Y = 1; - constexpr index_t X = 1; - - using ConvStrides = Sequence<1, 1>; - using ConvDilations = Sequence<1, 1>; - - using LeftPads = Sequence<0, 0>; - using RightPads = Sequence<0, 0>; -#elif 0 - // 1x1 filter, 28x28 image - // cudnn@V100 86%, ck@V100 84%, ck@P100 80%, ck@VII 69% - constexpr index_t N = 128; - constexpr index_t C = 256; - constexpr index_t HI = 28; - constexpr index_t WI = 28; - constexpr index_t K = 128; - constexpr index_t Y = 1; - constexpr index_t X = 1; - - using ConvStrides = Sequence<1, 1>; - using ConvDilations = Sequence<1, 1>; - - using LeftPads = Sequence<0, 0>; - using RightPads = Sequence<0, 0>; -#elif 0 - // 1x1 filter, 7x7 image - // cudnn@V100 71%, ck@V100 55%, ck@P100 70%, ck@VII 62% - constexpr index_t N = 128; - constexpr index_t C = 832; - constexpr index_t HI = 7; - constexpr index_t WI = 7; - constexpr index_t K = 256; - constexpr index_t Y = 1; - constexpr index_t X = 1; - - using ConvStrides = Sequence<1, 1>; - using ConvDilations = Sequence<1, 1>; - - using LeftPads = Sequence<0, 0>; - using RightPads = Sequence<0, 0>; -#elif 0 - // 1x1 filter, 17x17 input - // cudnn@V100 81%, ck@V100 76%, ck@P100 70%, ck@VII 76% - constexpr index_t N = 128; - constexpr index_t C = 768; - constexpr index_t HI = 17; - constexpr index_t WI = 17; - constexpr index_t K = 128; - constexpr index_t Y = 1; - constexpr index_t X = 1; - - using ConvStrides = Sequence<1, 1>; - using ConvDilations = Sequence<1, 1>; - - using LeftPads = Sequence<0, 0>; - using RightPads = Sequence<0, 0>; -#elif 0 - // 1x1 filter, 14x14 image - // cudnn@V100 73%, ck@V100 71%, ck@P100 70%, ck@VII 64% - constexpr index_t N = 128; - constexpr index_t C = 528; - constexpr index_t HI = 14; - constexpr index_t WI = 14; - constexpr index_t K = 128; - constexpr index_t Y = 1; - constexpr index_t X = 1; - - using ConvStrides = Sequence<1, 1>; - using ConvDilations = Sequence<1, 1>; - - using LeftPads = Sequence<0, 0>; - using RightPads = Sequence<0, 0>; -#elif 0 - // 1x1 filter, 14x14 image - // cudnn@V100 73%, ck@V100 72%, ck@P100 79%, ck@VII 75% - constexpr index_t N = 128; - constexpr index_t C = 528; - constexpr index_t HI = 14; - constexpr index_t WI = 14; - constexpr index_t K = 256; - constexpr index_t Y = 1; - constexpr index_t X = 1; - - using ConvStrides = Sequence<1, 1>; - using ConvDilations = Sequence<1, 1>; - - using LeftPads = Sequence<0, 0>; - using RightPads = Sequence<0, 0>; -#elif 0 - // 1x1 filter, 7x7 image - // cudnn@V100 49%, ck@V100 50%, ck@P100 61%, ck@VII 52% - constexpr index_t N = 128; - constexpr index_t C = 832; - constexpr index_t HI = 7; - constexpr index_t WI = 7; - constexpr index_t K = 128; - constexpr index_t Y = 1; - constexpr index_t X = 1; - - using ConvStrides = Sequence<1, 1>; - using ConvDilations = Sequence<1, 1>; - - using LeftPads = Sequence<0, 0>; - using RightPads = Sequence<0, 0>; -#elif 0 - // 3x3 filter, 2x2 stride, 35x35 input, 17x17 output - // cudnn@V100 90%, ck@V100 93%, ck@P100 83%, ck@VII 81% - constexpr index_t N = 128; - constexpr index_t C = 288; - constexpr index_t HI = 35; - constexpr index_t WI = 35; - constexpr index_t K = 384; - constexpr index_t Y = 3; - constexpr index_t X = 3; - - using ConvStrides = Sequence<2, 2>; - using ConvDilations = Sequence<1, 1>; - - using LeftPads = Sequence<0, 0>; - using RightPads = Sequence<0, 0>; -#elif 0 - // 5x5 filter, 2x2 pad, 7x7 input - constexpr index_t N = 128; - constexpr index_t C = 48; - constexpr index_t HI = 7; - constexpr index_t WI = 7; - constexpr index_t K = 128; - constexpr index_t Y = 5; - constexpr index_t X = 5; - - using ConvStrides = Sequence<1, 1>; - using ConvDilations = Sequence<1, 1>; - - using LeftPads = Sequence<2, 2>; - using RightPads = Sequence<2, 2>; -#elif 0 - // 7x1 filter, 3x0 pad, 17x17 input - constexpr index_t N = 128; - constexpr index_t C = 128; - constexpr index_t HI = 17; - constexpr index_t WI = 17; - constexpr index_t K = 128; - constexpr index_t Y = 7; - constexpr index_t X = 1; - - using ConvStrides = Sequence<1, 1>; - using ConvDilations = Sequence<1, 1>; - - using LeftPads = Sequence<3, 0>; - using RightPads = Sequence<3, 0>; -#elif 1 - // 1x7 filter, 0x3 pad, 17x17 input - constexpr index_t N = 128; - constexpr index_t C = 128; - constexpr index_t HI = 17; - constexpr index_t WI = 17; - constexpr index_t K = 128; - constexpr index_t Y = 1; - constexpr index_t X = 7; - - using ConvStrides = Sequence<1, 1>; - using ConvDilations = Sequence<1, 1>; - - using LeftPads = Sequence<0, 3>; - using RightPads = Sequence<0, 3>; -#endif - - auto in_nchw_desc = make_ConstantTensorDescriptor_packed(Sequence{}); - auto wei_kcyx_desc = make_ConstantTensorDescriptor_packed(Sequence{}); - auto out_nkhw_desc = get_convolution_with_padding_output_default_4d_tensor_descriptor( - in_nchw_desc, wei_kcyx_desc, ConvStrides{}, ConvDilations{}, LeftPads{}, RightPads{}); - - ostream_ConstantTensorDescriptor(in_nchw_desc, std::cout << "in_nchw_desc: "); - ostream_ConstantTensorDescriptor(wei_kcyx_desc, std::cout << "wei_kcyx_desc: "); - ostream_ConstantTensorDescriptor(out_nkhw_desc, std::cout << "out_nkhw_desc: "); - print_sequence("LeftPads", LeftPads{}); - print_sequence("RightPads", RightPads{}); - print_sequence("ConvStrides", ConvStrides{}); - print_sequence("ConvDilations", ConvDilations{}); - - using in_data_t = float; - using out_data_t = float; - Tensor in_nchw(make_TensorDescriptor(in_nchw_desc)); - Tensor wei_kcyx(make_TensorDescriptor(wei_kcyx_desc)); - Tensor out_nkhw_host(make_TensorDescriptor(out_nkhw_desc)); - Tensor out_nkhw_device(make_TensorDescriptor(out_nkhw_desc)); - - std::size_t num_thread = std::thread::hardware_concurrency(); - - if(argc != 3) - { - printf("arg1: do_verification, arg2: nrepeat\n"); - exit(1); - } - - bool do_verification = atoi(argv[1]); - index_t nrepeat = atoi(argv[2]); - - if(do_verification) - { -#if 0 - in_nchw.GenerateTensorValue(GeneratorTensor_1{}, num_thread); - wei_kcyx.GenerateTensorValue(GeneratorTensor_1{}, num_thread); -#elif 0 - in_nchw.GenerateTensorValue(GeneratorTensor_1{}, num_thread); - wei_kcyx.GenerateTensorValue(GeneratorTensor_3{}, num_thread); -#elif 0 - in_nchw.GenerateTensorValue(GeneratorTensor_3{}, num_thread); - wei_kcyx.GenerateTensorValue(GeneratorTensor_1{}, num_thread); -#elif 1 - in_nchw.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread); - wei_kcyx.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread); -#elif 0 - in_nchw.GenerateTensorValue(GeneratorTensor_2{1, 5}, num_thread); - - auto gen_wei = [](auto... is) { - return GeneratorTensor_2{1, 5}(is...) * GeneratorTensor_Checkboard{}(is...); - }; - wei_kcyx.GenerateTensorValue(gen_wei, num_thread); -#endif - } - -#if 0 - device_convolution_direct_v2_nchw_kcyx_nkhw - (in_nchw_desc, in_nchw, wei_kcyx_desc, wei_kcyx, out_nkhw_desc, out_nkhw_device, nrepeat); -#elif 0 - device_convolution_implicit_gemm_v1_chwn_cyxk_khwn( - in_nchw_desc, in_nchw, wei_kcyx_desc, wei_kcyx, out_nkhw_desc, out_nkhw_device, nrepeat); -#elif 0 - device_convolution_implicit_gemm_v1_chwn_cyxk_khwn_padded(in_nchw_desc, - in_nchw, - wei_kcyx_desc, - wei_kcyx, - out_nkhw_desc, - out_nkhw_device, - LeftPads{}, - RightPads{}, - nrepeat); -#elif 0 - device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw( - in_nchw_desc, in_nchw, wei_kcyx_desc, wei_kcyx, out_nkhw_desc, out_nkhw_device, nrepeat); -#elif 0 - device_convolution_implicit_gemm_v2_chwn_cyxk_khwn( - in_nchw_desc, in_nchw, wei_kcyx_desc, wei_kcyx, out_nkhw_desc, out_nkhw_device, nrepeat); -#elif 0 - device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw( - (in_nchw_desc, in_nchw, wei_kcyx_desc, wei_kcyx, out_nkhw_desc, out_nkhw_device, nrepeat); -#elif 0 - device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_deprecated(in_nchw_desc, - in_nchw, - wei_kcyx_desc, - wei_kcyx, - out_nkhw_desc, - out_nkhw_device, - ConvStrides{}, - ConvDilations{}, - nrepeat); -#elif 1 - device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(in_nchw_desc, - in_nchw, - wei_kcyx_desc, - wei_kcyx, - out_nkhw_desc, - out_nkhw_device, - ConvStrides{}, - ConvDilations{}, - LeftPads{}, - RightPads{}, - nrepeat); -#elif 0 - device_convolution_implicit_gemm_v4r2_nchw_kcyx_nkhw(in_nchw_desc, - in_nchw, - wei_kcyx_desc, - wei_kcyx, - out_nkhw_desc, - out_nkhw_device, - ConvStrides{}, - ConvDilations{}, - nrepeat); -#elif 0 - device_convolution_implicit_gemm_v4r3_nchw_kcyx_nkhw(in_nchw_desc, - in_nchw, - wei_kcyx_desc, - wei_kcyx, - out_nkhw_desc, - out_nkhw_device, - ConvStrides{}, - ConvDilations{}, - nrepeat); -#elif 0 - device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_deprecated(in_nchw_desc, - in_nchw, - wei_kcyx_desc, - wei_kcyx, - out_nkhw_desc, - out_nkhw_device, - ConvStrides{}, - ConvDilations{}, - nrepeat); -#elif 0 - device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw(in_nchw_desc, - in_nchw, - wei_kcyx_desc, - wei_kcyx, - out_nkhw_desc, - out_nkhw_device, - ConvStrides{}, - ConvDilations{}, - LeftPads{}, - RightPads{}, - nrepeat); -#endif - - if(do_verification) - { -#if 1 - if(Y == 3 && X == 3 && ConvStrides{}[0] == 1 && ConvStrides{}[1] == 1 && - ConvDilations{}[0] == 1 && ConvDilations{}[1] == 1) - { - host_winograd_3x3_convolution( - in_nchw, wei_kcyx, out_nkhw_host, LeftPads{}, RightPads{}); - } - else -#endif - { - host_direct_convolution(in_nchw, - wei_kcyx, - out_nkhw_host, - ConvStrides{}, - ConvDilations{}, - LeftPads{}, - RightPads{}); - } - check_error(out_nkhw_host, out_nkhw_device); - -#if 0 - LogRange(std::cout << "in_nchw : ", in_nchw.mData, ",") << std::endl; - LogRange(std::cout << "wei_kcyx: ", wei_kcyx.mData, ",") << std::endl; - LogRange(std::cout << "out_nkhw_host : ", out_nkhw_host.mData, ",") << std::endl; - LogRange(std::cout << "out_nkhw_device: ", out_nkhw_device.mData, ",") << std::endl; -#endif - } -} diff --git a/driver/src/driver.cu b/driver/src/driver.cu deleted file mode 120000 index 1ca4fea9..00000000 --- a/driver/src/driver.cu +++ /dev/null @@ -1 +0,0 @@ -driver.cpp \ No newline at end of file From a34977df3bb0dab6e3c08c5e56ab8b572238f209 Mon Sep 17 00:00:00 2001 From: Chao Liu Date: Tue, 12 Nov 2019 10:41:49 -0600 Subject: [PATCH 02/23] refactor conv driver --- driver/include/tensor_generator.hpp | 55 ++++ driver/src/conv_driver.cpp | 491 ++++++++++++++++++++++++++++ driver/src/conv_driver.cu | 1 + 3 files changed, 547 insertions(+) create mode 100644 driver/include/tensor_generator.hpp create mode 100644 driver/src/conv_driver.cpp create mode 120000 driver/src/conv_driver.cu diff --git a/driver/include/tensor_generator.hpp b/driver/include/tensor_generator.hpp new file mode 100644 index 00000000..7699608d --- /dev/null +++ b/driver/include/tensor_generator.hpp @@ -0,0 +1,55 @@ +#ifndef TENSOR_GENERATOR_HPP +#define TENSOR_GENERATOR_HPP + +#include "config.hpp" + +struct GeneratorTensor_1 +{ + template + double operator()(Is... is) + { + return 1; + } +}; + +struct GeneratorTensor_2 +{ + int min_value = 0; + int max_value = 1; + + template + double operator()(Is...) + { + return (std::rand() % (max_value - min_value)) + min_value; + } +}; + +struct GeneratorTensor_3 +{ + template + double operator()(Is... is) + { + std::array dims = {{static_cast(is)...}}; + + auto f_acc = [](auto a, auto b) { return 10 * a + b; }; + + return std::accumulate(dims.begin(), dims.end(), ck::index_t(0), f_acc); + } +}; + +struct GeneratorTensor_Checkboard +{ + template + double operator()(Ts... Xs) const + { + std::array dims = {{Xs...}}; + return std::accumulate(dims.begin(), + dims.end(), + true, + [](bool init, ck::index_t x) -> int { return init != (x % 2); }) + ? 1 + : -1; + } +}; + +#endif diff --git a/driver/src/conv_driver.cpp b/driver/src/conv_driver.cpp new file mode 100644 index 00000000..80606939 --- /dev/null +++ b/driver/src/conv_driver.cpp @@ -0,0 +1,491 @@ +#include +#include +#include +#include +#include +#include "config.hpp" +#include "ConstantTensorDescriptor_deprecated.hpp" +#include "print_array.hpp" +#include "print_sequence.hpp" +#include "device.hpp" +#include "tensor_generator.hpp" +#include "conv_common.hpp" +#include "host_conv.hpp" +//#include "device_convolution_direct_v2_nchw_kcyx_nkhw.hpp" +//#include "device_convolution_implicit_gemm_v1_chwn_cyxk_khwn.hpp" +//#include "device_convolution_implicit_gemm_v1_chwn_cyxk_khwn_padded.hpp" +//#include "device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw.hpp" +//#include "device_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hpp" +//#include "device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hpp" +#include "device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_deprecated.hpp" +#include "device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp" +//#include "device_convolution_implicit_gemm_v4r2_nchw_kcyx_nkhw.hpp" +//#include "device_convolution_implicit_gemm_v4r3_nchw_kcyx_nkhw.hpp" +#include "device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_deprecated.hpp" +#include "device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp" + + +int main(int argc, char* argv[]) +{ + using namespace ck; + +#if 0 + constexpr index_t N = 128; + constexpr index_t C = 128; + constexpr index_t HI = 17; + constexpr index_t WI = 17; + constexpr index_t K = 128; + constexpr index_t Y = 1; + constexpr index_t X = 7; + + using ConvStrides = Sequence<1, 1>; + using ConvDilations = Sequence<1, 1>; + + using LeftPads = Sequence<0, 3>; + using RightPads = Sequence<0, 3>; +#elif 0 + // 3x3, 34x34 + constexpr index_t N = 64; + constexpr index_t C = 256; + constexpr index_t HI = 34; + constexpr index_t WI = 34; + constexpr index_t K = 128; + constexpr index_t Y = 3; + constexpr index_t X = 3; + + using ConvStrides = Sequence<1, 1>; + using ConvDilations = Sequence<1, 1>; + + using LeftPads = Sequence<0, 0>; + using RightPads = Sequence<0, 0>; +#elif 0 + // 1x1 filter, 8x8 image + // cudnn@V100 68%, ck@V100 72%, ck@P100 52%, ck@VII 42% + constexpr index_t N = 64; + constexpr index_t C = 1536; + constexpr index_t HI = 8; + constexpr index_t WI = 8; + constexpr index_t K = 256; + constexpr index_t Y = 1; + constexpr index_t X = 1; + + using ConvStrides = Sequence<1, 1>; + using ConvDilations = Sequence<1, 1>; + + using LeftPads = Sequence<0, 0>; + using RightPads = Sequence<0, 0>; +#elif 0 + // 1x1 filter, 8x8 image + // cudnn@V100 77%, ck@V100 76%, ck@P100 79%, ck@VII 51% + constexpr index_t N = 128; + constexpr index_t C = 2048; + constexpr index_t HI = 8; + constexpr index_t WI = 8; + constexpr index_t K = 384; + constexpr index_t Y = 1; + constexpr index_t X = 1; + + using ConvStrides = Sequence<1, 1>; + using ConvDilations = Sequence<1, 1>; + + using LeftPads = Sequence<0, 0>; + using RightPads = Sequence<0, 0>; +#elif 0 + // 1x1 filter, 7x7 image + // cudnn@V100 82%, ck@V100 76%, ck@P100 67%, ck@VII 64% + constexpr index_t N = 128; + constexpr index_t C = 832; + constexpr index_t HI = 7; + constexpr index_t WI = 7; + constexpr index_t K = 384; + constexpr index_t Y = 1; + constexpr index_t X = 1; + + using ConvStrides = Sequence<1, 1>; + using ConvDilations = Sequence<1, 1>; + + using LeftPads = Sequence<0, 0>; + using RightPads = Sequence<0, 0>; +#elif 0 + // 1x1 filter, 8x8 image + // cudnn@V100 83%, ck@V100 75%, ck@P100 78%, ck@VII 65% + constexpr index_t N = 128; + constexpr index_t C = 1280; + constexpr index_t HI = 8; + constexpr index_t WI = 8; + constexpr index_t K = 384; + constexpr index_t Y = 1; + constexpr index_t X = 1; + + using ConvStrides = Sequence<1, 1>; + using ConvDilations = Sequence<1, 1>; + + using LeftPads = Sequence<0, 0>; + using RightPads = Sequence<0, 0>; +#elif 0 + // 1x1 filter, 14x14 image + // cudnn@V100 62%, ck@V100 68%, ck@P100 70%, ck@VII 50% + constexpr index_t N = 128; + constexpr index_t C = 512; + constexpr index_t HI = 14; + constexpr index_t WI = 14; + constexpr index_t K = 128; + constexpr index_t Y = 1; + constexpr index_t X = 1; + + using ConvStrides = Sequence<1, 1>; + using ConvDilations = Sequence<1, 1>; + + using LeftPads = Sequence<0, 0>; + using RightPads = Sequence<0, 0>; +#elif 0 + // 1x1 filter, 8x8 image + // cudnn@V100 74%, ck@V100 57%, ck@P100 78%, ck@VII 61% + constexpr index_t N = 64; + constexpr index_t C = 1536; + constexpr index_t HI = 8; + constexpr index_t WI = 8; + constexpr index_t K = 384; + constexpr index_t Y = 1; + constexpr index_t X = 1; + + using ConvStrides = Sequence<1, 1>; + using ConvDilations = Sequence<1, 1>; + + using LeftPads = Sequence<0, 0>; + using RightPads = Sequence<0, 0>; +#elif 0 + // 1x1 filter, 28x28 image + // cudnn@V100 86%, ck@V100 84%, ck@P100 80%, ck@VII 69% + constexpr index_t N = 128; + constexpr index_t C = 256; + constexpr index_t HI = 28; + constexpr index_t WI = 28; + constexpr index_t K = 128; + constexpr index_t Y = 1; + constexpr index_t X = 1; + + using ConvStrides = Sequence<1, 1>; + using ConvDilations = Sequence<1, 1>; + + using LeftPads = Sequence<0, 0>; + using RightPads = Sequence<0, 0>; +#elif 0 + // 1x1 filter, 7x7 image + // cudnn@V100 71%, ck@V100 55%, ck@P100 70%, ck@VII 62% + constexpr index_t N = 128; + constexpr index_t C = 832; + constexpr index_t HI = 7; + constexpr index_t WI = 7; + constexpr index_t K = 256; + constexpr index_t Y = 1; + constexpr index_t X = 1; + + using ConvStrides = Sequence<1, 1>; + using ConvDilations = Sequence<1, 1>; + + using LeftPads = Sequence<0, 0>; + using RightPads = Sequence<0, 0>; +#elif 0 + // 1x1 filter, 17x17 input + // cudnn@V100 81%, ck@V100 76%, ck@P100 70%, ck@VII 76% + constexpr index_t N = 128; + constexpr index_t C = 768; + constexpr index_t HI = 17; + constexpr index_t WI = 17; + constexpr index_t K = 128; + constexpr index_t Y = 1; + constexpr index_t X = 1; + + using ConvStrides = Sequence<1, 1>; + using ConvDilations = Sequence<1, 1>; + + using LeftPads = Sequence<0, 0>; + using RightPads = Sequence<0, 0>; +#elif 0 + // 1x1 filter, 14x14 image + // cudnn@V100 73%, ck@V100 71%, ck@P100 70%, ck@VII 64% + constexpr index_t N = 128; + constexpr index_t C = 528; + constexpr index_t HI = 14; + constexpr index_t WI = 14; + constexpr index_t K = 128; + constexpr index_t Y = 1; + constexpr index_t X = 1; + + using ConvStrides = Sequence<1, 1>; + using ConvDilations = Sequence<1, 1>; + + using LeftPads = Sequence<0, 0>; + using RightPads = Sequence<0, 0>; +#elif 0 + // 1x1 filter, 14x14 image + // cudnn@V100 73%, ck@V100 72%, ck@P100 79%, ck@VII 75% + constexpr index_t N = 128; + constexpr index_t C = 528; + constexpr index_t HI = 14; + constexpr index_t WI = 14; + constexpr index_t K = 256; + constexpr index_t Y = 1; + constexpr index_t X = 1; + + using ConvStrides = Sequence<1, 1>; + using ConvDilations = Sequence<1, 1>; + + using LeftPads = Sequence<0, 0>; + using RightPads = Sequence<0, 0>; +#elif 0 + // 1x1 filter, 7x7 image + // cudnn@V100 49%, ck@V100 50%, ck@P100 61%, ck@VII 52% + constexpr index_t N = 128; + constexpr index_t C = 832; + constexpr index_t HI = 7; + constexpr index_t WI = 7; + constexpr index_t K = 128; + constexpr index_t Y = 1; + constexpr index_t X = 1; + + using ConvStrides = Sequence<1, 1>; + using ConvDilations = Sequence<1, 1>; + + using LeftPads = Sequence<0, 0>; + using RightPads = Sequence<0, 0>; +#elif 0 + // 3x3 filter, 2x2 stride, 35x35 input, 17x17 output + // cudnn@V100 90%, ck@V100 93%, ck@P100 83%, ck@VII 81% + constexpr index_t N = 128; + constexpr index_t C = 288; + constexpr index_t HI = 35; + constexpr index_t WI = 35; + constexpr index_t K = 384; + constexpr index_t Y = 3; + constexpr index_t X = 3; + + using ConvStrides = Sequence<2, 2>; + using ConvDilations = Sequence<1, 1>; + + using LeftPads = Sequence<0, 0>; + using RightPads = Sequence<0, 0>; +#elif 0 + // 5x5 filter, 2x2 pad, 7x7 input + constexpr index_t N = 128; + constexpr index_t C = 48; + constexpr index_t HI = 7; + constexpr index_t WI = 7; + constexpr index_t K = 128; + constexpr index_t Y = 5; + constexpr index_t X = 5; + + using ConvStrides = Sequence<1, 1>; + using ConvDilations = Sequence<1, 1>; + + using LeftPads = Sequence<2, 2>; + using RightPads = Sequence<2, 2>; +#elif 0 + // 7x1 filter, 3x0 pad, 17x17 input + constexpr index_t N = 128; + constexpr index_t C = 128; + constexpr index_t HI = 17; + constexpr index_t WI = 17; + constexpr index_t K = 128; + constexpr index_t Y = 7; + constexpr index_t X = 1; + + using ConvStrides = Sequence<1, 1>; + using ConvDilations = Sequence<1, 1>; + + using LeftPads = Sequence<3, 0>; + using RightPads = Sequence<3, 0>; +#elif 1 + // 1x7 filter, 0x3 pad, 17x17 input + constexpr index_t N = 128; + constexpr index_t C = 128; + constexpr index_t HI = 17; + constexpr index_t WI = 17; + constexpr index_t K = 128; + constexpr index_t Y = 1; + constexpr index_t X = 7; + + using ConvStrides = Sequence<1, 1>; + using ConvDilations = Sequence<1, 1>; + + using LeftPads = Sequence<0, 3>; + using RightPads = Sequence<0, 3>; +#endif + + auto in_nchw_desc = make_ConstantTensorDescriptor_packed(Sequence{}); + auto wei_kcyx_desc = make_ConstantTensorDescriptor_packed(Sequence{}); + auto out_nkhw_desc = get_convolution_with_padding_output_default_4d_tensor_descriptor( + in_nchw_desc, wei_kcyx_desc, ConvStrides{}, ConvDilations{}, LeftPads{}, RightPads{}); + + ostream_ConstantTensorDescriptor(in_nchw_desc, std::cout << "in_nchw_desc: "); + ostream_ConstantTensorDescriptor(wei_kcyx_desc, std::cout << "wei_kcyx_desc: "); + ostream_ConstantTensorDescriptor(out_nkhw_desc, std::cout << "out_nkhw_desc: "); + print_sequence("LeftPads", LeftPads{}); + print_sequence("RightPads", RightPads{}); + print_sequence("ConvStrides", ConvStrides{}); + print_sequence("ConvDilations", ConvDilations{}); + + using in_data_t = float; + using out_data_t = float; + Tensor in_nchw(make_TensorDescriptor(in_nchw_desc)); + Tensor wei_kcyx(make_TensorDescriptor(wei_kcyx_desc)); + Tensor out_nkhw_host(make_TensorDescriptor(out_nkhw_desc)); + Tensor out_nkhw_device(make_TensorDescriptor(out_nkhw_desc)); + + std::size_t num_thread = std::thread::hardware_concurrency(); + + if(argc != 3) + { + printf("arg1: do_verification, arg2: nrepeat\n"); + exit(1); + } + + bool do_verification = atoi(argv[1]); + index_t nrepeat = atoi(argv[2]); + + if(do_verification) + { +#if 0 + in_nchw.GenerateTensorValue(GeneratorTensor_1{}, num_thread); + wei_kcyx.GenerateTensorValue(GeneratorTensor_1{}, num_thread); +#elif 0 + in_nchw.GenerateTensorValue(GeneratorTensor_1{}, num_thread); + wei_kcyx.GenerateTensorValue(GeneratorTensor_3{}, num_thread); +#elif 0 + in_nchw.GenerateTensorValue(GeneratorTensor_3{}, num_thread); + wei_kcyx.GenerateTensorValue(GeneratorTensor_1{}, num_thread); +#elif 1 + in_nchw.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread); + wei_kcyx.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread); +#elif 0 + in_nchw.GenerateTensorValue(GeneratorTensor_2{1, 5}, num_thread); + + auto gen_wei = [](auto... is) { + return GeneratorTensor_2{1, 5}(is...) * GeneratorTensor_Checkboard{}(is...); + }; + wei_kcyx.GenerateTensorValue(gen_wei, num_thread); +#endif + } + +#if 0 + device_convolution_direct_v2_nchw_kcyx_nkhw + (in_nchw_desc, in_nchw, wei_kcyx_desc, wei_kcyx, out_nkhw_desc, out_nkhw_device, nrepeat); +#elif 0 + device_convolution_implicit_gemm_v1_chwn_cyxk_khwn( + in_nchw_desc, in_nchw, wei_kcyx_desc, wei_kcyx, out_nkhw_desc, out_nkhw_device, nrepeat); +#elif 0 + device_convolution_implicit_gemm_v1_chwn_cyxk_khwn_padded(in_nchw_desc, + in_nchw, + wei_kcyx_desc, + wei_kcyx, + out_nkhw_desc, + out_nkhw_device, + LeftPads{}, + RightPads{}, + nrepeat); +#elif 0 + device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw( + in_nchw_desc, in_nchw, wei_kcyx_desc, wei_kcyx, out_nkhw_desc, out_nkhw_device, nrepeat); +#elif 0 + device_convolution_implicit_gemm_v2_chwn_cyxk_khwn( + in_nchw_desc, in_nchw, wei_kcyx_desc, wei_kcyx, out_nkhw_desc, out_nkhw_device, nrepeat); +#elif 0 + device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw( + (in_nchw_desc, in_nchw, wei_kcyx_desc, wei_kcyx, out_nkhw_desc, out_nkhw_device, nrepeat); +#elif 0 + device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_deprecated(in_nchw_desc, + in_nchw, + wei_kcyx_desc, + wei_kcyx, + out_nkhw_desc, + out_nkhw_device, + ConvStrides{}, + ConvDilations{}, + nrepeat); +#elif 1 + device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(in_nchw_desc, + in_nchw, + wei_kcyx_desc, + wei_kcyx, + out_nkhw_desc, + out_nkhw_device, + ConvStrides{}, + ConvDilations{}, + LeftPads{}, + RightPads{}, + nrepeat); +#elif 0 + device_convolution_implicit_gemm_v4r2_nchw_kcyx_nkhw(in_nchw_desc, + in_nchw, + wei_kcyx_desc, + wei_kcyx, + out_nkhw_desc, + out_nkhw_device, + ConvStrides{}, + ConvDilations{}, + nrepeat); +#elif 0 + device_convolution_implicit_gemm_v4r3_nchw_kcyx_nkhw(in_nchw_desc, + in_nchw, + wei_kcyx_desc, + wei_kcyx, + out_nkhw_desc, + out_nkhw_device, + ConvStrides{}, + ConvDilations{}, + nrepeat); +#elif 0 + device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_deprecated(in_nchw_desc, + in_nchw, + wei_kcyx_desc, + wei_kcyx, + out_nkhw_desc, + out_nkhw_device, + ConvStrides{}, + ConvDilations{}, + nrepeat); +#elif 0 + device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw(in_nchw_desc, + in_nchw, + wei_kcyx_desc, + wei_kcyx, + out_nkhw_desc, + out_nkhw_device, + ConvStrides{}, + ConvDilations{}, + LeftPads{}, + RightPads{}, + nrepeat); +#endif + + if(do_verification) + { +#if 1 + if(Y == 3 && X == 3 && ConvStrides{}[0] == 1 && ConvStrides{}[1] == 1 && + ConvDilations{}[0] == 1 && ConvDilations{}[1] == 1) + { + host_winograd_3x3_convolution( + in_nchw, wei_kcyx, out_nkhw_host, LeftPads{}, RightPads{}); + } + else +#endif + { + host_direct_convolution(in_nchw, + wei_kcyx, + out_nkhw_host, + ConvStrides{}, + ConvDilations{}, + LeftPads{}, + RightPads{}); + } + check_error(out_nkhw_host, out_nkhw_device); + +#if 0 + LogRange(std::cout << "in_nchw : ", in_nchw.mData, ",") << std::endl; + LogRange(std::cout << "wei_kcyx: ", wei_kcyx.mData, ",") << std::endl; + LogRange(std::cout << "out_nkhw_host : ", out_nkhw_host.mData, ",") << std::endl; + LogRange(std::cout << "out_nkhw_device: ", out_nkhw_device.mData, ",") << std::endl; +#endif + } +} diff --git a/driver/src/conv_driver.cu b/driver/src/conv_driver.cu new file mode 120000 index 00000000..6b7ee4e3 --- /dev/null +++ b/driver/src/conv_driver.cu @@ -0,0 +1 @@ +conv_driver.cpp \ No newline at end of file From 0a35a4e19aa11b8dce4f723eb17e28108519e9d0 Mon Sep 17 00:00:00 2001 From: Chao Liu Date: Wed, 13 Nov 2019 18:43:38 -0600 Subject: [PATCH 03/23] refactor host code --- ...e_convolution_direct_v2_nchw_kcyx_nkhw.hpp | 5 +- .../ConstantMatrixDescriptor.hpp | 2 +- .../tensor_description/tensor_coordinate.hpp | 4 +- .../tensor_coordinate_deprecated.hpp | 4 +- .../tensor_descriptor_helper.hpp | 12 ++-- driver/include/conv_common.hpp | 43 ++++++++---- driver/include/device_tensor.hpp | 28 ++++++++ driver/include/host_conv.hpp | 66 ------------------- driver/include/tensor.hpp | 45 +++++++++++-- driver/src/conv_driver.cpp | 4 +- driver/src/tensor.cpp | 6 +- 11 files changed, 118 insertions(+), 101 deletions(-) create mode 100644 driver/include/device_tensor.hpp diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_direct_v2_nchw_kcyx_nkhw.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_direct_v2_nchw_kcyx_nkhw.hpp index aae74b61..bec97d28 100644 --- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_direct_v2_nchw_kcyx_nkhw.hpp +++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_direct_v2_nchw_kcyx_nkhw.hpp @@ -93,8 +93,9 @@ struct GridwiseConvolutionDirect_v2_nchw_kcyx_nkhw constexpr auto wei_kcyx_thread_block_desc = make_ConstantTensorDescriptor( Sequence{}, wei_kcyx_block_desc.GetStrides()); - constexpr auto out_nkhw_thread_desc = get_convolution_output_default_4d_tensor_descriptor( - in_nchw_thread_block_desc, wei_kcyx_thread_block_desc); + constexpr auto out_nkhw_thread_desc = + get_convolution_output_default_4d_tensor_descriptor_deprecated( + in_nchw_thread_block_desc, wei_kcyx_thread_block_desc); // register Float p_out_thread[out_nkhw_thread_desc.GetElementSpace()]; diff --git a/composable_kernel/include/tensor_description/ConstantMatrixDescriptor.hpp b/composable_kernel/include/tensor_description/ConstantMatrixDescriptor.hpp index 0ebd9dc4..e2a5836e 100644 --- a/composable_kernel/include/tensor_description/ConstantMatrixDescriptor.hpp +++ b/composable_kernel/include/tensor_description/ConstantMatrixDescriptor.hpp @@ -60,7 +60,7 @@ __host__ __device__ constexpr auto template __host__ __device__ constexpr auto - make_ConstantMatrixDescriptor(ConstantTensorDescriptor_deprecated) +make_ConstantMatrixDescriptor(ConstantTensorDescriptor_deprecated) { using TDesc = ConstantTensorDescriptor_deprecated; static_assert(TDesc::GetNumOfDimension() == 2, "wrong"); diff --git a/composable_kernel/include/tensor_description/tensor_coordinate.hpp b/composable_kernel/include/tensor_description/tensor_coordinate.hpp index 66dda13c..4e5c5cc8 100644 --- a/composable_kernel/include/tensor_description/tensor_coordinate.hpp +++ b/composable_kernel/include/tensor_description/tensor_coordinate.hpp @@ -228,7 +228,7 @@ struct TensorCoordinate private: template __host__ __device__ static constexpr auto - MakeDummyTensorCoordinate(NativeTensorDescriptor) + MakeDummyTensorCoordinate(NativeTensorDescriptor) { return NativeTensorCoordinate>( make_zero_array()); @@ -236,7 +236,7 @@ struct TensorCoordinate template __host__ __device__ static constexpr auto - MakeDummyTensorCoordinate(TransformedTensorDescriptor) + MakeDummyTensorCoordinate(TransformedTensorDescriptor) { return TransformedTensorCoordinate>( make_zero_array()); diff --git a/composable_kernel/include/tensor_description/tensor_coordinate_deprecated.hpp b/composable_kernel/include/tensor_description/tensor_coordinate_deprecated.hpp index 69659445..da02abdd 100644 --- a/composable_kernel/include/tensor_description/tensor_coordinate_deprecated.hpp +++ b/composable_kernel/include/tensor_description/tensor_coordinate_deprecated.hpp @@ -327,14 +327,14 @@ struct TensorCoordinate_deprecated private: template __host__ __device__ static constexpr auto - MakeDummyTensorCoordinate(ConstantTensorDescriptor_deprecated) + MakeDummyTensorCoordinate(ConstantTensorDescriptor_deprecated) { return NormalTensorCoordinate_deprecated>(); } template __host__ __device__ static constexpr auto - MakeDummyTensorCoordinate(ConstantMergedTensorDescriptor_deprecated) + MakeDummyTensorCoordinate(ConstantMergedTensorDescriptor_deprecated) { return MergedTensorCoordinate_deprecated< ConstantMergedTensorDescriptor_deprecated>(); diff --git a/composable_kernel/include/tensor_description/tensor_descriptor_helper.hpp b/composable_kernel/include/tensor_description/tensor_descriptor_helper.hpp index d7ef3867..1597e4c5 100644 --- a/composable_kernel/include/tensor_description/tensor_descriptor_helper.hpp +++ b/composable_kernel/include/tensor_description/tensor_descriptor_helper.hpp @@ -64,10 +64,10 @@ template __host__ __device__ constexpr auto - reorder_transformed_tensor_descriptor_impl(LowerTensorDescriptor, - Sequence, - Sequence, - Sequence) +reorder_transformed_tensor_descriptor_impl(LowerTensorDescriptor, + Sequence, + Sequence, + Sequence) { return TransformedTensorDescriptor...>, @@ -78,7 +78,7 @@ __host__ __device__ constexpr auto // reorder a NativeTensorDescriptor template __host__ __device__ constexpr auto - reorder_tensor_descriptor_given_lower2upper(NativeTensorDescriptor, MapLower2Upper) +reorder_tensor_descriptor_given_lower2upper(NativeTensorDescriptor, MapLower2Upper) { static_assert(is_valid_sequence_map{}, "wrong! MapLower2Upper is not a valid map"); @@ -96,7 +96,7 @@ __host__ __device__ constexpr auto // reorder a TransformedTensorDescriptor template __host__ __device__ constexpr auto - reorder_tensor_descriptor_given_lower2upper(TransformedTensorDescriptor, MapLower2Upper) +reorder_tensor_descriptor_given_lower2upper(TransformedTensorDescriptor, MapLower2Upper) { static_assert(is_valid_sequence_map{}, "wrong! MapLower2Upper is not a valid map"); diff --git a/driver/include/conv_common.hpp b/driver/include/conv_common.hpp index f37645df..3213d7de 100644 --- a/driver/include/conv_common.hpp +++ b/driver/include/conv_common.hpp @@ -2,10 +2,16 @@ #define CONV_COMMON_HPP #include "ConstantTensorDescriptor_deprecated.hpp" +#include "tensor_descriptor.hpp" -// this is ugly, only for 4d -template -constexpr auto get_convolution_output_default_4d_tensor_descriptor(InDesc, WeiDesc) +template +constexpr auto get_convolution_output_default_4d_tensor_descriptor_deprecated( + InDesc, WeiDesc, ConvStrides, ConvDilations, LowerPads, UpperPads) { using namespace ck; @@ -22,18 +28,27 @@ constexpr auto get_convolution_output_default_4d_tensor_descriptor(InDesc, WeiDe static_assert(in_desc.GetLength(I1) == wei_desc.GetLength(I1), "input & weight dimension not consistent"); - constexpr auto N = in_desc.GetLength(I0); - constexpr auto HI = in_desc.GetLength(I2); - constexpr auto WI = in_desc.GetLength(I3); + constexpr index_t N = in_desc.GetLength(I0); + constexpr index_t Hi = in_desc.GetLength(I2); + constexpr index_t Wi = in_desc.GetLength(I3); + + constexpr index_t K = wei_desc.GetLength(I0); + constexpr index_t Y = wei_desc.GetLength(I2); + constexpr index_t X = wei_desc.GetLength(I3); + + constexpr index_t HPadLow = LowerPads{}.Get(I0); + constexpr index_t WPadLow = LowerPads{}.Get(I1); - constexpr auto K = wei_desc.GetLength(I0); - constexpr auto Y = wei_desc.GetLength(I2); - constexpr auto X = wei_desc.GetLength(I3); + constexpr index_t HPadUp = UpperPads{}.Get(I0); + constexpr index_t WPadUp = UpperPads{}.Get(I1); - constexpr auto HO = HI + 1 - Y; - constexpr auto WO = WI + 1 - X; + constexpr index_t YEff = (Y - 1) * ConvDilations{}[0] + 1; + constexpr index_t XEff = (X - 1) * ConvDilations{}[1] + 1; + + constexpr index_t Ho = (Hi + HPadLow + HPadUp - YEff) / ConvStrides{}[0] + 1; + constexpr index_t Wo = (Wi + WPadLow + WPadUp - XEff) / ConvStrides{}[1] + 1; - return make_ConstantTensorDescriptor_packed(Sequence{}); + return make_ConstantTensorDescriptor_packed(Sequence{}); } template -constexpr auto get_convolution_with_padding_output_default_4d_tensor_descriptor( +constexpr auto get_convolution_output_default_4d_tensor_descriptor( InDesc, WeiDesc, ConvStrides, ConvDilations, LowerPads, UpperPads) { using namespace ck; @@ -80,7 +95,7 @@ constexpr auto get_convolution_with_padding_output_default_4d_tensor_descriptor( constexpr index_t Ho = (Hi + HPadLow + HPadUp - YEff) / ConvStrides{}[0] + 1; constexpr index_t Wo = (Wi + WPadLow + WPadUp - XEff) / ConvStrides{}[1] + 1; - return make_ConstantTensorDescriptor_packed(Sequence{}); + return make_native_tensor_descriptor_packed(Sequence{}); } template diff --git a/driver/include/device_tensor.hpp b/driver/include/device_tensor.hpp new file mode 100644 index 00000000..61a99bc9 --- /dev/null +++ b/driver/include/device_tensor.hpp @@ -0,0 +1,28 @@ +#pragma once +#include "tensor.hpp" +#include "common_header.hpp" +#include "ConstantTensorDescriptor_deprecated.hpp" +#include "tensor_descriptor.hpp" + +template +auto make_TensorDescriptor_impl(ConstTensorDesc, std::integer_sequence) +{ + std::initializer_list lengths = {ConstTensorDesc::GetLength(Is)...}; + std::initializer_list strides = {ConstTensorDesc::GetStride(Is)...}; + + return TensorDescriptor(lengths, strides); +} + +template +auto make_TensorDescriptor(ConstTensorDesc) +{ + return make_TensorDescriptor_impl( + ConstTensorDesc{}, + std::make_integer_sequence{}); +} + +template +void ostream_ConstantTensorDescriptor(ConstTensorDesc, std::ostream& os = std::cout) +{ + ostream_TensorDescriptor(make_TensorDescriptor(ConstTensorDesc{}), os); +} diff --git a/driver/include/host_conv.hpp b/driver/include/host_conv.hpp index 880fd5ef..ab932bb2 100644 --- a/driver/include/host_conv.hpp +++ b/driver/include/host_conv.hpp @@ -1,49 +1,5 @@ #pragma once #include "tensor.hpp" -#include "common_header.hpp" -#include "ConstantTensorDescriptor_deprecated.hpp" - -// this is ugly, only for 4d -template -void ostream_ConstantTensorDescriptor(TConstTensorDesc, std::ostream& os = std::cout) -{ - using namespace ck; - - static_assert(TConstTensorDesc::nDim == 4, "nDim is not 4"); - - constexpr auto I0 = Number<0>{}; - constexpr auto I1 = Number<1>{}; - constexpr auto I2 = Number<2>{}; - constexpr auto I3 = Number<3>{}; - constexpr auto desc = TConstTensorDesc{}; - - os << "Lengths: {" << desc.GetLength(I0) << ", " << desc.GetLength(I1) << ", " - << desc.GetLength(I2) << ", " << desc.GetLength(I3) << "}, " - << "Strides: {" << desc.GetStride(I0) << ", " << desc.GetStride(I1) << ", " - << desc.GetStride(I2) << ", " << desc.GetStride(I3) << "}" << std::endl; -} - -// this is ugly, only for 4d -template -auto make_TensorDescriptor(TConstTensorDesc) -{ - using namespace ck; - - static_assert(TConstTensorDesc::nDim == 4, "nDim is not 4"); - - constexpr auto I0 = Number<0>{}; - constexpr auto I1 = Number<1>{}; - constexpr auto I2 = Number<2>{}; - constexpr auto I3 = Number<3>{}; - constexpr auto desc = TConstTensorDesc{}; - - std::initializer_list lengths = { - desc.GetLength(I0), desc.GetLength(I1), desc.GetLength(I2), desc.GetLength(I3)}; - std::initializer_list strides = { - desc.GetStride(I0), desc.GetStride(I1), desc.GetStride(I2), desc.GetStride(I3)}; - - return TensorDescriptor(lengths, strides); -} template & in_nchw, make_ParallelTensorFunctor(f_out_hold, N, K, HTile, WTile)(num_thread); make_ParallelTensorFunctor(f_out, N, K, HTile, WTile)(num_thread); } - -template -void check_error(const Tensor& ref, const Tensor& result) -{ - float error = 0; - float max_diff = -1; - float ref_value = 0, result_value = 0; - for(int i = 0; i < ref.mData.size(); ++i) - { - error += std::abs(double(ref.mData[i]) - double(result.mData[i])); - float diff = std::abs(double(ref.mData[i]) - double(result.mData[i])); - if(max_diff < diff) - { - max_diff = diff; - ref_value = ref.mData[i]; - result_value = result.mData[i]; - } - } - - std::cout << "error: " << error << std::endl; - std::cout << "max_diff: " << max_diff << ", " << ref_value << ", " << result_value << std::endl; -} diff --git a/driver/include/tensor.hpp b/driver/include/tensor.hpp index 9c42e83b..8c89c248 100644 --- a/driver/include/tensor.hpp +++ b/driver/include/tensor.hpp @@ -68,10 +68,12 @@ auto construct_f_unpack_args(F, T args) struct TensorDescriptor { TensorDescriptor() = delete; - TensorDescriptor(std::initializer_list lens); - TensorDescriptor(std::initializer_list lens, - std::initializer_list strides); - TensorDescriptor(std::vector lens, std::vector strides); + + template + TensorDescriptor(std::vector lens); + + template + TensorDescriptor(std::vector lens, std::vector strides); void CalculateStrides(); @@ -269,4 +271,39 @@ struct Tensor std::vector mData; }; +void ostream_TensorDescriptor(const TensorDescriptor& desc, std::ostream& os = std::cout) +{ + os << "dim " << desc.GetNumOfDimension() << ", "; + + os << "lengths {"; + LogRange(os, desc.GetLengths(), ", "); + os << "}, "; + + os << "strides {"; + LogRange(os, desc.GetStrides(), ", "); + os << "}" << std::endl; +} + +template +void check_error(const Tensor& ref, const Tensor& result) +{ + float error = 0; + float max_diff = -1; + float ref_value = 0, result_value = 0; + for(int i = 0; i < ref.mData.size(); ++i) + { + error += std::abs(double(ref.mData[i]) - double(result.mData[i])); + float diff = std::abs(double(ref.mData[i]) - double(result.mData[i])); + if(max_diff < diff) + { + max_diff = diff; + ref_value = ref.mData[i]; + result_value = result.mData[i]; + } + } + + std::cout << "error: " << error << std::endl; + std::cout << "max_diff: " << max_diff << ", " << ref_value << ", " << result_value << std::endl; +} + #endif diff --git a/driver/src/conv_driver.cpp b/driver/src/conv_driver.cpp index 80606939..9222e71a 100644 --- a/driver/src/conv_driver.cpp +++ b/driver/src/conv_driver.cpp @@ -11,6 +11,7 @@ #include "tensor_generator.hpp" #include "conv_common.hpp" #include "host_conv.hpp" +#include "device_tensor.hpp" //#include "device_convolution_direct_v2_nchw_kcyx_nkhw.hpp" //#include "device_convolution_implicit_gemm_v1_chwn_cyxk_khwn.hpp" //#include "device_convolution_implicit_gemm_v1_chwn_cyxk_khwn_padded.hpp" @@ -24,7 +25,6 @@ #include "device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_deprecated.hpp" #include "device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp" - int main(int argc, char* argv[]) { using namespace ck; @@ -315,7 +315,7 @@ int main(int argc, char* argv[]) auto in_nchw_desc = make_ConstantTensorDescriptor_packed(Sequence{}); auto wei_kcyx_desc = make_ConstantTensorDescriptor_packed(Sequence{}); - auto out_nkhw_desc = get_convolution_with_padding_output_default_4d_tensor_descriptor( + auto out_nkhw_desc = get_convolution_output_default_4d_tensor_descriptor_deprecated( in_nchw_desc, wei_kcyx_desc, ConvStrides{}, ConvDilations{}, LeftPads{}, RightPads{}); ostream_ConstantTensorDescriptor(in_nchw_desc, std::cout << "in_nchw_desc: "); diff --git a/driver/src/tensor.cpp b/driver/src/tensor.cpp index 035f7b5e..24d2c772 100644 --- a/driver/src/tensor.cpp +++ b/driver/src/tensor.cpp @@ -3,12 +3,14 @@ #include "tensor.hpp" -TensorDescriptor::TensorDescriptor(std::initializer_list lens) : mLens(lens) +template +TensorDescriptor::TensorDescriptor(std::vector lens) : mLens(lens) { this->CalculateStrides(); } -TensorDescriptor::TensorDescriptor(std::vector lens, std::vector strides) +template +TensorDescriptor::TensorDescriptor(std::vector lens, std::vector strides) : mLens(lens), mStrides(strides) { } From 895e87c1d37a51631aaa354ae8c91ff1f191b04f Mon Sep 17 00:00:00 2001 From: Chao Liu Date: Wed, 13 Nov 2019 18:43:56 -0600 Subject: [PATCH 04/23] adding col2im --- driver/src/col2im_driver.cpp | 384 +++++++++++++++++++++++++++++++++++ driver/src/col2im_driver.cu | 1 + 2 files changed, 385 insertions(+) create mode 100644 driver/src/col2im_driver.cpp create mode 120000 driver/src/col2im_driver.cu diff --git a/driver/src/col2im_driver.cpp b/driver/src/col2im_driver.cpp new file mode 100644 index 00000000..5f72db8d --- /dev/null +++ b/driver/src/col2im_driver.cpp @@ -0,0 +1,384 @@ +#include +#include +#include +#include +#include +#include "config.hpp" +#include "print_array.hpp" +#include "print_sequence.hpp" +#include "device.hpp" +#include "tensor_generator.hpp" +//#include "device_col2im.hpp" + +int main(int argc, char* argv[]) +{ + using namespace ck; + +#if 0 + constexpr index_t N = 128; + constexpr index_t C = 128; + constexpr index_t HI = 17; + constexpr index_t WI = 17; + constexpr index_t K = 128; + constexpr index_t Y = 1; + constexpr index_t X = 7; + + using ConvStrides = Sequence<1, 1>; + using ConvDilations = Sequence<1, 1>; + + using LeftPads = Sequence<0, 3>; + using RightPads = Sequence<0, 3>; +#elif 0 + // 3x3, 34x34 + constexpr index_t N = 64; + constexpr index_t C = 256; + constexpr index_t HI = 34; + constexpr index_t WI = 34; + constexpr index_t K = 128; + constexpr index_t Y = 3; + constexpr index_t X = 3; + + using ConvStrides = Sequence<1, 1>; + using ConvDilations = Sequence<1, 1>; + + using LeftPads = Sequence<0, 0>; + using RightPads = Sequence<0, 0>; +#elif 0 + // 1x1 filter, 8x8 image + // cudnn@V100 68%, ck@V100 72%, ck@P100 52%, ck@VII 42% + constexpr index_t N = 64; + constexpr index_t C = 1536; + constexpr index_t HI = 8; + constexpr index_t WI = 8; + constexpr index_t K = 256; + constexpr index_t Y = 1; + constexpr index_t X = 1; + + using ConvStrides = Sequence<1, 1>; + using ConvDilations = Sequence<1, 1>; + + using LeftPads = Sequence<0, 0>; + using RightPads = Sequence<0, 0>; +#elif 0 + // 1x1 filter, 8x8 image + // cudnn@V100 77%, ck@V100 76%, ck@P100 79%, ck@VII 51% + constexpr index_t N = 128; + constexpr index_t C = 2048; + constexpr index_t HI = 8; + constexpr index_t WI = 8; + constexpr index_t K = 384; + constexpr index_t Y = 1; + constexpr index_t X = 1; + + using ConvStrides = Sequence<1, 1>; + using ConvDilations = Sequence<1, 1>; + + using LeftPads = Sequence<0, 0>; + using RightPads = Sequence<0, 0>; +#elif 0 + // 1x1 filter, 7x7 image + // cudnn@V100 82%, ck@V100 76%, ck@P100 67%, ck@VII 64% + constexpr index_t N = 128; + constexpr index_t C = 832; + constexpr index_t HI = 7; + constexpr index_t WI = 7; + constexpr index_t K = 384; + constexpr index_t Y = 1; + constexpr index_t X = 1; + + using ConvStrides = Sequence<1, 1>; + using ConvDilations = Sequence<1, 1>; + + using LeftPads = Sequence<0, 0>; + using RightPads = Sequence<0, 0>; +#elif 0 + // 1x1 filter, 8x8 image + // cudnn@V100 83%, ck@V100 75%, ck@P100 78%, ck@VII 65% + constexpr index_t N = 128; + constexpr index_t C = 1280; + constexpr index_t HI = 8; + constexpr index_t WI = 8; + constexpr index_t K = 384; + constexpr index_t Y = 1; + constexpr index_t X = 1; + + using ConvStrides = Sequence<1, 1>; + using ConvDilations = Sequence<1, 1>; + + using LeftPads = Sequence<0, 0>; + using RightPads = Sequence<0, 0>; +#elif 0 + // 1x1 filter, 14x14 image + // cudnn@V100 62%, ck@V100 68%, ck@P100 70%, ck@VII 50% + constexpr index_t N = 128; + constexpr index_t C = 512; + constexpr index_t HI = 14; + constexpr index_t WI = 14; + constexpr index_t K = 128; + constexpr index_t Y = 1; + constexpr index_t X = 1; + + using ConvStrides = Sequence<1, 1>; + using ConvDilations = Sequence<1, 1>; + + using LeftPads = Sequence<0, 0>; + using RightPads = Sequence<0, 0>; +#elif 0 + // 1x1 filter, 8x8 image + // cudnn@V100 74%, ck@V100 57%, ck@P100 78%, ck@VII 61% + constexpr index_t N = 64; + constexpr index_t C = 1536; + constexpr index_t HI = 8; + constexpr index_t WI = 8; + constexpr index_t K = 384; + constexpr index_t Y = 1; + constexpr index_t X = 1; + + using ConvStrides = Sequence<1, 1>; + using ConvDilations = Sequence<1, 1>; + + using LeftPads = Sequence<0, 0>; + using RightPads = Sequence<0, 0>; +#elif 0 + // 1x1 filter, 28x28 image + // cudnn@V100 86%, ck@V100 84%, ck@P100 80%, ck@VII 69% + constexpr index_t N = 128; + constexpr index_t C = 256; + constexpr index_t HI = 28; + constexpr index_t WI = 28; + constexpr index_t K = 128; + constexpr index_t Y = 1; + constexpr index_t X = 1; + + using ConvStrides = Sequence<1, 1>; + using ConvDilations = Sequence<1, 1>; + + using LeftPads = Sequence<0, 0>; + using RightPads = Sequence<0, 0>; +#elif 0 + // 1x1 filter, 7x7 image + // cudnn@V100 71%, ck@V100 55%, ck@P100 70%, ck@VII 62% + constexpr index_t N = 128; + constexpr index_t C = 832; + constexpr index_t HI = 7; + constexpr index_t WI = 7; + constexpr index_t K = 256; + constexpr index_t Y = 1; + constexpr index_t X = 1; + + using ConvStrides = Sequence<1, 1>; + using ConvDilations = Sequence<1, 1>; + + using LeftPads = Sequence<0, 0>; + using RightPads = Sequence<0, 0>; +#elif 0 + // 1x1 filter, 17x17 input + // cudnn@V100 81%, ck@V100 76%, ck@P100 70%, ck@VII 76% + constexpr index_t N = 128; + constexpr index_t C = 768; + constexpr index_t HI = 17; + constexpr index_t WI = 17; + constexpr index_t K = 128; + constexpr index_t Y = 1; + constexpr index_t X = 1; + + using ConvStrides = Sequence<1, 1>; + using ConvDilations = Sequence<1, 1>; + + using LeftPads = Sequence<0, 0>; + using RightPads = Sequence<0, 0>; +#elif 0 + // 1x1 filter, 14x14 image + // cudnn@V100 73%, ck@V100 71%, ck@P100 70%, ck@VII 64% + constexpr index_t N = 128; + constexpr index_t C = 528; + constexpr index_t HI = 14; + constexpr index_t WI = 14; + constexpr index_t K = 128; + constexpr index_t Y = 1; + constexpr index_t X = 1; + + using ConvStrides = Sequence<1, 1>; + using ConvDilations = Sequence<1, 1>; + + using LeftPads = Sequence<0, 0>; + using RightPads = Sequence<0, 0>; +#elif 0 + // 1x1 filter, 14x14 image + // cudnn@V100 73%, ck@V100 72%, ck@P100 79%, ck@VII 75% + constexpr index_t N = 128; + constexpr index_t C = 528; + constexpr index_t HI = 14; + constexpr index_t WI = 14; + constexpr index_t K = 256; + constexpr index_t Y = 1; + constexpr index_t X = 1; + + using ConvStrides = Sequence<1, 1>; + using ConvDilations = Sequence<1, 1>; + + using LeftPads = Sequence<0, 0>; + using RightPads = Sequence<0, 0>; +#elif 0 + // 1x1 filter, 7x7 image + // cudnn@V100 49%, ck@V100 50%, ck@P100 61%, ck@VII 52% + constexpr index_t N = 128; + constexpr index_t C = 832; + constexpr index_t HI = 7; + constexpr index_t WI = 7; + constexpr index_t K = 128; + constexpr index_t Y = 1; + constexpr index_t X = 1; + + using ConvStrides = Sequence<1, 1>; + using ConvDilations = Sequence<1, 1>; + + using LeftPads = Sequence<0, 0>; + using RightPads = Sequence<0, 0>; +#elif 0 + // 3x3 filter, 2x2 stride, 35x35 input, 17x17 output + // cudnn@V100 90%, ck@V100 93%, ck@P100 83%, ck@VII 81% + constexpr index_t N = 128; + constexpr index_t C = 288; + constexpr index_t HI = 35; + constexpr index_t WI = 35; + constexpr index_t K = 384; + constexpr index_t Y = 3; + constexpr index_t X = 3; + + using ConvStrides = Sequence<2, 2>; + using ConvDilations = Sequence<1, 1>; + + using LeftPads = Sequence<0, 0>; + using RightPads = Sequence<0, 0>; +#elif 0 + // 5x5 filter, 2x2 pad, 7x7 input + constexpr index_t N = 128; + constexpr index_t C = 48; + constexpr index_t HI = 7; + constexpr index_t WI = 7; + constexpr index_t K = 128; + constexpr index_t Y = 5; + constexpr index_t X = 5; + + using ConvStrides = Sequence<1, 1>; + using ConvDilations = Sequence<1, 1>; + + using LeftPads = Sequence<2, 2>; + using RightPads = Sequence<2, 2>; +#elif 0 + // 7x1 filter, 3x0 pad, 17x17 input + constexpr index_t N = 128; + constexpr index_t C = 128; + constexpr index_t HI = 17; + constexpr index_t WI = 17; + constexpr index_t K = 128; + constexpr index_t Y = 7; + constexpr index_t X = 1; + + using ConvStrides = Sequence<1, 1>; + using ConvDilations = Sequence<1, 1>; + + using LeftPads = Sequence<3, 0>; + using RightPads = Sequence<3, 0>; +#elif 1 + // 1x7 filter, 0x3 pad, 17x17 input + constexpr index_t N = 128; + constexpr index_t C = 128; + constexpr index_t HI = 17; + constexpr index_t WI = 17; + constexpr index_t K = 128; + constexpr index_t Y = 1; + constexpr index_t X = 7; + + using ConvStrides = Sequence<1, 1>; + using ConvDilations = Sequence<1, 1>; + + using LeftPads = Sequence<0, 3>; + using RightPads = Sequence<0, 3>; +#endif + +#if 0 + auto in_nchw_desc = make_ConstantTensorDescriptor_packed(Sequence{}); + auto wei_kcyx_desc = make_ConstantTensorDescriptor_packed(Sequence{}); + auto out_nkhw_desc = get_convolution_output_default_4d_tensor_descriptor_deprecated( + in_nchw_desc, wei_kcyx_desc, ConvStrides{}, ConvDilations{}, LeftPads{}, RightPads{}); + + ostream_ConstantTensorDescriptor(in_nchw_desc, std::cout << "in_nchw_desc: "); + ostream_ConstantTensorDescriptor(wei_kcyx_desc, std::cout << "wei_kcyx_desc: "); + ostream_ConstantTensorDescriptor(out_nkhw_desc, std::cout << "out_nkhw_desc: "); + print_sequence("LeftPads", LeftPads{}); + print_sequence("RightPads", RightPads{}); + print_sequence("ConvStrides", ConvStrides{}); + print_sequence("ConvDilations", ConvDilations{}); + + using in_data_t = float; + using out_data_t = float; + Tensor in_nchw(make_TensorDescriptor(in_nchw_desc)); + Tensor wei_kcyx(make_TensorDescriptor(wei_kcyx_desc)); + Tensor out_nkhw_host(make_TensorDescriptor(out_nkhw_desc)); + Tensor out_nkhw_device(make_TensorDescriptor(out_nkhw_desc)); +#else + auto in_nchw_desc = make_native_tensor_descriptor_packed(Sequence{}); + auto wei_kcyx_desc = make_native_tensor_descriptor_packed(Sequence{}); + auto out_nkhw_desc = get_convolution_output_default_4d_tensor_descriptor( + in_nchw_desc, wei_kcyx_desc, ConvStrides{}, ConvDilations{}, LeftPads{}, RightPads{}); + + auto in_eb_desc = make_native_tensor_descriptor_packed(Sequence{}); + + ostream_ConstantTensorDescriptor(in_nchw_desc, std::cout << "in_nchw_desc: "); + ostream_ConstantTensorDescriptor(wei_kcyx_desc, std::cout << "wei_kcyx_desc: "); + ostream_ConstantTensorDescriptor(out_nkhw_desc, std::cout << "out_nkhw_desc: "); + ostream_ConstantTensorDescriptor(in_eb_desc, std::cout << "in_eb_desc: "); + print_sequence("LeftPads", LeftPads{}); + print_sequence("RightPads", RightPads{}); + print_sequence("ConvStrides", ConvStrides{}); + print_sequence("ConvDilations", ConvDilations{}); +#endif + + std::size_t num_thread = std::thread::hardware_concurrency(); + + if(argc != 3) + { + printf("arg1: do_verification, arg2: nrepeat\n"); + exit(1); + } + + bool do_verification = atoi(argv[1]); + index_t nrepeat = atoi(argv[2]); + + if(do_verification) + { + in_eb.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread); + } + + device_col2im(in_nchw_desc, + in_nchw, + in_eb_desc, + in_eb, + ConvStrides{}, + ConvDilations{}, + LeftPads{}, + RightPads{}, + nrepeat); + + if(do_verification) + { + host_direct_convolution(in_nchw, + wei_kcyx, + out_nkhw_host, + ConvStrides{}, + ConvDilations{}, + LeftPads{}, + RightPads{}); + + check_error(out_nkhw_host, out_nkhw_device); + +#if 0 + LogRange(std::cout << "in_nchw : ", in_nchw.mData, ",") << std::endl; + LogRange(std::cout << "wei_kcyx: ", wei_kcyx.mData, ",") << std::endl; + LogRange(std::cout << "out_nkhw_host : ", out_nkhw_host.mData, ",") << std::endl; + LogRange(std::cout << "out_nkhw_device: ", out_nkhw_device.mData, ",") << std::endl; +#endif + } +} diff --git a/driver/src/col2im_driver.cu b/driver/src/col2im_driver.cu new file mode 120000 index 00000000..8d388393 --- /dev/null +++ b/driver/src/col2im_driver.cu @@ -0,0 +1 @@ +col2im_driver.cpp \ No newline at end of file From ad3ac5cce07719c15a36b9b630e35e629a80b0ab Mon Sep 17 00:00:00 2001 From: Chao Liu Date: Fri, 15 Nov 2019 14:02:12 -0600 Subject: [PATCH 05/23] adding col2im --- driver/CMakeLists.txt | 10 +- ...it_gemm_v4r1_nchw_kcyx_nkhw_deprecated.hpp | 40 +- driver/include/device_tensor.hpp | 4 +- driver/include/host_col2im.hpp | 71 ++++ driver/src/col2im_driver.cpp | 106 ++--- driver/src/col2im_driver.cu | 387 +++++++++++++++++- driver/src/conv_driver.cpp | 24 +- 7 files changed, 570 insertions(+), 72 deletions(-) create mode 100644 driver/include/host_col2im.hpp mode change 120000 => 100644 driver/src/col2im_driver.cu diff --git a/driver/CMakeLists.txt b/driver/CMakeLists.txt index 56745e0e..72657a50 100644 --- a/driver/CMakeLists.txt +++ b/driver/CMakeLists.txt @@ -15,10 +15,14 @@ install(TARGETS host LIBRARY DESTINATION lib) if(DEVICE_BACKEND STREQUAL "AMD") - set(DRIVER_SOURCE src/conv_driver.cpp) + set(CONV_SOURCE src/conv_driver.cpp) + set(COL2IM_SOURCE src/col2im_driver.cpp) elseif(DEVICE_BACKEND STREQUAL "NVIDIA") - set(DRIVER_SOURCE src/conv_driver.cu) + set(CONV_SOURCE src/conv_driver.cu) + set(COL2IM_SOURCE src/col2im_driver.cu) endif() -add_executable(conv ${DRIVER_SOURCE}) +add_executable(conv ${CONV_SOURCE}) +add_executable(col2im ${COL2IM_SOURCE}) target_link_libraries(conv PRIVATE host) +target_link_libraries(col2im PRIVATE host) diff --git a/driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_deprecated.hpp b/driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_deprecated.hpp index 626dd77d..f741b4ab 100644 --- a/driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_deprecated.hpp +++ b/driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_deprecated.hpp @@ -46,7 +46,7 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_deprecated(InDesc, wei_kcyx_device_buf.ToDevice(wei_kcyx.mData.data()); out_nkhw_device_buf.ToDevice(out_nkhw.mData.data()); -#if 1 +#if 0 // BlockSize = 256, blockwise-GEMM 128x128, each thread hold 64 data constexpr index_t BlockSize = 256; @@ -120,7 +120,7 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_deprecated(InDesc, constexpr index_t WeiBlockCopySrcDataPerRead_E = 4; constexpr index_t WeiBlockCopyDstDataPerWrite_K = 1; -#elif 1 +#elif 0 // BlockSize = 256, blockwise-GEMM 64x128, each thread hold 32 data constexpr index_t BlockSize = 256; @@ -157,6 +157,42 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_deprecated(InDesc, constexpr index_t WeiBlockCopySrcDataPerRead_E = 2; constexpr index_t WeiBlockCopyDstDataPerWrite_K = 1; +#elif 1 + constexpr index_t BlockSize = 64; + + constexpr index_t BPerBlock = 16; + constexpr index_t KPerBlock = 32; + constexpr index_t EPerBlock = 4; + + constexpr index_t GemmNRepeat = 2; + + constexpr index_t GemmMPerThreadSubC = 4; + constexpr index_t GemmNPerThreadSubC = 4; + constexpr index_t GemmMLevel0Cluster = 1; + constexpr index_t GemmNLevel0Cluster = 4; + constexpr index_t GemmMLevel1Cluster = 4; + constexpr index_t GemmNLevel1Cluster = 4; + constexpr index_t GemmKPerThreadLoop = 1; + constexpr index_t GemmDataPerReadA = 4; + constexpr index_t GemmDataPerReadB = 4; + + using InBlockCopySubLengths_E_N1_B_N2 = Sequence<1, 2, 1, 4>; + using InBlockCopyClusterLengths_E_N1_B_N2 = Sequence<4, 1, 16, 1>; + using InBlockCopyThreadClusterArrangeOrder = Sequence<0, 1, 3, 2>; // [E, N1, N2, B] + using InBlockCopySrcAccessOrder = Sequence<0, 2, 1, 3>; // [E, B, N1, N2] + using InBlockCopyDstAccessOrder = Sequence<0, 1, 2, 3>; // [E, N1, B, N2] + + constexpr index_t InBlockCopySrcDataPerRead_B = 1; + constexpr index_t InBlockCopyDstDataPerWrite_N2 = 4; + + using WeiBlockCopySubLengths_E_K = Sequence<1, 2>; + using WeiBlockCopyClusterLengths_E_K = Sequence<4, 16>; + using WeiBlockCopyThreadClusterArrangeOrder = Sequence<1, 0>; // [K, E] + using WeiBlockCopySrcAccessOrder = Sequence<1, 0>; // [K, E] + using WeiBlockCopyDstAccessOrder = Sequence<0, 1>; // [E, K] + + constexpr index_t WeiBlockCopySrcDataPerRead_E = 1; + constexpr index_t WeiBlockCopyDstDataPerWrite_K = 2; #endif constexpr index_t N1 = GemmNRepeat; diff --git a/driver/include/device_tensor.hpp b/driver/include/device_tensor.hpp index 61a99bc9..e3a1c258 100644 --- a/driver/include/device_tensor.hpp +++ b/driver/include/device_tensor.hpp @@ -7,8 +7,8 @@ template auto make_TensorDescriptor_impl(ConstTensorDesc, std::integer_sequence) { - std::initializer_list lengths = {ConstTensorDesc::GetLength(Is)...}; - std::initializer_list strides = {ConstTensorDesc::GetStride(Is)...}; + std::initializer_list lengths = {ConstTensorDesc::GetLengths()[Is]...}; + std::initializer_list strides = {ConstTensorDesc::GetStrides()[Is]...}; return TensorDescriptor(lengths, strides); } diff --git a/driver/include/host_col2im.hpp b/driver/include/host_col2im.hpp new file mode 100644 index 00000000..d902c27f --- /dev/null +++ b/driver/include/host_col2im.hpp @@ -0,0 +1,71 @@ +#pragma once +#include "tensor.hpp" + +template +void host_col2im(const Tensor& in_eb, + Tensor& in_nchw, + FilterSizes, + OutputSizes, + ConvStrides, + ConvDilations, + LeftPads, + RightPads) +{ + using namespace ck; + + int N = in_nchw.mDesc.GetLengths()[0]; + int C = in_nchw.mDesc.GetLengths()[1]; + int HI = in_nchw.mDesc.GetLengths()[2]; + int WI = in_nchw.mDesc.GetLengths()[3]; + + int Y = FilterSizes{}[0]; + int X = FilterSizes{}[1]; + + int HO = OutputSizes{}[0]; + int WO = OutputSizes{}[1]; + + auto f = [&](auto n, auto c, auto hi, auto wi) { + double v = 0; + + for(int y = 0; y < Y; ++y) + { + int h_tmp = hi + LeftPads{}[0] - y * ConvDilations{}[0]; + + if(h_tmp % ConvStrides{}[0] == 0) + { + int ho = h_tmp / ConvStrides{}[0]; + + for(int x = 0; x < X; ++x) + { + int w_tmp = wi + LeftPads{}[1] - x * ConvDilations{}[1]; + + if(w_tmp % ConvStrides{}[1] == 0) + { + int wo = w_tmp / ConvStrides{}[1]; + + int e = c * (Y * X) + y * X + x; + int b = n * (HO * WO) + ho * WO + wo; + + v += in_eb(e, b); + } + } + } + } + + in_nchw(n, c, hi, wi) = v; + }; + + auto f_par = make_ParallelTensorFunctor(f, + in_nchw.mDesc.GetLengths()[0], + in_nchw.mDesc.GetLengths()[1], + in_nchw.mDesc.GetLengths()[2], + in_nchw.mDesc.GetLengths()[3]); + + f_par(std::thread::hardware_concurrency()); +} diff --git a/driver/src/col2im_driver.cpp b/driver/src/col2im_driver.cpp index 5f72db8d..89fd47aa 100644 --- a/driver/src/col2im_driver.cpp +++ b/driver/src/col2im_driver.cpp @@ -4,30 +4,35 @@ #include #include #include "config.hpp" +#include "tensor_descriptor.hpp" +#include "tensor_descriptor_helper.hpp" #include "print_array.hpp" #include "print_sequence.hpp" #include "device.hpp" #include "tensor_generator.hpp" +#include "device_tensor.hpp" +#include "conv_common.hpp" +#include "host_col2im.hpp" //#include "device_col2im.hpp" int main(int argc, char* argv[]) { using namespace ck; -#if 0 - constexpr index_t N = 128; - constexpr index_t C = 128; +#if 1 + constexpr index_t N = 1; + constexpr index_t C = 1; constexpr index_t HI = 17; constexpr index_t WI = 17; - constexpr index_t K = 128; - constexpr index_t Y = 1; - constexpr index_t X = 7; + constexpr index_t K = 1; + constexpr index_t Y = 3; + constexpr index_t X = 3; using ConvStrides = Sequence<1, 1>; using ConvDilations = Sequence<1, 1>; - using LeftPads = Sequence<0, 3>; - using RightPads = Sequence<0, 3>; + using LeftPads = Sequence<1, 1>; + using RightPads = Sequence<1, 1>; #elif 0 // 3x3, 34x34 constexpr index_t N = 64; @@ -298,43 +303,32 @@ int main(int argc, char* argv[]) using RightPads = Sequence<0, 3>; #endif -#if 0 - auto in_nchw_desc = make_ConstantTensorDescriptor_packed(Sequence{}); - auto wei_kcyx_desc = make_ConstantTensorDescriptor_packed(Sequence{}); - auto out_nkhw_desc = get_convolution_output_default_4d_tensor_descriptor_deprecated( + constexpr auto in_nchw_desc = make_native_tensor_descriptor_packed(Sequence{}); + constexpr auto wei_kcyx_desc = make_native_tensor_descriptor_packed(Sequence{}); + constexpr auto out_nkhw_desc = get_convolution_output_default_4d_tensor_descriptor( in_nchw_desc, wei_kcyx_desc, ConvStrides{}, ConvDilations{}, LeftPads{}, RightPads{}); - ostream_ConstantTensorDescriptor(in_nchw_desc, std::cout << "in_nchw_desc: "); - ostream_ConstantTensorDescriptor(wei_kcyx_desc, std::cout << "wei_kcyx_desc: "); - ostream_ConstantTensorDescriptor(out_nkhw_desc, std::cout << "out_nkhw_desc: "); - print_sequence("LeftPads", LeftPads{}); - print_sequence("RightPads", RightPads{}); - print_sequence("ConvStrides", ConvStrides{}); - print_sequence("ConvDilations", ConvDilations{}); + constexpr index_t HO = out_nkhw_desc.GetLengths()[2]; + constexpr index_t WO = out_nkhw_desc.GetLengths()[3]; - using in_data_t = float; - using out_data_t = float; - Tensor in_nchw(make_TensorDescriptor(in_nchw_desc)); - Tensor wei_kcyx(make_TensorDescriptor(wei_kcyx_desc)); - Tensor out_nkhw_host(make_TensorDescriptor(out_nkhw_desc)); - Tensor out_nkhw_device(make_TensorDescriptor(out_nkhw_desc)); -#else - auto in_nchw_desc = make_native_tensor_descriptor_packed(Sequence{}); - auto wei_kcyx_desc = make_native_tensor_descriptor_packed(Sequence{}); - auto out_nkhw_desc = get_convolution_output_default_4d_tensor_descriptor( - in_nchw_desc, wei_kcyx_desc, ConvStrides{}, ConvDilations{}, LeftPads{}, RightPads{}); + auto in_eb_desc = make_native_tensor_descriptor_packed(Sequence{}); - auto in_eb_desc = make_native_tensor_descriptor_packed(Sequence{}); + using FilterSizes = Sequence; + using OutputSizes = Sequence; ostream_ConstantTensorDescriptor(in_nchw_desc, std::cout << "in_nchw_desc: "); - ostream_ConstantTensorDescriptor(wei_kcyx_desc, std::cout << "wei_kcyx_desc: "); - ostream_ConstantTensorDescriptor(out_nkhw_desc, std::cout << "out_nkhw_desc: "); ostream_ConstantTensorDescriptor(in_eb_desc, std::cout << "in_eb_desc: "); + print_sequence("FilterSizes", FilterSizes{}); + print_sequence("OutputSizes", OutputSizes{}); + print_sequence("LeftPads", LeftPads{}); print_sequence("LeftPads", LeftPads{}); print_sequence("RightPads", RightPads{}); print_sequence("ConvStrides", ConvStrides{}); print_sequence("ConvDilations", ConvDilations{}); -#endif + + Tensor in_eb(make_TensorDescriptor(in_eb_desc)); + Tensor in_nchw_host(make_TensorDescriptor(in_nchw_desc)); + Tensor in_nchw_device(make_TensorDescriptor(in_nchw_desc)); std::size_t num_thread = std::thread::hardware_concurrency(); @@ -349,36 +343,44 @@ int main(int argc, char* argv[]) if(do_verification) { +#if 1 + in_eb.GenerateTensorValue(GeneratorTensor_1{}, num_thread); +#else in_eb.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread); +#endif } - device_col2im(in_nchw_desc, - in_nchw, - in_eb_desc, +#if 0 + device_col2im(in_eb_desc, in_eb, + in_nchw_desc, + in_nchw_device, + FilterSizes{}, + OutputSizes{}, ConvStrides{}, ConvDilations{}, LeftPads{}, RightPads{}, nrepeat); +#endif if(do_verification) { - host_direct_convolution(in_nchw, - wei_kcyx, - out_nkhw_host, - ConvStrides{}, - ConvDilations{}, - LeftPads{}, - RightPads{}); - - check_error(out_nkhw_host, out_nkhw_device); - -#if 0 - LogRange(std::cout << "in_nchw : ", in_nchw.mData, ",") << std::endl; - LogRange(std::cout << "wei_kcyx: ", wei_kcyx.mData, ",") << std::endl; - LogRange(std::cout << "out_nkhw_host : ", out_nkhw_host.mData, ",") << std::endl; - LogRange(std::cout << "out_nkhw_device: ", out_nkhw_device.mData, ",") << std::endl; + host_col2im(in_eb, + in_nchw_host, + FilterSizes{}, + OutputSizes{}, + ConvStrides{}, + ConvDilations{}, + LeftPads{}, + RightPads{}); + + check_error(in_nchw_host, in_nchw_device); + +#if 1 + LogRange(std::cout << "in_eb : ", in_eb.mData, ",") << std::endl; + LogRange(std::cout << "in_nchw_host : ", in_nchw_host.mData, ",") << std::endl; + LogRange(std::cout << "in_nchw_device : ", in_nchw_device.mData, ",") << std::endl; #endif } } diff --git a/driver/src/col2im_driver.cu b/driver/src/col2im_driver.cu deleted file mode 120000 index 8d388393..00000000 --- a/driver/src/col2im_driver.cu +++ /dev/null @@ -1 +0,0 @@ -col2im_driver.cpp \ No newline at end of file diff --git a/driver/src/col2im_driver.cu b/driver/src/col2im_driver.cu new file mode 100644 index 00000000..89fd47aa --- /dev/null +++ b/driver/src/col2im_driver.cu @@ -0,0 +1,386 @@ +#include +#include +#include +#include +#include +#include "config.hpp" +#include "tensor_descriptor.hpp" +#include "tensor_descriptor_helper.hpp" +#include "print_array.hpp" +#include "print_sequence.hpp" +#include "device.hpp" +#include "tensor_generator.hpp" +#include "device_tensor.hpp" +#include "conv_common.hpp" +#include "host_col2im.hpp" +//#include "device_col2im.hpp" + +int main(int argc, char* argv[]) +{ + using namespace ck; + +#if 1 + constexpr index_t N = 1; + constexpr index_t C = 1; + constexpr index_t HI = 17; + constexpr index_t WI = 17; + constexpr index_t K = 1; + constexpr index_t Y = 3; + constexpr index_t X = 3; + + using ConvStrides = Sequence<1, 1>; + using ConvDilations = Sequence<1, 1>; + + using LeftPads = Sequence<1, 1>; + using RightPads = Sequence<1, 1>; +#elif 0 + // 3x3, 34x34 + constexpr index_t N = 64; + constexpr index_t C = 256; + constexpr index_t HI = 34; + constexpr index_t WI = 34; + constexpr index_t K = 128; + constexpr index_t Y = 3; + constexpr index_t X = 3; + + using ConvStrides = Sequence<1, 1>; + using ConvDilations = Sequence<1, 1>; + + using LeftPads = Sequence<0, 0>; + using RightPads = Sequence<0, 0>; +#elif 0 + // 1x1 filter, 8x8 image + // cudnn@V100 68%, ck@V100 72%, ck@P100 52%, ck@VII 42% + constexpr index_t N = 64; + constexpr index_t C = 1536; + constexpr index_t HI = 8; + constexpr index_t WI = 8; + constexpr index_t K = 256; + constexpr index_t Y = 1; + constexpr index_t X = 1; + + using ConvStrides = Sequence<1, 1>; + using ConvDilations = Sequence<1, 1>; + + using LeftPads = Sequence<0, 0>; + using RightPads = Sequence<0, 0>; +#elif 0 + // 1x1 filter, 8x8 image + // cudnn@V100 77%, ck@V100 76%, ck@P100 79%, ck@VII 51% + constexpr index_t N = 128; + constexpr index_t C = 2048; + constexpr index_t HI = 8; + constexpr index_t WI = 8; + constexpr index_t K = 384; + constexpr index_t Y = 1; + constexpr index_t X = 1; + + using ConvStrides = Sequence<1, 1>; + using ConvDilations = Sequence<1, 1>; + + using LeftPads = Sequence<0, 0>; + using RightPads = Sequence<0, 0>; +#elif 0 + // 1x1 filter, 7x7 image + // cudnn@V100 82%, ck@V100 76%, ck@P100 67%, ck@VII 64% + constexpr index_t N = 128; + constexpr index_t C = 832; + constexpr index_t HI = 7; + constexpr index_t WI = 7; + constexpr index_t K = 384; + constexpr index_t Y = 1; + constexpr index_t X = 1; + + using ConvStrides = Sequence<1, 1>; + using ConvDilations = Sequence<1, 1>; + + using LeftPads = Sequence<0, 0>; + using RightPads = Sequence<0, 0>; +#elif 0 + // 1x1 filter, 8x8 image + // cudnn@V100 83%, ck@V100 75%, ck@P100 78%, ck@VII 65% + constexpr index_t N = 128; + constexpr index_t C = 1280; + constexpr index_t HI = 8; + constexpr index_t WI = 8; + constexpr index_t K = 384; + constexpr index_t Y = 1; + constexpr index_t X = 1; + + using ConvStrides = Sequence<1, 1>; + using ConvDilations = Sequence<1, 1>; + + using LeftPads = Sequence<0, 0>; + using RightPads = Sequence<0, 0>; +#elif 0 + // 1x1 filter, 14x14 image + // cudnn@V100 62%, ck@V100 68%, ck@P100 70%, ck@VII 50% + constexpr index_t N = 128; + constexpr index_t C = 512; + constexpr index_t HI = 14; + constexpr index_t WI = 14; + constexpr index_t K = 128; + constexpr index_t Y = 1; + constexpr index_t X = 1; + + using ConvStrides = Sequence<1, 1>; + using ConvDilations = Sequence<1, 1>; + + using LeftPads = Sequence<0, 0>; + using RightPads = Sequence<0, 0>; +#elif 0 + // 1x1 filter, 8x8 image + // cudnn@V100 74%, ck@V100 57%, ck@P100 78%, ck@VII 61% + constexpr index_t N = 64; + constexpr index_t C = 1536; + constexpr index_t HI = 8; + constexpr index_t WI = 8; + constexpr index_t K = 384; + constexpr index_t Y = 1; + constexpr index_t X = 1; + + using ConvStrides = Sequence<1, 1>; + using ConvDilations = Sequence<1, 1>; + + using LeftPads = Sequence<0, 0>; + using RightPads = Sequence<0, 0>; +#elif 0 + // 1x1 filter, 28x28 image + // cudnn@V100 86%, ck@V100 84%, ck@P100 80%, ck@VII 69% + constexpr index_t N = 128; + constexpr index_t C = 256; + constexpr index_t HI = 28; + constexpr index_t WI = 28; + constexpr index_t K = 128; + constexpr index_t Y = 1; + constexpr index_t X = 1; + + using ConvStrides = Sequence<1, 1>; + using ConvDilations = Sequence<1, 1>; + + using LeftPads = Sequence<0, 0>; + using RightPads = Sequence<0, 0>; +#elif 0 + // 1x1 filter, 7x7 image + // cudnn@V100 71%, ck@V100 55%, ck@P100 70%, ck@VII 62% + constexpr index_t N = 128; + constexpr index_t C = 832; + constexpr index_t HI = 7; + constexpr index_t WI = 7; + constexpr index_t K = 256; + constexpr index_t Y = 1; + constexpr index_t X = 1; + + using ConvStrides = Sequence<1, 1>; + using ConvDilations = Sequence<1, 1>; + + using LeftPads = Sequence<0, 0>; + using RightPads = Sequence<0, 0>; +#elif 0 + // 1x1 filter, 17x17 input + // cudnn@V100 81%, ck@V100 76%, ck@P100 70%, ck@VII 76% + constexpr index_t N = 128; + constexpr index_t C = 768; + constexpr index_t HI = 17; + constexpr index_t WI = 17; + constexpr index_t K = 128; + constexpr index_t Y = 1; + constexpr index_t X = 1; + + using ConvStrides = Sequence<1, 1>; + using ConvDilations = Sequence<1, 1>; + + using LeftPads = Sequence<0, 0>; + using RightPads = Sequence<0, 0>; +#elif 0 + // 1x1 filter, 14x14 image + // cudnn@V100 73%, ck@V100 71%, ck@P100 70%, ck@VII 64% + constexpr index_t N = 128; + constexpr index_t C = 528; + constexpr index_t HI = 14; + constexpr index_t WI = 14; + constexpr index_t K = 128; + constexpr index_t Y = 1; + constexpr index_t X = 1; + + using ConvStrides = Sequence<1, 1>; + using ConvDilations = Sequence<1, 1>; + + using LeftPads = Sequence<0, 0>; + using RightPads = Sequence<0, 0>; +#elif 0 + // 1x1 filter, 14x14 image + // cudnn@V100 73%, ck@V100 72%, ck@P100 79%, ck@VII 75% + constexpr index_t N = 128; + constexpr index_t C = 528; + constexpr index_t HI = 14; + constexpr index_t WI = 14; + constexpr index_t K = 256; + constexpr index_t Y = 1; + constexpr index_t X = 1; + + using ConvStrides = Sequence<1, 1>; + using ConvDilations = Sequence<1, 1>; + + using LeftPads = Sequence<0, 0>; + using RightPads = Sequence<0, 0>; +#elif 0 + // 1x1 filter, 7x7 image + // cudnn@V100 49%, ck@V100 50%, ck@P100 61%, ck@VII 52% + constexpr index_t N = 128; + constexpr index_t C = 832; + constexpr index_t HI = 7; + constexpr index_t WI = 7; + constexpr index_t K = 128; + constexpr index_t Y = 1; + constexpr index_t X = 1; + + using ConvStrides = Sequence<1, 1>; + using ConvDilations = Sequence<1, 1>; + + using LeftPads = Sequence<0, 0>; + using RightPads = Sequence<0, 0>; +#elif 0 + // 3x3 filter, 2x2 stride, 35x35 input, 17x17 output + // cudnn@V100 90%, ck@V100 93%, ck@P100 83%, ck@VII 81% + constexpr index_t N = 128; + constexpr index_t C = 288; + constexpr index_t HI = 35; + constexpr index_t WI = 35; + constexpr index_t K = 384; + constexpr index_t Y = 3; + constexpr index_t X = 3; + + using ConvStrides = Sequence<2, 2>; + using ConvDilations = Sequence<1, 1>; + + using LeftPads = Sequence<0, 0>; + using RightPads = Sequence<0, 0>; +#elif 0 + // 5x5 filter, 2x2 pad, 7x7 input + constexpr index_t N = 128; + constexpr index_t C = 48; + constexpr index_t HI = 7; + constexpr index_t WI = 7; + constexpr index_t K = 128; + constexpr index_t Y = 5; + constexpr index_t X = 5; + + using ConvStrides = Sequence<1, 1>; + using ConvDilations = Sequence<1, 1>; + + using LeftPads = Sequence<2, 2>; + using RightPads = Sequence<2, 2>; +#elif 0 + // 7x1 filter, 3x0 pad, 17x17 input + constexpr index_t N = 128; + constexpr index_t C = 128; + constexpr index_t HI = 17; + constexpr index_t WI = 17; + constexpr index_t K = 128; + constexpr index_t Y = 7; + constexpr index_t X = 1; + + using ConvStrides = Sequence<1, 1>; + using ConvDilations = Sequence<1, 1>; + + using LeftPads = Sequence<3, 0>; + using RightPads = Sequence<3, 0>; +#elif 1 + // 1x7 filter, 0x3 pad, 17x17 input + constexpr index_t N = 128; + constexpr index_t C = 128; + constexpr index_t HI = 17; + constexpr index_t WI = 17; + constexpr index_t K = 128; + constexpr index_t Y = 1; + constexpr index_t X = 7; + + using ConvStrides = Sequence<1, 1>; + using ConvDilations = Sequence<1, 1>; + + using LeftPads = Sequence<0, 3>; + using RightPads = Sequence<0, 3>; +#endif + + constexpr auto in_nchw_desc = make_native_tensor_descriptor_packed(Sequence{}); + constexpr auto wei_kcyx_desc = make_native_tensor_descriptor_packed(Sequence{}); + constexpr auto out_nkhw_desc = get_convolution_output_default_4d_tensor_descriptor( + in_nchw_desc, wei_kcyx_desc, ConvStrides{}, ConvDilations{}, LeftPads{}, RightPads{}); + + constexpr index_t HO = out_nkhw_desc.GetLengths()[2]; + constexpr index_t WO = out_nkhw_desc.GetLengths()[3]; + + auto in_eb_desc = make_native_tensor_descriptor_packed(Sequence{}); + + using FilterSizes = Sequence; + using OutputSizes = Sequence; + + ostream_ConstantTensorDescriptor(in_nchw_desc, std::cout << "in_nchw_desc: "); + ostream_ConstantTensorDescriptor(in_eb_desc, std::cout << "in_eb_desc: "); + print_sequence("FilterSizes", FilterSizes{}); + print_sequence("OutputSizes", OutputSizes{}); + print_sequence("LeftPads", LeftPads{}); + print_sequence("LeftPads", LeftPads{}); + print_sequence("RightPads", RightPads{}); + print_sequence("ConvStrides", ConvStrides{}); + print_sequence("ConvDilations", ConvDilations{}); + + Tensor in_eb(make_TensorDescriptor(in_eb_desc)); + Tensor in_nchw_host(make_TensorDescriptor(in_nchw_desc)); + Tensor in_nchw_device(make_TensorDescriptor(in_nchw_desc)); + + std::size_t num_thread = std::thread::hardware_concurrency(); + + if(argc != 3) + { + printf("arg1: do_verification, arg2: nrepeat\n"); + exit(1); + } + + bool do_verification = atoi(argv[1]); + index_t nrepeat = atoi(argv[2]); + + if(do_verification) + { +#if 1 + in_eb.GenerateTensorValue(GeneratorTensor_1{}, num_thread); +#else + in_eb.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread); +#endif + } + +#if 0 + device_col2im(in_eb_desc, + in_eb, + in_nchw_desc, + in_nchw_device, + FilterSizes{}, + OutputSizes{}, + ConvStrides{}, + ConvDilations{}, + LeftPads{}, + RightPads{}, + nrepeat); +#endif + + if(do_verification) + { + host_col2im(in_eb, + in_nchw_host, + FilterSizes{}, + OutputSizes{}, + ConvStrides{}, + ConvDilations{}, + LeftPads{}, + RightPads{}); + + check_error(in_nchw_host, in_nchw_device); + +#if 1 + LogRange(std::cout << "in_eb : ", in_eb.mData, ",") << std::endl; + LogRange(std::cout << "in_nchw_host : ", in_nchw_host.mData, ",") << std::endl; + LogRange(std::cout << "in_nchw_device : ", in_nchw_device.mData, ",") << std::endl; +#endif + } +} diff --git a/driver/src/conv_driver.cpp b/driver/src/conv_driver.cpp index 9222e71a..d5e608c3 100644 --- a/driver/src/conv_driver.cpp +++ b/driver/src/conv_driver.cpp @@ -29,20 +29,20 @@ int main(int argc, char* argv[]) { using namespace ck; -#if 0 - constexpr index_t N = 128; - constexpr index_t C = 128; - constexpr index_t HI = 17; - constexpr index_t WI = 17; - constexpr index_t K = 128; - constexpr index_t Y = 1; - constexpr index_t X = 7; +#if 1 + constexpr index_t N = 8; + constexpr index_t C = 32; + constexpr index_t HI = 28; + constexpr index_t WI = 28; + constexpr index_t K = 32; + constexpr index_t Y = 5; + constexpr index_t X = 5; using ConvStrides = Sequence<1, 1>; - using ConvDilations = Sequence<1, 1>; + using ConvDilations = Sequence<2, 2>; - using LeftPads = Sequence<0, 3>; - using RightPads = Sequence<0, 3>; + using LeftPads = Sequence<0, 0>; + using RightPads = Sequence<0, 0>; #elif 0 // 3x3, 34x34 constexpr index_t N = 64; @@ -393,7 +393,7 @@ int main(int argc, char* argv[]) #elif 0 device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw( (in_nchw_desc, in_nchw, wei_kcyx_desc, wei_kcyx, out_nkhw_desc, out_nkhw_device, nrepeat); -#elif 0 +#elif 1 device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_deprecated(in_nchw_desc, in_nchw, wei_kcyx_desc, From 6ed962659a1bb1dd2f460d6c998bb7de41abae57 Mon Sep 17 00:00:00 2001 From: Chao Liu Date: Sun, 17 Nov 2019 18:06:09 -0600 Subject: [PATCH 06/23] added col2im --- .../include/gridwise_operation_wrapper.hpp | 10 ++ .../gridwise_col2im_eb_nchw.hpp | 126 ++++++++++++++++++ .../blockwise_generic_tensor_slice_copy.hpp | 10 ++ .../threadwise_generic_tensor_slice_copy.hpp | 35 +++-- driver/include/device_col2im_eb_nchw.hpp | 108 +++++++++++++++ ...tion_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp | 17 ++- driver/include/host_col2im.hpp | 4 +- driver/include/tensor_generator.hpp | 4 +- driver/src/col2im_driver.cpp | 77 +++++------ driver/src/col2im_driver.cu | 83 ++++++------ driver/src/conv_driver.cpp | 8 +- 11 files changed, 379 insertions(+), 103 deletions(-) create mode 100644 composable_kernel/include/gridwise_operation_wrapper.hpp create mode 100644 composable_kernel/include/kernel_algorithm/gridwise_col2im_eb_nchw.hpp create mode 100644 driver/include/device_col2im_eb_nchw.hpp diff --git a/composable_kernel/include/gridwise_operation_wrapper.hpp b/composable_kernel/include/gridwise_operation_wrapper.hpp new file mode 100644 index 00000000..fa4c4008 --- /dev/null +++ b/composable_kernel/include/gridwise_operation_wrapper.hpp @@ -0,0 +1,10 @@ +#ifndef CK_GRIDWISE_OPERATION_KERNEL_WRAPPER +#define CK_GRIDWISE_OPERATION_KERNEL_WRAPPER + +template +__global__ void run_gridwise_operation(GridwiseOp& gridwise_op, Xs... xs) +{ + gridwise_op.Run(xs...); +} + +#endif diff --git a/composable_kernel/include/kernel_algorithm/gridwise_col2im_eb_nchw.hpp b/composable_kernel/include/kernel_algorithm/gridwise_col2im_eb_nchw.hpp new file mode 100644 index 00000000..ce5cc79d --- /dev/null +++ b/composable_kernel/include/kernel_algorithm/gridwise_col2im_eb_nchw.hpp @@ -0,0 +1,126 @@ +#ifndef CK_GRIDWISE_COL2IM_EB_NCHW_HPP +#define CK_GRIDWISE_COL2IM_EB_NCHW_HPP + +#include "common_header.hpp" +#include "tensor_descriptor.hpp" +#include "tensor_descriptor_helper.hpp" +#include "blockwise_generic_tensor_slice_copy.hpp" + +namespace ck { + +// B = merge(N, Ho, Wo) +template +struct GridwiseCol2Im_eb_nchw +{ + __device__ void Run(const Float* const __restrict__ p_col_global, + Float* const __restrict__ p_img_global) const + { + constexpr auto col_e_b_global_desc = ColGlobalDesc{}; + constexpr auto img_n_c_hi_wi_global_desc = ImgGlobalDesc{}; + + constexpr index_t N = img_n_c_hi_wi_global_desc.GetLengths()[0]; + constexpr index_t C = img_n_c_hi_wi_global_desc.GetLengths()[1]; + constexpr index_t Hi = img_n_c_hi_wi_global_desc.GetLengths()[2]; + constexpr index_t Wi = img_n_c_hi_wi_global_desc.GetLengths()[3]; + + constexpr index_t Ho = OutputSizes{}[0]; + constexpr index_t Wo = OutputSizes{}[1]; + + constexpr index_t Y = FilterSizes{}[0]; + constexpr index_t X = FilterSizes{}[1]; + + constexpr index_t ConvStrideH = ConvStrides{}[0]; + constexpr index_t ConvStrideW = ConvStrides{}[1]; + + constexpr index_t ConvDilationH = ConvDilations{}[0]; + constexpr index_t ConvDilationW = ConvDilations{}[1]; + + constexpr index_t E = C * Y * X; + constexpr index_t B = N * Ho * Wo; + + // sanity-check for vectorized memory load + static_assert((Wo == 1 || (ConvStrideW == 1 || BlockCopyDataPerAccess_B == 1)) && + (X == 1 || ConvDilationW % BlockCopyDataPerAccess_B == 0), + "wrong! aligment requirement for vectorized global load of input tensor will " + "be violated"); + + // divide block work by [E, B] + static_assert(E % EPerBlock == 0 && B % BPerBlock == 0, + "wrong! cannot divide work evenly among block"); + + constexpr index_t EBlockWork = E / EPerBlock; + constexpr index_t BBlockWork = B / BPerBlock; + + constexpr auto block_work_desc = + make_cluster_descriptor(Sequence{}); + + const auto block_work_id = block_work_desc.CalculateClusterIndex(get_block_1d_id()); + + const index_t e_block_data_on_global = block_work_id[0] * EPerBlock; + const index_t b_block_data_on_global = block_work_id[1] * BPerBlock; + + // construct img_eb_global_desc + constexpr auto img_n_c_hip_wip_global_desc = transform_tensor_descriptor( + img_n_c_hi_wi_global_desc, + make_tuple( + PassThrough{}, PassThrough{}, Pad, LeftPads, RightPads>{}), + make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}), + make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{})); + + constexpr auto img_n_c_y_ho_x_wo_global_desc = transform_tensor_descriptor( + img_n_c_hip_wip_global_desc, + make_tuple(PassThrough{}, + PassThrough{}, + Embed, Sequence>{}, + Embed, Sequence>{}), + make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}), + make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}, Sequence<4, 5>{})); + + constexpr auto img_e_b_global_desc = transform_tensor_descriptor( + img_n_c_y_ho_x_wo_global_desc, + make_tuple(Merge>{}, Merge>{}), + make_tuple(Sequence<1, 2, 4>{}, Sequence<0, 3, 5>{}), + make_tuple(Sequence<0>{}, Sequence<1>{})); + + // blockwise atomic accumulation + auto blockwise_copy = BlockwiseGenericTensorSliceCopy_v4, + BlockCopySubLengths_E_B, + BlockCopyClusterLengths_E_B, + BlockCopyThreadClusterArrangeOrder, + BlockCopySrcAccessOrder, + BlockCopyDstAccessOrder, + 1, + 1, + BlockCopyDataPerAccess_B, + BlockCopyDataPerAccess_B>( + {e_block_data_on_global, b_block_data_on_global}, + {e_block_data_on_global, b_block_data_on_global}); + + // blockwise copy + blockwise_copy.Run(p_col_global, p_img_global); + } +}; + +} // namespace ck +#endif diff --git a/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp b/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp index d31b3902..e234558f 100644 --- a/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp +++ b/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp @@ -134,8 +134,18 @@ struct BlockwiseGenericTensorSliceCopy_v4 } else { +#if 0 // debug mThreadwiseStore.Run( p_thread_buffer, p_block_dst, thread_buffer_address_space, block_dst_address_space); +#else + constexpr auto True = integral_constant{}; + + mThreadwiseStore.Run(p_thread_buffer, + p_block_dst, + thread_buffer_address_space, + block_dst_address_space, + True); +#endif } } diff --git a/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp b/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp index 6a98c783..ab183ca3 100644 --- a/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp +++ b/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp @@ -69,11 +69,14 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2 template + AddressSpace DstAddressSpace, + bool DoAtomicAdd = false> __device__ void Run(const SrcData* p_src, DstData* p_dst, integral_constant, - integral_constant) const + integral_constant, + integral_constant do_atomic_add = + integral_constant{}) const { using src_vector_t = typename vector_type::MemoryType; using dst_vector_t = typename vector_type::MemoryType; @@ -160,21 +163,27 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2 // has the same padding situation if(dst_coord.IsUpperIndexMappedToValidOffset()) { - static_if{}([&](auto fwd) { + static_if{}([&](auto) { + static_if{}([&](auto fwd) { #if CK_USE_AMD_BUFFER_ADDRESSING - amd_intrinsic_buffer_store( - *reinterpret_cast(&p_dst_long_vector[buffer_offset]), - fwd(p_dst), - dst_coord.GetOffset(), - 0); + amd_intrinsic_buffer_store( + *reinterpret_cast(&p_dst_long_vector[buffer_offset]), + fwd(p_dst), + dst_coord.GetOffset(), + 0); #else - *reinterpret_cast(&p_dst[dst_coord.GetOffset()]) = - *reinterpret_cast(&p_dst_long_vector[buffer_offset]); + *reinterpret_cast(&p_dst[dst_coord.GetOffset()]) = + *reinterpret_cast(&p_dst_long_vector[buffer_offset]); #endif + }).Else([&](auto) { + // dst can be all kinds of memory-space + *reinterpret_cast(&p_dst[dst_coord.GetOffset()]) = + *reinterpret_cast(&p_dst_long_vector[buffer_offset]); + }); }).Else([&](auto) { - // dst can be all kinds of memory-space - *reinterpret_cast(&p_dst[dst_coord.GetOffset()]) = - *reinterpret_cast(&p_dst_long_vector[buffer_offset]); + atomicAdd( + reinterpret_cast(&p_dst[dst_coord.GetOffset()]), + *reinterpret_cast(&p_dst_long_vector[buffer_offset])); }); } } diff --git a/driver/include/device_col2im_eb_nchw.hpp b/driver/include/device_col2im_eb_nchw.hpp new file mode 100644 index 00000000..187cb4ea --- /dev/null +++ b/driver/include/device_col2im_eb_nchw.hpp @@ -0,0 +1,108 @@ +#pragma once +#include +#include "device.hpp" +#include "tensor.hpp" +#include "gridwise_operation_wrapper.hpp" +#include "gridwise_col2im_eb_nchw.hpp" + +template +void device_col2im_eb_nchw(ColDesc, + const Tensor& col_eb, + ImgDesc, + Tensor& img_nchw, + FilterSizes, + OutputSizes, + ConvStrides, + ConvDilations, + LeftPads, + RightPads, + std::size_t nrepeat) +{ + using namespace ck; + + constexpr auto col_eb_desc = ColDesc{}; + constexpr auto img_nchw_desc = ImgDesc{}; + + constexpr index_t N = img_nchw_desc.GetLengths()[0]; + constexpr index_t C = img_nchw_desc.GetLengths()[1]; + constexpr index_t Hi = img_nchw_desc.GetLengths()[2]; + constexpr index_t Wi = img_nchw_desc.GetLengths()[3]; + + constexpr index_t E = col_eb_desc.GetLengths()[0]; + constexpr index_t B = col_eb_desc.GetLengths()[1]; + + std::size_t data_sz = sizeof(T); + DeviceMem col_eb_device_buf(data_sz * col_eb.mDesc.GetElementSpace()); + DeviceMem img_nchw_device_buf(data_sz * img_nchw.mDesc.GetElementSpace()); + + col_eb_device_buf.ToDevice(col_eb.mData.data()); + img_nchw_device_buf.ToDevice(img_nchw.mData.data()); + +#if 1 + constexpr index_t BlockSize = 256; + + constexpr index_t EPerBlock = 128; + constexpr index_t BPerBlock = 128; + + using BlockCopySubLengths_E_B = Sequence<8, 8>; + using BlockCopyClusterLengths_E_B = Sequence<16, 16>; + using BlockCopyThreadClusterArrangeOrder = Sequence<0, 1>; // [E, B] + using BlockCopySrcAccessOrder = Sequence<0, 1>; // [E, B] + using BlockCopyDstAccessOrder = Sequence<0, 1>; // [E, B] + + constexpr index_t BlockCopyDataPerAccess_B = 1; +#endif + + constexpr index_t GridSize = + ((E + EPerBlock - 1) / EPerBlock) * ((B + BPerBlock - 1) / BPerBlock); + + printf("%s: BlockSize %u, GridSize %u \n", __func__, BlockSize, GridSize); + + constexpr auto gridwise_col2im = GridwiseCol2Im_eb_nchw{}; + + for(index_t i = 0; i < nrepeat; ++i) + { + float time = launch_kernel(run_gridwise_operation, + dim3(GridSize), + dim3(BlockSize), + 0, + gridwise_col2im, + const_cast( + static_cast(col_eb_device_buf.GetDeviceBuffer())), + const_cast( + static_cast(img_nchw_device_buf.GetDeviceBuffer()))); + + printf("Elapsed time : %f ms\n", time); + usleep(std::min(time * 1000, float(10000))); + } + + img_nchw_device_buf.FromDevice(img_nchw.mData.data()); +} diff --git a/driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp b/driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp index ccff9e72..ba81ad59 100644 --- a/driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp +++ b/driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp @@ -2,7 +2,7 @@ #include #include "device.hpp" #include "tensor.hpp" -#include "gridwise_convolution_kernel_wrapper.hpp" +#include "gridwise_operation_wrapper.hpp" #include "convolution_common.hpp" #include "gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp" @@ -221,13 +221,20 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc, for(index_t i = 0; i < nrepeat; ++i) { - float time = launch_kernel(run_gridwise_convolution_kernel, + float time = launch_kernel(run_gridwise_operation, dim3(GridSize), dim3(BlockSize), 0, - static_cast(in_nchw_device_buf.GetDeviceBuffer()), - static_cast(wei_kcyx_device_buf.GetDeviceBuffer()), - static_cast(out_nkhw_device_buf.GetDeviceBuffer())); + gridwise_conv, + const_cast( + static_cast(in_nchw_device_buf.GetDeviceBuffer())), + const_cast( + static_cast(wei_kcyx_device_buf.GetDeviceBuffer())), + const_cast( + static_cast(out_nkhw_device_buf.GetDeviceBuffer()))); printf("Elapsed time : %f ms, %f TFlop/s\n", time, diff --git a/driver/include/host_col2im.hpp b/driver/include/host_col2im.hpp index d902c27f..e23540d8 100644 --- a/driver/include/host_col2im.hpp +++ b/driver/include/host_col2im.hpp @@ -37,7 +37,7 @@ void host_col2im(const Tensor& in_eb, { int h_tmp = hi + LeftPads{}[0] - y * ConvDilations{}[0]; - if(h_tmp % ConvStrides{}[0] == 0) + if(h_tmp >= 0 && h_tmp < HI && h_tmp % ConvStrides{}[0] == 0) { int ho = h_tmp / ConvStrides{}[0]; @@ -45,7 +45,7 @@ void host_col2im(const Tensor& in_eb, { int w_tmp = wi + LeftPads{}[1] - x * ConvDilations{}[1]; - if(w_tmp % ConvStrides{}[1] == 0) + if(w_tmp >= 0 && w_tmp < WI && w_tmp % ConvStrides{}[1] == 0) { int wo = w_tmp / ConvStrides{}[1]; diff --git a/driver/include/tensor_generator.hpp b/driver/include/tensor_generator.hpp index 7699608d..15469ba6 100644 --- a/driver/include/tensor_generator.hpp +++ b/driver/include/tensor_generator.hpp @@ -5,10 +5,12 @@ struct GeneratorTensor_1 { + int value = 1; + template double operator()(Is... is) { - return 1; + return value; } }; diff --git a/driver/src/col2im_driver.cpp b/driver/src/col2im_driver.cpp index 89fd47aa..a3af08f7 100644 --- a/driver/src/col2im_driver.cpp +++ b/driver/src/col2im_driver.cpp @@ -13,26 +13,26 @@ #include "device_tensor.hpp" #include "conv_common.hpp" #include "host_col2im.hpp" -//#include "device_col2im.hpp" +#include "device_col2im_eb_nchw.hpp" int main(int argc, char* argv[]) { using namespace ck; #if 1 - constexpr index_t N = 1; - constexpr index_t C = 1; - constexpr index_t HI = 17; - constexpr index_t WI = 17; - constexpr index_t K = 1; - constexpr index_t Y = 3; - constexpr index_t X = 3; + constexpr index_t N = 2; + constexpr index_t C = 8; + constexpr index_t HI = 8; + constexpr index_t WI = 8; + constexpr index_t K = 128; + constexpr index_t Y = 4; + constexpr index_t X = 4; using ConvStrides = Sequence<1, 1>; using ConvDilations = Sequence<1, 1>; using LeftPads = Sequence<1, 1>; - using RightPads = Sequence<1, 1>; + using RightPads = Sequence<2, 2>; #elif 0 // 3x3, 34x34 constexpr index_t N = 64; @@ -303,21 +303,22 @@ int main(int argc, char* argv[]) using RightPads = Sequence<0, 3>; #endif - constexpr auto in_nchw_desc = make_native_tensor_descriptor_packed(Sequence{}); + constexpr auto img_nchw_desc = make_native_tensor_descriptor_packed(Sequence{}); constexpr auto wei_kcyx_desc = make_native_tensor_descriptor_packed(Sequence{}); constexpr auto out_nkhw_desc = get_convolution_output_default_4d_tensor_descriptor( - in_nchw_desc, wei_kcyx_desc, ConvStrides{}, ConvDilations{}, LeftPads{}, RightPads{}); + img_nchw_desc, wei_kcyx_desc, ConvStrides{}, ConvDilations{}, LeftPads{}, RightPads{}); constexpr index_t HO = out_nkhw_desc.GetLengths()[2]; constexpr index_t WO = out_nkhw_desc.GetLengths()[3]; - auto in_eb_desc = make_native_tensor_descriptor_packed(Sequence{}); + constexpr auto col_eb_desc = + make_native_tensor_descriptor_packed(Sequence{}); using FilterSizes = Sequence; using OutputSizes = Sequence; - ostream_ConstantTensorDescriptor(in_nchw_desc, std::cout << "in_nchw_desc: "); - ostream_ConstantTensorDescriptor(in_eb_desc, std::cout << "in_eb_desc: "); + ostream_ConstantTensorDescriptor(col_eb_desc, std::cout << "col_eb_desc: "); + ostream_ConstantTensorDescriptor(img_nchw_desc, std::cout << "img_nchw_desc: "); print_sequence("FilterSizes", FilterSizes{}); print_sequence("OutputSizes", OutputSizes{}); print_sequence("LeftPads", LeftPads{}); @@ -326,9 +327,9 @@ int main(int argc, char* argv[]) print_sequence("ConvStrides", ConvStrides{}); print_sequence("ConvDilations", ConvDilations{}); - Tensor in_eb(make_TensorDescriptor(in_eb_desc)); - Tensor in_nchw_host(make_TensorDescriptor(in_nchw_desc)); - Tensor in_nchw_device(make_TensorDescriptor(in_nchw_desc)); + Tensor col_eb(make_TensorDescriptor(col_eb_desc)); + Tensor img_nchw_host(make_TensorDescriptor(img_nchw_desc)); + Tensor img_nchw_device(make_TensorDescriptor(img_nchw_desc)); std::size_t num_thread = std::thread::hardware_concurrency(); @@ -339,35 +340,35 @@ int main(int argc, char* argv[]) } bool do_verification = atoi(argv[1]); - index_t nrepeat = atoi(argv[2]); + std::size_t nrepeat = atoi(argv[2]); if(do_verification) { #if 1 - in_eb.GenerateTensorValue(GeneratorTensor_1{}, num_thread); + col_eb.GenerateTensorValue(GeneratorTensor_1{}, num_thread); #else - in_eb.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread); + col_eb.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread); #endif } -#if 0 - device_col2im(in_eb_desc, - in_eb, - in_nchw_desc, - in_nchw_device, - FilterSizes{}, - OutputSizes{}, - ConvStrides{}, - ConvDilations{}, - LeftPads{}, - RightPads{}, - nrepeat); +#if 1 + device_col2im_eb_nchw(col_eb_desc, + col_eb, + img_nchw_desc, + img_nchw_device, + FilterSizes{}, + OutputSizes{}, + ConvStrides{}, + ConvDilations{}, + LeftPads{}, + RightPads{}, + nrepeat); #endif if(do_verification) { - host_col2im(in_eb, - in_nchw_host, + host_col2im(col_eb, + img_nchw_host, FilterSizes{}, OutputSizes{}, ConvStrides{}, @@ -375,12 +376,12 @@ int main(int argc, char* argv[]) LeftPads{}, RightPads{}); - check_error(in_nchw_host, in_nchw_device); + check_error(img_nchw_host, img_nchw_device); #if 1 - LogRange(std::cout << "in_eb : ", in_eb.mData, ",") << std::endl; - LogRange(std::cout << "in_nchw_host : ", in_nchw_host.mData, ",") << std::endl; - LogRange(std::cout << "in_nchw_device : ", in_nchw_device.mData, ",") << std::endl; + LogRange(std::cout << "col_eb : ", col_eb.mData, ",") << std::endl; + LogRange(std::cout << "img_nchw_host : ", img_nchw_host.mData, ",") << std::endl; + LogRange(std::cout << "img_nchw_device : ", img_nchw_device.mData, ",") << std::endl; #endif } } diff --git a/driver/src/col2im_driver.cu b/driver/src/col2im_driver.cu index 89fd47aa..45f854a9 100644 --- a/driver/src/col2im_driver.cu +++ b/driver/src/col2im_driver.cu @@ -13,26 +13,26 @@ #include "device_tensor.hpp" #include "conv_common.hpp" #include "host_col2im.hpp" -//#include "device_col2im.hpp" +#include "device_col2im_eb_nchw.hpp" int main(int argc, char* argv[]) { using namespace ck; #if 1 - constexpr index_t N = 1; - constexpr index_t C = 1; - constexpr index_t HI = 17; - constexpr index_t WI = 17; - constexpr index_t K = 1; - constexpr index_t Y = 3; - constexpr index_t X = 3; + constexpr index_t N = 2; + constexpr index_t C = 8; + constexpr index_t HI = 8; + constexpr index_t WI = 8; + constexpr index_t K = 128; + constexpr index_t Y = 4; + constexpr index_t X = 4; using ConvStrides = Sequence<1, 1>; using ConvDilations = Sequence<1, 1>; using LeftPads = Sequence<1, 1>; - using RightPads = Sequence<1, 1>; + using RightPads = Sequence<2, 2>; #elif 0 // 3x3, 34x34 constexpr index_t N = 64; @@ -303,21 +303,22 @@ int main(int argc, char* argv[]) using RightPads = Sequence<0, 3>; #endif - constexpr auto in_nchw_desc = make_native_tensor_descriptor_packed(Sequence{}); + constexpr auto img_nchw_desc = make_native_tensor_descriptor_packed(Sequence{}); constexpr auto wei_kcyx_desc = make_native_tensor_descriptor_packed(Sequence{}); constexpr auto out_nkhw_desc = get_convolution_output_default_4d_tensor_descriptor( - in_nchw_desc, wei_kcyx_desc, ConvStrides{}, ConvDilations{}, LeftPads{}, RightPads{}); + img_nchw_desc, wei_kcyx_desc, ConvStrides{}, ConvDilations{}, LeftPads{}, RightPads{}); constexpr index_t HO = out_nkhw_desc.GetLengths()[2]; constexpr index_t WO = out_nkhw_desc.GetLengths()[3]; - auto in_eb_desc = make_native_tensor_descriptor_packed(Sequence{}); + constexpr auto col_eb_desc = + make_native_tensor_descriptor_packed(Sequence{}); using FilterSizes = Sequence; using OutputSizes = Sequence; - ostream_ConstantTensorDescriptor(in_nchw_desc, std::cout << "in_nchw_desc: "); - ostream_ConstantTensorDescriptor(in_eb_desc, std::cout << "in_eb_desc: "); + ostream_ConstantTensorDescriptor(col_eb_desc, std::cout << "col_eb_desc: "); + ostream_ConstantTensorDescriptor(img_nchw_desc, std::cout << "img_nchw_desc: "); print_sequence("FilterSizes", FilterSizes{}); print_sequence("OutputSizes", OutputSizes{}); print_sequence("LeftPads", LeftPads{}); @@ -326,9 +327,9 @@ int main(int argc, char* argv[]) print_sequence("ConvStrides", ConvStrides{}); print_sequence("ConvDilations", ConvDilations{}); - Tensor in_eb(make_TensorDescriptor(in_eb_desc)); - Tensor in_nchw_host(make_TensorDescriptor(in_nchw_desc)); - Tensor in_nchw_device(make_TensorDescriptor(in_nchw_desc)); + Tensor col_eb(make_TensorDescriptor(col_eb_desc)); + Tensor img_nchw_host(make_TensorDescriptor(img_nchw_desc)); + Tensor img_nchw_device(make_TensorDescriptor(img_nchw_desc)); std::size_t num_thread = std::thread::hardware_concurrency(); @@ -339,35 +340,37 @@ int main(int argc, char* argv[]) } bool do_verification = atoi(argv[1]); - index_t nrepeat = atoi(argv[2]); + std::size_t nrepeat = atoi(argv[2]); if(do_verification) { -#if 1 - in_eb.GenerateTensorValue(GeneratorTensor_1{}, num_thread); +#if 0 + col_eb.GenerateTensorValue(GeneratorTensor_1{1}, num_thread); + img_nchw_device.GenerateTensorValue(GeneratorTensor_1{0}, num_thread); #else - in_eb.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread); + col_eb.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread); + img_nchw_device.GenerateTensorValue(GeneratorTensor_1{0}, num_thread); #endif } -#if 0 - device_col2im(in_eb_desc, - in_eb, - in_nchw_desc, - in_nchw_device, - FilterSizes{}, - OutputSizes{}, - ConvStrides{}, - ConvDilations{}, - LeftPads{}, - RightPads{}, - nrepeat); +#if 1 + device_col2im_eb_nchw(col_eb_desc, + col_eb, + img_nchw_desc, + img_nchw_device, + FilterSizes{}, + OutputSizes{}, + ConvStrides{}, + ConvDilations{}, + LeftPads{}, + RightPads{}, + nrepeat); #endif if(do_verification) { - host_col2im(in_eb, - in_nchw_host, + host_col2im(col_eb, + img_nchw_host, FilterSizes{}, OutputSizes{}, ConvStrides{}, @@ -375,12 +378,12 @@ int main(int argc, char* argv[]) LeftPads{}, RightPads{}); - check_error(in_nchw_host, in_nchw_device); + check_error(img_nchw_host, img_nchw_device); -#if 1 - LogRange(std::cout << "in_eb : ", in_eb.mData, ",") << std::endl; - LogRange(std::cout << "in_nchw_host : ", in_nchw_host.mData, ",") << std::endl; - LogRange(std::cout << "in_nchw_device : ", in_nchw_device.mData, ",") << std::endl; +#if 0 + LogRange(std::cout << "col_eb : ", col_eb.mData, ",") << std::endl; + LogRange(std::cout << "img_nchw_host : ", img_nchw_host.mData, ",") << std::endl; + LogRange(std::cout << "img_nchw_device : ", img_nchw_device.mData, ",") << std::endl; #endif } } diff --git a/driver/src/conv_driver.cpp b/driver/src/conv_driver.cpp index d5e608c3..c868f717 100644 --- a/driver/src/conv_driver.cpp +++ b/driver/src/conv_driver.cpp @@ -29,7 +29,7 @@ int main(int argc, char* argv[]) { using namespace ck; -#if 1 +#if 0 constexpr index_t N = 8; constexpr index_t C = 32; constexpr index_t HI = 28; @@ -393,7 +393,7 @@ int main(int argc, char* argv[]) #elif 0 device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw( (in_nchw_desc, in_nchw, wei_kcyx_desc, wei_kcyx, out_nkhw_desc, out_nkhw_device, nrepeat); -#elif 1 +#elif 0 device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_deprecated(in_nchw_desc, in_nchw, wei_kcyx_desc, @@ -403,7 +403,7 @@ int main(int argc, char* argv[]) ConvStrides{}, ConvDilations{}, nrepeat); -#elif 1 +#elif 0 device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(in_nchw_desc, in_nchw, wei_kcyx_desc, @@ -445,7 +445,7 @@ int main(int argc, char* argv[]) ConvStrides{}, ConvDilations{}, nrepeat); -#elif 0 +#elif 1 device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw(in_nchw_desc, in_nchw, wei_kcyx_desc, From 95febeab4a1f276ca81460aff2f9e2280882b423 Mon Sep 17 00:00:00 2001 From: Chao Liu Date: Sun, 17 Nov 2019 21:59:26 -0600 Subject: [PATCH 07/23] amd build --- .../include/gridwise_operation_wrapper.hpp | 2 +- driver/src/col2im_driver.cpp | 6 +- driver/src/col2im_driver.cu | 390 +----------------- 3 files changed, 4 insertions(+), 394 deletions(-) mode change 100644 => 120000 driver/src/col2im_driver.cu diff --git a/composable_kernel/include/gridwise_operation_wrapper.hpp b/composable_kernel/include/gridwise_operation_wrapper.hpp index fa4c4008..60e96264 100644 --- a/composable_kernel/include/gridwise_operation_wrapper.hpp +++ b/composable_kernel/include/gridwise_operation_wrapper.hpp @@ -2,7 +2,7 @@ #define CK_GRIDWISE_OPERATION_KERNEL_WRAPPER template -__global__ void run_gridwise_operation(GridwiseOp& gridwise_op, Xs... xs) +__global__ void run_gridwise_operation(GridwiseOp gridwise_op, Xs... xs) { gridwise_op.Run(xs...); } diff --git a/driver/src/col2im_driver.cpp b/driver/src/col2im_driver.cpp index a3af08f7..2c460d6c 100644 --- a/driver/src/col2im_driver.cpp +++ b/driver/src/col2im_driver.cpp @@ -344,14 +344,13 @@ int main(int argc, char* argv[]) if(do_verification) { -#if 1 +#if 0 col_eb.GenerateTensorValue(GeneratorTensor_1{}, num_thread); #else col_eb.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread); #endif } -#if 1 device_col2im_eb_nchw(col_eb_desc, col_eb, img_nchw_desc, @@ -363,7 +362,6 @@ int main(int argc, char* argv[]) LeftPads{}, RightPads{}, nrepeat); -#endif if(do_verification) { @@ -378,7 +376,7 @@ int main(int argc, char* argv[]) check_error(img_nchw_host, img_nchw_device); -#if 1 +#if 0 LogRange(std::cout << "col_eb : ", col_eb.mData, ",") << std::endl; LogRange(std::cout << "img_nchw_host : ", img_nchw_host.mData, ",") << std::endl; LogRange(std::cout << "img_nchw_device : ", img_nchw_device.mData, ",") << std::endl; diff --git a/driver/src/col2im_driver.cu b/driver/src/col2im_driver.cu deleted file mode 100644 index 45f854a9..00000000 --- a/driver/src/col2im_driver.cu +++ /dev/null @@ -1,389 +0,0 @@ -#include -#include -#include -#include -#include -#include "config.hpp" -#include "tensor_descriptor.hpp" -#include "tensor_descriptor_helper.hpp" -#include "print_array.hpp" -#include "print_sequence.hpp" -#include "device.hpp" -#include "tensor_generator.hpp" -#include "device_tensor.hpp" -#include "conv_common.hpp" -#include "host_col2im.hpp" -#include "device_col2im_eb_nchw.hpp" - -int main(int argc, char* argv[]) -{ - using namespace ck; - -#if 1 - constexpr index_t N = 2; - constexpr index_t C = 8; - constexpr index_t HI = 8; - constexpr index_t WI = 8; - constexpr index_t K = 128; - constexpr index_t Y = 4; - constexpr index_t X = 4; - - using ConvStrides = Sequence<1, 1>; - using ConvDilations = Sequence<1, 1>; - - using LeftPads = Sequence<1, 1>; - using RightPads = Sequence<2, 2>; -#elif 0 - // 3x3, 34x34 - constexpr index_t N = 64; - constexpr index_t C = 256; - constexpr index_t HI = 34; - constexpr index_t WI = 34; - constexpr index_t K = 128; - constexpr index_t Y = 3; - constexpr index_t X = 3; - - using ConvStrides = Sequence<1, 1>; - using ConvDilations = Sequence<1, 1>; - - using LeftPads = Sequence<0, 0>; - using RightPads = Sequence<0, 0>; -#elif 0 - // 1x1 filter, 8x8 image - // cudnn@V100 68%, ck@V100 72%, ck@P100 52%, ck@VII 42% - constexpr index_t N = 64; - constexpr index_t C = 1536; - constexpr index_t HI = 8; - constexpr index_t WI = 8; - constexpr index_t K = 256; - constexpr index_t Y = 1; - constexpr index_t X = 1; - - using ConvStrides = Sequence<1, 1>; - using ConvDilations = Sequence<1, 1>; - - using LeftPads = Sequence<0, 0>; - using RightPads = Sequence<0, 0>; -#elif 0 - // 1x1 filter, 8x8 image - // cudnn@V100 77%, ck@V100 76%, ck@P100 79%, ck@VII 51% - constexpr index_t N = 128; - constexpr index_t C = 2048; - constexpr index_t HI = 8; - constexpr index_t WI = 8; - constexpr index_t K = 384; - constexpr index_t Y = 1; - constexpr index_t X = 1; - - using ConvStrides = Sequence<1, 1>; - using ConvDilations = Sequence<1, 1>; - - using LeftPads = Sequence<0, 0>; - using RightPads = Sequence<0, 0>; -#elif 0 - // 1x1 filter, 7x7 image - // cudnn@V100 82%, ck@V100 76%, ck@P100 67%, ck@VII 64% - constexpr index_t N = 128; - constexpr index_t C = 832; - constexpr index_t HI = 7; - constexpr index_t WI = 7; - constexpr index_t K = 384; - constexpr index_t Y = 1; - constexpr index_t X = 1; - - using ConvStrides = Sequence<1, 1>; - using ConvDilations = Sequence<1, 1>; - - using LeftPads = Sequence<0, 0>; - using RightPads = Sequence<0, 0>; -#elif 0 - // 1x1 filter, 8x8 image - // cudnn@V100 83%, ck@V100 75%, ck@P100 78%, ck@VII 65% - constexpr index_t N = 128; - constexpr index_t C = 1280; - constexpr index_t HI = 8; - constexpr index_t WI = 8; - constexpr index_t K = 384; - constexpr index_t Y = 1; - constexpr index_t X = 1; - - using ConvStrides = Sequence<1, 1>; - using ConvDilations = Sequence<1, 1>; - - using LeftPads = Sequence<0, 0>; - using RightPads = Sequence<0, 0>; -#elif 0 - // 1x1 filter, 14x14 image - // cudnn@V100 62%, ck@V100 68%, ck@P100 70%, ck@VII 50% - constexpr index_t N = 128; - constexpr index_t C = 512; - constexpr index_t HI = 14; - constexpr index_t WI = 14; - constexpr index_t K = 128; - constexpr index_t Y = 1; - constexpr index_t X = 1; - - using ConvStrides = Sequence<1, 1>; - using ConvDilations = Sequence<1, 1>; - - using LeftPads = Sequence<0, 0>; - using RightPads = Sequence<0, 0>; -#elif 0 - // 1x1 filter, 8x8 image - // cudnn@V100 74%, ck@V100 57%, ck@P100 78%, ck@VII 61% - constexpr index_t N = 64; - constexpr index_t C = 1536; - constexpr index_t HI = 8; - constexpr index_t WI = 8; - constexpr index_t K = 384; - constexpr index_t Y = 1; - constexpr index_t X = 1; - - using ConvStrides = Sequence<1, 1>; - using ConvDilations = Sequence<1, 1>; - - using LeftPads = Sequence<0, 0>; - using RightPads = Sequence<0, 0>; -#elif 0 - // 1x1 filter, 28x28 image - // cudnn@V100 86%, ck@V100 84%, ck@P100 80%, ck@VII 69% - constexpr index_t N = 128; - constexpr index_t C = 256; - constexpr index_t HI = 28; - constexpr index_t WI = 28; - constexpr index_t K = 128; - constexpr index_t Y = 1; - constexpr index_t X = 1; - - using ConvStrides = Sequence<1, 1>; - using ConvDilations = Sequence<1, 1>; - - using LeftPads = Sequence<0, 0>; - using RightPads = Sequence<0, 0>; -#elif 0 - // 1x1 filter, 7x7 image - // cudnn@V100 71%, ck@V100 55%, ck@P100 70%, ck@VII 62% - constexpr index_t N = 128; - constexpr index_t C = 832; - constexpr index_t HI = 7; - constexpr index_t WI = 7; - constexpr index_t K = 256; - constexpr index_t Y = 1; - constexpr index_t X = 1; - - using ConvStrides = Sequence<1, 1>; - using ConvDilations = Sequence<1, 1>; - - using LeftPads = Sequence<0, 0>; - using RightPads = Sequence<0, 0>; -#elif 0 - // 1x1 filter, 17x17 input - // cudnn@V100 81%, ck@V100 76%, ck@P100 70%, ck@VII 76% - constexpr index_t N = 128; - constexpr index_t C = 768; - constexpr index_t HI = 17; - constexpr index_t WI = 17; - constexpr index_t K = 128; - constexpr index_t Y = 1; - constexpr index_t X = 1; - - using ConvStrides = Sequence<1, 1>; - using ConvDilations = Sequence<1, 1>; - - using LeftPads = Sequence<0, 0>; - using RightPads = Sequence<0, 0>; -#elif 0 - // 1x1 filter, 14x14 image - // cudnn@V100 73%, ck@V100 71%, ck@P100 70%, ck@VII 64% - constexpr index_t N = 128; - constexpr index_t C = 528; - constexpr index_t HI = 14; - constexpr index_t WI = 14; - constexpr index_t K = 128; - constexpr index_t Y = 1; - constexpr index_t X = 1; - - using ConvStrides = Sequence<1, 1>; - using ConvDilations = Sequence<1, 1>; - - using LeftPads = Sequence<0, 0>; - using RightPads = Sequence<0, 0>; -#elif 0 - // 1x1 filter, 14x14 image - // cudnn@V100 73%, ck@V100 72%, ck@P100 79%, ck@VII 75% - constexpr index_t N = 128; - constexpr index_t C = 528; - constexpr index_t HI = 14; - constexpr index_t WI = 14; - constexpr index_t K = 256; - constexpr index_t Y = 1; - constexpr index_t X = 1; - - using ConvStrides = Sequence<1, 1>; - using ConvDilations = Sequence<1, 1>; - - using LeftPads = Sequence<0, 0>; - using RightPads = Sequence<0, 0>; -#elif 0 - // 1x1 filter, 7x7 image - // cudnn@V100 49%, ck@V100 50%, ck@P100 61%, ck@VII 52% - constexpr index_t N = 128; - constexpr index_t C = 832; - constexpr index_t HI = 7; - constexpr index_t WI = 7; - constexpr index_t K = 128; - constexpr index_t Y = 1; - constexpr index_t X = 1; - - using ConvStrides = Sequence<1, 1>; - using ConvDilations = Sequence<1, 1>; - - using LeftPads = Sequence<0, 0>; - using RightPads = Sequence<0, 0>; -#elif 0 - // 3x3 filter, 2x2 stride, 35x35 input, 17x17 output - // cudnn@V100 90%, ck@V100 93%, ck@P100 83%, ck@VII 81% - constexpr index_t N = 128; - constexpr index_t C = 288; - constexpr index_t HI = 35; - constexpr index_t WI = 35; - constexpr index_t K = 384; - constexpr index_t Y = 3; - constexpr index_t X = 3; - - using ConvStrides = Sequence<2, 2>; - using ConvDilations = Sequence<1, 1>; - - using LeftPads = Sequence<0, 0>; - using RightPads = Sequence<0, 0>; -#elif 0 - // 5x5 filter, 2x2 pad, 7x7 input - constexpr index_t N = 128; - constexpr index_t C = 48; - constexpr index_t HI = 7; - constexpr index_t WI = 7; - constexpr index_t K = 128; - constexpr index_t Y = 5; - constexpr index_t X = 5; - - using ConvStrides = Sequence<1, 1>; - using ConvDilations = Sequence<1, 1>; - - using LeftPads = Sequence<2, 2>; - using RightPads = Sequence<2, 2>; -#elif 0 - // 7x1 filter, 3x0 pad, 17x17 input - constexpr index_t N = 128; - constexpr index_t C = 128; - constexpr index_t HI = 17; - constexpr index_t WI = 17; - constexpr index_t K = 128; - constexpr index_t Y = 7; - constexpr index_t X = 1; - - using ConvStrides = Sequence<1, 1>; - using ConvDilations = Sequence<1, 1>; - - using LeftPads = Sequence<3, 0>; - using RightPads = Sequence<3, 0>; -#elif 1 - // 1x7 filter, 0x3 pad, 17x17 input - constexpr index_t N = 128; - constexpr index_t C = 128; - constexpr index_t HI = 17; - constexpr index_t WI = 17; - constexpr index_t K = 128; - constexpr index_t Y = 1; - constexpr index_t X = 7; - - using ConvStrides = Sequence<1, 1>; - using ConvDilations = Sequence<1, 1>; - - using LeftPads = Sequence<0, 3>; - using RightPads = Sequence<0, 3>; -#endif - - constexpr auto img_nchw_desc = make_native_tensor_descriptor_packed(Sequence{}); - constexpr auto wei_kcyx_desc = make_native_tensor_descriptor_packed(Sequence{}); - constexpr auto out_nkhw_desc = get_convolution_output_default_4d_tensor_descriptor( - img_nchw_desc, wei_kcyx_desc, ConvStrides{}, ConvDilations{}, LeftPads{}, RightPads{}); - - constexpr index_t HO = out_nkhw_desc.GetLengths()[2]; - constexpr index_t WO = out_nkhw_desc.GetLengths()[3]; - - constexpr auto col_eb_desc = - make_native_tensor_descriptor_packed(Sequence{}); - - using FilterSizes = Sequence; - using OutputSizes = Sequence; - - ostream_ConstantTensorDescriptor(col_eb_desc, std::cout << "col_eb_desc: "); - ostream_ConstantTensorDescriptor(img_nchw_desc, std::cout << "img_nchw_desc: "); - print_sequence("FilterSizes", FilterSizes{}); - print_sequence("OutputSizes", OutputSizes{}); - print_sequence("LeftPads", LeftPads{}); - print_sequence("LeftPads", LeftPads{}); - print_sequence("RightPads", RightPads{}); - print_sequence("ConvStrides", ConvStrides{}); - print_sequence("ConvDilations", ConvDilations{}); - - Tensor col_eb(make_TensorDescriptor(col_eb_desc)); - Tensor img_nchw_host(make_TensorDescriptor(img_nchw_desc)); - Tensor img_nchw_device(make_TensorDescriptor(img_nchw_desc)); - - std::size_t num_thread = std::thread::hardware_concurrency(); - - if(argc != 3) - { - printf("arg1: do_verification, arg2: nrepeat\n"); - exit(1); - } - - bool do_verification = atoi(argv[1]); - std::size_t nrepeat = atoi(argv[2]); - - if(do_verification) - { -#if 0 - col_eb.GenerateTensorValue(GeneratorTensor_1{1}, num_thread); - img_nchw_device.GenerateTensorValue(GeneratorTensor_1{0}, num_thread); -#else - col_eb.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread); - img_nchw_device.GenerateTensorValue(GeneratorTensor_1{0}, num_thread); -#endif - } - -#if 1 - device_col2im_eb_nchw(col_eb_desc, - col_eb, - img_nchw_desc, - img_nchw_device, - FilterSizes{}, - OutputSizes{}, - ConvStrides{}, - ConvDilations{}, - LeftPads{}, - RightPads{}, - nrepeat); -#endif - - if(do_verification) - { - host_col2im(col_eb, - img_nchw_host, - FilterSizes{}, - OutputSizes{}, - ConvStrides{}, - ConvDilations{}, - LeftPads{}, - RightPads{}); - - check_error(img_nchw_host, img_nchw_device); - -#if 0 - LogRange(std::cout << "col_eb : ", col_eb.mData, ",") << std::endl; - LogRange(std::cout << "img_nchw_host : ", img_nchw_host.mData, ",") << std::endl; - LogRange(std::cout << "img_nchw_device : ", img_nchw_device.mData, ",") << std::endl; -#endif - } -} diff --git a/driver/src/col2im_driver.cu b/driver/src/col2im_driver.cu new file mode 120000 index 00000000..8d388393 --- /dev/null +++ b/driver/src/col2im_driver.cu @@ -0,0 +1 @@ +col2im_driver.cpp \ No newline at end of file From e5874b3f0485dff8f7a45db663a67130954ec8ff Mon Sep 17 00:00:00 2001 From: Chao Liu Date: Mon, 18 Nov 2019 20:27:54 -0600 Subject: [PATCH 08/23] refactor copy and atomic --- CMakeLists.txt | 2 + .../gridwise_col2im_eb_nchw.hpp | 6 +- ..._v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp | 70 +++++----- .../blockwise_generic_tensor_slice_copy.hpp | 127 +++++------------- .../threadwise_generic_tensor_slice_copy.hpp | 110 +++++++++------ .../include/utility/common_header.hpp | 1 + .../include/utility/config.amd.hpp.in | 3 +- .../include/utility/config.nvidia.hpp.in | 10 +- .../include/utility/float_type.amd.hpp.in | 53 ++++++++ .../utility/in_memory_operation.nvidia.hpp.in | 60 +++++++++ driver/src/conv_driver.cpp | 2 +- 11 files changed, 276 insertions(+), 168 deletions(-) create mode 100644 composable_kernel/include/utility/in_memory_operation.nvidia.hpp.in diff --git a/CMakeLists.txt b/CMakeLists.txt index 20fc8028..a4b01660 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -55,9 +55,11 @@ include_directories(BEFORE if(DEVICE_BACKEND STREQUAL "AMD") configure_file("${PROJECT_SOURCE_DIR}/composable_kernel/include/utility/config.amd.hpp.in" "${PROJECT_BINARY_DIR}/composable_kernel/include/utility/config.hpp") configure_file("${PROJECT_SOURCE_DIR}/composable_kernel/include/utility/float_type.amd.hpp.in" "${PROJECT_BINARY_DIR}/composable_kernel/include/utility/float_type.hpp") + configure_file("${PROJECT_SOURCE_DIR}/composable_kernel/include/utility/in_memory_operation.amd.hpp.in" "${PROJECT_BINARY_DIR}/composable_kernel/include/utility/in_memory_operation.hpp") elseif(DEVICE_BACKEND STREQUAL "NVIDIA") configure_file("${PROJECT_SOURCE_DIR}/composable_kernel/include/utility/config.nvidia.hpp.in" "${PROJECT_BINARY_DIR}/composable_kernel/include/utility/config.hpp") configure_file("${PROJECT_SOURCE_DIR}/composable_kernel/include/utility/float_type.nvidia.hpp.in" "${PROJECT_BINARY_DIR}/composable_kernel/include/utility/float_type.hpp") + configure_file("${PROJECT_SOURCE_DIR}/composable_kernel/include/utility/in_memory_operation.nvidia.hpp.in" "${PROJECT_BINARY_DIR}/composable_kernel/include/utility/in_memory_operation.hpp") endif() add_subdirectory(driver) diff --git a/composable_kernel/include/kernel_algorithm/gridwise_col2im_eb_nchw.hpp b/composable_kernel/include/kernel_algorithm/gridwise_col2im_eb_nchw.hpp index ce5cc79d..2fbe301e 100644 --- a/composable_kernel/include/kernel_algorithm/gridwise_col2im_eb_nchw.hpp +++ b/composable_kernel/include/kernel_algorithm/gridwise_col2im_eb_nchw.hpp @@ -113,7 +113,11 @@ struct GridwiseCol2Im_eb_nchw 1, 1, BlockCopyDataPerAccess_B, - BlockCopyDataPerAccess_B>( + BlockCopyDataPerAccess_B, + AddressSpace::vgpr, + AddressSpace::vgpr, + AddressSpace::global, + InMemoryDataOperation::atomic_add>( {e_block_data_on_global, b_block_data_on_global}, {e_block_data_on_global, b_block_data_on_global}); diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp index 95fbeb29..fbdd2e44 100644 --- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp +++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp @@ -107,11 +107,16 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer constexpr auto True = integral_constant{}; - constexpr auto generic_address_space = - integral_constant{}; constexpr auto global_address_space = integral_constant{}; + constexpr auto lds_address_space = integral_constant{}; + + constexpr auto vgpr_address_space = integral_constant{}; + + constexpr auto no_inmem_op = + integral_constant{}; + static_assert(ConvDirection == ConvolutionDirection::Forward || ConvDirection == ConvolutionDirection::BackwardWeight, "wrong! this kernel only support convolution forward and backward-weight"); @@ -230,7 +235,11 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer 2, 3, InBlockCopySrcDataPerRead_B, - InBlockCopyDstDataPerWrite_N2>( + InBlockCopyDstDataPerWrite_N2, + AddressSpace::global, + AddressSpace::vgpr, + AddressSpace::lds, + InMemoryDataOperation::none>( {0, 0, b_block_data_on_global, 0}, {0, 0, 0, 0}); // weight tensor @@ -266,7 +275,11 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer 0, 1, WeiBlockCopySrcDataPerRead_E, - WeiBlockCopyDstDataPerWrite_K>( + WeiBlockCopyDstDataPerWrite_K, + AddressSpace::global, + AddressSpace::vgpr, + AddressSpace::lds, + InMemoryDataOperation::none>( {0, k_block_data_on_global}, {0, 0}); // GEMM definition @@ -334,10 +347,8 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer // LDS double buffer: preload data into LDS { - blockwise_in_copy.Run( - p_in_global, p_in_block_double, global_address_space, generic_address_space); - blockwise_wei_copy.Run( - p_wei_global, p_wei_block_double, global_address_space, generic_address_space); + blockwise_in_copy.Run(p_in_global, p_in_block_double); + blockwise_wei_copy.Run(p_wei_global, p_wei_block_double); } // LDS double buffer: main body @@ -368,10 +379,8 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer __syncthreads(); // LDS doubel buffer: load next data from device mem - blockwise_in_copy.RunLoadThreadBuffer( - p_in_global, p_in_thread_buffer, global_address_space, generic_address_space); - blockwise_wei_copy.RunLoadThreadBuffer( - p_wei_global, p_wei_thread_buffer, global_address_space, generic_address_space); + blockwise_in_copy.RunLoadThreadBuffer(p_in_global, p_in_thread_buffer); + blockwise_wei_copy.RunLoadThreadBuffer(p_wei_global, p_wei_thread_buffer); // LDS double buffer: GEMM on current data blockwise_gemm.Run(p_wei_block_now, p_in_block_now, p_out_thread); @@ -397,10 +406,8 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer __syncthreads(); // LDS double buffer: load last data from device mem - blockwise_in_copy.RunLoadThreadBuffer( - p_in_global, p_in_thread_buffer, global_address_space, generic_address_space); - blockwise_wei_copy.RunLoadThreadBuffer( - p_wei_global, p_wei_thread_buffer, global_address_space, generic_address_space); + blockwise_in_copy.RunLoadThreadBuffer(p_in_global, p_in_thread_buffer); + blockwise_wei_copy.RunLoadThreadBuffer(p_wei_global, p_wei_thread_buffer); // LDS double buffer: GEMM on 2nd-last data blockwise_gemm.Run(p_wei_block_double, p_in_block_double, p_out_thread); @@ -474,20 +481,23 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer const index_t b_thread_data_on_global = b_block_data_on_global + c_thread_mtx_on_block.col / N2; - ThreadwiseGenericTensorSliceCopy_v4r2::type, - 3, - 1, - 1>({0, 0, 0, 0, 0}, - {k_thread_data_on_global / K1, - k_thread_data_on_global % K1, - 0, - b_thread_data_on_global, - 0}) - .Run(p_out_thread, p_out_global, generic_address_space, global_address_space); + ThreadwiseGenericTensorSliceCopy_v4r2< + decltype(out_k0_k1_n1_b_n2_thread_desc), + decltype(out_k0_k1_n1_b_n2_global_desc), + decltype(out_k0_k1_n1_b_n2_thread_desc.GetLengths()), + arithmetic_sequence_gen<0, 5, 1>::type, + 3, + 1, + 1, + AddressSpace::vgpr, + AddressSpace::global, + InMemoryDataOperation::none>({0, 0, 0, 0, 0}, + {k_thread_data_on_global / K1, + k_thread_data_on_global % K1, + 0, + b_thread_data_on_global, + 0}) + .Run(p_out_thread, p_out_global); } } }; diff --git a/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp b/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp index e234558f..66b7b6a6 100644 --- a/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp +++ b/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp @@ -21,7 +21,11 @@ template + index_t DstDataPerAccess, + AddressSpace SrcAddressSpace = AddressSpace::generic, + AddressSpace ThreadBufferAddressSpace = AddressSpace::generic, + AddressSpace DstAddressSpace = AddressSpace::generic, + InMemoryDataOperation DstInMemOp = InMemoryDataOperation::none> struct BlockwiseGenericTensorSliceCopy_v4 { static constexpr index_t nDim = BlockSrcDesc::GetNumOfDimension(); @@ -66,130 +70,57 @@ struct BlockwiseGenericTensorSliceCopy_v4 return ThreadBufferDesc::GetElementSpace(); } - template - __device__ void - RunLoadThreadBuffer(const BlockSrcData* p_block_src, - ThreadBufferData* p_thread_buffer, - integral_constant, - integral_constant) const + template + __device__ void RunLoadThreadBuffer(const BlockSrcData* p_block_src, + ThreadBufferData* p_thread_buffer) const { - constexpr auto block_src_address_space = - integral_constant{}; - constexpr auto thread_buffer_address_space = - integral_constant{}; - constexpr bool has_optimized_address_calculation = decltype(mThreadwiseStore)::HasWorkingOptimizedAddressCalculation(); // TODO: threadwise copy is still being tweaked if(has_optimized_address_calculation) { - mThreadwiseLoad.Run_optimized_src_address_calculation( - p_block_src, p_thread_buffer, block_src_address_space, thread_buffer_address_space); + mThreadwiseLoad.Run_optimized_src_address_calculation(p_block_src, p_thread_buffer); } else { - mThreadwiseLoad.Run( - p_block_src, p_thread_buffer, block_src_address_space, thread_buffer_address_space); + mThreadwiseLoad.Run(p_block_src, p_thread_buffer); } } - template - __device__ void RunLoadThreadBuffer(const BlockSrcData* p_block_src, - ThreadBufferData* p_thread_buffer) const - { - constexpr auto generic_address_space = - integral_constant{}; - - RunLoadThreadBuffer( - p_block_src, p_thread_buffer, generic_address_space, generic_address_space); - } - - template - __device__ void - RunStoreThreadBuffer(const ThreadBufferData* p_thread_buffer, - BlockDstData* p_block_dst, - integral_constant, - integral_constant) const + template + __device__ void RunStoreThreadBuffer(const ThreadBufferData* p_thread_buffer, + BlockDstData* p_block_dst) const { - constexpr auto thread_buffer_address_space = - integral_constant{}; - constexpr auto block_dst_address_space = - integral_constant{}; - constexpr bool has_optimized_address_calculation = decltype(mThreadwiseStore)::HasWorkingOptimizedAddressCalculation(); // TODO: threadwise copy is still being tweaked if(has_optimized_address_calculation) { - mThreadwiseStore.Run_optimized_dst_address_calculation( - p_thread_buffer, p_block_dst, thread_buffer_address_space, block_dst_address_space); + mThreadwiseStore.Run_optimized_dst_address_calculation(p_thread_buffer, p_block_dst); } else { -#if 0 // debug - mThreadwiseStore.Run( - p_thread_buffer, p_block_dst, thread_buffer_address_space, block_dst_address_space); -#else - constexpr auto True = integral_constant{}; - - mThreadwiseStore.Run(p_thread_buffer, - p_block_dst, - thread_buffer_address_space, - block_dst_address_space, - True); -#endif + mThreadwiseStore.Run(p_thread_buffer, p_block_dst); } } - template - __device__ void RunStoreThreadBuffer(const ThreadBufferData* p_thread_buffer, - BlockDstData* p_block_dst) const + template + __device__ void Run(const BlockSrcData* p_block_src, BlockDstData* p_block_dst) const { - constexpr auto generic_address_space = - integral_constant{}; - - RunStoreThreadBuffer( - p_thread_buffer, p_block_dst, generic_address_space, generic_address_space); - } + static_assert(ThreadBufferAddressSpace == AddressSpace::vgpr, + "wrong! This function use vgpr as its thread " + "buffer. However, you have set RunLoadThreadBuffer and RunStoreThreadBuffer " + "to use ThreadBufferAddressSpace as their thread buffer, which is not vgpr. " + "Behavior may be different"); - template - __device__ void - Run(const BlockSrcData* p_block_src, - BlockDstData* p_block_dst, - integral_constant block_src_address_space, - integral_constant block_dst_address_space) const - { BlockSrcData p_thread_buffer[GetThreadBufferSize()]; - constexpr auto generic_address_space = - integral_constant{}; - - RunLoadThreadBuffer( - p_block_src, p_thread_buffer, block_src_address_space, generic_address_space); + RunLoadThreadBuffer(p_block_src, p_thread_buffer); // if there is type conversion, it's done during store - RunStoreThreadBuffer( - p_thread_buffer, p_block_dst, generic_address_space, block_dst_address_space); - } - - template - __device__ void Run(const BlockSrcData* p_block_src, BlockDstData* p_block_dst) const - { - constexpr auto generic_address_space = - integral_constant{}; - - Run(p_block_src, p_block_dst, generic_address_space, generic_address_space); + RunStoreThreadBuffer(p_thread_buffer, p_block_dst); } template @@ -217,7 +148,10 @@ struct BlockwiseGenericTensorSliceCopy_v4 SrcDimAccessOrder, SrcVectorAccessDim, SrcDataPerAccess, - 1>; + 1, + SrcAddressSpace, + ThreadBufferAddressSpace, + InMemoryDataOperation::none>; using ThreadwiseStore = ThreadwiseGenericTensorSliceCopy_v4r2; + DstDataPerAccess, + ThreadBufferAddressSpace, + DstAddressSpace, + DstInMemOp>; ThreadwiseLoad mThreadwiseLoad; ThreadwiseStore mThreadwiseStore; diff --git a/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp b/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp index ab183ca3..8d770c43 100644 --- a/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp +++ b/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp @@ -21,7 +21,10 @@ template + index_t DstDataPerAccess, + AddressSpace SrcAddressSpace = AddressSpace::generic, + AddressSpace DstAddressSpace = AddressSpace::generic, + InMemoryDataOperation DstInMemOp = InMemoryDataOperation::none> struct ThreadwiseGenericTensorSliceCopy_v4r2 { static constexpr index_t nDim = SliceLengths::Size(); @@ -66,17 +69,8 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2 // Will do padding check on src data: Read 0 if src data is in padding area. // Will do padding check on dst data: No write if dst data is in paddin area. - template - __device__ void Run(const SrcData* p_src, - DstData* p_dst, - integral_constant, - integral_constant, - integral_constant do_atomic_add = - integral_constant{}) const + template + __device__ void Run(const SrcData* p_src, DstData* p_dst) const { using src_vector_t = typename vector_type::MemoryType; using dst_vector_t = typename vector_type::MemoryType; @@ -123,6 +117,7 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2 // has the same padding situation if(src_coord.IsUpperIndexMappedToValidOffset()) { +#if 0 // debug static_if{}([&](auto fwd) { #if CK_USE_AMD_BUFFER_ADDRESSING *reinterpret_cast(&p_src_long_vector[buffer_offset]) = @@ -137,6 +132,14 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2 *reinterpret_cast(&p_src_long_vector[buffer_offset]) = *reinterpret_cast(&p_src[src_coord.GetOffset()]); }); +#else + move_data( + p_src, src_coord.GetOffset(), p_src_long_vector, buffer_offset); +#endif } } @@ -163,6 +166,7 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2 // has the same padding situation if(dst_coord.IsUpperIndexMappedToValidOffset()) { +#if 0 static_if{}([&](auto) { static_if{}([&](auto fwd) { #if CK_USE_AMD_BUFFER_ADDRESSING @@ -185,20 +189,19 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2 reinterpret_cast(&p_dst[dst_coord.GetOffset()]), *reinterpret_cast(&p_dst_long_vector[buffer_offset])); }); +#else + move_data( + p_dst_long_vector, buffer_offset, p_dst, dst_coord.GetOffset()); +#endif } } }); } - template - __device__ void Run(const SrcData* p_src, DstData* p_dst) const - { - constexpr auto generic_address_space = - integral_constant{}; - - Run(p_src, p_dst, generic_address_space, generic_address_space); - } - // Modify Length to 1, if Mask is set to false // Used for isolating linear dimension from non-linear dimensions template @@ -214,15 +217,9 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2 // Will do padding check on dst data: No write if dst data is in paddin area. // This version is optimized for address calculation of src tensor // TODO: this function is not compiled to expected ISA - template - __device__ void - Run_optimized_src_address_calculation(const SrcData* p_src, - DstData* p_dst, - integral_constant, - integral_constant) const + template + __device__ void Run_optimized_src_address_calculation(const SrcData* p_src, + DstData* p_dst) const { using src_vector_t = typename vector_type::MemoryType; using dst_vector_t = typename vector_type::MemoryType; @@ -317,6 +314,7 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2 // the src vector has the same padding situation if(src_coord.IsUpperIndexMappedToValidOffset()) { +#if 0 // debug static_if{}([&](auto) { #if CK_USE_AMD_BUFFER_ADDRESSING *reinterpret_cast(&p_src_long_vector[buffer_offset]) = @@ -332,6 +330,17 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2 *reinterpret_cast( &p_src[src_nonlinear_coord.GetOffset() + src_linear_offset]); }); +#else + move_data(p_src, + src_nonlinear_coord.GetOffset() + + src_linear_offset, + p_src_long_vector, + buffer_offset); +#endif } } @@ -361,8 +370,17 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2 // the dst vector has the same padding situation if(dst_coord.IsUpperIndexMappedToValidOffset()) { +#if 0 // debug *reinterpret_cast(&p_dst[dst_coord.GetOffset()]) = *reinterpret_cast(&p_dst_long_vector[buffer_offset]); +#else + move_data( + p_dst_long_vector, buffer_offset, p_dst, dst_coord.GetOffset()); +#endif } } }); @@ -376,15 +394,9 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2 // Will do padding check on dst data: No write if dst data is in paddin area. // This version is optimized for address calculation of dst tensor // TODO: this function is not compiled to expected ISA - template - __device__ void - Run_optimized_dst_address_calculation(const SrcData* p_src, - DstData* p_dst, - integral_constant, - integral_constant) const + template + __device__ void Run_optimized_dst_address_calculation(const SrcData* p_src, + DstData* p_dst) const { using src_vector_t = typename vector_type::MemoryType; using dst_vector_t = typename vector_type::MemoryType; @@ -470,8 +482,17 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2 // the src vector has the same padding situation if(src_coord.IsUpperIndexMappedToValidOffset()) { +#if 0 *reinterpret_cast(&p_src_long_vector[buffer_offset]) = *reinterpret_cast(&p_src[src_coord.GetOffset()]); +#else + move_data( + p_src, src_coord.GetOffset(), p_src_long_vector, buffer_offset); +#endif } } @@ -510,6 +531,7 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2 // the dst vector has the same padding situation if(dst_coord.IsUpperIndexMappedToValidOffset()) { +#if 0 static_if{}([&](auto) { #if CK_USE_AMD_BUFFER_ADDRESSING amd_intrinsic_buffer_store( @@ -527,6 +549,16 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2 &p_dst[dst_nonlinear_coord.GetOffset() + dst_linear_offset]) = *reinterpret_cast(&p_dst_long_vector[buffer_offset]); }); +#else + move_data(p_dst_long_vector, + buffer_offset, + p_dst, + dst_nonlinear_coord.GetOffset() + dst_linear_offset); +#endif } } }); diff --git a/composable_kernel/include/utility/common_header.hpp b/composable_kernel/include/utility/common_header.hpp index 00964e0f..45750bbe 100644 --- a/composable_kernel/include/utility/common_header.hpp +++ b/composable_kernel/include/utility/common_header.hpp @@ -15,6 +15,7 @@ #include "functional2.hpp" #include "functional3.hpp" #include "functional4.hpp" +#include "in_memory_operation.hpp" #if CK_USE_AMD_INLINE_ASM #include "amd_inline_asm.hpp" diff --git a/composable_kernel/include/utility/config.amd.hpp.in b/composable_kernel/include/utility/config.amd.hpp.in index 896679c8..0fe0eb2a 100644 --- a/composable_kernel/include/utility/config.amd.hpp.in +++ b/composable_kernel/include/utility/config.amd.hpp.in @@ -54,7 +54,8 @@ namespace ck { enum AddressSpace { generic, - global + global, + vgpr }; #if CK_UNSIGNED_INDEX_TYPE diff --git a/composable_kernel/include/utility/config.nvidia.hpp.in b/composable_kernel/include/utility/config.nvidia.hpp.in index 7c549cda..08757e0a 100644 --- a/composable_kernel/include/utility/config.nvidia.hpp.in +++ b/composable_kernel/include/utility/config.nvidia.hpp.in @@ -33,7 +33,15 @@ namespace ck { enum AddressSpace { generic, - global = generic + global, + lds, + vgpr +}; + +enum InMemoryDataOperation +{ + none, + atomic_add }; #if CK_UNSIGNED_INDEX_TYPE diff --git a/composable_kernel/include/utility/float_type.amd.hpp.in b/composable_kernel/include/utility/float_type.amd.hpp.in index fd9c0029..f3834c47 100644 --- a/composable_kernel/include/utility/float_type.amd.hpp.in +++ b/composable_kernel/include/utility/float_type.amd.hpp.in @@ -307,5 +307,58 @@ struct inner_product_with_conversion } }; +template +void move_data(const float* p_src, + index_t src_offset, + float* p_dst, + dst_offset, + integral_constant, + integral_constant src_address_space, + integral_constant dst_address_space) +{ + // TODO: use static_if::ElseIf + static_if{}([&](auto) { + copy_data(p_src, src_offset, p_dst, dst_offset, src_address_space, dst_address_space); + }); + + static_if{}([&](auto) { + atomic_add_data(p_src, src_offset, p_dst, dst_offset, src_address_space, dst_address_space); + }); +} + +template +void copy_data(const float* p_src, + index_t src_offset, + float* p_dst, + dst_offset, + integral_constant, + integral_constant) +{ + static_if{}( + [&](auto fwd) { +#if CK_USE_AMD_BUFFER_ADDRESSING + amd_intrinsic_buffer_store(p_src[src_offset], fwd(p_dst), dst_offset, 0); +#else + p_dst[dst_offset] = p_src[src_offset]; +#endif + }) + .Else([&](auto) { p_dst[dst_offset] = p_src[src_offset]; }); +} + +template +void atomic_add_data(const float* p_src, + index_t src_offset, + float* p_dst, + dst_offset, + integral_constant, + integral_constant) +{ + static_if{}( + [&](auto fwd) { atomicAdd(&(p_dst[dst_offset]), p_src[src_offset]); }) + .Else([&](auto fwd) { + static_assert(fwd(false), "atomic_add doesn't support this memory space"); + }); +} + } // namespace ck #endif diff --git a/composable_kernel/include/utility/in_memory_operation.nvidia.hpp.in b/composable_kernel/include/utility/in_memory_operation.nvidia.hpp.in new file mode 100644 index 00000000..d67059df --- /dev/null +++ b/composable_kernel/include/utility/in_memory_operation.nvidia.hpp.in @@ -0,0 +1,60 @@ +#ifndef CK_IN_MEMORY_OPERATION_NVIDIA_HPP +#define CK_IN_MEMORY_OPERATION_NVIDIA_HPP + +namespace ck { + +template +__device__ void copy_data(const T* p_src, index_t src_offset, T* p_dst, index_t dst_offset) +{ + using vector_t = typename vector_type::MemoryType; + + *reinterpret_cast(&p_dst[dst_offset]) = + *reinterpret_cast(&p_src[src_offset]); +} + +template +__device__ void atomic_add_data(const T* p_src, index_t src_offset, T* p_dst, index_t dst_offset) +{ + using vector_t = typename vector_type::MemoryType; + + static_if{}( + [&](auto) { + atomicAdd(reinterpret_cast(&p_dst[dst_offset]), + *reinterpret_cast(&p_src[src_offset])); + }) + .Else([&](auto fwd) { + static_assert(fwd(false), "atomic_add doesn't support this memory space"); + }); +} + +template +__device__ void move_data(const T* p_src, index_t src_offset, T* p_dst, index_t dst_offset) +{ + static_assert(DstInMemOp == InMemoryDataOperation::none || + DstInMemOp == InMemoryDataOperation::atomic_add, + "wrong! InMemoryDataOperation not supported!"); + + // TODO: use static_if::ElseIf + static_if{}([&](auto) { + copy_data( + p_src, src_offset, p_dst, dst_offset); + }); + + static_if{}([&](auto) { + atomic_add_data( + p_src, src_offset, p_dst, dst_offset); + }); +} + +} // namespace ck +#endif diff --git a/driver/src/conv_driver.cpp b/driver/src/conv_driver.cpp index c868f717..1e3eb518 100644 --- a/driver/src/conv_driver.cpp +++ b/driver/src/conv_driver.cpp @@ -403,7 +403,7 @@ int main(int argc, char* argv[]) ConvStrides{}, ConvDilations{}, nrepeat); -#elif 0 +#elif 1 device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(in_nchw_desc, in_nchw, wei_kcyx_desc, From ff2c373b7a9ea6d9ee36710611961ba237a4a1d0 Mon Sep 17 00:00:00 2001 From: Chao Liu Date: Mon, 18 Nov 2019 21:25:44 -0600 Subject: [PATCH 09/23] amd build --- .../ConstantMatrixDescriptor.hpp | 2 +- .../tensor_description/tensor_coordinate.hpp | 4 +- .../tensor_coordinate_deprecated.hpp | 4 +- .../tensor_descriptor_helper.hpp | 12 +-- .../threadwise_generic_tensor_slice_copy.hpp | 30 +++---- .../include/utility/config.amd.hpp.in | 7 ++ .../include/utility/float_type.amd.hpp.in | 53 ------------- .../include/utility/functional.hpp | 6 +- .../utility/in_memory_operation.amd.hpp.in | 78 +++++++++++++++++++ .../utility/in_memory_operation.nvidia.hpp.in | 15 ++-- 10 files changed, 117 insertions(+), 94 deletions(-) create mode 100644 composable_kernel/include/utility/in_memory_operation.amd.hpp.in diff --git a/composable_kernel/include/tensor_description/ConstantMatrixDescriptor.hpp b/composable_kernel/include/tensor_description/ConstantMatrixDescriptor.hpp index e2a5836e..0ebd9dc4 100644 --- a/composable_kernel/include/tensor_description/ConstantMatrixDescriptor.hpp +++ b/composable_kernel/include/tensor_description/ConstantMatrixDescriptor.hpp @@ -60,7 +60,7 @@ __host__ __device__ constexpr auto template __host__ __device__ constexpr auto -make_ConstantMatrixDescriptor(ConstantTensorDescriptor_deprecated) + make_ConstantMatrixDescriptor(ConstantTensorDescriptor_deprecated) { using TDesc = ConstantTensorDescriptor_deprecated; static_assert(TDesc::GetNumOfDimension() == 2, "wrong"); diff --git a/composable_kernel/include/tensor_description/tensor_coordinate.hpp b/composable_kernel/include/tensor_description/tensor_coordinate.hpp index 4e5c5cc8..66dda13c 100644 --- a/composable_kernel/include/tensor_description/tensor_coordinate.hpp +++ b/composable_kernel/include/tensor_description/tensor_coordinate.hpp @@ -228,7 +228,7 @@ struct TensorCoordinate private: template __host__ __device__ static constexpr auto - MakeDummyTensorCoordinate(NativeTensorDescriptor) + MakeDummyTensorCoordinate(NativeTensorDescriptor) { return NativeTensorCoordinate>( make_zero_array()); @@ -236,7 +236,7 @@ struct TensorCoordinate template __host__ __device__ static constexpr auto - MakeDummyTensorCoordinate(TransformedTensorDescriptor) + MakeDummyTensorCoordinate(TransformedTensorDescriptor) { return TransformedTensorCoordinate>( make_zero_array()); diff --git a/composable_kernel/include/tensor_description/tensor_coordinate_deprecated.hpp b/composable_kernel/include/tensor_description/tensor_coordinate_deprecated.hpp index da02abdd..69659445 100644 --- a/composable_kernel/include/tensor_description/tensor_coordinate_deprecated.hpp +++ b/composable_kernel/include/tensor_description/tensor_coordinate_deprecated.hpp @@ -327,14 +327,14 @@ struct TensorCoordinate_deprecated private: template __host__ __device__ static constexpr auto - MakeDummyTensorCoordinate(ConstantTensorDescriptor_deprecated) + MakeDummyTensorCoordinate(ConstantTensorDescriptor_deprecated) { return NormalTensorCoordinate_deprecated>(); } template __host__ __device__ static constexpr auto - MakeDummyTensorCoordinate(ConstantMergedTensorDescriptor_deprecated) + MakeDummyTensorCoordinate(ConstantMergedTensorDescriptor_deprecated) { return MergedTensorCoordinate_deprecated< ConstantMergedTensorDescriptor_deprecated>(); diff --git a/composable_kernel/include/tensor_description/tensor_descriptor_helper.hpp b/composable_kernel/include/tensor_description/tensor_descriptor_helper.hpp index 1597e4c5..d7ef3867 100644 --- a/composable_kernel/include/tensor_description/tensor_descriptor_helper.hpp +++ b/composable_kernel/include/tensor_description/tensor_descriptor_helper.hpp @@ -64,10 +64,10 @@ template __host__ __device__ constexpr auto -reorder_transformed_tensor_descriptor_impl(LowerTensorDescriptor, - Sequence, - Sequence, - Sequence) + reorder_transformed_tensor_descriptor_impl(LowerTensorDescriptor, + Sequence, + Sequence, + Sequence) { return TransformedTensorDescriptor...>, @@ -78,7 +78,7 @@ reorder_transformed_tensor_descriptor_impl(LowerTensorDescriptor, // reorder a NativeTensorDescriptor template __host__ __device__ constexpr auto -reorder_tensor_descriptor_given_lower2upper(NativeTensorDescriptor, MapLower2Upper) + reorder_tensor_descriptor_given_lower2upper(NativeTensorDescriptor, MapLower2Upper) { static_assert(is_valid_sequence_map{}, "wrong! MapLower2Upper is not a valid map"); @@ -96,7 +96,7 @@ reorder_tensor_descriptor_given_lower2upper(NativeTensorDescriptor, MapLo // reorder a TransformedTensorDescriptor template __host__ __device__ constexpr auto -reorder_tensor_descriptor_given_lower2upper(TransformedTensorDescriptor, MapLower2Upper) + reorder_tensor_descriptor_given_lower2upper(TransformedTensorDescriptor, MapLower2Upper) { static_assert(is_valid_sequence_map{}, "wrong! MapLower2Upper is not a valid map"); diff --git a/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp b/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp index 8d770c43..c8f9352b 100644 --- a/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp +++ b/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp @@ -166,28 +166,22 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2 // has the same padding situation if(dst_coord.IsUpperIndexMappedToValidOffset()) { -#if 0 - static_if{}([&](auto) { - static_if{}([&](auto fwd) { +#if 0 // debug + static_if{}([&](auto fwd) { #if CK_USE_AMD_BUFFER_ADDRESSING - amd_intrinsic_buffer_store( - *reinterpret_cast(&p_dst_long_vector[buffer_offset]), - fwd(p_dst), - dst_coord.GetOffset(), - 0); + amd_intrinsic_buffer_store( + *reinterpret_cast(&p_dst_long_vector[buffer_offset]), + fwd(p_dst), + dst_coord.GetOffset(), + 0); #else - *reinterpret_cast(&p_dst[dst_coord.GetOffset()]) = - *reinterpret_cast(&p_dst_long_vector[buffer_offset]); + *reinterpret_cast(&p_dst[dst_coord.GetOffset()]) = + *reinterpret_cast(&p_dst_long_vector[buffer_offset]); #endif - }).Else([&](auto) { - // dst can be all kinds of memory-space - *reinterpret_cast(&p_dst[dst_coord.GetOffset()]) = - *reinterpret_cast(&p_dst_long_vector[buffer_offset]); - }); }).Else([&](auto) { - atomicAdd( - reinterpret_cast(&p_dst[dst_coord.GetOffset()]), - *reinterpret_cast(&p_dst_long_vector[buffer_offset])); + // dst can be all kinds of memory-space + *reinterpret_cast(&p_dst[dst_coord.GetOffset()]) = + *reinterpret_cast(&p_dst_long_vector[buffer_offset]); }); #else move_data -void move_data(const float* p_src, - index_t src_offset, - float* p_dst, - dst_offset, - integral_constant, - integral_constant src_address_space, - integral_constant dst_address_space) -{ - // TODO: use static_if::ElseIf - static_if{}([&](auto) { - copy_data(p_src, src_offset, p_dst, dst_offset, src_address_space, dst_address_space); - }); - - static_if{}([&](auto) { - atomic_add_data(p_src, src_offset, p_dst, dst_offset, src_address_space, dst_address_space); - }); -} - -template -void copy_data(const float* p_src, - index_t src_offset, - float* p_dst, - dst_offset, - integral_constant, - integral_constant) -{ - static_if{}( - [&](auto fwd) { -#if CK_USE_AMD_BUFFER_ADDRESSING - amd_intrinsic_buffer_store(p_src[src_offset], fwd(p_dst), dst_offset, 0); -#else - p_dst[dst_offset] = p_src[src_offset]; -#endif - }) - .Else([&](auto) { p_dst[dst_offset] = p_src[src_offset]; }); -} - -template -void atomic_add_data(const float* p_src, - index_t src_offset, - float* p_dst, - dst_offset, - integral_constant, - integral_constant) -{ - static_if{}( - [&](auto fwd) { atomicAdd(&(p_dst[dst_offset]), p_src[src_offset]); }) - .Else([&](auto fwd) { - static_assert(fwd(false), "atomic_add doesn't support this memory space"); - }); -} - } // namespace ck #endif diff --git a/composable_kernel/include/utility/functional.hpp b/composable_kernel/include/utility/functional.hpp index c4980082..479f41a7 100644 --- a/composable_kernel/include/utility/functional.hpp +++ b/composable_kernel/include/utility/functional.hpp @@ -64,9 +64,8 @@ struct static_if } template - __host__ __device__ static constexpr auto Else(F) + __host__ __device__ static void Else(F) { - return Type{}; } }; @@ -82,14 +81,13 @@ struct static_if } template - __host__ __device__ static constexpr auto Else(F f) + __host__ __device__ static void Else(F f) { // This is a trick for compiler: // Pass forwarder to lambda "f" as "auto" argument, and make sure "f" will use it, // this will make "f" a generic lambda, so that "f" won't be compiled until being // instantiated here f(forwarder{}); - return Type{}; } }; diff --git a/composable_kernel/include/utility/in_memory_operation.amd.hpp.in b/composable_kernel/include/utility/in_memory_operation.amd.hpp.in new file mode 100644 index 00000000..58bd1ee0 --- /dev/null +++ b/composable_kernel/include/utility/in_memory_operation.amd.hpp.in @@ -0,0 +1,78 @@ +#ifndef CK_IN_MEMORY_OPERATION_AMD_HPP +#define CK_IN_MEMORY_OPERATION_AMD_HPP + +#include "float_type.hpp" +#include "amd_buffer_addressing.hpp" + +namespace ck { + +template +__device__ void copy_data(const T* p_src, index_t src_offset, T* p_dst, index_t dst_offset) +{ + using vector_t = typename vector_type::MemoryType; + +#if CK_USE_AMD_BUFFER_ADDRESSING + // TODO: use static_if::ElseIf + static_if{}([&](auto) { + *reinterpret_cast(&p_dst[dst_offset]) = + amd_intrinsic_buffer_load(p_src, src_offset, 0); + }).Else([&](auto) { + static_if{}([&](auto) { + amd_intrinsic_buffer_store( + *reinterpret_cast(&p_src[src_offset]), p_dst, dst_offset, 0); + }).Else([&](auto) { + *reinterpret_cast(&p_dst[dst_offset]) = + *reinterpret_cast(&p_src[src_offset]); + }); + }); +#else + *reinterpret_cast(&p_dst[dst_offset]) = + *reinterpret_cast(&p_src[src_offset]); +#endif +} + +template +__device__ void atomic_add_data(const T* p_src, index_t src_offset, T* p_dst, index_t dst_offset) +{ + using vector_t = typename vector_type::MemoryType; + + static_if{}([&](auto) { + atomicAdd(reinterpret_cast(&p_dst[dst_offset]), + *reinterpret_cast(&p_src[src_offset])); + }).Else([&](auto fwd) { + static_assert(fwd(false), "atomic_add doesn't support this memory space"); + }); +} + +template +__device__ void move_data(const T* p_src, index_t src_offset, T* p_dst, index_t dst_offset) +{ + static_assert(DstInMemOp == InMemoryDataOperation::none || + DstInMemOp == InMemoryDataOperation::atomic_add, + "wrong! InMemoryDataOperation not supported!"); + + // TODO: use static_if::ElseIf + static_if{}([&](auto) { + copy_data( + p_src, src_offset, p_dst, dst_offset); + }); + + static_if{}([&](auto) { + atomic_add_data( + p_src, src_offset, p_dst, dst_offset); + }); +} + +} // namespace ck +#endif diff --git a/composable_kernel/include/utility/in_memory_operation.nvidia.hpp.in b/composable_kernel/include/utility/in_memory_operation.nvidia.hpp.in index d67059df..b9f516e4 100644 --- a/composable_kernel/include/utility/in_memory_operation.nvidia.hpp.in +++ b/composable_kernel/include/utility/in_memory_operation.nvidia.hpp.in @@ -23,14 +23,13 @@ __device__ void atomic_add_data(const T* p_src, index_t src_offset, T* p_dst, in { using vector_t = typename vector_type::MemoryType; - static_if{}( - [&](auto) { - atomicAdd(reinterpret_cast(&p_dst[dst_offset]), - *reinterpret_cast(&p_src[src_offset])); - }) - .Else([&](auto fwd) { - static_assert(fwd(false), "atomic_add doesn't support this memory space"); - }); + static_if{}([&](auto) { + atomicAdd(reinterpret_cast(&p_dst[dst_offset]), + *reinterpret_cast(&p_src[src_offset])); + }).Else([&](auto fwd) { + static_assert(fwd(false), "atomic_add doesn't support this memory space"); + }); } template Date: Mon, 18 Nov 2019 21:46:51 -0600 Subject: [PATCH 10/23] fix issue: uncessary sgpr usage due to kernel wrapper pass unnecessary argument --- composable_kernel/include/gridwise_operation_wrapper.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/composable_kernel/include/gridwise_operation_wrapper.hpp b/composable_kernel/include/gridwise_operation_wrapper.hpp index 60e96264..9c99ee35 100644 --- a/composable_kernel/include/gridwise_operation_wrapper.hpp +++ b/composable_kernel/include/gridwise_operation_wrapper.hpp @@ -2,9 +2,9 @@ #define CK_GRIDWISE_OPERATION_KERNEL_WRAPPER template -__global__ void run_gridwise_operation(GridwiseOp gridwise_op, Xs... xs) +__global__ void run_gridwise_operation(GridwiseOp, Xs... xs) { - gridwise_op.Run(xs...); + GridwiseOp{}.Run(xs...); } #endif From 3b3b96233b9d28ee3aba812e39316f235442272b Mon Sep 17 00:00:00 2001 From: Chao Liu Date: Mon, 18 Nov 2019 22:03:07 -0600 Subject: [PATCH 11/23] clean up --- .../threadwise_generic_tensor_slice_copy.hpp | 90 ------------------- .../include/utility/amd_buffer_addressing.hpp | 8 ++ .../utility/in_memory_operation.amd.hpp.in | 10 ++- 3 files changed, 17 insertions(+), 91 deletions(-) diff --git a/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp b/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp index c8f9352b..6fc5632b 100644 --- a/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp +++ b/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp @@ -117,29 +117,12 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2 // has the same padding situation if(src_coord.IsUpperIndexMappedToValidOffset()) { -#if 0 // debug - static_if{}([&](auto fwd) { -#if CK_USE_AMD_BUFFER_ADDRESSING - *reinterpret_cast(&p_src_long_vector[buffer_offset]) = - amd_intrinsic_buffer_load( - fwd(p_src), src_coord.GetOffset(), 0); -#else - *reinterpret_cast(&p_src_long_vector[buffer_offset]) = - *reinterpret_cast(&p_src[src_coord.GetOffset()]); -#endif - }).Else([&](auto) { - // src can be all kinds of memory-space. - *reinterpret_cast(&p_src_long_vector[buffer_offset]) = - *reinterpret_cast(&p_src[src_coord.GetOffset()]); - }); -#else move_data( p_src, src_coord.GetOffset(), p_src_long_vector, buffer_offset); -#endif } } @@ -166,31 +149,12 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2 // has the same padding situation if(dst_coord.IsUpperIndexMappedToValidOffset()) { -#if 0 // debug - static_if{}([&](auto fwd) { -#if CK_USE_AMD_BUFFER_ADDRESSING - amd_intrinsic_buffer_store( - *reinterpret_cast(&p_dst_long_vector[buffer_offset]), - fwd(p_dst), - dst_coord.GetOffset(), - 0); -#else - *reinterpret_cast(&p_dst[dst_coord.GetOffset()]) = - *reinterpret_cast(&p_dst_long_vector[buffer_offset]); -#endif - }).Else([&](auto) { - // dst can be all kinds of memory-space - *reinterpret_cast(&p_dst[dst_coord.GetOffset()]) = - *reinterpret_cast(&p_dst_long_vector[buffer_offset]); - }); -#else move_data( p_dst_long_vector, buffer_offset, p_dst, dst_coord.GetOffset()); -#endif } } }); @@ -204,9 +168,6 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2 return Sequence<(Mask ? Lengths : 1)...>{}; } - // p_src must be global-memory, p_dst can be any memory-space. - // User should make sure p_src is a block-invariant pointer, because - // buffer_load is used for loading from global-memory into register buffer. // Will do padding check on src data: Read 0 if src data is in padding area. // Will do padding check on dst data: No write if dst data is in paddin area. // This version is optimized for address calculation of src tensor @@ -308,23 +269,6 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2 // the src vector has the same padding situation if(src_coord.IsUpperIndexMappedToValidOffset()) { -#if 0 // debug - static_if{}([&](auto) { -#if CK_USE_AMD_BUFFER_ADDRESSING - *reinterpret_cast(&p_src_long_vector[buffer_offset]) = - amd_intrinsic_buffer_load( - p_src, src_nonlinear_coord.GetOffset(), src_linear_offset); -#else - *reinterpret_cast(&p_src_long_vector[buffer_offset]) = - *reinterpret_cast( - &p_src[src_nonlinear_coord.GetOffset() + src_linear_offset]); -#endif - }).Else([&](auto) { - *reinterpret_cast(&p_src_long_vector[buffer_offset]) = - *reinterpret_cast( - &p_src[src_nonlinear_coord.GetOffset() + src_linear_offset]); - }); -#else move_data(&p_dst[dst_coord.GetOffset()]) = - *reinterpret_cast(&p_dst_long_vector[buffer_offset]); -#else move_data( p_dst_long_vector, buffer_offset, p_dst, dst_coord.GetOffset()); -#endif } } }); }); } - // p_src could be any memory space, d_dst must be global memory. - // User should make sure p_dst is a block-invariant pointer, because - // buffer_load is used for storing data from regsiter buffer into global-memory. // Will do padding check on src data: Read 0 if src data is in padding area. // Will do padding check on dst data: No write if dst data is in paddin area. // This version is optimized for address calculation of dst tensor @@ -476,17 +411,12 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2 // the src vector has the same padding situation if(src_coord.IsUpperIndexMappedToValidOffset()) { -#if 0 - *reinterpret_cast(&p_src_long_vector[buffer_offset]) = - *reinterpret_cast(&p_src[src_coord.GetOffset()]); -#else move_data( p_src, src_coord.GetOffset(), p_src_long_vector, buffer_offset); -#endif } } @@ -525,25 +455,6 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2 // the dst vector has the same padding situation if(dst_coord.IsUpperIndexMappedToValidOffset()) { -#if 0 - static_if{}([&](auto) { -#if CK_USE_AMD_BUFFER_ADDRESSING - amd_intrinsic_buffer_store( - *reinterpret_cast(&p_dst_long_vector[buffer_offset]), - p_dst, - dst_nonlinear_coord.GetOffset(), - dst_linear_offset); -#else - *reinterpret_cast( - &p_dst[dst_nonlinear_coord.GetOffset() + dst_linear_offset]) = - *reinterpret_cast(&p_dst_long_vector[buffer_offset]); -#endif - }).Else([&](auto) { - *reinterpret_cast( - &p_dst[dst_nonlinear_coord.GetOffset() + dst_linear_offset]) = - *reinterpret_cast(&p_dst_long_vector[buffer_offset]); - }); -#else move_data __device__ typename vector_type::MemoryType amd_intrinsic_buffer_load( const T* p_src_block, index_t src_thread_data_offset, index_t src_const_data_offset); +// buffer_store requires: +// 1) p_src must be in vgpr space, d_dst must be global memory +// 2) p_dst to be a block-invariant pointer. +// It is user's responsibility to make sure that is true. template __device__ void amd_intrinsic_buffer_store(const typename vector_type::MemoryType& src, diff --git a/composable_kernel/include/utility/in_memory_operation.amd.hpp.in b/composable_kernel/include/utility/in_memory_operation.amd.hpp.in index 58bd1ee0..a31da480 100644 --- a/composable_kernel/include/utility/in_memory_operation.amd.hpp.in +++ b/composable_kernel/include/utility/in_memory_operation.amd.hpp.in @@ -15,12 +15,20 @@ __device__ void copy_data(const T* p_src, index_t src_offset, T* p_dst, index_t using vector_t = typename vector_type::MemoryType; #if CK_USE_AMD_BUFFER_ADDRESSING - // TODO: use static_if::ElseIf + // TODO: use static_if::ElseIf, instead of nested static_if static_if{}([&](auto) { + // buffer_load requires: + // 1) p_src must be in global memory space, d_dst must be vgpr + // 2) p_src to be a block-invariant pointer. + // It is user's responsibility to make sure that is true. *reinterpret_cast(&p_dst[dst_offset]) = amd_intrinsic_buffer_load(p_src, src_offset, 0); }).Else([&](auto) { static_if{}([&](auto) { + // buffer_store requires: + // 1) p_src must be in vgpr space, d_dst must be global memory + // 2) p_dst to be a block-invariant pointer. + // It is user's responsibility to make sure that is true. amd_intrinsic_buffer_store( *reinterpret_cast(&p_src[src_offset]), p_dst, dst_offset, 0); }).Else([&](auto) { From d78fe365f3bfeea608cbf3f7c4bece59d4f44807 Mon Sep 17 00:00:00 2001 From: Chao Liu Date: Tue, 19 Nov 2019 01:46:36 -0600 Subject: [PATCH 12/23] initial impl of bwd data --- ...mm_v1_nchw_kcyx_nkhw_lds_double_buffer.hpp | 401 ++++++++++++++++++ ..._v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp | 28 +- .../ConstantMatrixDescriptor.hpp | 2 +- .../tensor_description/tensor_coordinate.hpp | 4 +- .../tensor_coordinate_deprecated.hpp | 4 +- .../tensor_descriptor_helper.hpp | 12 +- .../threadwise_generic_tensor_slice_copy.hpp | 9 - .../utility/in_memory_operation.amd.hpp.in | 15 +- .../utility/in_memory_operation.nvidia.hpp.in | 15 +- driver/CMakeLists.txt | 4 + ...d_data_implicit_gemm_v1_nchw_kcyx_nkhw.hpp | 145 +++++++ driver/include/host_conv_bwd_data.hpp | 71 ++++ driver/src/conv_bwd_data_driver.cpp | 379 +++++++++++++++++ driver/src/conv_bwd_data_driver.cu | 1 + 14 files changed, 1037 insertions(+), 53 deletions(-) create mode 100644 composable_kernel/include/kernel_algorithm/gridwise_convolution_backward_data_implicit_gemm_v1_nchw_kcyx_nkhw_lds_double_buffer.hpp create mode 100644 driver/include/device_convolution_bwd_data_implicit_gemm_v1_nchw_kcyx_nkhw.hpp create mode 100644 driver/include/host_conv_bwd_data.hpp create mode 100644 driver/src/conv_bwd_data_driver.cpp create mode 120000 driver/src/conv_bwd_data_driver.cu diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_backward_data_implicit_gemm_v1_nchw_kcyx_nkhw_lds_double_buffer.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_backward_data_implicit_gemm_v1_nchw_kcyx_nkhw_lds_double_buffer.hpp new file mode 100644 index 00000000..92606525 --- /dev/null +++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_backward_data_implicit_gemm_v1_nchw_kcyx_nkhw_lds_double_buffer.hpp @@ -0,0 +1,401 @@ +#ifndef CK_GRIDWISE_CONVOLUTION_BACKWARD_DATA_IMPLICIT_GEMM_V1_NCHW_KCYX_NKHW_LDS_DOUBLE_BUFFER_HPP +#define CK_GRIDWISE_CONVOLUTION_BACKWARD_DATA_IMPLICIT_GEMM_V1_NCHW_KCYX_NKHW_LDS_DOUBLE_BUFFER_HPP + +#include "common_header.hpp" +#include "tensor_descriptor.hpp" +#include "tensor_descriptor_helper.hpp" +#include "ConstantMatrixDescriptor.hpp" +#include "blockwise_generic_tensor_slice_copy.hpp" +#include "threadwise_generic_tensor_slice_copy.hpp" +#include "blockwise_gemm.hpp" + +namespace ck { + +template +struct GridwiseConvolutionBackwardDataImplicitGemm_v1_nchw_kcyx_nkhw_lds_double_buffer +{ + __device__ void Run(Float* const __restrict__ p_in_global, + const Float* const __restrict__ p_wei_global, + const Float* const __restrict__ p_out_global) const + { + constexpr auto I0 = Number<0>{}; + constexpr auto I1 = Number<1>{}; + constexpr auto I2 = Number<2>{}; + constexpr auto I3 = Number<3>{}; + + constexpr auto True = integral_constant{}; + + constexpr auto in_n_c_hi_wi_global_desc = InGlobalDesc{}; + constexpr auto wei_k_c_y_x_global_desc = WeiGlobalDesc{}; + constexpr auto out_n_k_ho_wo_global_desc = OutGlobalDesc{}; + + constexpr index_t N = in_n_c_hi_wi_global_desc.GetLengths()[0]; + constexpr index_t C = in_n_c_hi_wi_global_desc.GetLengths()[1]; + constexpr index_t Hi = in_n_c_hi_wi_global_desc.GetLengths()[2]; + constexpr index_t Wi = in_n_c_hi_wi_global_desc.GetLengths()[3]; + + constexpr index_t K = out_n_k_ho_wo_global_desc.GetLengths()[1]; + constexpr index_t Ho = out_n_k_ho_wo_global_desc.GetLengths()[2]; + constexpr index_t Wo = out_n_k_ho_wo_global_desc.GetLengths()[3]; + + constexpr index_t Y = wei_k_c_y_x_global_desc.GetLengths()[2]; + constexpr index_t X = wei_k_c_y_x_global_desc.GetLengths()[3]; + + constexpr index_t ConvStrideH = ConvStrides{}[0]; + constexpr index_t ConvStrideW = ConvStrides{}[1]; + + constexpr index_t ConvDilationH = ConvDilations{}[0]; + constexpr index_t ConvDilationW = ConvDilations{}[1]; + + constexpr index_t E = C * Y * X; + constexpr index_t B = N * Ho * Wo; + + // sanity-check for vectorized memory load + static_assert((Wo == 1 || (ConvStrideW == 1 || InThreadCopyDataPerAccess_B == 1)) && + (X == 1 || ConvDilationW % InThreadCopyDataPerAccess_B == 0), + "wrong! aligment requirement for vectorized global load of input tensor will " + "be violated"); + + // lds max alignment + constexpr index_t max_lds_align = math::lcm(WeiBlockCopyDataPerAccess_E, + OutBlockCopyDataPerAccess_B, + GemmDataPerReadA, + GemmDataPerReadB); + + // divide block work by [K, B] + static_assert(E % EPerBlock == 0 && B % BPerBlock == 0 && K % KPerBlock == 0, + "wrong! cannot divide work evenly among block"); + + constexpr index_t EBlockWork = E / EPerBlock; + constexpr index_t BBlockWork = B / BPerBlock; + + constexpr auto block_work_desc = + make_cluster_descriptor(Sequence{}); + + const auto block_work_id = block_work_desc.CalculateClusterIndex(get_block_1d_id()); + + const index_t e_block_data_on_global = block_work_id[0] * EPerBlock; + const index_t b_block_data_on_global = block_work_id[1] * BPerBlock; + + // output tensor + // global tensor in global memory + constexpr auto out_n_k_howo_global_desc = + unfold_tensor_descriptor(out_n_k_ho_wo_global_desc, I2, I3); + + // global tensor in global memory, src of blockwise copy + constexpr auto out_k_b_global_desc = + transform_tensor_descriptor(out_n_k_howo_global_desc, + make_tuple(PassThrough{}, Merge>{}), + make_tuple(Sequence<1>{}, Sequence<0, 2>{}), + make_tuple(Sequence<0>{}, Sequence<1>{})); + + // block tensor in LDS memory, dst of blockwise copy + // be careful of LDS alignment + constexpr auto out_k_b_block_desc = make_native_tensor_descriptor_aligned( + Sequence{}, Number{}); + + // input tensor blockwise copy + auto blockwise_out_copy = + BlockwiseGenericTensorSliceCopy_v4, + Sequence<0, 1>, + Sequence<0, 1>, + 1, + 1, + OutBlockCopyDataPerAccess_B, + OutBlockCopyDataPerAccess_B, + AddressSpace::global, + AddressSpace::vgpr, + AddressSpace::lds, + InMemoryDataOperation::none>( + {0, b_block_data_on_global}, {0, 0}); + + // weight tensor + // global tensor in global memory, src of blockwise copy + // It is constructed differently, depending on whether forward or backward weight + // convolution + constexpr auto wei_k_e_global_desc = + unfold_tensor_descriptor(wei_k_c_y_x_global_desc, I1, I3); + + // block tensor in LDS memory, dst of blockwise copy + // be careful of LDS alignment + constexpr auto wei_k_e_block_desc = make_native_tensor_descriptor_aligned( + Sequence{}, Number{}); + + // weight tensor blockwise copy + auto blockwise_wei_copy = + BlockwiseGenericTensorSliceCopy_v4, + Sequence<0, 1>, + Sequence<0, 1>, + 1, + 1, + WeiBlockCopyDataPerAccess_E, + WeiBlockCopyDataPerAccess_E, + AddressSpace::global, + AddressSpace::vgpr, + AddressSpace::lds, + InMemoryDataOperation::none>( + {0, e_block_data_on_global}, {0, 0}); + + // GEMM definition + // c_mtx += transpose(a_mtx) * b_mtx + // a_mtx[KPerBlock, EPerBlock] is in LDS + // b_mtx[KPerBlocl, BPerBlock] is in LDS + // c_mtx[EPerBlock, BPerBlock] is distributed among threads, and saved in + // register + constexpr auto a_k_e_block_mtx_desc = make_ConstantMatrixDescriptor(wei_k_e_block_desc); + constexpr auto b_k_b_block_mtx_desc = make_ConstantMatrixDescriptor(out_k_b_block_desc); + + // sanity check + static_assert( + EPerBlock % (GemmMPerThreadSubC * GemmMLevel0Cluster * GemmMLevel1Cluster) == 0 && + BPerBlock % (GemmNPerThreadSubC * GemmNLevel0Cluster * GemmNLevel1Cluster) == 0, + "wrong!"); + + constexpr index_t GemmMRepeat = + EPerBlock / (GemmMPerThreadSubC * GemmMLevel0Cluster * GemmMLevel1Cluster); + + constexpr index_t GemmNRepeat = + BPerBlock / (GemmNPerThreadSubC * GemmNLevel0Cluster * GemmNLevel1Cluster); + + // c_thread_mtx definition: this is a mess + // TODO:: more elegent way of defining c_thread_mtx + constexpr auto c_e0e1_b0b1_thread_mtx_desc = make_ConstantMatrixDescriptor_packed( + Number{}, Number{}); + + const auto blockwise_gemm = BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2< + BlockSize, + decltype(a_k_e_block_mtx_desc), + decltype(b_k_b_block_mtx_desc), + decltype(c_e0e1_b0b1_thread_mtx_desc), + GemmMPerThreadSubC, + GemmNPerThreadSubC, + GemmMLevel0Cluster, + GemmNLevel0Cluster, + GemmMLevel1Cluster, + GemmNLevel1Cluster, + GemmKPerThreadLoop, + GemmDataPerReadA, + GemmDataPerReadB>{}; + + // LDS allocation for input and weight: be careful of alignment + constexpr index_t out_block_space = + math::integer_least_multiple(out_k_b_block_desc.GetElementSpace(), max_lds_align); + + constexpr index_t wei_block_space = + math::integer_least_multiple(wei_k_e_block_desc.GetElementSpace(), max_lds_align); + + __shared__ Float p_out_block_double[2 * out_block_space]; + __shared__ Float p_wei_block_double[2 * wei_block_space]; + + // register allocation for output + AccDataType p_in_thread[c_e0e1_b0b1_thread_mtx_desc.GetElementSpace()]; + + // zero out threadwise output + threadwise_matrix_set_zero(c_e0e1_b0b1_thread_mtx_desc, p_in_thread); + + // LDS double buffer: preload data into LDS + { + blockwise_out_copy.Run(p_out_global, p_out_block_double); + blockwise_wei_copy.Run(p_wei_global, p_wei_block_double); + } + + // LDS double buffer: main body + for(index_t k_block_data_begin = 0; k_block_data_begin + 2 * KPerBlock < K; + k_block_data_begin += 2 * KPerBlock) + { +#pragma unroll + for(index_t iloop = 0; iloop < 2; ++iloop) + { + const bool even_loop = (iloop % 2 == 0); + + Float* p_out_block_now = + even_loop ? p_out_block_double : p_out_block_double + out_block_space; + Float* p_wei_block_now = + even_loop ? p_wei_block_double : p_wei_block_double + wei_block_space; + + Float* p_out_block_next = + even_loop ? p_out_block_double + out_block_space : p_out_block_double; + Float* p_wei_block_next = + even_loop ? p_wei_block_double + wei_block_space : p_wei_block_double; + + Float p_out_thread_buffer[blockwise_out_copy.GetThreadBufferSize()]; + Float p_wei_thread_buffer[blockwise_wei_copy.GetThreadBufferSize()]; + + blockwise_out_copy.MoveSrcSliceWindow(Sequence{}, True); + blockwise_wei_copy.MoveSrcSliceWindow(Sequence{}, True); + + __syncthreads(); + + // LDS doubel buffer: load next data from device mem + blockwise_out_copy.RunLoadThreadBuffer(p_out_global, p_out_thread_buffer); + blockwise_wei_copy.RunLoadThreadBuffer(p_wei_global, p_wei_thread_buffer); + + // LDS double buffer: GEMM on current data + blockwise_gemm.Run(p_wei_block_now, p_out_block_now, p_in_thread); + + // LDS double buffer: store next data to LDS + blockwise_out_copy.RunStoreThreadBuffer(p_out_thread_buffer, p_out_block_next); + blockwise_wei_copy.RunStoreThreadBuffer(p_wei_thread_buffer, p_wei_block_next); + } + } + + // LDS double buffer: tail + { + constexpr bool has_two_iteration_left = (K % (2 * KPerBlock) == 0); + + if(has_two_iteration_left) // if has 2 iteration left + { + Float p_out_thread_buffer[blockwise_out_copy.GetThreadBufferSize()]; + Float p_wei_thread_buffer[blockwise_wei_copy.GetThreadBufferSize()]; + + blockwise_out_copy.MoveSrcSliceWindow(Sequence{}, True); + blockwise_wei_copy.MoveSrcSliceWindow(Sequence{}, True); + + __syncthreads(); + + // LDS double buffer: load last data from device mem + blockwise_out_copy.RunLoadThreadBuffer(p_out_global, p_out_thread_buffer); + blockwise_wei_copy.RunLoadThreadBuffer(p_wei_global, p_wei_thread_buffer); + + // LDS double buffer: GEMM on 2nd-last data + blockwise_gemm.Run(p_wei_block_double, p_out_block_double, p_in_thread); + + // LDS double buffer: store last data to LDS + blockwise_out_copy.RunStoreThreadBuffer(p_out_thread_buffer, + p_out_block_double + out_block_space); + blockwise_wei_copy.RunStoreThreadBuffer(p_wei_thread_buffer, + p_wei_block_double + wei_block_space); + + __syncthreads(); + + // LDS double buffer: GEMM on last data + blockwise_gemm.Run(p_wei_block_double + wei_block_space, + p_out_block_double + out_block_space, + p_in_thread); + } + else // if has 1 iteration left + { + __syncthreads(); + + // LDS double buffer: GEMM on last data + blockwise_gemm.Run(p_wei_block_double, p_out_block_double, p_in_thread); + } + } + + // input: register to global memory, atomic add + { + constexpr index_t E1 = GemmMPerThreadSubC * GemmMLevel0Cluster * GemmMLevel1Cluster; + constexpr index_t E0 = E / E1; + + constexpr index_t B1 = GemmNPerThreadSubC * GemmNLevel0Cluster * GemmNLevel1Cluster; + constexpr index_t B0 = B / B1; + + // define input tensor descriptor for threadwise copy + // thread input tensor, src of threadwise copy + constexpr auto in_e0_e1_b0_b1_thread_desc = make_native_tensor_descriptor_packed( + Sequence{}); + + // global input tensor, dst of threadwise copy + constexpr auto in_n_c_hip_wip_global_desc = transform_tensor_descriptor( + in_n_c_hi_wi_global_desc, + make_tuple(PassThrough{}, + PassThrough{}, + Pad, LeftPads, RightPads>{}), + make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}), + make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{})); + + constexpr auto in_n_c_y_ho_x_wo_global_desc = transform_tensor_descriptor( + in_n_c_hip_wip_global_desc, + make_tuple(PassThrough{}, + PassThrough{}, + Embed, Sequence>{}, + Embed, Sequence>{}), + make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}), + make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}, Sequence<4, 5>{})); + + constexpr auto in_e_b_global_desc = transform_tensor_descriptor( + in_n_c_y_ho_x_wo_global_desc, + make_tuple(Merge>{}, Merge>{}), + make_tuple(Sequence<1, 2, 4>{}, Sequence<0, 3, 5>{}), + make_tuple(Sequence<0>{}, Sequence<1>{})); + + constexpr auto in_e0_e1_b0_b1_global_desc = transform_tensor_descriptor( + in_e_b_global_desc, + make_tuple(UnMerge>{}, UnMerge>{}), + make_tuple(Sequence<0>{}, Sequence<1>{}), + make_tuple(Sequence<0, 1>{}, Sequence<2, 3>{})); + + // calculate origin of thread input tensor on global memory + // blockwise GEMM c matrix starting index + const auto c_thread_mtx_on_block = + blockwise_gemm.GetBeginOfThreadMatrixC(get_thread_local_1d_id()); + + const index_t e_thread_data_on_global = + e_block_data_on_global + c_thread_mtx_on_block.row; + + const index_t b_thread_data_on_global = + b_block_data_on_global + c_thread_mtx_on_block.col; + + ThreadwiseGenericTensorSliceCopy_v4r2, + 3, + InThreadCopyDataPerAccess_B, + InThreadCopyDataPerAccess_B, + AddressSpace::vgpr, + AddressSpace::global, + InMemoryDataOperation::atomic_add>( + {0, 0, 0, 0}, + {e_thread_data_on_global / E1, + e_thread_data_on_global % E1, + b_thread_data_on_global / B1, + b_thread_data_on_global % B1}) + .Run(p_in_thread, p_in_global); + } + } +}; + +} // namespace ck +#endif diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp index fbdd2e44..9cbc2ce1 100644 --- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp +++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp @@ -107,16 +107,6 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer constexpr auto True = integral_constant{}; - constexpr auto global_address_space = - integral_constant{}; - - constexpr auto lds_address_space = integral_constant{}; - - constexpr auto vgpr_address_space = integral_constant{}; - - constexpr auto no_inmem_op = - integral_constant{}; - static_assert(ConvDirection == ConvolutionDirection::Forward || ConvDirection == ConvolutionDirection::BackwardWeight, "wrong! this kernel only support convolution forward and backward-weight"); @@ -135,17 +125,17 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer constexpr auto wei_k_c_y_x_global_desc = WeiGlobalDesc{}; constexpr auto out_n_k_ho_wo_global_desc = OutGlobalDesc{}; - constexpr index_t N = in_n_c_hi_wi_global_desc.GetLength(I0); - constexpr index_t C = in_n_c_hi_wi_global_desc.GetLength(I1); - constexpr index_t Hi = in_n_c_hi_wi_global_desc.GetLength(I2); - constexpr index_t Wi = in_n_c_hi_wi_global_desc.GetLength(I3); + constexpr index_t N = in_n_c_hi_wi_global_desc.GetLengths()[0]; + constexpr index_t C = in_n_c_hi_wi_global_desc.GetLengths()[1]; + constexpr index_t Hi = in_n_c_hi_wi_global_desc.GetLengths()[2]; + constexpr index_t Wi = in_n_c_hi_wi_global_desc.GetLengths()[3]; - constexpr index_t K = out_n_k_ho_wo_global_desc.GetLength(I1); - constexpr index_t Ho = out_n_k_ho_wo_global_desc.GetLength(I2); - constexpr index_t Wo = out_n_k_ho_wo_global_desc.GetLength(I3); + constexpr index_t K = out_n_k_ho_wo_global_desc.GetLengths()[1]; + constexpr index_t Ho = out_n_k_ho_wo_global_desc.GetLengths()[2]; + constexpr index_t Wo = out_n_k_ho_wo_global_desc.GetLengths()[3]; - constexpr index_t Y = wei_k_c_y_x_global_desc.GetLength(I2); - constexpr index_t X = wei_k_c_y_x_global_desc.GetLength(I3); + constexpr index_t Y = wei_k_c_y_x_global_desc.GetLengths()[2]; + constexpr index_t X = wei_k_c_y_x_global_desc.GetLengths()[3]; constexpr index_t ConvStrideH = ConvStrides{}[0]; constexpr index_t ConvStrideW = ConvStrides{}[1]; diff --git a/composable_kernel/include/tensor_description/ConstantMatrixDescriptor.hpp b/composable_kernel/include/tensor_description/ConstantMatrixDescriptor.hpp index 0ebd9dc4..e2a5836e 100644 --- a/composable_kernel/include/tensor_description/ConstantMatrixDescriptor.hpp +++ b/composable_kernel/include/tensor_description/ConstantMatrixDescriptor.hpp @@ -60,7 +60,7 @@ __host__ __device__ constexpr auto template __host__ __device__ constexpr auto - make_ConstantMatrixDescriptor(ConstantTensorDescriptor_deprecated) +make_ConstantMatrixDescriptor(ConstantTensorDescriptor_deprecated) { using TDesc = ConstantTensorDescriptor_deprecated; static_assert(TDesc::GetNumOfDimension() == 2, "wrong"); diff --git a/composable_kernel/include/tensor_description/tensor_coordinate.hpp b/composable_kernel/include/tensor_description/tensor_coordinate.hpp index 66dda13c..4e5c5cc8 100644 --- a/composable_kernel/include/tensor_description/tensor_coordinate.hpp +++ b/composable_kernel/include/tensor_description/tensor_coordinate.hpp @@ -228,7 +228,7 @@ struct TensorCoordinate private: template __host__ __device__ static constexpr auto - MakeDummyTensorCoordinate(NativeTensorDescriptor) + MakeDummyTensorCoordinate(NativeTensorDescriptor) { return NativeTensorCoordinate>( make_zero_array()); @@ -236,7 +236,7 @@ struct TensorCoordinate template __host__ __device__ static constexpr auto - MakeDummyTensorCoordinate(TransformedTensorDescriptor) + MakeDummyTensorCoordinate(TransformedTensorDescriptor) { return TransformedTensorCoordinate>( make_zero_array()); diff --git a/composable_kernel/include/tensor_description/tensor_coordinate_deprecated.hpp b/composable_kernel/include/tensor_description/tensor_coordinate_deprecated.hpp index 69659445..da02abdd 100644 --- a/composable_kernel/include/tensor_description/tensor_coordinate_deprecated.hpp +++ b/composable_kernel/include/tensor_description/tensor_coordinate_deprecated.hpp @@ -327,14 +327,14 @@ struct TensorCoordinate_deprecated private: template __host__ __device__ static constexpr auto - MakeDummyTensorCoordinate(ConstantTensorDescriptor_deprecated) + MakeDummyTensorCoordinate(ConstantTensorDescriptor_deprecated) { return NormalTensorCoordinate_deprecated>(); } template __host__ __device__ static constexpr auto - MakeDummyTensorCoordinate(ConstantMergedTensorDescriptor_deprecated) + MakeDummyTensorCoordinate(ConstantMergedTensorDescriptor_deprecated) { return MergedTensorCoordinate_deprecated< ConstantMergedTensorDescriptor_deprecated>(); diff --git a/composable_kernel/include/tensor_description/tensor_descriptor_helper.hpp b/composable_kernel/include/tensor_description/tensor_descriptor_helper.hpp index d7ef3867..1597e4c5 100644 --- a/composable_kernel/include/tensor_description/tensor_descriptor_helper.hpp +++ b/composable_kernel/include/tensor_description/tensor_descriptor_helper.hpp @@ -64,10 +64,10 @@ template __host__ __device__ constexpr auto - reorder_transformed_tensor_descriptor_impl(LowerTensorDescriptor, - Sequence, - Sequence, - Sequence) +reorder_transformed_tensor_descriptor_impl(LowerTensorDescriptor, + Sequence, + Sequence, + Sequence) { return TransformedTensorDescriptor...>, @@ -78,7 +78,7 @@ __host__ __device__ constexpr auto // reorder a NativeTensorDescriptor template __host__ __device__ constexpr auto - reorder_tensor_descriptor_given_lower2upper(NativeTensorDescriptor, MapLower2Upper) +reorder_tensor_descriptor_given_lower2upper(NativeTensorDescriptor, MapLower2Upper) { static_assert(is_valid_sequence_map{}, "wrong! MapLower2Upper is not a valid map"); @@ -96,7 +96,7 @@ __host__ __device__ constexpr auto // reorder a TransformedTensorDescriptor template __host__ __device__ constexpr auto - reorder_tensor_descriptor_given_lower2upper(TransformedTensorDescriptor, MapLower2Upper) +reorder_tensor_descriptor_given_lower2upper(TransformedTensorDescriptor, MapLower2Upper) { static_assert(is_valid_sequence_map{}, "wrong! MapLower2Upper is not a valid map"); diff --git a/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp b/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp index 6fc5632b..784bc1a3 100644 --- a/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp +++ b/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp @@ -72,9 +72,6 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2 template __device__ void Run(const SrcData* p_src, DstData* p_dst) const { - using src_vector_t = typename vector_type::MemoryType; - using dst_vector_t = typename vector_type::MemoryType; - constexpr auto vector_access_dim = Number{}; constexpr auto src_data_per_access = Number{}; @@ -176,9 +173,6 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2 __device__ void Run_optimized_src_address_calculation(const SrcData* p_src, DstData* p_dst) const { - using src_vector_t = typename vector_type::MemoryType; - using dst_vector_t = typename vector_type::MemoryType; - constexpr auto vector_access_dim = Number{}; constexpr auto src_data_per_access = Number{}; @@ -327,9 +321,6 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2 __device__ void Run_optimized_dst_address_calculation(const SrcData* p_src, DstData* p_dst) const { - using src_vector_t = typename vector_type::MemoryType; - using dst_vector_t = typename vector_type::MemoryType; - constexpr auto vector_access_dim = Number{}; constexpr auto src_data_per_access = Number{}; diff --git a/composable_kernel/include/utility/in_memory_operation.amd.hpp.in b/composable_kernel/include/utility/in_memory_operation.amd.hpp.in index a31da480..190e2b61 100644 --- a/composable_kernel/include/utility/in_memory_operation.amd.hpp.in +++ b/composable_kernel/include/utility/in_memory_operation.amd.hpp.in @@ -50,13 +50,14 @@ __device__ void atomic_add_data(const T* p_src, index_t src_offset, T* p_dst, in { using vector_t = typename vector_type::MemoryType; - static_if{}([&](auto) { - atomicAdd(reinterpret_cast(&p_dst[dst_offset]), - *reinterpret_cast(&p_src[src_offset])); - }).Else([&](auto fwd) { - static_assert(fwd(false), "atomic_add doesn't support this memory space"); - }); + static_if{}( + [&](auto) { + atomicAdd(reinterpret_cast(&p_dst[dst_offset]), + *reinterpret_cast(&p_src[src_offset])); + }) + .Else([&](auto fwd) { + static_assert(fwd(false), "atomic_add doesn't support this memory space"); + }); } template ::MemoryType; - static_if{}([&](auto) { - atomicAdd(reinterpret_cast(&p_dst[dst_offset]), - *reinterpret_cast(&p_src[src_offset])); - }).Else([&](auto fwd) { - static_assert(fwd(false), "atomic_add doesn't support this memory space"); - }); + static_if{}( + [&](auto) { + atomicAdd(reinterpret_cast(&p_dst[dst_offset]), + *reinterpret_cast(&p_src[src_offset])); + }) + .Else([&](auto fwd) { + static_assert(fwd(false), "atomic_add doesn't support this memory space"); + }); } template +#include "device.hpp" +#include "tensor.hpp" +#include "gridwise_operation_wrapper.hpp" +#include "gridwise_convolution_backward_data_implicit_gemm_v1_nchw_kcyx_nkhw_lds_double_buffer.hpp" + +template +void device_convolution_bwd_data_implicit_gemm_v1_nchw_kcyx_nkhw(InDesc in_nchw_desc, + Tensor& in_nchw, + WeiDesc wei_kcyx_desc, + const Tensor& wei_kcyx, + OutDesc out_nkhw_desc, + const Tensor& out_nkhw, + ConvStrides, + ConvDilations, + LeftPads, + RightPads, + std::size_t nrepeat) +{ + using namespace ck; + + constexpr index_t N = out_nkhw_desc.GetLengths()[0]; + constexpr index_t K = out_nkhw_desc.GetLengths()[1]; + constexpr index_t Ho = out_nkhw_desc.GetLengths()[2]; + constexpr index_t Wo = out_nkhw_desc.GetLengths()[3]; + + constexpr index_t C = wei_kcyx_desc.GetLengths()[1]; + constexpr index_t Y = wei_kcyx_desc.GetLengths()[2]; + constexpr index_t X = wei_kcyx_desc.GetLengths()[3]; + + std::size_t data_sz = sizeof(T); + DeviceMem in_nchw_device_buf(data_sz * in_nchw.mDesc.GetElementSpace()); + DeviceMem wei_kcyx_device_buf(data_sz * wei_kcyx.mDesc.GetElementSpace()); + DeviceMem out_nkhw_device_buf(data_sz * out_nkhw.mDesc.GetElementSpace()); + + in_nchw_device_buf.ToDevice(in_nchw.mData.data()); + wei_kcyx_device_buf.ToDevice(wei_kcyx.mData.data()); + out_nkhw_device_buf.ToDevice(out_nkhw.mData.data()); + +#if 1 + // BlockSize = 256, each thread hold 64 data + constexpr index_t BlockSize = 256; + + constexpr index_t EPerBlock = 128; + constexpr index_t BPerBlock = 128; + constexpr index_t KPerBlock = 8; + + constexpr index_t GemmMPerThreadSubC = 4; + constexpr index_t GemmNPerThreadSubC = 4; + constexpr index_t GemmMLevel0Cluster = 4; + constexpr index_t GemmNLevel0Cluster = 4; + constexpr index_t GemmMLevel1Cluster = 4; + constexpr index_t GemmNLevel1Cluster = 4; + constexpr index_t GemmKPerThreadLoop = 1; + constexpr index_t GemmDataPerReadA = 4; + constexpr index_t GemmDataPerReadB = 4; + + using OutBlockCopySubLengths_K_B = Sequence<4, 1>; + using OutBlockCopyClusterLengths_K_B = Sequence<2, 128>; + + constexpr index_t OutBlockCopyDataPerAccess_B = 1; + + using WeiBlockCopySubLengths_K_E = Sequence<1, 4>; + using WeiBlockCopyClusterLengths_K_E = Sequence<8, 32>; + + constexpr index_t WeiBlockCopyDataPerAccess_E = 4; + + constexpr index_t InThreadCopyDataPerAccess_B = 1; +#endif + + constexpr index_t E = C * Y * X; + constexpr index_t B = (N * Ho * Wo); + + constexpr index_t GridSize = + ((E + EPerBlock - 1) / EPerBlock) * ((B + BPerBlock - 1) / BPerBlock); + + printf("%s: BlockSize %u, GridSize %u \n", __func__, BlockSize, GridSize); + + constexpr auto gridwise_conv = + GridwiseConvolutionBackwardDataImplicitGemm_v1_nchw_kcyx_nkhw_lds_double_buffer< + GridSize, + BlockSize, + T, + T, + decltype(in_nchw_desc), + decltype(wei_kcyx_desc), + decltype(out_nkhw_desc), + ConvStrides, + ConvDilations, + LeftPads, + RightPads, + EPerBlock, + BPerBlock, + KPerBlock, + GemmMPerThreadSubC, + GemmNPerThreadSubC, + GemmMLevel0Cluster, + GemmNLevel0Cluster, + GemmMLevel1Cluster, + GemmNLevel1Cluster, + GemmKPerThreadLoop, + GemmDataPerReadA, + GemmDataPerReadB, + OutBlockCopySubLengths_K_B, + OutBlockCopyClusterLengths_K_B, + OutBlockCopyDataPerAccess_B, + WeiBlockCopySubLengths_K_E, + WeiBlockCopyClusterLengths_K_E, + WeiBlockCopyDataPerAccess_E, + InThreadCopyDataPerAccess_B>{}; + + for(index_t i = 0; i < nrepeat; ++i) + { + float time = launch_kernel(run_gridwise_operation, + dim3(GridSize), + dim3(BlockSize), + 0, + gridwise_conv, + const_cast( + static_cast(in_nchw_device_buf.GetDeviceBuffer())), + const_cast( + static_cast(wei_kcyx_device_buf.GetDeviceBuffer())), + const_cast( + static_cast(out_nkhw_device_buf.GetDeviceBuffer()))); + + printf("Elapsed time : %f ms, %f TFlop/s\n", + time, + (float)calculate_convolution_flops(InDesc{}, WeiDesc{}, OutDesc{}) / + (std::size_t(1000) * 1000 * 1000) / time); + usleep(std::min(time * 1000, float(10000))); + } + + in_nchw_device_buf.FromDevice(in_nchw.mData.data()); +} diff --git a/driver/include/host_conv_bwd_data.hpp b/driver/include/host_conv_bwd_data.hpp new file mode 100644 index 00000000..fa6df727 --- /dev/null +++ b/driver/include/host_conv_bwd_data.hpp @@ -0,0 +1,71 @@ +#pragma once +#include "tensor.hpp" + +template +void host_direct_convolution_bwd_data(Tensor& in_nchw, + const Tensor& wei_kcyx, + const Tensor& out_nkhw, + ConvStrides, + ConvDilations, + LeftPads, + RightPads) +{ + using namespace ck; + + int N = in_nchw.mDesc.GetLengths()[0]; + int C = in_nchw.mDesc.GetLengths()[1]; + int HI = in_nchw.mDesc.GetLengths()[2]; + int WI = in_nchw.mDesc.GetLengths()[3]; + + std::size_t K = wei_kcyx.mDesc.GetLengths()[0]; + std::size_t Y = wei_kcyx.mDesc.GetLengths()[2]; + std::size_t X = wei_kcyx.mDesc.GetLengths()[3]; + + std::size_t HO = out_nkhw.mDesc.GetLengths()[2]; + std::size_t WO = out_nkhw.mDesc.GetLengths()[3]; + + auto f = [&](auto n, auto c, auto hi, auto wi) { + double v = 0; + + for(int y = 0; y < Y; ++y) + { + int h_tmp = hi + LeftPads{}[0] - y * ConvDilations{}[0]; + + if(h_tmp >= 0 && h_tmp < HI && h_tmp % ConvStrides{}[0] == 0) + { + int ho = h_tmp / ConvStrides{}[0]; + + for(int x = 0; x < X; ++x) + { + int w_tmp = wi + LeftPads{}[1] - x * ConvDilations{}[1]; + + if(w_tmp >= 0 && w_tmp < WI && w_tmp % ConvStrides{}[1] == 0) + { + int wo = w_tmp / ConvStrides{}[1]; + + for(int k = 0; k < K; ++k) + { + v += out_nkhw(n, k, ho, wo) * wei_kcyx(k, c, y, x); + } + } + } + } + } + + in_nchw(n, c, hi, wi) = v; + }; + + auto f_par = make_ParallelTensorFunctor(f, + in_nchw.mDesc.GetLengths()[0], + in_nchw.mDesc.GetLengths()[1], + in_nchw.mDesc.GetLengths()[2], + in_nchw.mDesc.GetLengths()[3]); + + f_par(std::thread::hardware_concurrency()); +} diff --git a/driver/src/conv_bwd_data_driver.cpp b/driver/src/conv_bwd_data_driver.cpp new file mode 100644 index 00000000..ed7334f9 --- /dev/null +++ b/driver/src/conv_bwd_data_driver.cpp @@ -0,0 +1,379 @@ +#include +#include +#include +#include +#include +#include "config.hpp" +#include "tensor_descriptor.hpp" +#include "tensor_descriptor_helper.hpp" +#include "print_array.hpp" +#include "print_sequence.hpp" +#include "device.hpp" +#include "tensor_generator.hpp" +#include "device_tensor.hpp" +#include "conv_common.hpp" +#include "host_conv_bwd_data.hpp" +#include "device_convolution_bwd_data_implicit_gemm_v1_nchw_kcyx_nkhw.hpp" + +int main(int argc, char* argv[]) +{ + using namespace ck; + +#if 0 + constexpr index_t N = 2; + constexpr index_t C = 8; + constexpr index_t HI = 8; + constexpr index_t WI = 8; + constexpr index_t K = 128; + constexpr index_t Y = 4; + constexpr index_t X = 4; + + using ConvStrides = Sequence<1, 1>; + using ConvDilations = Sequence<1, 1>; + + using LeftPads = Sequence<1, 1>; + using RightPads = Sequence<2, 2>; +#elif 0 + // 3x3, 34x34 + constexpr index_t N = 64; + constexpr index_t C = 256; + constexpr index_t HI = 34; + constexpr index_t WI = 34; + constexpr index_t K = 128; + constexpr index_t Y = 3; + constexpr index_t X = 3; + + using ConvStrides = Sequence<1, 1>; + using ConvDilations = Sequence<1, 1>; + + using LeftPads = Sequence<0, 0>; + using RightPads = Sequence<0, 0>; +#elif 0 + // 1x1 filter, 8x8 image + // cudnn@V100 68%, ck@V100 72%, ck@P100 52%, ck@VII 42% + constexpr index_t N = 64; + constexpr index_t C = 1536; + constexpr index_t HI = 8; + constexpr index_t WI = 8; + constexpr index_t K = 256; + constexpr index_t Y = 1; + constexpr index_t X = 1; + + using ConvStrides = Sequence<1, 1>; + using ConvDilations = Sequence<1, 1>; + + using LeftPads = Sequence<0, 0>; + using RightPads = Sequence<0, 0>; +#elif 0 + // 1x1 filter, 8x8 image + // cudnn@V100 77%, ck@V100 76%, ck@P100 79%, ck@VII 51% + constexpr index_t N = 128; + constexpr index_t C = 2048; + constexpr index_t HI = 8; + constexpr index_t WI = 8; + constexpr index_t K = 384; + constexpr index_t Y = 1; + constexpr index_t X = 1; + + using ConvStrides = Sequence<1, 1>; + using ConvDilations = Sequence<1, 1>; + + using LeftPads = Sequence<0, 0>; + using RightPads = Sequence<0, 0>; +#elif 0 + // 1x1 filter, 7x7 image + // cudnn@V100 82%, ck@V100 76%, ck@P100 67%, ck@VII 64% + constexpr index_t N = 128; + constexpr index_t C = 832; + constexpr index_t HI = 7; + constexpr index_t WI = 7; + constexpr index_t K = 384; + constexpr index_t Y = 1; + constexpr index_t X = 1; + + using ConvStrides = Sequence<1, 1>; + using ConvDilations = Sequence<1, 1>; + + using LeftPads = Sequence<0, 0>; + using RightPads = Sequence<0, 0>; +#elif 0 + // 1x1 filter, 8x8 image + // cudnn@V100 83%, ck@V100 75%, ck@P100 78%, ck@VII 65% + constexpr index_t N = 128; + constexpr index_t C = 1280; + constexpr index_t HI = 8; + constexpr index_t WI = 8; + constexpr index_t K = 384; + constexpr index_t Y = 1; + constexpr index_t X = 1; + + using ConvStrides = Sequence<1, 1>; + using ConvDilations = Sequence<1, 1>; + + using LeftPads = Sequence<0, 0>; + using RightPads = Sequence<0, 0>; +#elif 0 + // 1x1 filter, 14x14 image + // cudnn@V100 62%, ck@V100 68%, ck@P100 70%, ck@VII 50% + constexpr index_t N = 128; + constexpr index_t C = 512; + constexpr index_t HI = 14; + constexpr index_t WI = 14; + constexpr index_t K = 128; + constexpr index_t Y = 1; + constexpr index_t X = 1; + + using ConvStrides = Sequence<1, 1>; + using ConvDilations = Sequence<1, 1>; + + using LeftPads = Sequence<0, 0>; + using RightPads = Sequence<0, 0>; +#elif 0 + // 1x1 filter, 8x8 image + // cudnn@V100 74%, ck@V100 57%, ck@P100 78%, ck@VII 61% + constexpr index_t N = 64; + constexpr index_t C = 1536; + constexpr index_t HI = 8; + constexpr index_t WI = 8; + constexpr index_t K = 384; + constexpr index_t Y = 1; + constexpr index_t X = 1; + + using ConvStrides = Sequence<1, 1>; + using ConvDilations = Sequence<1, 1>; + + using LeftPads = Sequence<0, 0>; + using RightPads = Sequence<0, 0>; +#elif 0 + // 1x1 filter, 28x28 image + // cudnn@V100 86%, ck@V100 84%, ck@P100 80%, ck@VII 69% + constexpr index_t N = 128; + constexpr index_t C = 256; + constexpr index_t HI = 28; + constexpr index_t WI = 28; + constexpr index_t K = 128; + constexpr index_t Y = 1; + constexpr index_t X = 1; + + using ConvStrides = Sequence<1, 1>; + using ConvDilations = Sequence<1, 1>; + + using LeftPads = Sequence<0, 0>; + using RightPads = Sequence<0, 0>; +#elif 0 + // 1x1 filter, 7x7 image + // cudnn@V100 71%, ck@V100 55%, ck@P100 70%, ck@VII 62% + constexpr index_t N = 128; + constexpr index_t C = 832; + constexpr index_t HI = 7; + constexpr index_t WI = 7; + constexpr index_t K = 256; + constexpr index_t Y = 1; + constexpr index_t X = 1; + + using ConvStrides = Sequence<1, 1>; + using ConvDilations = Sequence<1, 1>; + + using LeftPads = Sequence<0, 0>; + using RightPads = Sequence<0, 0>; +#elif 0 + // 1x1 filter, 17x17 input + // cudnn@V100 81%, ck@V100 76%, ck@P100 70%, ck@VII 76% + constexpr index_t N = 128; + constexpr index_t C = 768; + constexpr index_t HI = 17; + constexpr index_t WI = 17; + constexpr index_t K = 128; + constexpr index_t Y = 1; + constexpr index_t X = 1; + + using ConvStrides = Sequence<1, 1>; + using ConvDilations = Sequence<1, 1>; + + using LeftPads = Sequence<0, 0>; + using RightPads = Sequence<0, 0>; +#elif 0 + // 1x1 filter, 14x14 image + // cudnn@V100 73%, ck@V100 71%, ck@P100 70%, ck@VII 64% + constexpr index_t N = 128; + constexpr index_t C = 528; + constexpr index_t HI = 14; + constexpr index_t WI = 14; + constexpr index_t K = 128; + constexpr index_t Y = 1; + constexpr index_t X = 1; + + using ConvStrides = Sequence<1, 1>; + using ConvDilations = Sequence<1, 1>; + + using LeftPads = Sequence<0, 0>; + using RightPads = Sequence<0, 0>; +#elif 0 + // 1x1 filter, 14x14 image + // cudnn@V100 73%, ck@V100 72%, ck@P100 79%, ck@VII 75% + constexpr index_t N = 128; + constexpr index_t C = 528; + constexpr index_t HI = 14; + constexpr index_t WI = 14; + constexpr index_t K = 256; + constexpr index_t Y = 1; + constexpr index_t X = 1; + + using ConvStrides = Sequence<1, 1>; + using ConvDilations = Sequence<1, 1>; + + using LeftPads = Sequence<0, 0>; + using RightPads = Sequence<0, 0>; +#elif 0 + // 1x1 filter, 7x7 image + // cudnn@V100 49%, ck@V100 50%, ck@P100 61%, ck@VII 52% + constexpr index_t N = 128; + constexpr index_t C = 832; + constexpr index_t HI = 7; + constexpr index_t WI = 7; + constexpr index_t K = 128; + constexpr index_t Y = 1; + constexpr index_t X = 1; + + using ConvStrides = Sequence<1, 1>; + using ConvDilations = Sequence<1, 1>; + + using LeftPads = Sequence<0, 0>; + using RightPads = Sequence<0, 0>; +#elif 0 + // 3x3 filter, 2x2 stride, 35x35 input, 17x17 output + // cudnn@V100 90%, ck@V100 93%, ck@P100 83%, ck@VII 81% + constexpr index_t N = 128; + constexpr index_t C = 288; + constexpr index_t HI = 35; + constexpr index_t WI = 35; + constexpr index_t K = 384; + constexpr index_t Y = 3; + constexpr index_t X = 3; + + using ConvStrides = Sequence<2, 2>; + using ConvDilations = Sequence<1, 1>; + + using LeftPads = Sequence<0, 0>; + using RightPads = Sequence<0, 0>; +#elif 0 + // 5x5 filter, 2x2 pad, 7x7 input + constexpr index_t N = 128; + constexpr index_t C = 48; + constexpr index_t HI = 7; + constexpr index_t WI = 7; + constexpr index_t K = 128; + constexpr index_t Y = 5; + constexpr index_t X = 5; + + using ConvStrides = Sequence<1, 1>; + using ConvDilations = Sequence<1, 1>; + + using LeftPads = Sequence<2, 2>; + using RightPads = Sequence<2, 2>; +#elif 0 + // 7x1 filter, 3x0 pad, 17x17 input + constexpr index_t N = 128; + constexpr index_t C = 128; + constexpr index_t HI = 17; + constexpr index_t WI = 17; + constexpr index_t K = 128; + constexpr index_t Y = 7; + constexpr index_t X = 1; + + using ConvStrides = Sequence<1, 1>; + using ConvDilations = Sequence<1, 1>; + + using LeftPads = Sequence<3, 0>; + using RightPads = Sequence<3, 0>; +#elif 1 + // 1x7 filter, 0x3 pad, 17x17 input + constexpr index_t N = 128; + constexpr index_t C = 128; + constexpr index_t HI = 17; + constexpr index_t WI = 17; + constexpr index_t K = 128; + constexpr index_t Y = 1; + constexpr index_t X = 7; + + using ConvStrides = Sequence<1, 1>; + using ConvDilations = Sequence<1, 1>; + + using LeftPads = Sequence<0, 3>; + using RightPads = Sequence<0, 3>; +#endif + + constexpr auto in_nchw_desc = make_native_tensor_descriptor_packed(Sequence{}); + constexpr auto wei_kcyx_desc = make_native_tensor_descriptor_packed(Sequence{}); + constexpr auto out_nkhw_desc = get_convolution_output_default_4d_tensor_descriptor( + in_nchw_desc, wei_kcyx_desc, ConvStrides{}, ConvDilations{}, LeftPads{}, RightPads{}); + + ostream_ConstantTensorDescriptor(in_nchw_desc, std::cout << "in_nchw_desc: "); + ostream_ConstantTensorDescriptor(wei_kcyx_desc, std::cout << "wei_kcyx_desc: "); + ostream_ConstantTensorDescriptor(out_nkhw_desc, std::cout << "out_nkhw_desc: "); + print_sequence("LeftPads", LeftPads{}); + print_sequence("LeftPads", LeftPads{}); + print_sequence("RightPads", RightPads{}); + print_sequence("ConvStrides", ConvStrides{}); + print_sequence("ConvDilations", ConvDilations{}); + + Tensor in_nchw_device(make_TensorDescriptor(in_nchw_desc)); + Tensor in_nchw_host(make_TensorDescriptor(in_nchw_desc)); + Tensor wei_kcyx(make_TensorDescriptor(wei_kcyx_desc)); + Tensor out_nkhw(make_TensorDescriptor(out_nkhw_desc)); + + std::size_t num_thread = std::thread::hardware_concurrency(); + + if(argc != 3) + { + printf("arg1: do_verification, arg2: nrepeat\n"); + exit(1); + } + + bool do_verification = atoi(argv[1]); + std::size_t nrepeat = atoi(argv[2]); + + if(do_verification) + { +#if 0 + out_nkhw.GenerateTensorValue(GeneratorTensor_1{1}, num_thread); + wei_kcyx.GenerateTensorValue(GeneratorTensor_1{1}, num_thread); +#else + out_nkhw.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread); + wei_kcyx.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread); +#endif + } + +#if 1 + device_convolution_bwd_data_implicit_gemm_v1_nchw_kcyx_nkhw(in_nchw_desc, + in_nchw_device, + wei_kcyx_desc, + wei_kcyx, + out_nkhw_desc, + out_nkhw, + ConvStrides{}, + ConvDilations{}, + LeftPads{}, + RightPads{}, + nrepeat); +#endif + + if(do_verification) + { + host_direct_convolution_bwd_data(in_nchw_host, + wei_kcyx, + out_nkhw, + ConvStrides{}, + ConvDilations{}, + LeftPads{}, + RightPads{}); + + check_error(in_nchw_host, in_nchw_device); + +#if 0 + LogRange(std::cout << "col_eb : ", col_eb.mData, ",") << std::endl; + LogRange(std::cout << "img_nchw_host : ", img_nchw_host.mData, ",") << std::endl; + LogRange(std::cout << "img_nchw_device : ", img_nchw_device.mData, ",") << std::endl; +#endif + } +} diff --git a/driver/src/conv_bwd_data_driver.cu b/driver/src/conv_bwd_data_driver.cu new file mode 120000 index 00000000..bf6baa8d --- /dev/null +++ b/driver/src/conv_bwd_data_driver.cu @@ -0,0 +1 @@ +conv_bwd_data_driver.cpp \ No newline at end of file From d2490b490b3edc32258955874b54a71bb9a69d4a Mon Sep 17 00:00:00 2001 From: Chao Liu Date: Tue, 19 Nov 2019 17:22:46 -0600 Subject: [PATCH 13/23] rename --- ...v4r4_nchw_kcyx_nkhw_lds_double_buffer.hpp} | 6 ++-- ...ata_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp} | 26 ++++++++--------- driver/src/conv_bwd_data_driver.cpp | 28 +++++++++---------- 3 files changed, 29 insertions(+), 31 deletions(-) rename composable_kernel/include/kernel_algorithm/{gridwise_convolution_backward_data_implicit_gemm_v1_nchw_kcyx_nkhw_lds_double_buffer.hpp => gridwise_convolution_backward_data_implicit_gemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer.hpp} (98%) rename driver/include/{device_convolution_bwd_data_implicit_gemm_v1_nchw_kcyx_nkhw.hpp => device_convolution_backward_data_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp} (90%) diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_backward_data_implicit_gemm_v1_nchw_kcyx_nkhw_lds_double_buffer.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_backward_data_implicit_gemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer.hpp similarity index 98% rename from composable_kernel/include/kernel_algorithm/gridwise_convolution_backward_data_implicit_gemm_v1_nchw_kcyx_nkhw_lds_double_buffer.hpp rename to composable_kernel/include/kernel_algorithm/gridwise_convolution_backward_data_implicit_gemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer.hpp index 92606525..140a2a8c 100644 --- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_backward_data_implicit_gemm_v1_nchw_kcyx_nkhw_lds_double_buffer.hpp +++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_backward_data_implicit_gemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer.hpp @@ -1,5 +1,5 @@ -#ifndef CK_GRIDWISE_CONVOLUTION_BACKWARD_DATA_IMPLICIT_GEMM_V1_NCHW_KCYX_NKHW_LDS_DOUBLE_BUFFER_HPP -#define CK_GRIDWISE_CONVOLUTION_BACKWARD_DATA_IMPLICIT_GEMM_V1_NCHW_KCYX_NKHW_LDS_DOUBLE_BUFFER_HPP +#ifndef CK_GRIDWISE_CONVOLUTION_BACKWARD_DATA_IMPLICIT_GEMM_V4R4_NCHW_KCYX_NKHW_LDS_DOUBLE_BUFFER_HPP +#define CK_GRIDWISE_CONVOLUTION_BACKWARD_DATA_IMPLICIT_GEMM_V4R4_NCHW_KCYX_NKHW_LDS_DOUBLE_BUFFER_HPP #include "common_header.hpp" #include "tensor_descriptor.hpp" @@ -41,7 +41,7 @@ template -struct GridwiseConvolutionBackwardDataImplicitGemm_v1_nchw_kcyx_nkhw_lds_double_buffer +struct GridwiseConvolutionBackwardDataImplicitGemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer { __device__ void Run(Float* const __restrict__ p_in_global, const Float* const __restrict__ p_wei_global, diff --git a/driver/include/device_convolution_bwd_data_implicit_gemm_v1_nchw_kcyx_nkhw.hpp b/driver/include/device_convolution_backward_data_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp similarity index 90% rename from driver/include/device_convolution_bwd_data_implicit_gemm_v1_nchw_kcyx_nkhw.hpp rename to driver/include/device_convolution_backward_data_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp index 2175a512..e85caac2 100644 --- a/driver/include/device_convolution_bwd_data_implicit_gemm_v1_nchw_kcyx_nkhw.hpp +++ b/driver/include/device_convolution_backward_data_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp @@ -3,7 +3,7 @@ #include "device.hpp" #include "tensor.hpp" #include "gridwise_operation_wrapper.hpp" -#include "gridwise_convolution_backward_data_implicit_gemm_v1_nchw_kcyx_nkhw_lds_double_buffer.hpp" +#include "gridwise_convolution_backward_data_implicit_gemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer.hpp" template -void device_convolution_bwd_data_implicit_gemm_v1_nchw_kcyx_nkhw(InDesc in_nchw_desc, - Tensor& in_nchw, - WeiDesc wei_kcyx_desc, - const Tensor& wei_kcyx, - OutDesc out_nkhw_desc, - const Tensor& out_nkhw, - ConvStrides, - ConvDilations, - LeftPads, - RightPads, - std::size_t nrepeat) +void device_convolution_backward_data_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc in_nchw_desc, + Tensor& in_nchw, + WeiDesc wei_kcyx_desc, + const Tensor& wei_kcyx, + OutDesc out_nkhw_desc, + const Tensor& out_nkhw, + ConvStrides, + ConvDilations, + LeftPads, + RightPads, + std::size_t nrepeat) { using namespace ck; @@ -85,7 +85,7 @@ void device_convolution_bwd_data_implicit_gemm_v1_nchw_kcyx_nkhw(InDesc in_nchw_ printf("%s: BlockSize %u, GridSize %u \n", __func__, BlockSize, GridSize); constexpr auto gridwise_conv = - GridwiseConvolutionBackwardDataImplicitGemm_v1_nchw_kcyx_nkhw_lds_double_buffer< + GridwiseConvolutionBackwardDataImplicitGemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer< GridSize, BlockSize, T, diff --git a/driver/src/conv_bwd_data_driver.cpp b/driver/src/conv_bwd_data_driver.cpp index ed7334f9..b6d8ee22 100644 --- a/driver/src/conv_bwd_data_driver.cpp +++ b/driver/src/conv_bwd_data_driver.cpp @@ -13,7 +13,7 @@ #include "device_tensor.hpp" #include "conv_common.hpp" #include "host_conv_bwd_data.hpp" -#include "device_convolution_bwd_data_implicit_gemm_v1_nchw_kcyx_nkhw.hpp" +#include "device_convolution_backward_data_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp" int main(int argc, char* argv[]) { @@ -96,7 +96,7 @@ int main(int argc, char* argv[]) using LeftPads = Sequence<0, 0>; using RightPads = Sequence<0, 0>; -#elif 0 +#elif 1 // 1x1 filter, 8x8 image // cudnn@V100 83%, ck@V100 75%, ck@P100 78%, ck@VII 65% constexpr index_t N = 128; @@ -344,19 +344,17 @@ int main(int argc, char* argv[]) #endif } -#if 1 - device_convolution_bwd_data_implicit_gemm_v1_nchw_kcyx_nkhw(in_nchw_desc, - in_nchw_device, - wei_kcyx_desc, - wei_kcyx, - out_nkhw_desc, - out_nkhw, - ConvStrides{}, - ConvDilations{}, - LeftPads{}, - RightPads{}, - nrepeat); -#endif + device_convolution_backward_data_implicit_gemm_v4r4_nchw_kcyx_nkhw(in_nchw_desc, + in_nchw_device, + wei_kcyx_desc, + wei_kcyx, + out_nkhw_desc, + out_nkhw, + ConvStrides{}, + ConvDilations{}, + LeftPads{}, + RightPads{}, + nrepeat); if(do_verification) { From 2f0f26d3aaa601cd8b0ce70c7b90fd0cf5c3bad8 Mon Sep 17 00:00:00 2001 From: Chao Liu Date: Wed, 20 Nov 2019 14:53:12 -0600 Subject: [PATCH 14/23] adding bwd data --- ..._v4r4_nchw_kcyx_nkhw_lds_double_buffer.hpp | 2 +- ..._v4r5_nchw_kcyx_nkhw_lds_double_buffer.hpp | 438 ++++++++++++++++++ ...data_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp | 2 +- ...data_implicit_gemm_v4r5_nchw_kcyx_nkhw.hpp | 149 ++++++ driver/src/conv_bwd_data_driver.cpp | 28 +- 5 files changed, 606 insertions(+), 13 deletions(-) create mode 100644 composable_kernel/include/kernel_algorithm/gridwise_convolution_backward_data_implicit_gemm_v4r5_nchw_kcyx_nkhw_lds_double_buffer.hpp create mode 100644 driver/include/device_convolution_backward_data_implicit_gemm_v4r5_nchw_kcyx_nkhw.hpp diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_backward_data_implicit_gemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_backward_data_implicit_gemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer.hpp index 140a2a8c..82b4e086 100644 --- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_backward_data_implicit_gemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer.hpp +++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_backward_data_implicit_gemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer.hpp @@ -22,8 +22,8 @@ template +struct GridwiseConvolutionBackwardDataImplicitGemm_v4r5_nchw_kcyx_nkhw_lds_double_buffer +{ + __device__ void Run(Float* const __restrict__ p_in_global, + const Float* const __restrict__ p_wei_global, + const Float* const __restrict__ p_out_global) const + { + constexpr auto I0 = Number<0>{}; + constexpr auto I1 = Number<1>{}; + constexpr auto I2 = Number<2>{}; + constexpr auto I3 = Number<3>{}; + + constexpr auto True = integral_constant{}; + + constexpr auto in_n_c_hi_wi_global_desc = InGlobalDesc{}; + constexpr auto wei_k_c_y_x_global_desc = WeiGlobalDesc{}; + constexpr auto out_n_k_ho_wo_global_desc = OutGlobalDesc{}; + + constexpr index_t N = in_n_c_hi_wi_global_desc.GetLengths()[0]; + constexpr index_t C = in_n_c_hi_wi_global_desc.GetLengths()[1]; + constexpr index_t Hi = in_n_c_hi_wi_global_desc.GetLengths()[2]; + constexpr index_t Wi = in_n_c_hi_wi_global_desc.GetLengths()[3]; + + constexpr index_t K = out_n_k_ho_wo_global_desc.GetLengths()[1]; + constexpr index_t Ho = out_n_k_ho_wo_global_desc.GetLengths()[2]; + constexpr index_t Wo = out_n_k_ho_wo_global_desc.GetLengths()[3]; + + constexpr index_t Y = wei_k_c_y_x_global_desc.GetLengths()[2]; + constexpr index_t X = wei_k_c_y_x_global_desc.GetLengths()[3]; + + constexpr index_t ConvStrideH = ConvStrides{}[0]; + constexpr index_t ConvStrideW = ConvStrides{}[1]; + + constexpr index_t ConvDilationH = ConvDilations{}[0]; + constexpr index_t ConvDilationW = ConvDilations{}[1]; + + constexpr index_t C0 = GemmMPerThreadSubC; + constexpr index_t N0 = GemmNPerThreadSubC; + + static_assert(C % C0 == 0 && N % N0 == 0, "wrong!"); + + constexpr index_t C1 = C / C0; + constexpr index_t N1 = N / N0; + + constexpr index_t E = C1 * Y * X; + constexpr index_t B = N1 * Ho * Wo; + + // sanity-check for vectorized memory load + static_assert((Wo == 1 || (ConvStrideW == 1 || InThreadCopyDstDataPerWrite_B == 1)) && + (X == 1 || ConvDilationW % InThreadCopyDstDataPerWrite_B == 0), + "wrong! aligment requirement for vectorized global load of input tensor will " + "be violated"); + + // divide block work by [K, B] + static_assert(E % EPerBlock == 0 && B % BPerBlock == 0 && K % KPerBlock == 0, + "wrong! cannot divide work evenly among block"); + + constexpr index_t EBlockWork = E / EPerBlock; + constexpr index_t BBlockWork = B / BPerBlock; + + constexpr auto block_work_desc = + make_cluster_descriptor(Sequence{}); + + const auto block_work_id = block_work_desc.CalculateClusterIndex(get_block_1d_id()); + + const index_t e_block_data_on_global = block_work_id[0] * EPerBlock; + const index_t b_block_data_on_global = block_work_id[1] * BPerBlock; + + // output tensor + // global tensor in global memory, src of blockwise copy + constexpr auto out_n_k_howo_global_desc = + unfold_tensor_descriptor(out_n_k_ho_wo_global_desc, I2, I3); + + constexpr auto out_n0_n1_k_howo_global_desc = transform_tensor_descriptor( + out_n_k_howo_global_desc, + make_tuple(UnMerge>{}, PassThrough{}, PassThrough{}), + make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}), + make_tuple(Sequence<0, 1>{}, Sequence<2>{}, Sequence<3>{})); + + constexpr auto out_k_b_n0_global_desc = transform_tensor_descriptor( + out_n0_n1_k_howo_global_desc, + make_tuple(PassThrough{}, Merge>{}, PassThrough{}), + make_tuple(Sequence<2>{}, Sequence<1, 3>{}, Sequence<0>{}), + make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{})); + + // block tensor in LDS memory, dst of blockwise copy + // be careful of LDS alignment + constexpr auto out_k_b_n0_block_desc = make_native_tensor_descriptor_aligned( + Sequence{}, Number{}); + + // output tensor blockwise copy + auto blockwise_out_copy = + BlockwiseGenericTensorSliceCopy_v4, + Sequence<0, 1, 2>, + Sequence<0, 1, 2>, + 1, + 2, + OutBlockCopySrcDataPerRead_B, + OutBlockCopyDstDataPerWrite_N0, + AddressSpace::global, + AddressSpace::vgpr, + AddressSpace::lds, + InMemoryDataOperation::none>( + {0, b_block_data_on_global, 0}, {0, 0, 0}); + + // weight tensor + // global tensor in global memory, src of blockwise copy + constexpr auto wei_k_cyx_global_desc = + unfold_tensor_descriptor(wei_k_c_y_x_global_desc, I1, I3); + + constexpr auto wei_k_c0_e_global_desc = + transform_tensor_descriptor(wei_k_cyx_global_desc, + make_tuple(PassThrough{}, UnMerge>{}), + make_tuple(Sequence<0>{}, Sequence<1>{}), + make_tuple(Sequence<0>{}, Sequence<1, 2>{})); + + constexpr auto wei_k_e_c0_global_desc = reorder_tensor_descriptor_given_lower2upper( + wei_k_c0_e_global_desc, Sequence<0, 2, 1>{}); + + // block tensor in LDS memory, dst of blockwise copy + // be careful of LDS alignment + constexpr auto wei_k_e_c0_block_desc = make_native_tensor_descriptor_aligned( + Sequence{}, Number{}); + + // weight tensor blockwise copy + auto blockwise_wei_copy = + BlockwiseGenericTensorSliceCopy_v4, + Sequence<0, 1, 2>, + Sequence<0, 1, 2>, + 1, + 2, + WeiBlockCopySrcDataPerRead_E, + WeiBlockCopyDstDataPerWrite_C0, + AddressSpace::global, + AddressSpace::vgpr, + AddressSpace::lds, + InMemoryDataOperation::none>( + {0, e_block_data_on_global, 0}, {0, 0, 0}); + + // GEMM definition + // c_mtx += transpose(a_mtx) * b_mtx + // a_mtx[KPerBlock, EPerBlock*C0] is in LDS + // b_mtx[KPerBlocl, BPerBlock*N0] is in LDS + // c_mtx[EPerBlock*C0, BPerBlock*N0] is distributed among threads, and saved in + // register + constexpr auto a_k_ec0_block_mtx_desc = make_ConstantMatrixDescriptor( + wei_k_e_c0_block_desc.GetLength(I0), + wei_k_e_c0_block_desc.GetLength(I1) * wei_k_e_c0_block_desc.GetLength(I2), + wei_k_e_c0_block_desc.GetStride(I0)); + constexpr auto b_k_bn0_block_mtx_desc = make_ConstantMatrixDescriptor( + out_k_b_n0_block_desc.GetLength(I0), + out_k_b_n0_block_desc.GetLength(I1) * out_k_b_n0_block_desc.GetLength(I2), + out_k_b_n0_block_desc.GetStride(I0)); + + // sanity check alignment + // TODO: this check is ad-hoc, should enforce it by enforcing alignment of + // wei_k_e_c0_block_desc and out_k_b_n0_block_desc + static_assert(a_k_ec0_block_mtx_desc.RowStride() % GemmDataPerReadB == 0, "wrong!"); + static_assert(b_k_bn0_block_mtx_desc.RowStride() % GemmDataPerReadA == 0, "wrong!"); + + // sanity check + static_assert(EPerBlock % (GemmMLevel0Cluster * GemmMLevel1Cluster) == 0 && + BPerBlock % (GemmNLevel0Cluster * GemmNLevel1Cluster) == 0, + "wrong!"); + + constexpr index_t GemmMRepeat = EPerBlock / (GemmMLevel0Cluster * GemmMLevel1Cluster); + constexpr index_t GemmNRepeat = BPerBlock / (GemmNLevel0Cluster * GemmNLevel1Cluster); + + // c_thread_mtx definition: this is a mess + // TODO:: more elegent way of defining c_thread_mtx + constexpr auto c_e0e1c0_b0b1n0_thread_mtx_desc = make_ConstantMatrixDescriptor_packed( + Number{}, Number{}); + + const auto blockwise_gemm = BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2< + BlockSize, + decltype(a_k_ec0_block_mtx_desc), + decltype(b_k_bn0_block_mtx_desc), + decltype(c_e0e1c0_b0b1n0_thread_mtx_desc), + GemmMPerThreadSubC, + GemmNPerThreadSubC, + GemmMLevel0Cluster, + GemmNLevel0Cluster, + GemmMLevel1Cluster, + GemmNLevel1Cluster, + GemmKPerThreadLoop, + GemmDataPerReadA, + GemmDataPerReadB>{}; + + // LDS allocation for input and weight: be careful of alignment + constexpr index_t max_lds_align = math::lcm(WeiBlockCopyDstDataPerWrite_C0, + OutBlockCopyDstDataPerWrite_N0, + GemmDataPerReadA, + GemmDataPerReadB); + + constexpr index_t out_block_space = + math::integer_least_multiple(out_k_b_n0_block_desc.GetElementSpace(), max_lds_align); + + constexpr index_t wei_block_space = + math::integer_least_multiple(wei_k_e_c0_block_desc.GetElementSpace(), max_lds_align); + + __shared__ Float p_out_block_double[2 * out_block_space]; + __shared__ Float p_wei_block_double[2 * wei_block_space]; + + // register allocation for output + AccFloat p_in_thread[c_e0e1c0_b0b1n0_thread_mtx_desc.GetElementSpace()]; + + // zero out threadwise output + threadwise_matrix_set_zero(c_e0e1c0_b0b1n0_thread_mtx_desc, p_in_thread); + + // LDS double buffer: preload data into LDS + { + blockwise_out_copy.Run(p_out_global, p_out_block_double); + blockwise_wei_copy.Run(p_wei_global, p_wei_block_double); + } + + // LDS double buffer: main body + for(index_t k_block_data_begin = 0; k_block_data_begin + 2 * KPerBlock < K; + k_block_data_begin += 2 * KPerBlock) + { +#pragma unroll + for(index_t iloop = 0; iloop < 2; ++iloop) + { + const bool even_loop = (iloop % 2 == 0); + + Float* p_out_block_now = + even_loop ? p_out_block_double : p_out_block_double + out_block_space; + Float* p_wei_block_now = + even_loop ? p_wei_block_double : p_wei_block_double + wei_block_space; + + Float* p_out_block_next = + even_loop ? p_out_block_double + out_block_space : p_out_block_double; + Float* p_wei_block_next = + even_loop ? p_wei_block_double + wei_block_space : p_wei_block_double; + + Float p_out_thread_buffer[blockwise_out_copy.GetThreadBufferSize()]; + Float p_wei_thread_buffer[blockwise_wei_copy.GetThreadBufferSize()]; + + blockwise_out_copy.MoveSrcSliceWindow(Sequence{}, True); + blockwise_wei_copy.MoveSrcSliceWindow(Sequence{}, True); + + __syncthreads(); + + // LDS doubel buffer: load next data from device mem + blockwise_out_copy.RunLoadThreadBuffer(p_out_global, p_out_thread_buffer); + blockwise_wei_copy.RunLoadThreadBuffer(p_wei_global, p_wei_thread_buffer); + + // LDS double buffer: GEMM on current data + blockwise_gemm.Run(p_wei_block_now, p_out_block_now, p_in_thread); + + // LDS double buffer: store next data to LDS + blockwise_out_copy.RunStoreThreadBuffer(p_out_thread_buffer, p_out_block_next); + blockwise_wei_copy.RunStoreThreadBuffer(p_wei_thread_buffer, p_wei_block_next); + } + } + + // LDS double buffer: tail + { + constexpr bool has_two_iteration_left = (K % (2 * KPerBlock) == 0); + + if(has_two_iteration_left) // if has 2 iteration left + { + Float p_out_thread_buffer[blockwise_out_copy.GetThreadBufferSize()]; + Float p_wei_thread_buffer[blockwise_wei_copy.GetThreadBufferSize()]; + + blockwise_out_copy.MoveSrcSliceWindow(Sequence{}, True); + blockwise_wei_copy.MoveSrcSliceWindow(Sequence{}, True); + + __syncthreads(); + + // LDS double buffer: load last data from device mem + blockwise_out_copy.RunLoadThreadBuffer(p_out_global, p_out_thread_buffer); + blockwise_wei_copy.RunLoadThreadBuffer(p_wei_global, p_wei_thread_buffer); + + // LDS double buffer: GEMM on 2nd-last data + blockwise_gemm.Run(p_wei_block_double, p_out_block_double, p_in_thread); + + // LDS double buffer: store last data to LDS + blockwise_out_copy.RunStoreThreadBuffer(p_out_thread_buffer, + p_out_block_double + out_block_space); + blockwise_wei_copy.RunStoreThreadBuffer(p_wei_thread_buffer, + p_wei_block_double + wei_block_space); + + __syncthreads(); + + // LDS double buffer: GEMM on last data + blockwise_gemm.Run(p_wei_block_double + wei_block_space, + p_out_block_double + out_block_space, + p_in_thread); + } + else // if has 1 iteration left + { + __syncthreads(); + + // LDS double buffer: GEMM on last data + blockwise_gemm.Run(p_wei_block_double, p_out_block_double, p_in_thread); + } + } + + // input: register to global memory, atomic add + { + constexpr index_t E1 = GemmMLevel0Cluster * GemmMLevel1Cluster; + constexpr index_t E0 = E / E1; + + constexpr index_t B1 = GemmNLevel0Cluster * GemmNLevel1Cluster; + constexpr index_t B0 = B / B1; + + // define input tensor descriptor for threadwise copy + // thread input tensor, src of threadwise copy + constexpr auto in_e0_e1_c0_b0_b1_n0_thread_desc = make_native_tensor_descriptor_packed( + Sequence{}); + + // global input tensor, dst of threadwise copy + constexpr auto in_n_c_hip_wip_global_desc = transform_tensor_descriptor( + in_n_c_hi_wi_global_desc, + make_tuple(PassThrough{}, + PassThrough{}, + Pad, LeftPads, RightPads>{}), + make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}), + make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{})); + + constexpr auto in_n0_n1_c0_c1_y_ho_x_wo_global_desc = transform_tensor_descriptor( + in_n_c_hip_wip_global_desc, + make_tuple(UnMerge>{}, + UnMerge>{}, + Embed, Sequence>{}, + Embed, Sequence>{}), + make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}), + make_tuple(Sequence<0, 1>{}, Sequence<2, 3>{}, Sequence<4, 5>{}, Sequence<6, 7>{})); + + constexpr auto in_e_c0_b_n0_global_desc = transform_tensor_descriptor( + in_n0_n1_c0_c1_y_ho_x_wo_global_desc, + make_tuple(Merge>{}, + PassThrough{}, + Merge>{}, + PassThrough{}), + make_tuple(Sequence<3, 4, 6>{}, Sequence<2>{}, Sequence<1, 5, 7>{}, Sequence<0>{}), + make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{})); + + constexpr auto in_e0_e1_c0_b0_b1_n0_global_desc = transform_tensor_descriptor( + in_e_c0_b_n0_global_desc, + make_tuple(UnMerge>{}, + PassThrough{}, + UnMerge>{}, + PassThrough{}), + make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}), + make_tuple(Sequence<0, 1>{}, Sequence<2>{}, Sequence<3, 4>{}, Sequence<5>{})); + + // calculate origin of thread input tensor on global memory + // blockwise GEMM c matrix starting index + const auto c_thread_mtx_on_block = + blockwise_gemm.GetBeginOfThreadMatrixC(get_thread_local_1d_id()); + + const index_t e_thread_data_on_global = + e_block_data_on_global + c_thread_mtx_on_block.row / GemmMPerThreadSubC; + + const index_t b_thread_data_on_global = + b_block_data_on_global + c_thread_mtx_on_block.col / GemmNPerThreadSubC; + + ThreadwiseGenericTensorSliceCopy_v4r2< + decltype(in_e0_e1_c0_b0_b1_n0_thread_desc), + decltype(in_e0_e1_c0_b0_b1_n0_global_desc), + decltype(in_e0_e1_c0_b0_b1_n0_thread_desc.GetLengths()), + Sequence<0, 1, 2, 3, 4, 5>, + 4, + 1, + InThreadCopyDstDataPerWrite_B, + AddressSpace::vgpr, + AddressSpace::global, + InMemoryDataOperation::atomic_add>({0, 0, 0, 0, 0, 0}, + {e_thread_data_on_global / E1, + e_thread_data_on_global % E1, + 0, + b_thread_data_on_global / B1, + b_thread_data_on_global % B1, + 0}) + .Run(p_in_thread, p_in_global); + } + } +}; + +} // namespace ck +#endif diff --git a/driver/include/device_convolution_backward_data_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp b/driver/include/device_convolution_backward_data_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp index e85caac2..37fd847c 100644 --- a/driver/include/device_convolution_backward_data_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp +++ b/driver/include/device_convolution_backward_data_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp @@ -97,8 +97,8 @@ void device_convolution_backward_data_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc i ConvDilations, LeftPads, RightPads, - EPerBlock, BPerBlock, + EPerBlock, KPerBlock, GemmMPerThreadSubC, GemmNPerThreadSubC, diff --git a/driver/include/device_convolution_backward_data_implicit_gemm_v4r5_nchw_kcyx_nkhw.hpp b/driver/include/device_convolution_backward_data_implicit_gemm_v4r5_nchw_kcyx_nkhw.hpp new file mode 100644 index 00000000..464d6004 --- /dev/null +++ b/driver/include/device_convolution_backward_data_implicit_gemm_v4r5_nchw_kcyx_nkhw.hpp @@ -0,0 +1,149 @@ +#pragma once +#include +#include "device.hpp" +#include "tensor.hpp" +#include "gridwise_operation_wrapper.hpp" +#include "gridwise_convolution_backward_data_implicit_gemm_v4r5_nchw_kcyx_nkhw_lds_double_buffer.hpp" + +template +void device_convolution_backward_data_implicit_gemm_v4r5_nchw_kcyx_nkhw(InDesc in_nchw_desc, + Tensor& in_nchw, + WeiDesc wei_kcyx_desc, + const Tensor& wei_kcyx, + OutDesc out_nkhw_desc, + const Tensor& out_nkhw, + ConvStrides, + ConvDilations, + LeftPads, + RightPads, + std::size_t nrepeat) +{ + using namespace ck; + + constexpr index_t N = out_nkhw_desc.GetLengths()[0]; + constexpr index_t K = out_nkhw_desc.GetLengths()[1]; + constexpr index_t Ho = out_nkhw_desc.GetLengths()[2]; + constexpr index_t Wo = out_nkhw_desc.GetLengths()[3]; + + constexpr index_t C = wei_kcyx_desc.GetLengths()[1]; + constexpr index_t Y = wei_kcyx_desc.GetLengths()[2]; + constexpr index_t X = wei_kcyx_desc.GetLengths()[3]; + + std::size_t data_sz = sizeof(T); + DeviceMem in_nchw_device_buf(data_sz * in_nchw.mDesc.GetElementSpace()); + DeviceMem wei_kcyx_device_buf(data_sz * wei_kcyx.mDesc.GetElementSpace()); + DeviceMem out_nkhw_device_buf(data_sz * out_nkhw.mDesc.GetElementSpace()); + + in_nchw_device_buf.ToDevice(in_nchw.mData.data()); + wei_kcyx_device_buf.ToDevice(wei_kcyx.mData.data()); + out_nkhw_device_buf.ToDevice(out_nkhw.mData.data()); + +#if 1 + // BlockSize = 256, each thread hold 64 data + constexpr index_t BlockSize = 256; + + constexpr index_t BPerBlock = 32; + constexpr index_t EPerBlock = 32; + constexpr index_t KPerBlock = 8; + + constexpr index_t GemmMPerThreadSubC = 4; + constexpr index_t GemmNPerThreadSubC = 4; + constexpr index_t GemmMLevel0Cluster = 4; + constexpr index_t GemmNLevel0Cluster = 4; + constexpr index_t GemmMLevel1Cluster = 4; + constexpr index_t GemmNLevel1Cluster = 4; + constexpr index_t GemmKPerThreadLoop = 1; + constexpr index_t GemmDataPerReadA = 4; + constexpr index_t GemmDataPerReadB = 4; + + using OutBlockCopySubLengths_K_B_N0 = Sequence<1, 1, 4>; + using OutBlockCopyClusterLengths_K_B_N0 = Sequence<8, 32, 1>; + + constexpr index_t OutBlockCopySrcDataPerRead_B = 1; + constexpr index_t OutBlockCopyDstDataPerWrite_N0 = 4; + + using WeiBlockCopySubLengths_K_E_C0 = Sequence<1, 4, 1>; + using WeiBlockCopyClusterLengths_K_E_C0 = Sequence<8, 8, 4>; + + constexpr index_t WeiBlockCopySrcDataPerRead_E = 4; + constexpr index_t WeiBlockCopyDstDataPerWrite_C0 = 1; + + constexpr index_t InThreadCopyDstDataPerWrite_B = 1; +#endif + + constexpr index_t E = C * Y * X; + constexpr index_t B = (N * Ho * Wo); + + constexpr index_t GridSize = + ((E + EPerBlock - 1) / EPerBlock) * ((B + BPerBlock - 1) / BPerBlock); + + printf("%s: BlockSize %u, GridSize %u \n", __func__, BlockSize, GridSize); + + constexpr auto gridwise_conv = + GridwiseConvolutionBackwardDataImplicitGemm_v4r5_nchw_kcyx_nkhw_lds_double_buffer< + GridSize, + BlockSize, + T, + T, + decltype(in_nchw_desc), + decltype(wei_kcyx_desc), + decltype(out_nkhw_desc), + ConvStrides, + ConvDilations, + LeftPads, + RightPads, + BPerBlock, + EPerBlock, + KPerBlock, + GemmMPerThreadSubC, + GemmNPerThreadSubC, + GemmMLevel0Cluster, + GemmNLevel0Cluster, + GemmMLevel1Cluster, + GemmNLevel1Cluster, + GemmKPerThreadLoop, + GemmDataPerReadA, + GemmDataPerReadB, + OutBlockCopySubLengths_K_B_N0, + OutBlockCopyClusterLengths_K_B_N0, + OutBlockCopySrcDataPerRead_B, + OutBlockCopyDstDataPerWrite_N0, + WeiBlockCopySubLengths_K_E_C0, + WeiBlockCopyClusterLengths_K_E_C0, + WeiBlockCopySrcDataPerRead_E, + WeiBlockCopyDstDataPerWrite_C0, + InThreadCopyDstDataPerWrite_B>{}; + + for(index_t i = 0; i < nrepeat; ++i) + { + float time = launch_kernel(run_gridwise_operation, + dim3(GridSize), + dim3(BlockSize), + 0, + gridwise_conv, + const_cast( + static_cast(in_nchw_device_buf.GetDeviceBuffer())), + const_cast( + static_cast(wei_kcyx_device_buf.GetDeviceBuffer())), + const_cast( + static_cast(out_nkhw_device_buf.GetDeviceBuffer()))); + + printf("Elapsed time : %f ms, %f TFlop/s\n", + time, + (float)calculate_convolution_flops(InDesc{}, WeiDesc{}, OutDesc{}) / + (std::size_t(1000) * 1000 * 1000) / time); + usleep(std::min(time * 1000, float(10000))); + } + + in_nchw_device_buf.FromDevice(in_nchw.mData.data()); +} diff --git a/driver/src/conv_bwd_data_driver.cpp b/driver/src/conv_bwd_data_driver.cpp index b6d8ee22..48ec40c8 100644 --- a/driver/src/conv_bwd_data_driver.cpp +++ b/driver/src/conv_bwd_data_driver.cpp @@ -14,6 +14,7 @@ #include "conv_common.hpp" #include "host_conv_bwd_data.hpp" #include "device_convolution_backward_data_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp" +#include "device_convolution_backward_data_implicit_gemm_v4r5_nchw_kcyx_nkhw.hpp" int main(int argc, char* argv[]) { @@ -344,17 +345,22 @@ int main(int argc, char* argv[]) #endif } - device_convolution_backward_data_implicit_gemm_v4r4_nchw_kcyx_nkhw(in_nchw_desc, - in_nchw_device, - wei_kcyx_desc, - wei_kcyx, - out_nkhw_desc, - out_nkhw, - ConvStrides{}, - ConvDilations{}, - LeftPads{}, - RightPads{}, - nrepeat); +#if 0 + device_convolution_backward_data_implicit_gemm_v4r4_nchw_kcyx_nkhw +#else + device_convolution_backward_data_implicit_gemm_v4r5_nchw_kcyx_nkhw +#endif + (in_nchw_desc, + in_nchw_device, + wei_kcyx_desc, + wei_kcyx, + out_nkhw_desc, + out_nkhw, + ConvStrides{}, + ConvDilations{}, + LeftPads{}, + RightPads{}, + nrepeat); if(do_verification) { From a7a1e3c1e2feb05d7eca3446de38f0ea111b3dcb Mon Sep 17 00:00:00 2001 From: Chao Liu Date: Wed, 20 Nov 2019 19:16:03 -0600 Subject: [PATCH 15/23] add bwd-data v4r5 --- ...data_implicit_gemm_v4r5_nchw_kcyx_nkhw.hpp | 10 ++++-- driver/src/conv_bwd_data_driver.cpp | 35 ++++++++++--------- 2 files changed, 26 insertions(+), 19 deletions(-) diff --git a/driver/include/device_convolution_backward_data_implicit_gemm_v4r5_nchw_kcyx_nkhw.hpp b/driver/include/device_convolution_backward_data_implicit_gemm_v4r5_nchw_kcyx_nkhw.hpp index 464d6004..86631baa 100644 --- a/driver/include/device_convolution_backward_data_implicit_gemm_v4r5_nchw_kcyx_nkhw.hpp +++ b/driver/include/device_convolution_backward_data_implicit_gemm_v4r5_nchw_kcyx_nkhw.hpp @@ -78,8 +78,14 @@ void device_convolution_backward_data_implicit_gemm_v4r5_nchw_kcyx_nkhw(InDesc i constexpr index_t InThreadCopyDstDataPerWrite_B = 1; #endif - constexpr index_t E = C * Y * X; - constexpr index_t B = (N * Ho * Wo); + constexpr index_t C0 = GemmMPerThreadSubC; + constexpr index_t N0 = GemmNPerThreadSubC; + + constexpr index_t C1 = C / C0; + constexpr index_t N1 = N / N0; + + constexpr index_t E = C1 * Y * X; + constexpr index_t B = (N1 * Ho * Wo); constexpr index_t GridSize = ((E + EPerBlock - 1) / EPerBlock) * ((B + BPerBlock - 1) / BPerBlock); diff --git a/driver/src/conv_bwd_data_driver.cpp b/driver/src/conv_bwd_data_driver.cpp index 48ec40c8..1e54401e 100644 --- a/driver/src/conv_bwd_data_driver.cpp +++ b/driver/src/conv_bwd_data_driver.cpp @@ -21,19 +21,19 @@ int main(int argc, char* argv[]) using namespace ck; #if 0 - constexpr index_t N = 2; - constexpr index_t C = 8; - constexpr index_t HI = 8; - constexpr index_t WI = 8; - constexpr index_t K = 128; - constexpr index_t Y = 4; - constexpr index_t X = 4; + constexpr index_t N = 128; + constexpr index_t C = 256; + constexpr index_t HI = 35; + constexpr index_t WI = 35; + constexpr index_t K = 384; + constexpr index_t Y = 3; + constexpr index_t X = 3; - using ConvStrides = Sequence<1, 1>; + using ConvStrides = Sequence<2, 2>; using ConvDilations = Sequence<1, 1>; - using LeftPads = Sequence<1, 1>; - using RightPads = Sequence<2, 2>; + using LeftPads = Sequence<0, 0>; + using RightPads = Sequence<0, 0>; #elif 0 // 3x3, 34x34 constexpr index_t N = 64; @@ -49,7 +49,7 @@ int main(int argc, char* argv[]) using LeftPads = Sequence<0, 0>; using RightPads = Sequence<0, 0>; -#elif 0 +#elif 1 // 1x1 filter, 8x8 image // cudnn@V100 68%, ck@V100 72%, ck@P100 52%, ck@VII 42% constexpr index_t N = 64; @@ -97,7 +97,7 @@ int main(int argc, char* argv[]) using LeftPads = Sequence<0, 0>; using RightPads = Sequence<0, 0>; -#elif 1 +#elif 0 // 1x1 filter, 8x8 image // cudnn@V100 83%, ck@V100 75%, ck@P100 78%, ck@VII 65% constexpr index_t N = 128; @@ -241,7 +241,7 @@ int main(int argc, char* argv[]) using LeftPads = Sequence<0, 0>; using RightPads = Sequence<0, 0>; -#elif 0 +#elif 1 // 3x3 filter, 2x2 stride, 35x35 input, 17x17 output // cudnn@V100 90%, ck@V100 93%, ck@P100 83%, ck@VII 81% constexpr index_t N = 128; @@ -287,7 +287,7 @@ int main(int argc, char* argv[]) using LeftPads = Sequence<3, 0>; using RightPads = Sequence<3, 0>; -#elif 1 +#elif 0 // 1x7 filter, 0x3 pad, 17x17 input constexpr index_t N = 128; constexpr index_t C = 128; @@ -375,9 +375,10 @@ int main(int argc, char* argv[]) check_error(in_nchw_host, in_nchw_device); #if 0 - LogRange(std::cout << "col_eb : ", col_eb.mData, ",") << std::endl; - LogRange(std::cout << "img_nchw_host : ", img_nchw_host.mData, ",") << std::endl; - LogRange(std::cout << "img_nchw_device : ", img_nchw_device.mData, ",") << std::endl; + LogRange(std::cout << "out_nkhw : ", out_nkhw.mData, ",") << std::endl; + LogRange(std::cout << "wei_kcyx : ", wei_kcyx.mData, ",") << std::endl; + LogRange(std::cout << "in_nchw_host : ", in_nchw_host.mData, ",") << std::endl; + LogRange(std::cout << "in_nchw_device : ", in_nchw_device.mData, ",") << std::endl; #endif } } From 528051d23c693f507527dda7bf21cf8183873189 Mon Sep 17 00:00:00 2001 From: Chao Liu Date: Thu, 21 Nov 2019 14:16:06 -0600 Subject: [PATCH 16/23] hand tune some params for v4r1 and v4r4 --- ...tion_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp | 41 +++++- ...tion_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp | 123 +++++++++++------- driver/src/driver.cpp | 20 +-- 3 files changed, 128 insertions(+), 56 deletions(-) diff --git a/driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp b/driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp index ccff9e72..c4b9b7de 100644 --- a/driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp +++ b/driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp @@ -54,8 +54,8 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc, wei_kcyx_device_buf.ToDevice(wei_kcyx.mData.data()); out_nkhw_device_buf.ToDevice(out_nkhw.mData.data()); -#if 1 - // BlockSize = 256, each thread hold 64 data +#if 0 + // BlockSize = 256, EperBlock = 8, each thread hold 64 data constexpr index_t BlockSize = 256; constexpr index_t BPerBlock = 16; @@ -89,6 +89,43 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc, using WeiBlockCopySrcAccessOrder = Sequence<1, 0>; // [K, E] using WeiBlockCopyDstAccessOrder = Sequence<0, 1>; // [E, K] + constexpr index_t WeiBlockCopySrcDataPerRead_E = 4; + constexpr index_t WeiBlockCopyDstDataPerWrite_K = 1; +#elif 1 + // BlockSize = 256, EPerBlock = 16, each thread hold 64 data + constexpr index_t BlockSize = 256; + + constexpr index_t BPerBlock = 16; + constexpr index_t KPerBlock = 128; + constexpr index_t EPerBlock = 16; + + constexpr index_t GemmNRepeat = 2; + + constexpr index_t GemmMPerThreadSubC = 4; + constexpr index_t GemmNPerThreadSubC = 4; + constexpr index_t GemmMLevel0Cluster = 4; + constexpr index_t GemmNLevel0Cluster = 4; + constexpr index_t GemmMLevel1Cluster = 4; + constexpr index_t GemmNLevel1Cluster = 4; + constexpr index_t GemmKPerThreadLoop = 1; + constexpr index_t GemmDataPerReadA = 4; + constexpr index_t GemmDataPerReadB = 4; + + using InBlockCopySubLengths_E_N1_B_N2 = Sequence<1, 2, 1, 4>; + using InBlockCopyClusterLengths_E_N1_B_N2 = Sequence<16, 1, 16, 1>; + using InBlockCopyThreadClusterArrangeOrder = Sequence<0, 1, 3, 2>; // [E, N1, N2, B] + using InBlockCopySrcAccessOrder = Sequence<0, 2, 1, 3>; // [E, B, N1, N2] + using InBlockCopyDstAccessOrder = Sequence<0, 1, 2, 3>; // [E, N1, B, N2] + + constexpr index_t InBlockCopySrcDataPerRead_B = 1; + constexpr index_t InBlockCopyDstDataPerWrite_N2 = 4; + + using WeiBlockCopySubLengths_E_K = Sequence<4, 2>; + using WeiBlockCopyClusterLengths_E_K = Sequence<4, 64>; + using WeiBlockCopyThreadClusterArrangeOrder = Sequence<1, 0>; // [K, E] + using WeiBlockCopySrcAccessOrder = Sequence<1, 0>; // [K, E] + using WeiBlockCopyDstAccessOrder = Sequence<0, 1>; // [E, K] + constexpr index_t WeiBlockCopySrcDataPerRead_E = 4; constexpr index_t WeiBlockCopyDstDataPerWrite_K = 1; #elif 0 diff --git a/driver/include/device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp b/driver/include/device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp index df807391..53573004 100644 --- a/driver/include/device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp +++ b/driver/include/device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp @@ -50,7 +50,8 @@ void device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc, wei_kcyx_device_buf.ToDevice(wei_kcyx.mData.data()); out_nkhw_device_buf.ToDevice(out_nkhw.mData.data()); -#if 1 +#if 0 + // BlockSize = 256, EPerBlock = 8 constexpr index_t BlockSize = 256; constexpr index_t BPerBlock = 128; @@ -85,7 +86,8 @@ void device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc, constexpr index_t WeiBlockCopyDstDataPerWrite_K = 1; constexpr index_t OutThreadCopyDataPerAccess_B = 1; -#elif 1 +#elif 0 + // BlockSize = 256, EPerBlock = 8 // 1x1 filter, 8x8 image constexpr index_t BlockSize = 256; @@ -121,7 +123,44 @@ void device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc, constexpr index_t WeiBlockCopyDstDataPerWrite_K = 1; constexpr index_t OutThreadCopyDataPerAccess_B = 4; -#elif 0 +#elif 1 + // BlockSize = 256, EPerBlock = 16 + // 1x1 filter, 8x8 image + constexpr index_t BlockSize = 256; + + constexpr index_t BPerBlock = 128; + constexpr index_t KPerBlock = 128; + constexpr index_t EPerBlock = 16; + + constexpr index_t GemmMPerThreadSubC = 4; + constexpr index_t GemmNPerThreadSubC = 4; + constexpr index_t GemmMLevel0Cluster = 4; + constexpr index_t GemmNLevel0Cluster = 4; + constexpr index_t GemmMLevel1Cluster = 4; + constexpr index_t GemmNLevel1Cluster = 4; + constexpr index_t GemmKPerThreadLoop = 1; + constexpr index_t GemmDataPerReadA = 4; + constexpr index_t GemmDataPerReadB = 4; + + using InBlockCopySubLengths_E_B = Sequence<2, 4>; + using InBlockCopyClusterLengths_E_B = Sequence<8, 32>; + using InBlockCopyThreadClusterArrangeOrder = Sequence<0, 1>; // [E, B] + using InBlockCopySrcAccessOrder = Sequence<0, 1>; // [E, B] + using InBlockCopyDstAccessOrder = Sequence<0, 1>; // [E, B] + + constexpr index_t InBlockCopyDataPerAccess_B = 4; + + using WeiBlockCopySubLengths_E_K = Sequence<4, 2>; + using WeiBlockCopyClusterLengths_E_K = Sequence<4, 64>; + using WeiBlockCopyThreadClusterArrangeOrder = Sequence<1, 0>; // [K, E] + using WeiBlockCopySrcAccessOrder = Sequence<1, 0>; // [K, E] + using WeiBlockCopyDstAccessOrder = Sequence<0, 1>; // [E, K] + + constexpr index_t WeiBlockCopySrcDataPerRead_E = 4; + constexpr index_t WeiBlockCopyDstDataPerWrite_K = 1; + + constexpr index_t OutThreadCopyDataPerAccess_B = 4; +#elif 1 // 1x1 filter, 14x14 image constexpr index_t BlockSize = 256; @@ -167,47 +206,43 @@ void device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc, printf("%s: BlockSize %u, GridSize %u \n", __func__, BlockSize, GridSize); constexpr auto gridwise_conv = -#if 0 - GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_padded -#else - GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer -#endif - {}; + GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer< + GridSize, + BlockSize, + T, + decltype(in_nchw_desc), + decltype(wei_kcyx_desc), + decltype(out_nkhw_desc), + ConvStrides, + ConvDilations, + LeftPads, + RightPads, + BPerBlock, + KPerBlock, + EPerBlock, + GemmMPerThreadSubC, + GemmNPerThreadSubC, + GemmMLevel0Cluster, + GemmNLevel0Cluster, + GemmMLevel1Cluster, + GemmNLevel1Cluster, + GemmKPerThreadLoop, + GemmDataPerReadA, + GemmDataPerReadB, + InBlockCopySubLengths_E_B, + InBlockCopyClusterLengths_E_B, + InBlockCopyThreadClusterArrangeOrder, + InBlockCopySrcAccessOrder, + InBlockCopyDstAccessOrder, + InBlockCopyDataPerAccess_B, + WeiBlockCopySubLengths_E_K, + WeiBlockCopyClusterLengths_E_K, + WeiBlockCopyThreadClusterArrangeOrder, + WeiBlockCopySrcAccessOrder, + WeiBlockCopyDstAccessOrder, + WeiBlockCopySrcDataPerRead_E, + WeiBlockCopyDstDataPerWrite_K, + OutThreadCopyDataPerAccess_B>{}; for(index_t i = 0; i < nrepeat; ++i) { diff --git a/driver/src/driver.cpp b/driver/src/driver.cpp index 720d5920..ccbc81a7 100644 --- a/driver/src/driver.cpp +++ b/driver/src/driver.cpp @@ -76,20 +76,20 @@ int main(int argc, char* argv[]) { using namespace ck; -#if 0 +#if 1 constexpr index_t N = 128; - constexpr index_t C = 128; - constexpr index_t HI = 17; - constexpr index_t WI = 17; - constexpr index_t K = 128; + constexpr index_t C = 1024; + constexpr index_t HI = 14; + constexpr index_t WI = 14; + constexpr index_t K = 256; constexpr index_t Y = 1; - constexpr index_t X = 7; + constexpr index_t X = 1; using ConvStrides = Sequence<1, 1>; using ConvDilations = Sequence<1, 1>; - using LeftPads = Sequence<0, 3>; - using RightPads = Sequence<0, 3>; + using LeftPads = Sequence<0, 0>; + using RightPads = Sequence<0, 0>; #elif 0 // 3x3, 34x34 constexpr index_t N = 64; @@ -105,7 +105,7 @@ int main(int argc, char* argv[]) using LeftPads = Sequence<0, 0>; using RightPads = Sequence<0, 0>; -#elif 0 +#elif 1 // 1x1 filter, 8x8 image // cudnn@V100 68%, ck@V100 72%, ck@P100 52%, ck@VII 42% constexpr index_t N = 64; @@ -492,7 +492,7 @@ int main(int argc, char* argv[]) ConvStrides{}, ConvDilations{}, nrepeat); -#elif 0 +#elif 1 device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw(in_nchw_desc, in_nchw, wei_kcyx_desc, From 72d5b7993ec3815fbb4e86bab9bcb5b60d5c0af5 Mon Sep 17 00:00:00 2001 From: Chao Liu Date: Thu, 21 Nov 2019 15:40:04 -0600 Subject: [PATCH 17/23] fix host bug for bwd data --- driver/include/host_conv_bwd_data.hpp | 36 ++++++++++++++++----------- driver/src/conv_bwd_data_driver.cpp | 36 +++++++++++++-------------- script/compile-hip.sh | 2 +- script/docker-cuda.sh | 3 +++ script/ds_read_offset.sh | 12 --------- 5 files changed, 43 insertions(+), 46 deletions(-) create mode 100755 script/docker-cuda.sh delete mode 100755 script/ds_read_offset.sh diff --git a/driver/include/host_conv_bwd_data.hpp b/driver/include/host_conv_bwd_data.hpp index fa6df727..ce0fb789 100644 --- a/driver/include/host_conv_bwd_data.hpp +++ b/driver/include/host_conv_bwd_data.hpp @@ -8,13 +8,13 @@ template -void host_direct_convolution_bwd_data(Tensor& in_nchw, - const Tensor& wei_kcyx, - const Tensor& out_nkhw, - ConvStrides, - ConvDilations, - LeftPads, - RightPads) +void host_direct_convolution_backward_data(Tensor& in_nchw, + const Tensor& wei_kcyx, + const Tensor& out_nkhw, + ConvStrides, + ConvDilations, + LeftPads, + RightPads) { using namespace ck; @@ -37,21 +37,27 @@ void host_direct_convolution_bwd_data(Tensor& in_nchw, { int h_tmp = hi + LeftPads{}[0] - y * ConvDilations{}[0]; - if(h_tmp >= 0 && h_tmp < HI && h_tmp % ConvStrides{}[0] == 0) + if(h_tmp % ConvStrides{}[0] == 0) { int ho = h_tmp / ConvStrides{}[0]; - for(int x = 0; x < X; ++x) + if(ho >= 0 && ho < HO) { - int w_tmp = wi + LeftPads{}[1] - x * ConvDilations{}[1]; - - if(w_tmp >= 0 && w_tmp < WI && w_tmp % ConvStrides{}[1] == 0) + for(int x = 0; x < X; ++x) { - int wo = w_tmp / ConvStrides{}[1]; + int w_tmp = wi + LeftPads{}[1] - x * ConvDilations{}[1]; - for(int k = 0; k < K; ++k) + if(w_tmp % ConvStrides{}[1] == 0) { - v += out_nkhw(n, k, ho, wo) * wei_kcyx(k, c, y, x); + int wo = w_tmp / ConvStrides{}[1]; + + if(wo >= 0 && wo < WO) + { + for(int k = 0; k < K; ++k) + { + v += out_nkhw(n, k, ho, wo) * wei_kcyx(k, c, y, x); + } + } } } } diff --git a/driver/src/conv_bwd_data_driver.cpp b/driver/src/conv_bwd_data_driver.cpp index 1e54401e..679fd7eb 100644 --- a/driver/src/conv_bwd_data_driver.cpp +++ b/driver/src/conv_bwd_data_driver.cpp @@ -21,15 +21,15 @@ int main(int argc, char* argv[]) using namespace ck; #if 0 - constexpr index_t N = 128; - constexpr index_t C = 256; - constexpr index_t HI = 35; - constexpr index_t WI = 35; - constexpr index_t K = 384; - constexpr index_t Y = 3; - constexpr index_t X = 3; + constexpr index_t N = 4; + constexpr index_t C = 8; + constexpr index_t HI = 11; + constexpr index_t WI = 11; + constexpr index_t K = 8; + constexpr index_t Y = 4; + constexpr index_t X = 4; - using ConvStrides = Sequence<2, 2>; + using ConvStrides = Sequence<1, 1>; using ConvDilations = Sequence<1, 1>; using LeftPads = Sequence<0, 0>; @@ -49,7 +49,7 @@ int main(int argc, char* argv[]) using LeftPads = Sequence<0, 0>; using RightPads = Sequence<0, 0>; -#elif 1 +#elif 0 // 1x1 filter, 8x8 image // cudnn@V100 68%, ck@V100 72%, ck@P100 52%, ck@VII 42% constexpr index_t N = 64; @@ -241,7 +241,7 @@ int main(int argc, char* argv[]) using LeftPads = Sequence<0, 0>; using RightPads = Sequence<0, 0>; -#elif 1 +#elif 0 // 3x3 filter, 2x2 stride, 35x35 input, 17x17 output // cudnn@V100 90%, ck@V100 93%, ck@P100 83%, ck@VII 81% constexpr index_t N = 128; @@ -287,7 +287,7 @@ int main(int argc, char* argv[]) using LeftPads = Sequence<3, 0>; using RightPads = Sequence<3, 0>; -#elif 0 +#elif 1 // 1x7 filter, 0x3 pad, 17x17 input constexpr index_t N = 128; constexpr index_t C = 128; @@ -364,13 +364,13 @@ int main(int argc, char* argv[]) if(do_verification) { - host_direct_convolution_bwd_data(in_nchw_host, - wei_kcyx, - out_nkhw, - ConvStrides{}, - ConvDilations{}, - LeftPads{}, - RightPads{}); + host_direct_convolution_backward_data(in_nchw_host, + wei_kcyx, + out_nkhw, + ConvStrides{}, + ConvDilations{}, + LeftPads{}, + RightPads{}); check_error(in_nchw_host, in_nchw_device); diff --git a/script/compile-hip.sh b/script/compile-hip.sh index bae4d677..0aebc1dd 100755 --- a/script/compile-hip.sh +++ b/script/compile-hip.sh @@ -4,5 +4,5 @@ export KMDUMPLLVM=1 export KMDUMPDIR=$PWD - make -j driver + make -j $1 #/opt/rocm/hcc/bin/llvm-objdump -mcpu=gfx906 -source -line-numbers driver/dump-gfx906.isabin > driver/dump-gfx906.isabin.asm diff --git a/script/docker-cuda.sh b/script/docker-cuda.sh new file mode 100755 index 00000000..508774b6 --- /dev/null +++ b/script/docker-cuda.sh @@ -0,0 +1,3 @@ +WORKSPACE=$1 +echo "workspace: " $WORKSPACE +sudo docker run -it -v $WORKSPACE:/root/workspace --group-add sudo --runtime=nvidia asroy/cuda:10.1-cudnn7-devel-ubuntu18.04-latest /bin/bash diff --git a/script/ds_read_offset.sh b/script/ds_read_offset.sh deleted file mode 100755 index 22e756bf..00000000 --- a/script/ds_read_offset.sh +++ /dev/null @@ -1,12 +0,0 @@ -for((i=0;i<=4096;i=i+64)) -do - OFFSET=$i - echo "if(offset == $OFFSET)" - echo "{" - echo " asm volatile(\"\\n \\" - echo " ds_read_b128 %0, %1 offset:$OFFSET\n \\" - echo " \"" - echo " : \"=v\"(r)" - echo " : \"v\"(__to_local(lds)));" - echo "}" -done From bec35fbc5a5d8123dd10579bb51558a1ba97a886 Mon Sep 17 00:00:00 2001 From: Chao Liu Date: Thu, 21 Nov 2019 15:49:18 -0600 Subject: [PATCH 18/23] rename --- ...plicit_gemm_v1r1_nchw_kcyx_nkhw_lds_double_buffer.hpp} | 6 +++--- ...plicit_gemm_v1r2_nchw_kcyx_nkhw_lds_double_buffer.hpp} | 6 +++--- ...n_backward_data_implicit_gemm_v1r1_nchw_kcyx_nkhw.hpp} | 6 +++--- ...n_backward_data_implicit_gemm_v1r2_nchw_kcyx_nkhw.hpp} | 6 +++--- driver/src/conv_bwd_data_driver.cpp | 8 ++++---- 5 files changed, 16 insertions(+), 16 deletions(-) rename composable_kernel/include/kernel_algorithm/{gridwise_convolution_backward_data_implicit_gemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer.hpp => gridwise_convolution_backward_data_implicit_gemm_v1r1_nchw_kcyx_nkhw_lds_double_buffer.hpp} (99%) rename composable_kernel/include/kernel_algorithm/{gridwise_convolution_backward_data_implicit_gemm_v4r5_nchw_kcyx_nkhw_lds_double_buffer.hpp => gridwise_convolution_backward_data_implicit_gemm_v1r2_nchw_kcyx_nkhw_lds_double_buffer.hpp} (99%) rename driver/include/{device_convolution_backward_data_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp => device_convolution_backward_data_implicit_gemm_v1r1_nchw_kcyx_nkhw.hpp} (96%) rename driver/include/{device_convolution_backward_data_implicit_gemm_v4r5_nchw_kcyx_nkhw.hpp => device_convolution_backward_data_implicit_gemm_v1r2_nchw_kcyx_nkhw.hpp} (97%) diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_backward_data_implicit_gemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_backward_data_implicit_gemm_v1r1_nchw_kcyx_nkhw_lds_double_buffer.hpp similarity index 99% rename from composable_kernel/include/kernel_algorithm/gridwise_convolution_backward_data_implicit_gemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer.hpp rename to composable_kernel/include/kernel_algorithm/gridwise_convolution_backward_data_implicit_gemm_v1r1_nchw_kcyx_nkhw_lds_double_buffer.hpp index 82b4e086..94487a4b 100644 --- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_backward_data_implicit_gemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer.hpp +++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_backward_data_implicit_gemm_v1r1_nchw_kcyx_nkhw_lds_double_buffer.hpp @@ -1,5 +1,5 @@ -#ifndef CK_GRIDWISE_CONVOLUTION_BACKWARD_DATA_IMPLICIT_GEMM_V4R4_NCHW_KCYX_NKHW_LDS_DOUBLE_BUFFER_HPP -#define CK_GRIDWISE_CONVOLUTION_BACKWARD_DATA_IMPLICIT_GEMM_V4R4_NCHW_KCYX_NKHW_LDS_DOUBLE_BUFFER_HPP +#ifndef CK_GRIDWISE_CONVOLUTION_BACKWARD_DATA_IMPLICIT_GEMM_V1R1_NCHW_KCYX_NKHW_LDS_DOUBLE_BUFFER_HPP +#define CK_GRIDWISE_CONVOLUTION_BACKWARD_DATA_IMPLICIT_GEMM_V1R1_NCHW_KCYX_NKHW_LDS_DOUBLE_BUFFER_HPP #include "common_header.hpp" #include "tensor_descriptor.hpp" @@ -41,7 +41,7 @@ template -struct GridwiseConvolutionBackwardDataImplicitGemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer +struct GridwiseConvolutionBackwardDataImplicitGemm_v1r1_nchw_kcyx_nkhw_lds_double_buffer { __device__ void Run(Float* const __restrict__ p_in_global, const Float* const __restrict__ p_wei_global, diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_backward_data_implicit_gemm_v4r5_nchw_kcyx_nkhw_lds_double_buffer.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_backward_data_implicit_gemm_v1r2_nchw_kcyx_nkhw_lds_double_buffer.hpp similarity index 99% rename from composable_kernel/include/kernel_algorithm/gridwise_convolution_backward_data_implicit_gemm_v4r5_nchw_kcyx_nkhw_lds_double_buffer.hpp rename to composable_kernel/include/kernel_algorithm/gridwise_convolution_backward_data_implicit_gemm_v1r2_nchw_kcyx_nkhw_lds_double_buffer.hpp index 9589505f..7cf94266 100644 --- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_backward_data_implicit_gemm_v4r5_nchw_kcyx_nkhw_lds_double_buffer.hpp +++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_backward_data_implicit_gemm_v1r2_nchw_kcyx_nkhw_lds_double_buffer.hpp @@ -1,5 +1,5 @@ -#ifndef CK_GRIDWISE_CONVOLUTION_BACKWARD_DATA_IMPLICIT_GEMM_V4R5_NCHW_KCYX_NKHW_LDS_DOUBLE_BUFFER_HPP -#define CK_GRIDWISE_CONVOLUTION_BACKWARD_DATA_IMPLICIT_GEMM_V4R5_NCHW_KCYX_NKHW_LDS_DOUBLE_BUFFER_HPP +#ifndef CK_GRIDWISE_CONVOLUTION_BACKWARD_DATA_IMPLICIT_GEMM_V1R2_NCHW_KCYX_NKHW_LDS_DOUBLE_BUFFER_HPP +#define CK_GRIDWISE_CONVOLUTION_BACKWARD_DATA_IMPLICIT_GEMM_V1R2_NCHW_KCYX_NKHW_LDS_DOUBLE_BUFFER_HPP #include "common_header.hpp" #include "tensor_descriptor.hpp" @@ -43,7 +43,7 @@ template -struct GridwiseConvolutionBackwardDataImplicitGemm_v4r5_nchw_kcyx_nkhw_lds_double_buffer +struct GridwiseConvolutionBackwardDataImplicitGemm_v1r2_nchw_kcyx_nkhw_lds_double_buffer { __device__ void Run(Float* const __restrict__ p_in_global, const Float* const __restrict__ p_wei_global, diff --git a/driver/include/device_convolution_backward_data_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp b/driver/include/device_convolution_backward_data_implicit_gemm_v1r1_nchw_kcyx_nkhw.hpp similarity index 96% rename from driver/include/device_convolution_backward_data_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp rename to driver/include/device_convolution_backward_data_implicit_gemm_v1r1_nchw_kcyx_nkhw.hpp index 37fd847c..8527a786 100644 --- a/driver/include/device_convolution_backward_data_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp +++ b/driver/include/device_convolution_backward_data_implicit_gemm_v1r1_nchw_kcyx_nkhw.hpp @@ -3,7 +3,7 @@ #include "device.hpp" #include "tensor.hpp" #include "gridwise_operation_wrapper.hpp" -#include "gridwise_convolution_backward_data_implicit_gemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer.hpp" +#include "gridwise_convolution_backward_data_implicit_gemm_v1r1_nchw_kcyx_nkhw_lds_double_buffer.hpp" template -void device_convolution_backward_data_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc in_nchw_desc, +void device_convolution_backward_data_implicit_gemm_v1r1_nchw_kcyx_nkhw(InDesc in_nchw_desc, Tensor& in_nchw, WeiDesc wei_kcyx_desc, const Tensor& wei_kcyx, @@ -85,7 +85,7 @@ void device_convolution_backward_data_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc i printf("%s: BlockSize %u, GridSize %u \n", __func__, BlockSize, GridSize); constexpr auto gridwise_conv = - GridwiseConvolutionBackwardDataImplicitGemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer< + GridwiseConvolutionBackwardDataImplicitGemm_v1r1_nchw_kcyx_nkhw_lds_double_buffer< GridSize, BlockSize, T, diff --git a/driver/include/device_convolution_backward_data_implicit_gemm_v4r5_nchw_kcyx_nkhw.hpp b/driver/include/device_convolution_backward_data_implicit_gemm_v1r2_nchw_kcyx_nkhw.hpp similarity index 97% rename from driver/include/device_convolution_backward_data_implicit_gemm_v4r5_nchw_kcyx_nkhw.hpp rename to driver/include/device_convolution_backward_data_implicit_gemm_v1r2_nchw_kcyx_nkhw.hpp index 86631baa..e443a2b7 100644 --- a/driver/include/device_convolution_backward_data_implicit_gemm_v4r5_nchw_kcyx_nkhw.hpp +++ b/driver/include/device_convolution_backward_data_implicit_gemm_v1r2_nchw_kcyx_nkhw.hpp @@ -3,7 +3,7 @@ #include "device.hpp" #include "tensor.hpp" #include "gridwise_operation_wrapper.hpp" -#include "gridwise_convolution_backward_data_implicit_gemm_v4r5_nchw_kcyx_nkhw_lds_double_buffer.hpp" +#include "gridwise_convolution_backward_data_implicit_gemm_v1r2_nchw_kcyx_nkhw_lds_double_buffer.hpp" template -void device_convolution_backward_data_implicit_gemm_v4r5_nchw_kcyx_nkhw(InDesc in_nchw_desc, +void device_convolution_backward_data_implicit_gemm_v1r2_nchw_kcyx_nkhw(InDesc in_nchw_desc, Tensor& in_nchw, WeiDesc wei_kcyx_desc, const Tensor& wei_kcyx, @@ -93,7 +93,7 @@ void device_convolution_backward_data_implicit_gemm_v4r5_nchw_kcyx_nkhw(InDesc i printf("%s: BlockSize %u, GridSize %u \n", __func__, BlockSize, GridSize); constexpr auto gridwise_conv = - GridwiseConvolutionBackwardDataImplicitGemm_v4r5_nchw_kcyx_nkhw_lds_double_buffer< + GridwiseConvolutionBackwardDataImplicitGemm_v1r2_nchw_kcyx_nkhw_lds_double_buffer< GridSize, BlockSize, T, diff --git a/driver/src/conv_bwd_data_driver.cpp b/driver/src/conv_bwd_data_driver.cpp index 679fd7eb..ee1e125c 100644 --- a/driver/src/conv_bwd_data_driver.cpp +++ b/driver/src/conv_bwd_data_driver.cpp @@ -13,8 +13,8 @@ #include "device_tensor.hpp" #include "conv_common.hpp" #include "host_conv_bwd_data.hpp" -#include "device_convolution_backward_data_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp" -#include "device_convolution_backward_data_implicit_gemm_v4r5_nchw_kcyx_nkhw.hpp" +#include "device_convolution_backward_data_implicit_gemm_v1r1_nchw_kcyx_nkhw.hpp" +#include "device_convolution_backward_data_implicit_gemm_v1r2_nchw_kcyx_nkhw.hpp" int main(int argc, char* argv[]) { @@ -346,9 +346,9 @@ int main(int argc, char* argv[]) } #if 0 - device_convolution_backward_data_implicit_gemm_v4r4_nchw_kcyx_nkhw + device_convolution_backward_data_implicit_gemm_v1r1_nchw_kcyx_nkhw #else - device_convolution_backward_data_implicit_gemm_v4r5_nchw_kcyx_nkhw + device_convolution_backward_data_implicit_gemm_v1r2_nchw_kcyx_nkhw #endif (in_nchw_desc, in_nchw_device, From ecd3240ba1c760114af4f8203aea97fa9457dcfb Mon Sep 17 00:00:00 2001 From: Chao Liu Date: Sat, 30 Nov 2019 00:53:38 -0600 Subject: [PATCH 19/23] refactor --- ...data_implicit_gemm_v1r1_nchw_kcyx_nkhw.hpp | 157 +++++++ ..._v1r1_nchw_kcyx_nkhw_lds_double_buffer.hpp | 401 ------------------ .../tensor_operation/gridwise_gemm.hpp | 330 ++++++++++++++ driver/src/conv_bwd_data_driver.cpp | 2 +- 4 files changed, 488 insertions(+), 402 deletions(-) create mode 100644 composable_kernel/include/kernel_algorithm/gridwise_convolution_backward_data_implicit_gemm_v1r1_nchw_kcyx_nkhw.hpp delete mode 100644 composable_kernel/include/kernel_algorithm/gridwise_convolution_backward_data_implicit_gemm_v1r1_nchw_kcyx_nkhw_lds_double_buffer.hpp create mode 100644 composable_kernel/include/tensor_operation/gridwise_gemm.hpp diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_backward_data_implicit_gemm_v1r1_nchw_kcyx_nkhw.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_backward_data_implicit_gemm_v1r1_nchw_kcyx_nkhw.hpp new file mode 100644 index 00000000..f2325aab --- /dev/null +++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_backward_data_implicit_gemm_v1r1_nchw_kcyx_nkhw.hpp @@ -0,0 +1,157 @@ +#ifndef CK_GRIDWISE_CONVOLUTION_BACKWARD_DATA_IMPLICIT_GEMM_V1R1_NCHW_KCYX_NKHW_HPP +#define CK_GRIDWISE_CONVOLUTION_BACKWARD_DATA_IMPLICIT_GEMM_V1R1_NCHW_KCYX_NKHW_HPP + +#include "common_header.hpp" +#include "tensor_descriptor.hpp" +#include "tensor_descriptor_helper.hpp" +#include "gridwise_gemm.hpp" + +namespace ck { + +template +struct GridwiseConvolutionBackwardDataImplicitGemm_v1r1_nchw_kcyx_nkhw +{ + __device__ void Run(Float* __restrict__ p_in_global, + const Float* __restrict__ p_wei_global, + const Float* __restrict__ p_out_global) const + { + constexpr auto I0 = Number<0>{}; + constexpr auto I1 = Number<1>{}; + constexpr auto I2 = Number<2>{}; + constexpr auto I3 = Number<3>{}; + + constexpr auto True = integral_constant{}; + + constexpr auto in_n_c_hi_wi_global_desc = InGlobalDesc{}; + constexpr auto wei_k_c_y_x_global_desc = WeiGlobalDesc{}; + constexpr auto out_n_k_ho_wo_global_desc = OutGlobalDesc{}; + + constexpr index_t N = in_n_c_hi_wi_global_desc.GetLengths()[0]; + constexpr index_t C = in_n_c_hi_wi_global_desc.GetLengths()[1]; + constexpr index_t Hi = in_n_c_hi_wi_global_desc.GetLengths()[2]; + constexpr index_t Wi = in_n_c_hi_wi_global_desc.GetLengths()[3]; + + constexpr index_t K = out_n_k_ho_wo_global_desc.GetLengths()[1]; + constexpr index_t Ho = out_n_k_ho_wo_global_desc.GetLengths()[2]; + constexpr index_t Wo = out_n_k_ho_wo_global_desc.GetLengths()[3]; + + constexpr index_t Y = wei_k_c_y_x_global_desc.GetLengths()[2]; + constexpr index_t X = wei_k_c_y_x_global_desc.GetLengths()[3]; + + constexpr index_t ConvStrideH = ConvStrides{}[0]; + constexpr index_t ConvStrideW = ConvStrides{}[1]; + + constexpr index_t ConvDilationH = ConvDilations{}[0]; + constexpr index_t ConvDilationW = ConvDilations{}[1]; + + constexpr index_t E = C * Y * X; + constexpr index_t B = N * Ho * Wo; + + // sanity-check for vectorized memory load + static_assert((Wo == 1 || (ConvStrideW == 1 || InThreadCopyDataPerAccess_B == 1)) && + (X == 1 || ConvDilationW % InThreadCopyDataPerAccess_B == 0), + "wrong! aligment requirement for vectorized global load of input tensor will " + "be violated"); + + // output tensor + constexpr auto out_n_k_howo_global_desc = + unfold_tensor_descriptor(out_n_k_ho_wo_global_desc, I2, I3); + + constexpr auto out_k_b_global_desc = + transform_tensor_descriptor(out_n_k_howo_global_desc, + make_tuple(PassThrough{}, Merge>{}), + make_tuple(Sequence<1>{}, Sequence<0, 2>{}), + make_tuple(Sequence<0>{}, Sequence<1>{})); + + // weight tensor + constexpr auto wei_k_e_global_desc = + unfold_tensor_descriptor(wei_k_c_y_x_global_desc, I1, I3); + + // input tensor + constexpr auto in_n_c_hip_wip_global_desc = transform_tensor_descriptor( + in_n_c_hi_wi_global_desc, + make_tuple( + PassThrough{}, PassThrough{}, Pad, LeftPads, RightPads>{}), + make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}), + make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{})); + + constexpr auto in_n_c_y_ho_x_wo_global_desc = transform_tensor_descriptor( + in_n_c_hip_wip_global_desc, + make_tuple(PassThrough{}, + PassThrough{}, + Embed, Sequence>{}, + Embed, Sequence>{}), + make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}), + make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}, Sequence<4, 5>{})); + + constexpr auto in_e_b_global_desc = transform_tensor_descriptor( + in_n_c_y_ho_x_wo_global_desc, + make_tuple(Merge>{}, Merge>{}), + make_tuple(Sequence<1, 2, 4>{}, Sequence<0, 3, 5>{}), + make_tuple(Sequence<0>{}, Sequence<1>{})); + + // GEMM: atomic add + constexpr auto gridwise_gemm = + GridwiseGemmTransposedANormalBNormalC_v1r1{}; + + gridwise_gemm.Run(p_wei_global, p_out_global, p_in_global); + } +}; + +} // namespace ck +#endif diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_backward_data_implicit_gemm_v1r1_nchw_kcyx_nkhw_lds_double_buffer.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_backward_data_implicit_gemm_v1r1_nchw_kcyx_nkhw_lds_double_buffer.hpp deleted file mode 100644 index 94487a4b..00000000 --- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_backward_data_implicit_gemm_v1r1_nchw_kcyx_nkhw_lds_double_buffer.hpp +++ /dev/null @@ -1,401 +0,0 @@ -#ifndef CK_GRIDWISE_CONVOLUTION_BACKWARD_DATA_IMPLICIT_GEMM_V1R1_NCHW_KCYX_NKHW_LDS_DOUBLE_BUFFER_HPP -#define CK_GRIDWISE_CONVOLUTION_BACKWARD_DATA_IMPLICIT_GEMM_V1R1_NCHW_KCYX_NKHW_LDS_DOUBLE_BUFFER_HPP - -#include "common_header.hpp" -#include "tensor_descriptor.hpp" -#include "tensor_descriptor_helper.hpp" -#include "ConstantMatrixDescriptor.hpp" -#include "blockwise_generic_tensor_slice_copy.hpp" -#include "threadwise_generic_tensor_slice_copy.hpp" -#include "blockwise_gemm.hpp" - -namespace ck { - -template -struct GridwiseConvolutionBackwardDataImplicitGemm_v1r1_nchw_kcyx_nkhw_lds_double_buffer -{ - __device__ void Run(Float* const __restrict__ p_in_global, - const Float* const __restrict__ p_wei_global, - const Float* const __restrict__ p_out_global) const - { - constexpr auto I0 = Number<0>{}; - constexpr auto I1 = Number<1>{}; - constexpr auto I2 = Number<2>{}; - constexpr auto I3 = Number<3>{}; - - constexpr auto True = integral_constant{}; - - constexpr auto in_n_c_hi_wi_global_desc = InGlobalDesc{}; - constexpr auto wei_k_c_y_x_global_desc = WeiGlobalDesc{}; - constexpr auto out_n_k_ho_wo_global_desc = OutGlobalDesc{}; - - constexpr index_t N = in_n_c_hi_wi_global_desc.GetLengths()[0]; - constexpr index_t C = in_n_c_hi_wi_global_desc.GetLengths()[1]; - constexpr index_t Hi = in_n_c_hi_wi_global_desc.GetLengths()[2]; - constexpr index_t Wi = in_n_c_hi_wi_global_desc.GetLengths()[3]; - - constexpr index_t K = out_n_k_ho_wo_global_desc.GetLengths()[1]; - constexpr index_t Ho = out_n_k_ho_wo_global_desc.GetLengths()[2]; - constexpr index_t Wo = out_n_k_ho_wo_global_desc.GetLengths()[3]; - - constexpr index_t Y = wei_k_c_y_x_global_desc.GetLengths()[2]; - constexpr index_t X = wei_k_c_y_x_global_desc.GetLengths()[3]; - - constexpr index_t ConvStrideH = ConvStrides{}[0]; - constexpr index_t ConvStrideW = ConvStrides{}[1]; - - constexpr index_t ConvDilationH = ConvDilations{}[0]; - constexpr index_t ConvDilationW = ConvDilations{}[1]; - - constexpr index_t E = C * Y * X; - constexpr index_t B = N * Ho * Wo; - - // sanity-check for vectorized memory load - static_assert((Wo == 1 || (ConvStrideW == 1 || InThreadCopyDataPerAccess_B == 1)) && - (X == 1 || ConvDilationW % InThreadCopyDataPerAccess_B == 0), - "wrong! aligment requirement for vectorized global load of input tensor will " - "be violated"); - - // lds max alignment - constexpr index_t max_lds_align = math::lcm(WeiBlockCopyDataPerAccess_E, - OutBlockCopyDataPerAccess_B, - GemmDataPerReadA, - GemmDataPerReadB); - - // divide block work by [K, B] - static_assert(E % EPerBlock == 0 && B % BPerBlock == 0 && K % KPerBlock == 0, - "wrong! cannot divide work evenly among block"); - - constexpr index_t EBlockWork = E / EPerBlock; - constexpr index_t BBlockWork = B / BPerBlock; - - constexpr auto block_work_desc = - make_cluster_descriptor(Sequence{}); - - const auto block_work_id = block_work_desc.CalculateClusterIndex(get_block_1d_id()); - - const index_t e_block_data_on_global = block_work_id[0] * EPerBlock; - const index_t b_block_data_on_global = block_work_id[1] * BPerBlock; - - // output tensor - // global tensor in global memory - constexpr auto out_n_k_howo_global_desc = - unfold_tensor_descriptor(out_n_k_ho_wo_global_desc, I2, I3); - - // global tensor in global memory, src of blockwise copy - constexpr auto out_k_b_global_desc = - transform_tensor_descriptor(out_n_k_howo_global_desc, - make_tuple(PassThrough{}, Merge>{}), - make_tuple(Sequence<1>{}, Sequence<0, 2>{}), - make_tuple(Sequence<0>{}, Sequence<1>{})); - - // block tensor in LDS memory, dst of blockwise copy - // be careful of LDS alignment - constexpr auto out_k_b_block_desc = make_native_tensor_descriptor_aligned( - Sequence{}, Number{}); - - // input tensor blockwise copy - auto blockwise_out_copy = - BlockwiseGenericTensorSliceCopy_v4, - Sequence<0, 1>, - Sequence<0, 1>, - 1, - 1, - OutBlockCopyDataPerAccess_B, - OutBlockCopyDataPerAccess_B, - AddressSpace::global, - AddressSpace::vgpr, - AddressSpace::lds, - InMemoryDataOperation::none>( - {0, b_block_data_on_global}, {0, 0}); - - // weight tensor - // global tensor in global memory, src of blockwise copy - // It is constructed differently, depending on whether forward or backward weight - // convolution - constexpr auto wei_k_e_global_desc = - unfold_tensor_descriptor(wei_k_c_y_x_global_desc, I1, I3); - - // block tensor in LDS memory, dst of blockwise copy - // be careful of LDS alignment - constexpr auto wei_k_e_block_desc = make_native_tensor_descriptor_aligned( - Sequence{}, Number{}); - - // weight tensor blockwise copy - auto blockwise_wei_copy = - BlockwiseGenericTensorSliceCopy_v4, - Sequence<0, 1>, - Sequence<0, 1>, - 1, - 1, - WeiBlockCopyDataPerAccess_E, - WeiBlockCopyDataPerAccess_E, - AddressSpace::global, - AddressSpace::vgpr, - AddressSpace::lds, - InMemoryDataOperation::none>( - {0, e_block_data_on_global}, {0, 0}); - - // GEMM definition - // c_mtx += transpose(a_mtx) * b_mtx - // a_mtx[KPerBlock, EPerBlock] is in LDS - // b_mtx[KPerBlocl, BPerBlock] is in LDS - // c_mtx[EPerBlock, BPerBlock] is distributed among threads, and saved in - // register - constexpr auto a_k_e_block_mtx_desc = make_ConstantMatrixDescriptor(wei_k_e_block_desc); - constexpr auto b_k_b_block_mtx_desc = make_ConstantMatrixDescriptor(out_k_b_block_desc); - - // sanity check - static_assert( - EPerBlock % (GemmMPerThreadSubC * GemmMLevel0Cluster * GemmMLevel1Cluster) == 0 && - BPerBlock % (GemmNPerThreadSubC * GemmNLevel0Cluster * GemmNLevel1Cluster) == 0, - "wrong!"); - - constexpr index_t GemmMRepeat = - EPerBlock / (GemmMPerThreadSubC * GemmMLevel0Cluster * GemmMLevel1Cluster); - - constexpr index_t GemmNRepeat = - BPerBlock / (GemmNPerThreadSubC * GemmNLevel0Cluster * GemmNLevel1Cluster); - - // c_thread_mtx definition: this is a mess - // TODO:: more elegent way of defining c_thread_mtx - constexpr auto c_e0e1_b0b1_thread_mtx_desc = make_ConstantMatrixDescriptor_packed( - Number{}, Number{}); - - const auto blockwise_gemm = BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2< - BlockSize, - decltype(a_k_e_block_mtx_desc), - decltype(b_k_b_block_mtx_desc), - decltype(c_e0e1_b0b1_thread_mtx_desc), - GemmMPerThreadSubC, - GemmNPerThreadSubC, - GemmMLevel0Cluster, - GemmNLevel0Cluster, - GemmMLevel1Cluster, - GemmNLevel1Cluster, - GemmKPerThreadLoop, - GemmDataPerReadA, - GemmDataPerReadB>{}; - - // LDS allocation for input and weight: be careful of alignment - constexpr index_t out_block_space = - math::integer_least_multiple(out_k_b_block_desc.GetElementSpace(), max_lds_align); - - constexpr index_t wei_block_space = - math::integer_least_multiple(wei_k_e_block_desc.GetElementSpace(), max_lds_align); - - __shared__ Float p_out_block_double[2 * out_block_space]; - __shared__ Float p_wei_block_double[2 * wei_block_space]; - - // register allocation for output - AccDataType p_in_thread[c_e0e1_b0b1_thread_mtx_desc.GetElementSpace()]; - - // zero out threadwise output - threadwise_matrix_set_zero(c_e0e1_b0b1_thread_mtx_desc, p_in_thread); - - // LDS double buffer: preload data into LDS - { - blockwise_out_copy.Run(p_out_global, p_out_block_double); - blockwise_wei_copy.Run(p_wei_global, p_wei_block_double); - } - - // LDS double buffer: main body - for(index_t k_block_data_begin = 0; k_block_data_begin + 2 * KPerBlock < K; - k_block_data_begin += 2 * KPerBlock) - { -#pragma unroll - for(index_t iloop = 0; iloop < 2; ++iloop) - { - const bool even_loop = (iloop % 2 == 0); - - Float* p_out_block_now = - even_loop ? p_out_block_double : p_out_block_double + out_block_space; - Float* p_wei_block_now = - even_loop ? p_wei_block_double : p_wei_block_double + wei_block_space; - - Float* p_out_block_next = - even_loop ? p_out_block_double + out_block_space : p_out_block_double; - Float* p_wei_block_next = - even_loop ? p_wei_block_double + wei_block_space : p_wei_block_double; - - Float p_out_thread_buffer[blockwise_out_copy.GetThreadBufferSize()]; - Float p_wei_thread_buffer[blockwise_wei_copy.GetThreadBufferSize()]; - - blockwise_out_copy.MoveSrcSliceWindow(Sequence{}, True); - blockwise_wei_copy.MoveSrcSliceWindow(Sequence{}, True); - - __syncthreads(); - - // LDS doubel buffer: load next data from device mem - blockwise_out_copy.RunLoadThreadBuffer(p_out_global, p_out_thread_buffer); - blockwise_wei_copy.RunLoadThreadBuffer(p_wei_global, p_wei_thread_buffer); - - // LDS double buffer: GEMM on current data - blockwise_gemm.Run(p_wei_block_now, p_out_block_now, p_in_thread); - - // LDS double buffer: store next data to LDS - blockwise_out_copy.RunStoreThreadBuffer(p_out_thread_buffer, p_out_block_next); - blockwise_wei_copy.RunStoreThreadBuffer(p_wei_thread_buffer, p_wei_block_next); - } - } - - // LDS double buffer: tail - { - constexpr bool has_two_iteration_left = (K % (2 * KPerBlock) == 0); - - if(has_two_iteration_left) // if has 2 iteration left - { - Float p_out_thread_buffer[blockwise_out_copy.GetThreadBufferSize()]; - Float p_wei_thread_buffer[blockwise_wei_copy.GetThreadBufferSize()]; - - blockwise_out_copy.MoveSrcSliceWindow(Sequence{}, True); - blockwise_wei_copy.MoveSrcSliceWindow(Sequence{}, True); - - __syncthreads(); - - // LDS double buffer: load last data from device mem - blockwise_out_copy.RunLoadThreadBuffer(p_out_global, p_out_thread_buffer); - blockwise_wei_copy.RunLoadThreadBuffer(p_wei_global, p_wei_thread_buffer); - - // LDS double buffer: GEMM on 2nd-last data - blockwise_gemm.Run(p_wei_block_double, p_out_block_double, p_in_thread); - - // LDS double buffer: store last data to LDS - blockwise_out_copy.RunStoreThreadBuffer(p_out_thread_buffer, - p_out_block_double + out_block_space); - blockwise_wei_copy.RunStoreThreadBuffer(p_wei_thread_buffer, - p_wei_block_double + wei_block_space); - - __syncthreads(); - - // LDS double buffer: GEMM on last data - blockwise_gemm.Run(p_wei_block_double + wei_block_space, - p_out_block_double + out_block_space, - p_in_thread); - } - else // if has 1 iteration left - { - __syncthreads(); - - // LDS double buffer: GEMM on last data - blockwise_gemm.Run(p_wei_block_double, p_out_block_double, p_in_thread); - } - } - - // input: register to global memory, atomic add - { - constexpr index_t E1 = GemmMPerThreadSubC * GemmMLevel0Cluster * GemmMLevel1Cluster; - constexpr index_t E0 = E / E1; - - constexpr index_t B1 = GemmNPerThreadSubC * GemmNLevel0Cluster * GemmNLevel1Cluster; - constexpr index_t B0 = B / B1; - - // define input tensor descriptor for threadwise copy - // thread input tensor, src of threadwise copy - constexpr auto in_e0_e1_b0_b1_thread_desc = make_native_tensor_descriptor_packed( - Sequence{}); - - // global input tensor, dst of threadwise copy - constexpr auto in_n_c_hip_wip_global_desc = transform_tensor_descriptor( - in_n_c_hi_wi_global_desc, - make_tuple(PassThrough{}, - PassThrough{}, - Pad, LeftPads, RightPads>{}), - make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}), - make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{})); - - constexpr auto in_n_c_y_ho_x_wo_global_desc = transform_tensor_descriptor( - in_n_c_hip_wip_global_desc, - make_tuple(PassThrough{}, - PassThrough{}, - Embed, Sequence>{}, - Embed, Sequence>{}), - make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}), - make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}, Sequence<4, 5>{})); - - constexpr auto in_e_b_global_desc = transform_tensor_descriptor( - in_n_c_y_ho_x_wo_global_desc, - make_tuple(Merge>{}, Merge>{}), - make_tuple(Sequence<1, 2, 4>{}, Sequence<0, 3, 5>{}), - make_tuple(Sequence<0>{}, Sequence<1>{})); - - constexpr auto in_e0_e1_b0_b1_global_desc = transform_tensor_descriptor( - in_e_b_global_desc, - make_tuple(UnMerge>{}, UnMerge>{}), - make_tuple(Sequence<0>{}, Sequence<1>{}), - make_tuple(Sequence<0, 1>{}, Sequence<2, 3>{})); - - // calculate origin of thread input tensor on global memory - // blockwise GEMM c matrix starting index - const auto c_thread_mtx_on_block = - blockwise_gemm.GetBeginOfThreadMatrixC(get_thread_local_1d_id()); - - const index_t e_thread_data_on_global = - e_block_data_on_global + c_thread_mtx_on_block.row; - - const index_t b_thread_data_on_global = - b_block_data_on_global + c_thread_mtx_on_block.col; - - ThreadwiseGenericTensorSliceCopy_v4r2, - 3, - InThreadCopyDataPerAccess_B, - InThreadCopyDataPerAccess_B, - AddressSpace::vgpr, - AddressSpace::global, - InMemoryDataOperation::atomic_add>( - {0, 0, 0, 0}, - {e_thread_data_on_global / E1, - e_thread_data_on_global % E1, - b_thread_data_on_global / B1, - b_thread_data_on_global % B1}) - .Run(p_in_thread, p_in_global); - } - } -}; - -} // namespace ck -#endif diff --git a/composable_kernel/include/tensor_operation/gridwise_gemm.hpp b/composable_kernel/include/tensor_operation/gridwise_gemm.hpp new file mode 100644 index 00000000..6727b36d --- /dev/null +++ b/composable_kernel/include/tensor_operation/gridwise_gemm.hpp @@ -0,0 +1,330 @@ +#ifndef CK_GRIDWISE_GEMM_HPP +#define CK_GRIDWISE_GEMM_HPP + +#include "common_header.hpp" +#include "tensor_descriptor.hpp" +#include "tensor_descriptor_helper.hpp" +#include "ConstantMatrixDescriptor.hpp" +#include "blockwise_generic_tensor_slice_copy.hpp" +#include "threadwise_generic_tensor_slice_copy.hpp" +#include "blockwise_gemm.hpp" + +namespace ck { + +template +struct GridwiseGemmTransposedANormalBNormalC_v1r1 +{ + __device__ void Run(const Float* __restrict__ p_a_global, + const Float* __restrict__ p_b_global, + Float* __restrict__ p_c_global) const + { + constexpr auto True = integral_constant{}; + + constexpr auto I0 = Number<0>{}; + constexpr auto I1 = Number<1>{}; + constexpr auto I2 = Number<2>{}; + constexpr auto I3 = Number<3>{}; + + constexpr auto a_k_m_global_desc = AGlobalDesc{}; + constexpr auto b_k_n_global_desc = BGlobalDesc{}; + constexpr auto c_m_n_global_desc = CGlobalDesc{}; + + constexpr auto K = a_k_m_global_desc.GetLength(I0); + constexpr auto M = a_k_m_global_desc.GetLength(I1); + constexpr auto N = b_k_n_global_desc.GetLength(I1); + + // lds max alignment + constexpr index_t max_lds_align = math::lcm(ABlockCopyDataPerAccess_M, + BBlockCopyDataPerAccess_N, + ThreadGemmDataPerReadM, + ThreadGemmDataPerReadN); + + // divide block work by [M, N] + static_assert(M % MPerBlock == 0 && N % NPerBlock == 0 && K % KPerBlock == 0, + "wrong! cannot divide work evenly among block"); + + constexpr index_t MBlockWork = M / MPerBlock; + constexpr index_t NBlockWork = N / NPerBlock; + + constexpr auto block_work_desc = + make_cluster_descriptor(Sequence{}); + + const auto block_work_id = block_work_desc.CalculateClusterIndex(get_block_1d_id()); + + const index_t m_block_data_on_global = block_work_id[0] * MPerBlock; + const index_t n_block_data_on_global = block_work_id[1] * NPerBlock; + + // A matrix in LDS memory, dst of blockwise copy + // be careful of LDS alignment + constexpr auto a_k_m_block_desc = make_native_tensor_descriptor_aligned( + Sequence{}, Number{}); + + // A matrix blockwise copy + auto a_blockwise_copy = + BlockwiseGenericTensorSliceCopy_v4, + Sequence<0, 1>, + Sequence<0, 1>, + 1, + 1, + ABlockCopyDataPerAccess_M, + ABlockCopyDataPerAccess_M, + AddressSpace::global, + AddressSpace::vgpr, + AddressSpace::lds, + InMemoryDataOperation::none>( + {0, m_block_data_on_global}, {0, 0}); + + // B matrix in LDS memory, dst of blockwise copy + // be careful of LDS alignment + constexpr auto b_k_n_block_desc = make_native_tensor_descriptor_aligned( + Sequence{}, Number{}); + + // B matrix blockwise copy + auto b_blockwise_copy = + BlockwiseGenericTensorSliceCopy_v4, + Sequence<0, 1>, + Sequence<0, 1>, + 1, + 1, + BBlockCopyDataPerAccess_N, + BBlockCopyDataPerAccess_N, + AddressSpace::global, + AddressSpace::vgpr, + AddressSpace::lds, + InMemoryDataOperation::none>( + {0, n_block_data_on_global}, {0, 0}); + + // GEMM definition + // c_mtx += transpose(a_mtx) * b_mtx + // a_mtx[KPerBlock, MPerBlock] is in LDS + // b_mtx[KPerBlocl, NPerBlock] is in LDS + // c_mtx[MPerBlock, NPerBlock] is distributed among threads, and saved in + // register + constexpr auto a_k_m_block_mtx_desc = make_ConstantMatrixDescriptor(a_k_m_block_desc); + constexpr auto b_k_n_block_mtx_desc = make_ConstantMatrixDescriptor(b_k_n_block_desc); + + // sanity check + static_assert(MPerBlock % (MPerThreadSubC * MLevel0Cluster * MLevel1Cluster) == 0 && + NPerBlock % (NPerThreadSubC * NLevel0Cluster * NLevel1Cluster) == 0, + "wrong!"); + + constexpr index_t GemmMRepeat = + MPerBlock / (MPerThreadSubC * MLevel0Cluster * MLevel1Cluster); + + constexpr index_t GemmNRepeat = + NPerBlock / (NPerThreadSubC * NLevel0Cluster * NLevel1Cluster); + + // c_thread_mtx definition: this is a mess + // TODO:: more elegent way of defining c_thread_mtx + constexpr auto c_m0m1_n0n1_thread_mtx_desc = make_ConstantMatrixDescriptor_packed( + Number{}, Number{}); + + const auto blockwise_gemm = BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2< + BlockSize, + decltype(a_k_m_block_mtx_desc), + decltype(b_k_n_block_mtx_desc), + decltype(c_m0m1_n0n1_thread_mtx_desc), + MPerThreadSubC, + NPerThreadSubC, + MLevel0Cluster, + NLevel0Cluster, + MLevel1Cluster, + NLevel1Cluster, + KPerThreadLoop, + ThreadGemmDataPerReadM, + ThreadGemmDataPerReadN>{}; + + // LDS allocation for A and B: be careful of alignment + constexpr index_t a_block_space = + math::integer_least_multiple(a_k_m_block_desc.GetElementSpace(), max_lds_align); + + constexpr index_t b_block_space = + math::integer_least_multiple(b_k_n_block_desc.GetElementSpace(), max_lds_align); + + __shared__ Float p_a_block_double[2 * a_block_space]; + __shared__ Float p_b_block_double[2 * b_block_space]; + + // register allocation for output + AccFloat p_c_thread[c_m0m1_n0n1_thread_mtx_desc.GetElementSpace()]; + + // zero out threadwise output + threadwise_matrix_set_zero(c_m0m1_n0n1_thread_mtx_desc, p_c_thread); + + // LDS double buffer: preload data into LDS + { + a_blockwise_copy.Run(p_a_global, p_a_block_double); + b_blockwise_copy.Run(p_b_global, p_b_block_double); + } + + // LDS double buffer: main body + for(index_t k_block_data_begin = 0; k_block_data_begin + 2 * KPerBlock < K; + k_block_data_begin += 2 * KPerBlock) + { +#pragma unroll + for(index_t iloop = 0; iloop < 2; ++iloop) + { + const bool even_loop = (iloop % 2 == 0); + + Float* p_a_block_now = + even_loop ? p_a_block_double : p_a_block_double + a_block_space; + Float* p_b_block_now = + even_loop ? p_b_block_double : p_b_block_double + b_block_space; + + Float* p_a_block_next = + even_loop ? p_a_block_double + a_block_space : p_a_block_double; + Float* p_b_block_next = + even_loop ? p_b_block_double + b_block_space : p_b_block_double; + + Float p_a_thread_buffer[a_blockwise_copy.GetThreadBufferSize()]; + Float p_b_thread_buffer[b_blockwise_copy.GetThreadBufferSize()]; + + a_blockwise_copy.MoveSrcSliceWindow(Sequence{}, True); + b_blockwise_copy.MoveSrcSliceWindow(Sequence{}, True); + + __syncthreads(); + + // LDS doubel buffer: load next data from device mem + a_blockwise_copy.RunLoadThreadBuffer(p_a_global, p_a_thread_buffer); + b_blockwise_copy.RunLoadThreadBuffer(p_b_global, p_b_thread_buffer); + + // LDS double buffer: GEMM on current data + blockwise_gemm.Run(p_a_block_now, p_b_block_now, p_c_thread); + + // LDS double buffer: store next data to LDS + a_blockwise_copy.RunStoreThreadBuffer(p_a_thread_buffer, p_a_block_next); + b_blockwise_copy.RunStoreThreadBuffer(p_b_thread_buffer, p_b_block_next); + } + } + + // LDS double buffer: tail + { + constexpr bool has_two_iteration_left = (K % (2 * KPerBlock) == 0); + + if(has_two_iteration_left) // if has 2 iteration left + { + Float p_a_thread_buffer[a_blockwise_copy.GetThreadBufferSize()]; + Float p_b_thread_buffer[b_blockwise_copy.GetThreadBufferSize()]; + + a_blockwise_copy.MoveSrcSliceWindow(Sequence{}, True); + b_blockwise_copy.MoveSrcSliceWindow(Sequence{}, True); + + __syncthreads(); + + // LDS double buffer: load last data from device mem + a_blockwise_copy.RunLoadThreadBuffer(p_a_global, p_a_thread_buffer); + b_blockwise_copy.RunLoadThreadBuffer(p_b_global, p_b_thread_buffer); + + // LDS double buffer: GEMM on 2nd-last data + blockwise_gemm.Run(p_a_block_double, p_b_block_double, p_c_thread); + + // LDS double buffer: store last data to LDS + a_blockwise_copy.RunStoreThreadBuffer(p_a_thread_buffer, + p_a_block_double + a_block_space); + b_blockwise_copy.RunStoreThreadBuffer(p_b_thread_buffer, + p_b_block_double + b_block_space); + + __syncthreads(); + + // LDS double buffer: GEMM on last data + blockwise_gemm.Run( + p_a_block_double + a_block_space, p_b_block_double + b_block_space, p_c_thread); + } + else // if has 1 iteration left + { + __syncthreads(); + + // LDS double buffer: GEMM on last data + blockwise_gemm.Run(p_a_block_double, p_b_block_double, p_c_thread); + } + } + + // input: register to global memory + { + constexpr index_t M1 = MPerThreadSubC * MLevel0Cluster * MLevel1Cluster; + constexpr index_t M0 = M / M1; + + constexpr index_t N1 = NPerThreadSubC * NLevel0Cluster * NLevel1Cluster; + constexpr index_t N0 = N / N1; + + // define input tensor descriptor for threadwise copy + // thread input tensor, src of threadwise copy + constexpr auto c_m0_m1_n0_n1_thread_desc = make_native_tensor_descriptor_packed( + Sequence{}); + + constexpr auto c_m0_m1_n0_n1_global_desc = transform_tensor_descriptor( + c_m_n_global_desc, + make_tuple(UnMerge>{}, UnMerge>{}), + make_tuple(Sequence<0>{}, Sequence<1>{}), + make_tuple(Sequence<0, 1>{}, Sequence<2, 3>{})); + + // calculate origin of thread input tensor on global memory + // blockwise GEMM c matrix starting index + const auto c_thread_mtx_on_block = + blockwise_gemm.GetBeginOfThreadMatrixC(get_thread_local_1d_id()); + + const index_t m_thread_data_on_global = + m_block_data_on_global + c_thread_mtx_on_block.row; + + const index_t n_thread_data_on_global = + n_block_data_on_global + c_thread_mtx_on_block.col; + + ThreadwiseGenericTensorSliceCopy_v4r2, + 3, + CThreadCopyDataPerAccess_N, + CThreadCopyDataPerAccess_N, + AddressSpace::vgpr, + AddressSpace::global, + CGlobalMemoryDataOperation>( + {0, 0, 0, 0}, + {m_thread_data_on_global / M1, + m_thread_data_on_global % M1, + n_thread_data_on_global / N1, + n_thread_data_on_global % N1}) + .Run(p_c_thread, p_c_global); + } + } +}; + +} // namespace ck +#endif diff --git a/driver/src/conv_bwd_data_driver.cpp b/driver/src/conv_bwd_data_driver.cpp index ee1e125c..3d828698 100644 --- a/driver/src/conv_bwd_data_driver.cpp +++ b/driver/src/conv_bwd_data_driver.cpp @@ -49,7 +49,7 @@ int main(int argc, char* argv[]) using LeftPads = Sequence<0, 0>; using RightPads = Sequence<0, 0>; -#elif 0 +#elif 1 // 1x1 filter, 8x8 image // cudnn@V100 68%, ck@V100 72%, ck@P100 52%, ck@VII 42% constexpr index_t N = 64; From cfff66cd08200ca537caa870a72ce9c60ae1ffba Mon Sep 17 00:00:00 2001 From: Chao Liu Date: Sat, 30 Nov 2019 00:56:15 -0600 Subject: [PATCH 20/23] refactor --- ...data_implicit_gemm_v1r1_nchw_kcyx_nkhw.hpp | 65 +++++++++---------- driver/src/conv_bwd_data_driver.cpp | 2 +- 2 files changed, 33 insertions(+), 34 deletions(-) diff --git a/driver/include/device_convolution_backward_data_implicit_gemm_v1r1_nchw_kcyx_nkhw.hpp b/driver/include/device_convolution_backward_data_implicit_gemm_v1r1_nchw_kcyx_nkhw.hpp index 8527a786..a287a27e 100644 --- a/driver/include/device_convolution_backward_data_implicit_gemm_v1r1_nchw_kcyx_nkhw.hpp +++ b/driver/include/device_convolution_backward_data_implicit_gemm_v1r1_nchw_kcyx_nkhw.hpp @@ -3,7 +3,7 @@ #include "device.hpp" #include "tensor.hpp" #include "gridwise_operation_wrapper.hpp" -#include "gridwise_convolution_backward_data_implicit_gemm_v1r1_nchw_kcyx_nkhw_lds_double_buffer.hpp" +#include "gridwise_convolution_backward_data_implicit_gemm_v1r1_nchw_kcyx_nkhw.hpp" template {}; + constexpr auto gridwise_conv = GridwiseConvolutionBackwardDataImplicitGemm_v1r1_nchw_kcyx_nkhw< + GridSize, + BlockSize, + T, + T, + decltype(in_nchw_desc), + decltype(wei_kcyx_desc), + decltype(out_nkhw_desc), + ConvStrides, + ConvDilations, + LeftPads, + RightPads, + BPerBlock, + EPerBlock, + KPerBlock, + GemmMPerThreadSubC, + GemmNPerThreadSubC, + GemmMLevel0Cluster, + GemmNLevel0Cluster, + GemmMLevel1Cluster, + GemmNLevel1Cluster, + GemmKPerThreadLoop, + GemmDataPerReadA, + GemmDataPerReadB, + OutBlockCopySubLengths_K_B, + OutBlockCopyClusterLengths_K_B, + OutBlockCopyDataPerAccess_B, + WeiBlockCopySubLengths_K_E, + WeiBlockCopyClusterLengths_K_E, + WeiBlockCopyDataPerAccess_E, + InThreadCopyDataPerAccess_B>{}; for(index_t i = 0; i < nrepeat; ++i) { diff --git a/driver/src/conv_bwd_data_driver.cpp b/driver/src/conv_bwd_data_driver.cpp index 3d828698..6d0ab98b 100644 --- a/driver/src/conv_bwd_data_driver.cpp +++ b/driver/src/conv_bwd_data_driver.cpp @@ -345,7 +345,7 @@ int main(int argc, char* argv[]) #endif } -#if 0 +#if 1 device_convolution_backward_data_implicit_gemm_v1r1_nchw_kcyx_nkhw #else device_convolution_backward_data_implicit_gemm_v1r2_nchw_kcyx_nkhw From b7992190f40dd5d294ff7a2ba8f5945f78c83ba6 Mon Sep 17 00:00:00 2001 From: Chao Liu Date: Mon, 2 Dec 2019 10:38:35 -0600 Subject: [PATCH 21/23] adding bwd data v2r1 --- ...data_implicit_gemm_v1r1_nchw_kcyx_nkhw.hpp | 16 +- ..._v1r2_nchw_kcyx_nkhw_lds_double_buffer.hpp | 2 +- ...data_implicit_gemm_v2r1_nchw_kcyx_nkhw.hpp | 201 ++++++++++++++++++ ...data_implicit_gemm_v1r1_nchw_kcyx_nkhw.hpp | 81 ++++--- ...data_implicit_gemm_v1r2_nchw_kcyx_nkhw.hpp | 2 +- ...data_implicit_gemm_v2r1_nchw_kcyx_nkhw.hpp | 182 ++++++++++++++++ driver/src/conv_bwd_data_driver.cpp | 18 +- 7 files changed, 445 insertions(+), 57 deletions(-) create mode 100644 composable_kernel/include/kernel_algorithm/gridwise_convolution_backward_data_implicit_gemm_v2r1_nchw_kcyx_nkhw.hpp create mode 100644 driver/include/device_convolution_backward_data_implicit_gemm_v2r1_nchw_kcyx_nkhw.hpp diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_backward_data_implicit_gemm_v1r1_nchw_kcyx_nkhw.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_backward_data_implicit_gemm_v1r1_nchw_kcyx_nkhw.hpp index f2325aab..06d41327 100644 --- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_backward_data_implicit_gemm_v1r1_nchw_kcyx_nkhw.hpp +++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_backward_data_implicit_gemm_v1r1_nchw_kcyx_nkhw.hpp @@ -19,8 +19,8 @@ template struct GridwiseConvolutionBackwardDataImplicitGemm_v1r1_nchw_kcyx_nkhw { @@ -139,8 +139,8 @@ struct GridwiseConvolutionBackwardDataImplicitGemm_v1r1_nchw_kcyx_nkhw GemmMLevel1Cluster, GemmNLevel1Cluster, GemmKPerThreadLoop, - GemmDataPerReadA, - GemmDataPerReadB, + GemmThreadGemmDataPerReadM, + GemmThreadGemmDataPerReadN, WeiBlockCopySubLengths_K_E, WeiBlockCopyClusterLengths_K_E, WeiBlockCopyDataPerAccess_E, diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_backward_data_implicit_gemm_v1r2_nchw_kcyx_nkhw_lds_double_buffer.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_backward_data_implicit_gemm_v1r2_nchw_kcyx_nkhw_lds_double_buffer.hpp index 7cf94266..f53cfaa2 100644 --- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_backward_data_implicit_gemm_v1r2_nchw_kcyx_nkhw_lds_double_buffer.hpp +++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_backward_data_implicit_gemm_v1r2_nchw_kcyx_nkhw_lds_double_buffer.hpp @@ -22,8 +22,8 @@ template +struct GridwiseConvolutionBackwardDataImplicitGemm_v2r1_nchw_kcyx_nkhw +{ + __device__ void Run(Float* __restrict__ p_in_global, + const Float* __restrict__ p_wei_global, + const Float* __restrict__ p_out_global) const + { + constexpr auto in_n_c_hi_wi_global_desc = InGlobalDesc{}; + constexpr auto wei_k_c_y_x_global_desc = WeiGlobalDesc{}; + constexpr auto out_n_k_ho_wo_global_desc = OutGlobalDesc{}; + + constexpr index_t N = in_n_c_hi_wi_global_desc.GetLengths()[0]; + constexpr index_t C = in_n_c_hi_wi_global_desc.GetLengths()[1]; + constexpr index_t Hi = in_n_c_hi_wi_global_desc.GetLengths()[2]; + constexpr index_t Wi = in_n_c_hi_wi_global_desc.GetLengths()[3]; + + constexpr index_t K = out_n_k_ho_wo_global_desc.GetLengths()[1]; + constexpr index_t Ho = out_n_k_ho_wo_global_desc.GetLengths()[2]; + constexpr index_t Wo = out_n_k_ho_wo_global_desc.GetLengths()[3]; + + constexpr index_t Y = wei_k_c_y_x_global_desc.GetLengths()[2]; + constexpr index_t X = wei_k_c_y_x_global_desc.GetLengths()[3]; + + constexpr index_t ConvStrideH = ConvStrides{}[0]; + constexpr index_t ConvStrideW = ConvStrides{}[1]; + + constexpr index_t ConvDilationH = ConvDilations{}[0]; + constexpr index_t ConvDilationW = ConvDilations{}[1]; + + // sanity-check for vectorized memory load + static_assert((Wo == 1 || (ConvStrideW == 1 || GemmCThreadCopyDataPerAccess == 1)) && + (X == 1 || ConvDilationW % GemmCThreadCopyDataPerAccess == 0), + "wrong! aligment requirement for vectorized global load of input tensor will " + "be violated"); + + // TODO: this algo support any stride and dilation. But for now, let's fix them to be 1 for + // simplicity + static_assert(ConvStrideH == 1 && ConvStrideW == 1 && ConvDilationH == 1 && + ConvDilationW == 1, + "wrong! not supported yet"); + + // TODO: these logic are only for stride = 1, dilation = 1 + constexpr index_t Ydot = Y; + constexpr index_t Ytilda = 1; + constexpr index_t Htilda = Ho + Y - 1; + + constexpr index_t Xdot = X; + constexpr index_t Xtilda = 1; + constexpr index_t Wtilda = Wo + X - 1; + + constexpr index_t GemmK = K * Ydot * Xdot; + constexpr index_t GemmM = C * Ytilda * Xtilda; + constexpr index_t GemmN = N * Htilda * Wtilda; + + // weight tensor + constexpr auto wei_k_c_ydot_ytilda_xdot_xtilda_global_desc = transform_tensor_descriptor( + wei_k_c_y_x_global_desc, + make_tuple( + PassThrough{}, + PassThrough{}, + Embed, Sequence<1, 1, 0>>{}, // coefficient may be wrong + Embed, Sequence<1, 1, 0>>{}), // coefficient may be wrong + make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}), + make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}, Sequence<4, 5>{})); + + constexpr auto wei_gemmk_gemmm_global_desc = transform_tensor_descriptor( + wei_k_c_ydot_ytilda_xdot_xtilda_global_desc, + make_tuple(Merge>{}, Merge>{}), + make_tuple(Sequence<0, 2, 4>{}, Sequence<1, 3, 5>{}), + make_tuple(Sequence<0>{}, Sequence<1>{})); + + // output tensor + constexpr auto out_n_k_hop_wop_global_desc = transform_tensor_descriptor( + out_n_k_ho_wo_global_desc, + make_tuple( + PassThrough{}, + PassThrough{}, + Pad, Sequence<0, 0>, Sequence>{}), // coefficient may + // be wrong + make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}), + make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{})); + + constexpr auto out_n_k_ydot_htilda_xdot_wtilda_global_desc = transform_tensor_descriptor( + out_n_k_hop_wop_global_desc, + make_tuple( + PassThrough{}, + PassThrough{}, + Embed, Sequence<0, 1, 0>>{}, // coefficient may be wrong + Embed, Sequence<0, 1, 0>>{}), // coefficient may be wrong + make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}), + make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}, Sequence<4, 5>{})); + + constexpr auto out_gemmk_gemmn_global_desc = transform_tensor_descriptor( + out_n_k_ydot_htilda_xdot_wtilda_global_desc, + make_tuple(Merge>{}, Merge>{}), + make_tuple(Sequence<1, 2, 4>{}, Sequence<0, 3, 5>{}), + make_tuple(Sequence<0>{}, Sequence<1>{})); + + // input tensor + constexpr auto eff_left_pads = LeftPads{} + Sequence{}; + constexpr auto eff_right_pads = RightPads{} + Sequence{}; + + constexpr auto in_n_c_hip_wip_global_desc = transform_tensor_descriptor( + in_n_c_hi_wi_global_desc, + make_tuple(PassThrough{}, + PassThrough{}, + Pad, decltype(eff_left_pads), decltype(eff_right_pads)>{}), + make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}), + make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{})); + + constexpr auto in_n_c_ytilda_htilda_xtilda_wtilda_global_desc = transform_tensor_descriptor( + in_n_c_hip_wip_global_desc, + make_tuple(PassThrough{}, + PassThrough{}, + Embed, Sequence>{}, + Embed, Sequence>{}), + make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}), + make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}, Sequence<4, 5>{})); + + constexpr auto in_gemmm_gemmn_global_desc = transform_tensor_descriptor( + in_n_c_ytilda_htilda_xtilda_wtilda_global_desc, + make_tuple(Merge>{}, Merge>{}), + make_tuple(Sequence<1, 3, 5>{}, Sequence<0, 2, 4>{}), + make_tuple(Sequence<0>{}, Sequence<1>{})); + + // GEMM + constexpr auto gridwise_gemm = + GridwiseGemmTransposedANormalBNormalC_v1r1{}; + + gridwise_gemm.Run(p_wei_global, p_out_global, p_in_global); + } +}; + +} // namespace ck +#endif diff --git a/driver/include/device_convolution_backward_data_implicit_gemm_v1r1_nchw_kcyx_nkhw.hpp b/driver/include/device_convolution_backward_data_implicit_gemm_v1r1_nchw_kcyx_nkhw.hpp index a287a27e..526abe37 100644 --- a/driver/include/device_convolution_backward_data_implicit_gemm_v1r1_nchw_kcyx_nkhw.hpp +++ b/driver/include/device_convolution_backward_data_implicit_gemm_v1r1_nchw_kcyx_nkhw.hpp @@ -49,38 +49,37 @@ void device_convolution_backward_data_implicit_gemm_v1r1_nchw_kcyx_nkhw(InDesc i // BlockSize = 256, each thread hold 64 data constexpr index_t BlockSize = 256; - constexpr index_t EPerBlock = 128; - constexpr index_t BPerBlock = 128; - constexpr index_t KPerBlock = 8; - - constexpr index_t GemmMPerThreadSubC = 4; - constexpr index_t GemmNPerThreadSubC = 4; - constexpr index_t GemmMLevel0Cluster = 4; - constexpr index_t GemmNLevel0Cluster = 4; - constexpr index_t GemmMLevel1Cluster = 4; - constexpr index_t GemmNLevel1Cluster = 4; - constexpr index_t GemmKPerThreadLoop = 1; - constexpr index_t GemmDataPerReadA = 4; - constexpr index_t GemmDataPerReadB = 4; - - using OutBlockCopySubLengths_K_B = Sequence<4, 1>; - using OutBlockCopyClusterLengths_K_B = Sequence<2, 128>; - - constexpr index_t OutBlockCopyDataPerAccess_B = 1; - - using WeiBlockCopySubLengths_K_E = Sequence<1, 4>; - using WeiBlockCopyClusterLengths_K_E = Sequence<8, 32>; - - constexpr index_t WeiBlockCopyDataPerAccess_E = 4; - - constexpr index_t InThreadCopyDataPerAccess_B = 1; + constexpr index_t GemmMPerBlock = 128; + constexpr index_t GemmNPerBlock = 128; + constexpr index_t GemmKPerBlock = 8; + constexpr index_t GemmMPerThreadSubC = 4; + constexpr index_t GemmNPerThreadSubC = 4; + constexpr index_t GemmMLevel0Cluster = 4; + constexpr index_t GemmNLevel0Cluster = 4; + constexpr index_t GemmMLevel1Cluster = 4; + constexpr index_t GemmNLevel1Cluster = 4; + constexpr index_t GemmKPerThreadLoop = 1; + constexpr index_t GemmThreadGemmDataPerReadM = 4; + constexpr index_t GemmThreadGemmDataPerReadN = 4; + + using GemmABlockCopySubLengths = Sequence<1, 4>; // Gemm-K, Gemm-M + using GemmABlockCopyClusterLengths = Sequence<8, 32>; // Gemm-K, Gemm-M + + constexpr index_t GemmABlockCopyDataPerAccess = 4; // Gemm-M + + using GemmBBlockCopySubLengths = Sequence<4, 1>; // Gemm-K, Gemm-N + using GemmBBlockCopyClusterLengths = Sequence<2, 128>; // Gemm-K, Gemm-N + + constexpr index_t GemmBBlockCopyDataPerAccess = 1; // Gemm-N + + constexpr index_t GemmCThreadCopyDataPerAccess = 1; // Gemm-N #endif - constexpr index_t E = C * Y * X; - constexpr index_t B = (N * Ho * Wo); + constexpr index_t GemmM = C * Y * X; + constexpr index_t GemmN = N * Ho * Wo; - constexpr index_t GridSize = - ((E + EPerBlock - 1) / EPerBlock) * ((B + BPerBlock - 1) / BPerBlock); + constexpr index_t GridSize = ((GemmM + GemmMPerBlock - 1) / GemmMPerBlock) * + ((GemmN + GemmNPerBlock - 1) / GemmNPerBlock); printf("%s: BlockSize %u, GridSize %u \n", __func__, BlockSize, GridSize); @@ -96,9 +95,9 @@ void device_convolution_backward_data_implicit_gemm_v1r1_nchw_kcyx_nkhw(InDesc i ConvDilations, LeftPads, RightPads, - BPerBlock, - EPerBlock, - KPerBlock, + GemmMPerBlock, + GemmNPerBlock, + GemmKPerBlock, GemmMPerThreadSubC, GemmNPerThreadSubC, GemmMLevel0Cluster, @@ -106,15 +105,15 @@ void device_convolution_backward_data_implicit_gemm_v1r1_nchw_kcyx_nkhw(InDesc i GemmMLevel1Cluster, GemmNLevel1Cluster, GemmKPerThreadLoop, - GemmDataPerReadA, - GemmDataPerReadB, - OutBlockCopySubLengths_K_B, - OutBlockCopyClusterLengths_K_B, - OutBlockCopyDataPerAccess_B, - WeiBlockCopySubLengths_K_E, - WeiBlockCopyClusterLengths_K_E, - WeiBlockCopyDataPerAccess_E, - InThreadCopyDataPerAccess_B>{}; + GemmThreadGemmDataPerReadM, + GemmThreadGemmDataPerReadN, + GemmABlockCopySubLengths, + GemmABlockCopyClusterLengths, + GemmABlockCopyDataPerAccess, + GemmBBlockCopySubLengths, + GemmBBlockCopyClusterLengths, + GemmBBlockCopyDataPerAccess, + GemmCThreadCopyDataPerAccess>{}; for(index_t i = 0; i < nrepeat; ++i) { diff --git a/driver/include/device_convolution_backward_data_implicit_gemm_v1r2_nchw_kcyx_nkhw.hpp b/driver/include/device_convolution_backward_data_implicit_gemm_v1r2_nchw_kcyx_nkhw.hpp index e443a2b7..affd41a0 100644 --- a/driver/include/device_convolution_backward_data_implicit_gemm_v1r2_nchw_kcyx_nkhw.hpp +++ b/driver/include/device_convolution_backward_data_implicit_gemm_v1r2_nchw_kcyx_nkhw.hpp @@ -105,8 +105,8 @@ void device_convolution_backward_data_implicit_gemm_v1r2_nchw_kcyx_nkhw(InDesc i ConvDilations, LeftPads, RightPads, - BPerBlock, EPerBlock, + BPerBlock, KPerBlock, GemmMPerThreadSubC, GemmNPerThreadSubC, diff --git a/driver/include/device_convolution_backward_data_implicit_gemm_v2r1_nchw_kcyx_nkhw.hpp b/driver/include/device_convolution_backward_data_implicit_gemm_v2r1_nchw_kcyx_nkhw.hpp new file mode 100644 index 00000000..2074bdc4 --- /dev/null +++ b/driver/include/device_convolution_backward_data_implicit_gemm_v2r1_nchw_kcyx_nkhw.hpp @@ -0,0 +1,182 @@ +#pragma once +#include +#include "device.hpp" +#include "tensor.hpp" +#include "gridwise_operation_wrapper.hpp" +#include "gridwise_convolution_backward_data_implicit_gemm_v2r1_nchw_kcyx_nkhw.hpp" + +template +void device_convolution_backward_data_implicit_gemm_v2r1_nchw_kcyx_nkhw(InDesc in_nchw_desc, + Tensor& in_nchw, + WeiDesc wei_kcyx_desc, + const Tensor& wei_kcyx, + OutDesc out_nkhw_desc, + const Tensor& out_nkhw, + ConvStrides, + ConvDilations, + LeftPads, + RightPads, + std::size_t nrepeat) +{ + using namespace ck; + + constexpr index_t N = out_nkhw_desc.GetLengths()[0]; + constexpr index_t K = out_nkhw_desc.GetLengths()[1]; + constexpr index_t Ho = out_nkhw_desc.GetLengths()[2]; + constexpr index_t Wo = out_nkhw_desc.GetLengths()[3]; + + constexpr index_t C = wei_kcyx_desc.GetLengths()[1]; + constexpr index_t Y = wei_kcyx_desc.GetLengths()[2]; + constexpr index_t X = wei_kcyx_desc.GetLengths()[3]; + + std::size_t data_sz = sizeof(T); + DeviceMem in_nchw_device_buf(data_sz * in_nchw.mDesc.GetElementSpace()); + DeviceMem wei_kcyx_device_buf(data_sz * wei_kcyx.mDesc.GetElementSpace()); + DeviceMem out_nkhw_device_buf(data_sz * out_nkhw.mDesc.GetElementSpace()); + + in_nchw_device_buf.ToDevice(in_nchw.mData.data()); + wei_kcyx_device_buf.ToDevice(wei_kcyx.mData.data()); + out_nkhw_device_buf.ToDevice(out_nkhw.mData.data()); + +#if 1 + // BlockSize = 256, each thread hold 64 data + constexpr index_t BlockSize = 256; + + constexpr index_t GemmMPerBlock = 128; + constexpr index_t GemmNPerBlock = 128; + constexpr index_t GemmKPerBlock = 8; + constexpr index_t GemmMPerThreadSubC = 4; + constexpr index_t GemmNPerThreadSubC = 4; + constexpr index_t GemmMLevel0Cluster = 4; + constexpr index_t GemmNLevel0Cluster = 4; + constexpr index_t GemmMLevel1Cluster = 4; + constexpr index_t GemmNLevel1Cluster = 4; + constexpr index_t GemmKPerThreadLoop = 1; + constexpr index_t GemmThreadGemmDataPerReadM = 4; + constexpr index_t GemmThreadGemmDataPerReadN = 4; + + using GemmABlockCopySubLengths = Sequence<4, 1>; // Gemm-K, Gemm-M + using GemmABlockCopyClusterLengths = Sequence<2, 128>; // Gemm-K, Gemm-M + + constexpr index_t GemmABlockCopyDataPerAccess = 1; // Gemm-M + + using GemmBBlockCopySubLengths = Sequence<4, 1>; // Gemm-K, Gemm-N + using GemmBBlockCopyClusterLengths = Sequence<2, 128>; // Gemm-K, Gemm-N + + constexpr index_t GemmBBlockCopyDataPerAccess = 1; // Gemm-N + + constexpr index_t GemmCThreadCopyDataPerAccess = 1; // Gemm-N +#elif 0 + // BlockSize = 256, each thread hold 64 data + constexpr index_t BlockSize = 256; + + constexpr index_t GemmMPerBlock = 128; + constexpr index_t GemmNPerBlock = 128; + constexpr index_t GemmKPerBlock = 8; + constexpr index_t GemmMPerThreadSubC = 4; + constexpr index_t GemmNPerThreadSubC = 4; + constexpr index_t GemmMLevel0Cluster = 4; + constexpr index_t GemmNLevel0Cluster = 4; + constexpr index_t GemmMLevel1Cluster = 4; + constexpr index_t GemmNLevel1Cluster = 4; + constexpr index_t GemmKPerThreadLoop = 1; + constexpr index_t GemmThreadGemmDataPerReadM = 4; + constexpr index_t GemmThreadGemmDataPerReadN = 4; + + using GemmABlockCopySubLengths = Sequence<1, 4>; // Gemm-K, Gemm-M + using GemmABlockCopyClusterLengths = Sequence<8, 32>; // Gemm-K, Gemm-M + + constexpr index_t GemmABlockCopyDataPerAccess = 4; // Gemm-M + + using GemmBBlockCopySubLengths = Sequence<4, 1>; // Gemm-K, Gemm-N + using GemmBBlockCopyClusterLengths = Sequence<2, 128>; // Gemm-K, Gemm-N + + constexpr index_t GemmBBlockCopyDataPerAccess = 1; // Gemm-N + + constexpr index_t GemmCThreadCopyDataPerAccess = 1; // Gemm-N +#endif + + // TODO: this algo support any stride and dilation. But for now, let's fix them to be 1 for + // simplicity + constexpr index_t Ydot = 1; + constexpr index_t Ytilda = Y; + constexpr index_t Htilda = Ho + Y - 1; + + constexpr index_t Xdot = 1; + constexpr index_t Xtilda = X; + constexpr index_t Wtilda = Wo + X - 1; + + constexpr index_t GemmK = K * Ydot * Xdot; + constexpr index_t GemmM = C * Ytilda * Xtilda; + constexpr index_t GemmN = N * Htilda * Wtilda; + + constexpr index_t GridSize = ((GemmM + GemmMPerBlock - 1) / GemmMPerBlock) * + ((GemmN + GemmNPerBlock - 1) / GemmNPerBlock); + + printf("%s: BlockSize %u, GridSize %u \n", __func__, BlockSize, GridSize); + + constexpr auto gridwise_conv = GridwiseConvolutionBackwardDataImplicitGemm_v2r1_nchw_kcyx_nkhw< + GridSize, + BlockSize, + T, + T, + decltype(in_nchw_desc), + decltype(wei_kcyx_desc), + decltype(out_nkhw_desc), + ConvStrides, + ConvDilations, + LeftPads, + RightPads, + GemmMPerBlock, + GemmNPerBlock, + GemmKPerBlock, + GemmMPerThreadSubC, + GemmNPerThreadSubC, + GemmMLevel0Cluster, + GemmNLevel0Cluster, + GemmMLevel1Cluster, + GemmNLevel1Cluster, + GemmKPerThreadLoop, + GemmThreadGemmDataPerReadM, + GemmThreadGemmDataPerReadN, + GemmABlockCopySubLengths, + GemmABlockCopyClusterLengths, + GemmABlockCopyDataPerAccess, + GemmBBlockCopySubLengths, + GemmBBlockCopyClusterLengths, + GemmBBlockCopyDataPerAccess, + GemmCThreadCopyDataPerAccess>{}; + + for(index_t i = 0; i < nrepeat; ++i) + { + float time = launch_kernel(run_gridwise_operation, + dim3(GridSize), + dim3(BlockSize), + 0, + gridwise_conv, + const_cast( + static_cast(in_nchw_device_buf.GetDeviceBuffer())), + const_cast( + static_cast(wei_kcyx_device_buf.GetDeviceBuffer())), + const_cast( + static_cast(out_nkhw_device_buf.GetDeviceBuffer()))); + + printf("Elapsed time : %f ms, %f TFlop/s\n", + time, + (float)calculate_convolution_flops(InDesc{}, WeiDesc{}, OutDesc{}) / + (std::size_t(1000) * 1000 * 1000) / time); + usleep(std::min(time * 1000, float(10000))); + } + + in_nchw_device_buf.FromDevice(in_nchw.mData.data()); +} diff --git a/driver/src/conv_bwd_data_driver.cpp b/driver/src/conv_bwd_data_driver.cpp index 6d0ab98b..f677326c 100644 --- a/driver/src/conv_bwd_data_driver.cpp +++ b/driver/src/conv_bwd_data_driver.cpp @@ -15,6 +15,7 @@ #include "host_conv_bwd_data.hpp" #include "device_convolution_backward_data_implicit_gemm_v1r1_nchw_kcyx_nkhw.hpp" #include "device_convolution_backward_data_implicit_gemm_v1r2_nchw_kcyx_nkhw.hpp" +#include "device_convolution_backward_data_implicit_gemm_v2r1_nchw_kcyx_nkhw.hpp" int main(int argc, char* argv[]) { @@ -34,7 +35,7 @@ int main(int argc, char* argv[]) using LeftPads = Sequence<0, 0>; using RightPads = Sequence<0, 0>; -#elif 0 +#elif 1 // 3x3, 34x34 constexpr index_t N = 64; constexpr index_t C = 256; @@ -49,7 +50,7 @@ int main(int argc, char* argv[]) using LeftPads = Sequence<0, 0>; using RightPads = Sequence<0, 0>; -#elif 1 +#elif 0 // 1x1 filter, 8x8 image // cudnn@V100 68%, ck@V100 72%, ck@P100 52%, ck@VII 42% constexpr index_t N = 64; @@ -337,18 +338,23 @@ int main(int argc, char* argv[]) if(do_verification) { #if 0 - out_nkhw.GenerateTensorValue(GeneratorTensor_1{1}, num_thread); wei_kcyx.GenerateTensorValue(GeneratorTensor_1{1}, num_thread); + out_nkhw.GenerateTensorValue(GeneratorTensor_1{1}, num_thread); +#elif 0 + wei_kcyx.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread); + out_nkhw.GenerateTensorValue(GeneratorTensor_1{1}, num_thread); #else - out_nkhw.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread); wei_kcyx.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread); + out_nkhw.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread); #endif } -#if 1 +#if 0 device_convolution_backward_data_implicit_gemm_v1r1_nchw_kcyx_nkhw -#else +#elif 0 device_convolution_backward_data_implicit_gemm_v1r2_nchw_kcyx_nkhw +#else + device_convolution_backward_data_implicit_gemm_v2r1_nchw_kcyx_nkhw #endif (in_nchw_desc, in_nchw_device, From 157491ab2518bbacc44bf3bb6c20e7c794287082 Mon Sep 17 00:00:00 2001 From: Chao Liu Date: Tue, 3 Dec 2019 01:08:50 -0600 Subject: [PATCH 22/23] added bwd data v2r1: no need for atomic --- ...data_implicit_gemm_v2r1_nchw_kcyx_nkhw.hpp | 90 ++++++++++--------- .../multi_index_transform.hpp | 33 ++----- .../tensor_operation/gridwise_gemm.hpp | 11 +-- composable_kernel/include/utility/math.hpp | 55 ++++++++++-- ...data_implicit_gemm_v2r1_nchw_kcyx_nkhw.hpp | 25 ++++-- driver/src/conv_bwd_data_driver.cpp | 34 ++----- driver/src/conv_driver.cpp | 6 +- 7 files changed, 140 insertions(+), 114 deletions(-) diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_backward_data_implicit_gemm_v2r1_nchw_kcyx_nkhw.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_backward_data_implicit_gemm_v2r1_nchw_kcyx_nkhw.hpp index 5577c0ec..2aab909e 100644 --- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_backward_data_implicit_gemm_v2r1_nchw_kcyx_nkhw.hpp +++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_backward_data_implicit_gemm_v2r1_nchw_kcyx_nkhw.hpp @@ -8,6 +8,9 @@ namespace ck { +// GemmK = K * Ydot * Xdot; +// GemmM = C * Ytilda * Xtilda; +// GemmN = N * Htilda * Wtilda; template {}, - PassThrough{}, - Embed, Sequence<1, 1, 0>>{}, // coefficient may be wrong - Embed, Sequence<1, 1, 0>>{}), // coefficient may be wrong + make_tuple(PassThrough{}, + PassThrough{}, + Pad, + Sequence<0, 0>, + Sequence, + true>{}), + make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}), + make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{})); + + constexpr auto wei_k_c_ydot_ytilda_xdot_xtilda_global_desc = transform_tensor_descriptor( + wei_k_c_yp_xp_global_desc, + make_tuple(PassThrough{}, + PassThrough{}, + Embed, + Sequence>{}, + Embed, + Sequence>{}), make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}), make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}, Sequence<4, 5>{})); @@ -110,23 +121,25 @@ struct GridwiseConvolutionBackwardDataImplicitGemm_v2r1_nchw_kcyx_nkhw make_tuple(Sequence<0>{}, Sequence<1>{})); // output tensor - constexpr auto out_n_k_hop_wop_global_desc = transform_tensor_descriptor( - out_n_k_ho_wo_global_desc, - make_tuple( - PassThrough{}, - PassThrough{}, - Pad, Sequence<0, 0>, Sequence>{}), // coefficient may - // be wrong - make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}), - make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{})); + constexpr auto out_n_k_hop_wop_global_desc = + transform_tensor_descriptor(out_n_k_ho_wo_global_desc, + make_tuple(PassThrough{}, + PassThrough{}, + Pad, + Sequence<0, 0>, + Sequence, + true>{}), + make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}), + make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{})); constexpr auto out_n_k_ydot_htilda_xdot_wtilda_global_desc = transform_tensor_descriptor( out_n_k_hop_wop_global_desc, - make_tuple( - PassThrough{}, - PassThrough{}, - Embed, Sequence<0, 1, 0>>{}, // coefficient may be wrong - Embed, Sequence<0, 1, 0>>{}), // coefficient may be wrong + make_tuple(PassThrough{}, + PassThrough{}, + Embed, + Sequence<-ConvDilationH / hcf_stride_dilation_h, 1, 0>>{}, + Embed, + Sequence<-ConvDilationW / hcf_stride_dilation_w, 1, 0>>{}), make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}), make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}, Sequence<4, 5>{})); @@ -137,14 +150,11 @@ struct GridwiseConvolutionBackwardDataImplicitGemm_v2r1_nchw_kcyx_nkhw make_tuple(Sequence<0>{}, Sequence<1>{})); // input tensor - constexpr auto eff_left_pads = LeftPads{} + Sequence{}; - constexpr auto eff_right_pads = RightPads{} + Sequence{}; - constexpr auto in_n_c_hip_wip_global_desc = transform_tensor_descriptor( in_n_c_hi_wi_global_desc, make_tuple(PassThrough{}, PassThrough{}, - Pad, decltype(eff_left_pads), decltype(eff_right_pads)>{}), + Pad, LeftPads, RightPads, true>{}), make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}), make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{})); @@ -160,7 +170,7 @@ struct GridwiseConvolutionBackwardDataImplicitGemm_v2r1_nchw_kcyx_nkhw constexpr auto in_gemmm_gemmn_global_desc = transform_tensor_descriptor( in_n_c_ytilda_htilda_xtilda_wtilda_global_desc, make_tuple(Merge>{}, Merge>{}), - make_tuple(Sequence<1, 3, 5>{}, Sequence<0, 2, 4>{}), + make_tuple(Sequence<1, 2, 4>{}, Sequence<0, 3, 5>{}), make_tuple(Sequence<0>{}, Sequence<1>{})); // GEMM diff --git a/composable_kernel/include/tensor_description/multi_index_transform.hpp b/composable_kernel/include/tensor_description/multi_index_transform.hpp index 103904a9..bd69c402 100644 --- a/composable_kernel/include/tensor_description/multi_index_transform.hpp +++ b/composable_kernel/include/tensor_description/multi_index_transform.hpp @@ -84,35 +84,14 @@ struct Pad __host__ __device__ constexpr bool IsUpperIndexMappedToValidLowerIndex(const UpperIndex& idx_up) const { -#if 0 - struct lambda_no_pad - { - __host__ __device__ constexpr bool operator()(index_t x) const { return x == 0; } - }; + bool flag = true; - if(sequence_all_of(LeftPads{}, lambda_no_pad{}) && - sequence_all_of(RightPads{}, lambda_no_pad{})) - { - return true; - } - else -#endif - { - bool flag = true; + static_for<0, nDim, 1>{}([&](auto idim) { + flag = flag && (idx_up[idim] >= LeftPads::At(idim)) && + (idx_up[idim] < LeftPads::At(idim) + LowerLengths::At(idim)); + }); - static_for<0, nDim, 1>{}([&](auto idim) { - // only check if there is left-padding - static_if<(LeftPads::At(idim) != 0)>{}( - [&](auto) { flag = flag && idx_up[idim] >= LeftPads::At(idim); }); - - // only check if there is right-padding - static_if<(RightPads::At(idim) != 0)>{}([&](auto) { - flag = flag && (idx_up[idim] < LeftPads::At(idim) + LowerLengths::At(idim)); - }); - }); - - return flag; - } + return flag; } }; diff --git a/composable_kernel/include/tensor_operation/gridwise_gemm.hpp b/composable_kernel/include/tensor_operation/gridwise_gemm.hpp index 6727b36d..2f203f0e 100644 --- a/composable_kernel/include/tensor_operation/gridwise_gemm.hpp +++ b/composable_kernel/include/tensor_operation/gridwise_gemm.hpp @@ -46,18 +46,13 @@ struct GridwiseGemmTransposedANormalBNormalC_v1r1 { constexpr auto True = integral_constant{}; - constexpr auto I0 = Number<0>{}; - constexpr auto I1 = Number<1>{}; - constexpr auto I2 = Number<2>{}; - constexpr auto I3 = Number<3>{}; - constexpr auto a_k_m_global_desc = AGlobalDesc{}; constexpr auto b_k_n_global_desc = BGlobalDesc{}; constexpr auto c_m_n_global_desc = CGlobalDesc{}; - constexpr auto K = a_k_m_global_desc.GetLength(I0); - constexpr auto M = a_k_m_global_desc.GetLength(I1); - constexpr auto N = b_k_n_global_desc.GetLength(I1); + constexpr auto K = a_k_m_global_desc.GetLengths()[0]; + constexpr auto M = a_k_m_global_desc.GetLengths()[1]; + constexpr auto N = b_k_n_global_desc.GetLengths()[1]; // lds max alignment constexpr index_t max_lds_align = math::lcm(ABlockCopyDataPerAccess_M, diff --git a/composable_kernel/include/utility/math.hpp b/composable_kernel/include/utility/math.hpp index ba70e7ab..feb73939 100644 --- a/composable_kernel/include/utility/math.hpp +++ b/composable_kernel/include/utility/math.hpp @@ -97,12 +97,57 @@ __host__ __device__ constexpr T min(T x, Ts... xs) return x < y ? x : y; } -// this is WRONG -// TODO: implement least common multiple properly, instead of calling max() -template -__host__ __device__ constexpr T lcm(T x, Ts... xs) +// highest common factor +template +__host__ __device__ constexpr T hcf(T x, T y) +{ + if(x == 0) + { + return y; + } + + if(y == 0) + { + return x; + } + + if(x == y) + { + return x; + } + + if(x > y) + { + return hcf(x - y, y); + } + + return hcf(x, y - x); +} + +template +__host__ __device__ constexpr auto hcf(Number, Number) +{ + constexpr auto result = hcf(X, Y); + return Number{}; +} + +template +__host__ __device__ constexpr auto hcf(X x, Ys... ys) +{ + return hcf(x, ys...); +} + +// least common multiple +template +__host__ __device__ constexpr T lcm(T x, T y) +{ + return (x * y) / hcf(x, y); +} + +template +__host__ __device__ constexpr auto lcm(X x, Y y, Zs... zs) { - return max(x, xs...); + return lcm(x, lcm(y, zs...)); } template diff --git a/driver/include/device_convolution_backward_data_implicit_gemm_v2r1_nchw_kcyx_nkhw.hpp b/driver/include/device_convolution_backward_data_implicit_gemm_v2r1_nchw_kcyx_nkhw.hpp index 2074bdc4..c0b1fe97 100644 --- a/driver/include/device_convolution_backward_data_implicit_gemm_v2r1_nchw_kcyx_nkhw.hpp +++ b/driver/include/device_convolution_backward_data_implicit_gemm_v2r1_nchw_kcyx_nkhw.hpp @@ -36,6 +36,12 @@ void device_convolution_backward_data_implicit_gemm_v2r1_nchw_kcyx_nkhw(InDesc i constexpr index_t Y = wei_kcyx_desc.GetLengths()[2]; constexpr index_t X = wei_kcyx_desc.GetLengths()[3]; + constexpr index_t ConvStrideH = ConvStrides{}[0]; + constexpr index_t ConvStrideW = ConvStrides{}[1]; + + constexpr index_t ConvDilationH = ConvDilations{}[0]; + constexpr index_t ConvDilationW = ConvDilations{}[1]; + std::size_t data_sz = sizeof(T); DeviceMem in_nchw_device_buf(data_sz * in_nchw.mDesc.GetElementSpace()); DeviceMem wei_kcyx_device_buf(data_sz * wei_kcyx.mDesc.GetElementSpace()); @@ -105,13 +111,20 @@ void device_convolution_backward_data_implicit_gemm_v2r1_nchw_kcyx_nkhw(InDesc i // TODO: this algo support any stride and dilation. But for now, let's fix them to be 1 for // simplicity - constexpr index_t Ydot = 1; - constexpr index_t Ytilda = Y; - constexpr index_t Htilda = Ho + Y - 1; + constexpr index_t hcf_stride_dilation_h = math::hcf(ConvStrideH, ConvDilationH); + constexpr index_t hcf_stride_dilation_w = math::hcf(ConvStrideW, ConvDilationW); + + constexpr index_t Ytilda = ConvStrideH / hcf_stride_dilation_h; // may be wrong + constexpr index_t Xtilda = ConvStrideW / hcf_stride_dilation_w; // may be wrong + + constexpr index_t Ydot = math::integer_divide_ceil(Y, Ytilda); + constexpr index_t Xdot = math::integer_divide_ceil(X, Xtilda); + + constexpr index_t right_pad_ho = (ConvDilationH / hcf_stride_dilation_h) * (Y - Ytilda); + constexpr index_t right_pad_wo = (ConvDilationW / hcf_stride_dilation_w) * (X - Xtilda); - constexpr index_t Xdot = 1; - constexpr index_t Xtilda = X; - constexpr index_t Wtilda = Wo + X - 1; + constexpr index_t Htilda = Ho + right_pad_ho; + constexpr index_t Wtilda = Wo + right_pad_wo; constexpr index_t GemmK = K * Ydot * Xdot; constexpr index_t GemmM = C * Ytilda * Xtilda; diff --git a/driver/src/conv_bwd_data_driver.cpp b/driver/src/conv_bwd_data_driver.cpp index f677326c..2f0df590 100644 --- a/driver/src/conv_bwd_data_driver.cpp +++ b/driver/src/conv_bwd_data_driver.cpp @@ -22,20 +22,20 @@ int main(int argc, char* argv[]) using namespace ck; #if 0 - constexpr index_t N = 4; - constexpr index_t C = 8; - constexpr index_t HI = 11; - constexpr index_t WI = 11; + constexpr index_t N = 8; + constexpr index_t C = 128; + constexpr index_t HI = 16; + constexpr index_t WI = 16; constexpr index_t K = 8; - constexpr index_t Y = 4; - constexpr index_t X = 4; + constexpr index_t Y = 2; + constexpr index_t X = 2; - using ConvStrides = Sequence<1, 1>; - using ConvDilations = Sequence<1, 1>; + using ConvStrides = Sequence<4, 4>; + using ConvDilations = Sequence<2, 2>; using LeftPads = Sequence<0, 0>; using RightPads = Sequence<0, 0>; -#elif 1 +#elif 0 // 3x3, 34x34 constexpr index_t N = 64; constexpr index_t C = 256; @@ -52,7 +52,6 @@ int main(int argc, char* argv[]) using RightPads = Sequence<0, 0>; #elif 0 // 1x1 filter, 8x8 image - // cudnn@V100 68%, ck@V100 72%, ck@P100 52%, ck@VII 42% constexpr index_t N = 64; constexpr index_t C = 1536; constexpr index_t HI = 8; @@ -68,7 +67,6 @@ int main(int argc, char* argv[]) using RightPads = Sequence<0, 0>; #elif 0 // 1x1 filter, 8x8 image - // cudnn@V100 77%, ck@V100 76%, ck@P100 79%, ck@VII 51% constexpr index_t N = 128; constexpr index_t C = 2048; constexpr index_t HI = 8; @@ -84,7 +82,6 @@ int main(int argc, char* argv[]) using RightPads = Sequence<0, 0>; #elif 0 // 1x1 filter, 7x7 image - // cudnn@V100 82%, ck@V100 76%, ck@P100 67%, ck@VII 64% constexpr index_t N = 128; constexpr index_t C = 832; constexpr index_t HI = 7; @@ -100,7 +97,6 @@ int main(int argc, char* argv[]) using RightPads = Sequence<0, 0>; #elif 0 // 1x1 filter, 8x8 image - // cudnn@V100 83%, ck@V100 75%, ck@P100 78%, ck@VII 65% constexpr index_t N = 128; constexpr index_t C = 1280; constexpr index_t HI = 8; @@ -116,7 +112,6 @@ int main(int argc, char* argv[]) using RightPads = Sequence<0, 0>; #elif 0 // 1x1 filter, 14x14 image - // cudnn@V100 62%, ck@V100 68%, ck@P100 70%, ck@VII 50% constexpr index_t N = 128; constexpr index_t C = 512; constexpr index_t HI = 14; @@ -132,7 +127,6 @@ int main(int argc, char* argv[]) using RightPads = Sequence<0, 0>; #elif 0 // 1x1 filter, 8x8 image - // cudnn@V100 74%, ck@V100 57%, ck@P100 78%, ck@VII 61% constexpr index_t N = 64; constexpr index_t C = 1536; constexpr index_t HI = 8; @@ -148,7 +142,6 @@ int main(int argc, char* argv[]) using RightPads = Sequence<0, 0>; #elif 0 // 1x1 filter, 28x28 image - // cudnn@V100 86%, ck@V100 84%, ck@P100 80%, ck@VII 69% constexpr index_t N = 128; constexpr index_t C = 256; constexpr index_t HI = 28; @@ -164,7 +157,6 @@ int main(int argc, char* argv[]) using RightPads = Sequence<0, 0>; #elif 0 // 1x1 filter, 7x7 image - // cudnn@V100 71%, ck@V100 55%, ck@P100 70%, ck@VII 62% constexpr index_t N = 128; constexpr index_t C = 832; constexpr index_t HI = 7; @@ -180,7 +172,6 @@ int main(int argc, char* argv[]) using RightPads = Sequence<0, 0>; #elif 0 // 1x1 filter, 17x17 input - // cudnn@V100 81%, ck@V100 76%, ck@P100 70%, ck@VII 76% constexpr index_t N = 128; constexpr index_t C = 768; constexpr index_t HI = 17; @@ -196,7 +187,6 @@ int main(int argc, char* argv[]) using RightPads = Sequence<0, 0>; #elif 0 // 1x1 filter, 14x14 image - // cudnn@V100 73%, ck@V100 71%, ck@P100 70%, ck@VII 64% constexpr index_t N = 128; constexpr index_t C = 528; constexpr index_t HI = 14; @@ -212,7 +202,6 @@ int main(int argc, char* argv[]) using RightPads = Sequence<0, 0>; #elif 0 // 1x1 filter, 14x14 image - // cudnn@V100 73%, ck@V100 72%, ck@P100 79%, ck@VII 75% constexpr index_t N = 128; constexpr index_t C = 528; constexpr index_t HI = 14; @@ -228,7 +217,6 @@ int main(int argc, char* argv[]) using RightPads = Sequence<0, 0>; #elif 0 // 1x1 filter, 7x7 image - // cudnn@V100 49%, ck@V100 50%, ck@P100 61%, ck@VII 52% constexpr index_t N = 128; constexpr index_t C = 832; constexpr index_t HI = 7; @@ -244,7 +232,6 @@ int main(int argc, char* argv[]) using RightPads = Sequence<0, 0>; #elif 0 // 3x3 filter, 2x2 stride, 35x35 input, 17x17 output - // cudnn@V100 90%, ck@V100 93%, ck@P100 83%, ck@VII 81% constexpr index_t N = 128; constexpr index_t C = 288; constexpr index_t HI = 35; @@ -340,9 +327,6 @@ int main(int argc, char* argv[]) #if 0 wei_kcyx.GenerateTensorValue(GeneratorTensor_1{1}, num_thread); out_nkhw.GenerateTensorValue(GeneratorTensor_1{1}, num_thread); -#elif 0 - wei_kcyx.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread); - out_nkhw.GenerateTensorValue(GeneratorTensor_1{1}, num_thread); #else wei_kcyx.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread); out_nkhw.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread); diff --git a/driver/src/conv_driver.cpp b/driver/src/conv_driver.cpp index 5646ffc2..5f87b738 100644 --- a/driver/src/conv_driver.cpp +++ b/driver/src/conv_driver.cpp @@ -58,7 +58,7 @@ int main(int argc, char* argv[]) using LeftPads = Sequence<0, 0>; using RightPads = Sequence<0, 0>; -#elif 1 +#elif 0 // 1x1 filter, 8x8 image // cudnn@V100 68%, ck@V100 72%, ck@P100 52%, ck@VII 42% constexpr index_t N = 64; @@ -250,7 +250,7 @@ int main(int argc, char* argv[]) using LeftPads = Sequence<0, 0>; using RightPads = Sequence<0, 0>; -#elif 0 +#elif 1 // 3x3 filter, 2x2 stride, 35x35 input, 17x17 output // cudnn@V100 90%, ck@V100 93%, ck@P100 83%, ck@VII 81% constexpr index_t N = 128; @@ -296,7 +296,7 @@ int main(int argc, char* argv[]) using LeftPads = Sequence<3, 0>; using RightPads = Sequence<3, 0>; -#elif 1 +#elif 0 // 1x7 filter, 0x3 pad, 17x17 input constexpr index_t N = 128; constexpr index_t C = 128; From 8c42225c2ee8976353dbbc2e251412d7094e9fe8 Mon Sep 17 00:00:00 2001 From: Chao Liu Date: Tue, 3 Dec 2019 01:42:19 -0600 Subject: [PATCH 23/23] minor bug fix --- ...data_implicit_gemm_v2r1_nchw_kcyx_nkhw.hpp | 27 ++-- ..._v4r4_nchw_kcyx_nkhw_lds_double_buffer.hpp | 134 ++++++------------ 2 files changed, 57 insertions(+), 104 deletions(-) diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_backward_data_implicit_gemm_v2r1_nchw_kcyx_nkhw.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_backward_data_implicit_gemm_v2r1_nchw_kcyx_nkhw.hpp index 2aab909e..ecd501b2 100644 --- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_backward_data_implicit_gemm_v2r1_nchw_kcyx_nkhw.hpp +++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_backward_data_implicit_gemm_v2r1_nchw_kcyx_nkhw.hpp @@ -20,8 +20,8 @@ template {}, Pad, Sequence<0, 0>, - Sequence, - true>{}), + Sequence>{}), make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}), make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{})); @@ -121,16 +120,14 @@ struct GridwiseConvolutionBackwardDataImplicitGemm_v2r1_nchw_kcyx_nkhw make_tuple(Sequence<0>{}, Sequence<1>{})); // output tensor - constexpr auto out_n_k_hop_wop_global_desc = - transform_tensor_descriptor(out_n_k_ho_wo_global_desc, - make_tuple(PassThrough{}, - PassThrough{}, - Pad, - Sequence<0, 0>, - Sequence, - true>{}), - make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}), - make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{})); + constexpr auto out_n_k_hop_wop_global_desc = transform_tensor_descriptor( + out_n_k_ho_wo_global_desc, + make_tuple( + PassThrough{}, + PassThrough{}, + Pad, Sequence<0, 0>, Sequence>{}), + make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}), + make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{})); constexpr auto out_n_k_ydot_htilda_xdot_wtilda_global_desc = transform_tensor_descriptor( out_n_k_hop_wop_global_desc, @@ -154,7 +151,7 @@ struct GridwiseConvolutionBackwardDataImplicitGemm_v2r1_nchw_kcyx_nkhw in_n_c_hi_wi_global_desc, make_tuple(PassThrough{}, PassThrough{}, - Pad, LeftPads, RightPads, true>{}), + Pad, InputLeftPads, InputRightPads>{}), make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}), make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{})); diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer.hpp index d739ebd9..fcf96f39 100644 --- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer.hpp +++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer.hpp @@ -1,19 +1,20 @@ -#ifndef CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V4R4_NCHW_KCYX_NKHW_LDS_DOUBLE_BUFFER_HPP -#define CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V4R4_NCHW_KCYX_NKHW_LDS_DOUBLE_BUFFER_HPP +#ifndef CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V4R4_NCHW_KCYX_NKHW_HPP +#define CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V4R4_NCHW_KCYX_NKHW_HPP #include "common_header.hpp" #include "tensor_descriptor.hpp" #include "tensor_descriptor_helper.hpp" -#include "ConstantMatrixDescriptor.hpp" -#include "blockwise_generic_tensor_slice_copy.hpp" -#include "threadwise_generic_tensor_slice_copy.hpp" -#include "blockwise_gemm.hpp" +#include "gridwise_gemm.hpp" namespace ck { -// B = merge(N, Ho, Wo) + +// GEMM_M = K +// GEMM_N = N * Ho * Wo +// GEMM_K = C * Y * X template struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer { @@ -58,8 +59,6 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer constexpr auto I2 = Number<2>{}; constexpr auto I3 = Number<3>{}; - constexpr auto True = integral_constant{}; - constexpr auto in_n_c_hi_wi_global_desc = make_native_tensor_descriptor(InGlobalDesc::GetLengths(), InGlobalDesc::GetStrides()); constexpr auto wei_k_c_y_x_global_desc = @@ -94,23 +93,11 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer "wrong! aligment requirement for vectorized global load of input tensor will " "be violated"); - // divide block work by [K, B] - static_assert(K % KPerBlock == 0 && B % BPerBlock == 0 && E % EPerBlock == 0, - "wrong! cannot divide work evenly among block"); - - constexpr index_t KBlockWork = K / KPerBlock; - constexpr index_t BBlockWork = B / BPerBlock; - - constexpr auto block_work_desc = - make_cluster_descriptor(Sequence{}); - - const auto block_work_id = block_work_desc.CalculateClusterIndex(get_block_1d_id()); - - const index_t k_block_data_on_global = block_work_id[0] * KPerBlock; - const index_t b_block_data_on_global = block_work_id[1] * BPerBlock; + // weight tensor + constexpr auto wei_gemmk_gemmm_global_desc = reorder_tensor_descriptor_given_upper2lower( + unfold_tensor_descriptor(wei_k_c_y_x_global_desc, I1, I3), Sequence<1, 0>{}); // input tensor - // global mem constexpr auto in_n_c_hip_wip_global_desc = transform_tensor_descriptor( in_n_c_hi_wi_global_desc, make_tuple( @@ -127,54 +114,23 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}), make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}, Sequence<4, 5>{})); - constexpr auto in_e_b_global_desc = transform_tensor_descriptor( + constexpr auto in_gemmk_gemmn_global_desc = transform_tensor_descriptor( in_n_c_y_ho_x_wo_global_desc, make_tuple(Merge>{}, Merge>{}), make_tuple(Sequence<1, 2, 4>{}, Sequence<0, 3, 5>{}), make_tuple(Sequence<0>{}, Sequence<1>{})); - // LDS mem - // be careful of LDS alignment - constexpr auto in_e_b_block_desc = - make_native_tensor_descriptor_packed(Sequence{}); - - // input blockwise copy - auto blockwise_in_copy = - BlockwiseGenericTensorSliceCopy_v4( - {0, b_block_data_on_global}, {0, 0}); - - // weight tensor - // global mem - constexpr auto wei_e_k_global_desc = reorder_tensor_descriptor_given_upper2lower( - unfold_tensor_descriptor(wei_k_c_y_x_global_desc, I1, I3), Sequence<1, 0>{}); - // LDS // be careful of LDS alignment constexpr auto wei_e_k_block_desc = make_native_tensor_descriptor_aligned( - Sequence{}, - Number{}); + Sequence{}, + Number{}); // this check is ad-hoc // TODO: need to properly implement tensor descriptor with multiple alignment // requirements - static_assert(wei_e_k_block_desc.GetStride(I0) % GemmDataPerReadA == 0, - "GemmDataPerReadA alignment requirement is not satisfied"); + static_assert(wei_e_k_block_desc.GetStride(I0) % GemmThreadGemmDataPerReadM == 0, + "GemmThreadGemmDataPerReadM alignment requirement is not satisfied"); // weight blockwise copy auto blockwise_wei_copy = @@ -199,24 +155,24 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer // GEMM definition // c_mtx += transpose(a_mtx) * b_mtx - // a_mtx[EPerBlock, KPerBlock] is in LDS - // b_mtx[EPerBlocl, BPerBlock] is in LDS - // c_mtx[KPerBlock, BPerBlock] is distributed among threads, and saved in + // a_mtx[GemmKPerBlock, GemmMPerBlock] is in LDS + // b_mtx[EPerBlocl, GemmNPerBlock] is in LDS + // c_mtx[GemmMPerBlock, GemmNPerBlock] is distributed among threads, and saved in // register constexpr auto a_e_k_block_mtx_desc = make_ConstantMatrixDescriptor(wei_e_k_block_desc); constexpr auto b_e_b_block_mtx_desc = make_ConstantMatrixDescriptor(in_e_b_block_desc); // sanity check static_assert( - KPerBlock % (GemmMPerThreadSubC * GemmMLevel0Cluster * GemmMLevel1Cluster) == 0 && - BPerBlock % (GemmNPerThreadSubC * GemmNLevel0Cluster * GemmNLevel1Cluster) == 0, + GemmMPerBlock % (GemmMPerThreadSubC * GemmMLevel0Cluster * GemmMLevel1Cluster) == 0 && + GemmNPerBlock % (GemmNPerThreadSubC * GemmNLevel0Cluster * GemmNLevel1Cluster) == 0, "wrong!"); constexpr index_t GemmMRepeat = - KPerBlock / (GemmMPerThreadSubC * GemmMLevel0Cluster * GemmMLevel1Cluster); + GemmMPerBlock / (GemmMPerThreadSubC * GemmMLevel0Cluster * GemmMLevel1Cluster); constexpr index_t GemmNRepeat = - BPerBlock / (GemmNPerThreadSubC * GemmNLevel0Cluster * GemmNLevel1Cluster); + GemmNPerBlock / (GemmNPerThreadSubC * GemmNLevel0Cluster * GemmNLevel1Cluster); // c_thread_mtx definition: this is a mess // TODO:: more elegent way of defining c_thread_mtx @@ -235,14 +191,14 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer GemmMLevel1Cluster, GemmNLevel1Cluster, GemmKPerThreadLoop, - GemmDataPerReadA, - GemmDataPerReadB>{}; + GemmThreadGemmDataPerReadM, + GemmThreadGemmDataPerReadN>{}; // LDS allocation for input and weight: be careful of alignment constexpr index_t max_align = math::lcm(InBlockCopyDataPerAccess_B, WeiBlockCopyDstDataPerWrite_K, - GemmDataPerReadA, - GemmDataPerReadB); + GemmThreadGemmDataPerReadM, + GemmThreadGemmDataPerReadN); constexpr index_t in_block_space = math::integer_least_multiple(in_e_b_block_desc.GetElementSpace(), max_align); @@ -266,8 +222,8 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer } // LDS double buffer: main body - for(index_t e_block_data_begin = 0; e_block_data_begin + 2 * EPerBlock < E; - e_block_data_begin += 2 * EPerBlock) + for(index_t e_block_data_begin = 0; e_block_data_begin + 2 * GemmKPerBlock < E; + e_block_data_begin += 2 * GemmKPerBlock) { #pragma unroll for(index_t iloop = 0; iloop < 2; ++iloop) @@ -287,8 +243,8 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer Float p_in_thread_buffer[blockwise_in_copy.GetThreadBufferSize()]; Float p_wei_thread_buffer[blockwise_wei_copy.GetThreadBufferSize()]; - blockwise_in_copy.MoveSrcSliceWindow(Sequence{}, True); - blockwise_wei_copy.MoveSrcSliceWindow(Sequence{}, True); + blockwise_in_copy.MoveSrcSliceWindow(Sequence{}, True); + blockwise_wei_copy.MoveSrcSliceWindow(Sequence{}, True); __syncthreads(); @@ -307,15 +263,15 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer // LDS double buffer: tail { - constexpr bool has_two_iteration_left = (E % (2 * EPerBlock) == 0); + constexpr bool has_two_iteration_left = (E % (2 * GemmKPerBlock) == 0); if(has_two_iteration_left) // if has 2 iteration left { Float p_in_thread_buffer[blockwise_in_copy.GetThreadBufferSize()]; Float p_wei_thread_buffer[blockwise_wei_copy.GetThreadBufferSize()]; - blockwise_in_copy.MoveSrcSliceWindow(Sequence{}, True); - blockwise_wei_copy.MoveSrcSliceWindow(Sequence{}, True); + blockwise_in_copy.MoveSrcSliceWindow(Sequence{}, True); + blockwise_wei_copy.MoveSrcSliceWindow(Sequence{}, True); __syncthreads();