From 0b1743460958ef0179047d1c1d871624d7d43ec8 Mon Sep 17 00:00:00 2001 From: xgqdut2016 Date: Thu, 17 Oct 2024 16:17:13 +0800 Subject: [PATCH 1/5] bang_add --- src/ops/add/bang/add_bang.cc | 56 ++++++++++++++++++ src/ops/add/bang/add_bang.h | 33 +++++++++++ src/ops/add/bang/add_bang.mlu | 103 ++++++++++++++++++++++++++++++++++ src/ops/add/operator.cc | 16 +++++- 4 files changed, 205 insertions(+), 3 deletions(-) create mode 100644 src/ops/add/bang/add_bang.cc create mode 100644 src/ops/add/bang/add_bang.h create mode 100644 src/ops/add/bang/add_bang.mlu diff --git a/src/ops/add/bang/add_bang.cc b/src/ops/add/bang/add_bang.cc new file mode 100644 index 00000000..e90ea55d --- /dev/null +++ b/src/ops/add/bang/add_bang.cc @@ -0,0 +1,56 @@ +#include "add_bang.h" +#include "../../../devices/bang/common_bang.h" +#include "../../utils.h" +#include +infiniopStatus_t bangCreateAddDescriptor(BangHandle_t handle, + AddBangDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t c, + infiniopTensorDescriptor_t a, + infiniopTensorDescriptor_t b) { + uint64_t ndim = c->ndim; + if (!isValidBroadcastShape(a, b, c)) { + return STATUS_BAD_TENSOR_SHAPE; + } + if (!is_contiguous(a) || !is_contiguous(b) || !is_contiguous(c)) { + return STATUS_BAD_TENSOR_STRIDES; + } + if (!dtype_eq(c->dt, F16) || c->dt != a->dt || c->dt != b->dt) { + return STATUS_BAD_TENSOR_DTYPE; + } + + uint64_t c_data_size = std::accumulate(c->shape, c->shape + c->ndim, 1ULL, std::multiplies()); + + // get the adjusted strides for a and b + uint64_t *a_strides = new uint64_t[ndim]; + uint64_t *b_strides = new uint64_t[ndim]; + for (size_t i = 0; i < ndim; ++i) { + a_strides[i] = (i < ndim - a->ndim || c->shape[i] != a->shape[i + a->ndim - ndim]) ? 0 : a->strides[i + a->ndim - ndim]; + b_strides[i] = (i < ndim - b->ndim || c->shape[i] != b->shape[i + b->ndim - ndim]) ? 0 : b->strides[i + b->ndim - ndim]; + } + uint64_t *c_shape, *a_strides_d, *b_strides_d; + cnrtMalloc((void **) &c_shape, ndim * sizeof(uint64_t)); + cnrtMalloc((void **) &a_strides_d, ndim * sizeof(uint64_t)); + cnrtMalloc((void **) &b_strides_d, ndim * sizeof(uint64_t)); + cnrtMemcpy(c_shape, c->shape, ndim * sizeof(uint64_t), cnrtMemcpyHostToDev); + cnrtMemcpy(a_strides_d, a_strides, ndim * sizeof(uint64_t), cnrtMemcpyHostToDev); + cnrtMemcpy(b_strides_d, b_strides, ndim * sizeof(uint64_t), cnrtMemcpyHostToDev); + *desc_ptr = new AddBangDescriptor{ + handle->device, + handle->device_id, + c->dt, + ndim, + c_data_size, + c_shape, + a_strides_d, + b_strides_d}; + + return STATUS_SUCCESS; +} + +infiniopStatus_t bangDestroyAddDescriptor(AddBangDescriptor_t desc) { + cnrtFree(desc->c_shape); + cnrtFree(desc->a_strides_d); + cnrtFree(desc->b_strides_d); + delete desc; + return STATUS_SUCCESS; +} diff --git a/src/ops/add/bang/add_bang.h b/src/ops/add/bang/add_bang.h new file mode 100644 index 00000000..25cc56c1 --- /dev/null +++ b/src/ops/add/bang/add_bang.h @@ -0,0 +1,33 @@ +#ifndef __BANG_ADD_H__ +#define __BANG_ADD_H__ + +#include "../../../devices/bang/bang_handle.h" +#include "../../utils.h" +#include "operators.h" + +struct AddBangDescriptor { + Device device; + int device_id; + DT dtype; + uint64_t ndim; + uint64_t c_data_size; + uint64_t *c_shape; + uint64_t *a_strides_d; + uint64_t *b_strides_d; +}; + +typedef struct AddBangDescriptor *AddBangDescriptor_t; + +infiniopStatus_t bangCreateAddDescriptor(BangHandle_t, + AddBangDescriptor_t *, + infiniopTensorDescriptor_t c, + infiniopTensorDescriptor_t a, + infiniopTensorDescriptor_t b); + +infiniopStatus_t bangAdd(AddBangDescriptor_t desc, + void *c, void const *a, void const *b, + void *stream); + +infiniopStatus_t bangDestroyAddDescriptor(AddBangDescriptor_t desc); + +#endif diff --git a/src/ops/add/bang/add_bang.mlu b/src/ops/add/bang/add_bang.mlu new file mode 100644 index 00000000..35326171 --- /dev/null +++ b/src/ops/add/bang/add_bang.mlu @@ -0,0 +1,103 @@ +#include "bang.h" +#include "bang_device_functions.h" +#include "cnrt.h" +#include "add_bang.h" +#include "../../../devices/bang/common_bang.h" +#include + +const uint64_t SRC_MAX_SIZE = 1024 * 64; +__nram__ char nram_buffer[NRAM_MAX_SIZE]; +template +__mlu_global__ void addKernel(T *c, T const *a, T const *b, uint64_t *c_shape, uint64_t *a_strides_d, uint64_t *b_strides_d, uint64_t ndim, uint64_t length){ + const uint64_t maxNum = SRC_MAX_SIZE/sizeof(T); + uint64_t taskSize = taskDim * maxNum; + + uint64_t remain = length % taskSize; + + uint64_t repeat = (length - remain) / taskSize; + + uint64_t remainT = remain % taskDim; + uint64_t stepEasy = (remain - remainT) / taskDim; + uint64_t stepHard = stepEasy + 1; + uint64_t step = (taskId < remainT ? stepHard : stepEasy); + uint64_t indStart = repeat * taskSize + (taskId < remainT ? taskId * stepHard : remainT * stepHard + (taskId - remainT) * stepEasy); + //__bang_printf("%ld, %ld, %ld\n", repeat, indStart, step); + T *c_src = (T *)nram_buffer; + T *a_src = c_src + maxNum; + T *b_src = a_src + maxNum; + for(uint64_t r = 0; r < repeat; r++){ + __memcpy(c_src, c + r * taskSize + taskId * maxNum, maxNum * sizeof(T), GDRAM2NRAM); + for(uint64_t i = r * taskSize + taskId * maxNum; i < r * taskSize + (taskId + 1) * maxNum; i++){ + uint64_t a_tid = 0; + uint64_t b_tid = 0; + uint64_t indi = i; + for(uint64_t s = ndim - 1; s >= 0; s -= 1){ + a_tid += (indi % c_shape[s]) * a_strides_d[s]; + b_tid += (indi % c_shape[s]) * b_strides_d[s]; + indi /= c_shape[s]; + } + c_src[i] = a[a_tid] + b[b_tid]; + } + __memcpy(c + r * taskSize + taskId * maxNum, c_src, maxNum * sizeof(T), NRAM2GDRAM); + } + if(step){ + + __memcpy(c_src, c + indStart, step * sizeof(T), GDRAM2NRAM); + for(uint64_t i = indStart; i < indStart + step; i++){ + uint64_t a_tid = 0; + uint64_t b_tid = 0; + uint64_t indi = i; + + for(uint64_t s = ndim - 1; s > 0; s -= 1){ + a_tid += (indi % c_shape[s]) * a_strides_d[s]; + b_tid += (indi % c_shape[s]) * b_strides_d[s]; + indi /= c_shape[s]; + } + + __memcpy(a_src, a + a_tid, sizeof(T), GDRAM2NRAM); + __memcpy(b_src, b + b_tid, sizeof(T), GDRAM2NRAM); + c_src[i] = a_src[0] + b_src[0]; + __bang_printf("%ld, %ld, %ld, %ld, %.4f, %.4f, %.4f\n", taskId, i, a_tid, b_tid, static_cast(a[a_tid]), static_cast(b[b_tid]), static_cast(c_src[i])); + } + __memcpy(c + indStart, c_src, step * sizeof(T), NRAM2GDRAM); + } + +} +template +void addUnion(cnrtQueue_t queue, void *c, void const *a, void const *b, uint64_t *c_shape, uint64_t *a_strides_d, uint64_t *b_strides_d, uint64_t ndim, uint64_t length){ + auto a_ = reinterpret_cast(a); + auto b_ = reinterpret_cast(b); + auto c_ = reinterpret_cast(c); + cnrtDim3_t k_dim; + cnrtFunctionType_t k_type; + + k_dim.x = 4; + k_dim.y = 1; + k_dim.z = 1; + k_type = CNRT_FUNC_TYPE_UNION1; + addKernel<<>>(c_, a_, b_, c_shape, a_strides_d, b_strides_d, ndim, length); + //cnrtQueueSync(queue); +} +void add_bang_f16(AddBangDescriptor_t desc, void *c, void const *a, void const *b, + void *stream) { + auto queue = reinterpret_cast(stream); + uint64_t ndim = desc->ndim; + uint64_t length = desc->c_data_size; + addUnion(queue, c, a, b, desc->c_shape, desc->a_strides_d, desc->b_strides_d, ndim, length); + +} + +infiniopStatus_t bangAdd(AddBangDescriptor_t desc, + void *c, void const *a, void const *b, + void *stream) { + if (cnrtSetDevice(desc->device_id) != cnrtSuccess) { + return STATUS_BAD_DEVICE; + } + if (dtype_eq(desc->dtype, F16)) { + add_bang_f16(desc, c, a, b, stream); + return STATUS_SUCCESS; + } + return STATUS_BAD_TENSOR_DTYPE; +} + + diff --git a/src/ops/add/operator.cc b/src/ops/add/operator.cc index c2a30ea8..81952965 100644 --- a/src/ops/add/operator.cc +++ b/src/ops/add/operator.cc @@ -9,6 +9,10 @@ #include "../../devices/cuda/cuda_handle.h" #include "cuda/add.cuh" #endif +#ifdef ENABLE_CAMBRICON_MLU +#include "../../devices/bang/bang_handle.h" +#include "bang/add_bang.h" +#endif __C infiniopStatus_t infiniopCreateAddDescriptor( infiniopHandle_t handle, @@ -28,7 +32,9 @@ __C infiniopStatus_t infiniopCreateAddDescriptor( #endif #ifdef ENABLE_CAMBRICON_MLU - // TODO + case DevCambriconMlu: { + return bangCreateAddDescriptor((BangHandle_t) handle, (AddBangDescriptor_t *) desc_ptr, c, a, b); + } #endif } return STATUS_BAD_DEVICE; @@ -47,7 +53,9 @@ __C infiniopStatus_t infiniopAdd(infiniopAddDescriptor_t desc, void *c, void con #endif #ifdef ENABLE_CAMBRICON_MLU - // TODO + case DevCambriconMlu: { + return bangAdd((AddBangDescriptor_t) desc, c, a, b, stream); + } #endif } return STATUS_BAD_DEVICE; @@ -66,7 +74,9 @@ __C infiniopStatus_t infiniopDestroyAddDescriptor(infiniopAddDescriptor_t desc) #endif #ifdef ENABLE_CAMBRICON_MLU - // TODO + case DevCambriconMlu: { + return bangDestroyAddDescriptor((AddBangDescriptor_t) desc); + } #endif } return STATUS_BAD_DEVICE; From 6b6e1d6e40315cb49bf2c1af3aa3c4be077e39ce Mon Sep 17 00:00:00 2001 From: xgqdut2016 Date: Mon, 21 Oct 2024 15:35:58 +0800 Subject: [PATCH 2/5] modified add_bang.mlu --- src/ops/add/bang/add_bang.cc | 3 ++- src/ops/add/bang/add_bang.mlu | 31 +++++++++++++++++-------------- 2 files changed, 19 insertions(+), 15 deletions(-) diff --git a/src/ops/add/bang/add_bang.cc b/src/ops/add/bang/add_bang.cc index e90ea55d..3e4c8168 100644 --- a/src/ops/add/bang/add_bang.cc +++ b/src/ops/add/bang/add_bang.cc @@ -27,7 +27,8 @@ infiniopStatus_t bangCreateAddDescriptor(BangHandle_t handle, a_strides[i] = (i < ndim - a->ndim || c->shape[i] != a->shape[i + a->ndim - ndim]) ? 0 : a->strides[i + a->ndim - ndim]; b_strides[i] = (i < ndim - b->ndim || c->shape[i] != b->shape[i + b->ndim - ndim]) ? 0 : b->strides[i + b->ndim - ndim]; } - uint64_t *c_shape, *a_strides_d, *b_strides_d; + uint64_t *c_shape, + *a_strides_d, *b_strides_d; cnrtMalloc((void **) &c_shape, ndim * sizeof(uint64_t)); cnrtMalloc((void **) &a_strides_d, ndim * sizeof(uint64_t)); cnrtMalloc((void **) &b_strides_d, ndim * sizeof(uint64_t)); diff --git a/src/ops/add/bang/add_bang.mlu b/src/ops/add/bang/add_bang.mlu index 35326171..eaeac923 100644 --- a/src/ops/add/bang/add_bang.mlu +++ b/src/ops/add/bang/add_bang.mlu @@ -27,11 +27,11 @@ __mlu_global__ void addKernel(T *c, T const *a, T const *b, uint64_t *c_shape, u T *b_src = a_src + maxNum; for(uint64_t r = 0; r < repeat; r++){ __memcpy(c_src, c + r * taskSize + taskId * maxNum, maxNum * sizeof(T), GDRAM2NRAM); - for(uint64_t i = r * taskSize + taskId * maxNum; i < r * taskSize + (taskId + 1) * maxNum; i++){ + for(uint64_t i = 0; i < maxNum; i++){ uint64_t a_tid = 0; uint64_t b_tid = 0; - uint64_t indi = i; - for(uint64_t s = ndim - 1; s >= 0; s -= 1){ + uint64_t indi = i + r * taskSize + taskId * maxNum; + for(int s = ndim - 1; s >= 0; s -= 1){ a_tid += (indi % c_shape[s]) * a_strides_d[s]; b_tid += (indi % c_shape[s]) * b_strides_d[s]; indi /= c_shape[s]; @@ -43,12 +43,12 @@ __mlu_global__ void addKernel(T *c, T const *a, T const *b, uint64_t *c_shape, u if(step){ __memcpy(c_src, c + indStart, step * sizeof(T), GDRAM2NRAM); - for(uint64_t i = indStart; i < indStart + step; i++){ + for(uint64_t i = 0; i < step; i++){ uint64_t a_tid = 0; uint64_t b_tid = 0; - uint64_t indi = i; + uint64_t indi = i + indStart; - for(uint64_t s = ndim - 1; s > 0; s -= 1){ + for(int s = ndim - 1; s >= 0; --s){ a_tid += (indi % c_shape[s]) * a_strides_d[s]; b_tid += (indi % c_shape[s]) * b_strides_d[s]; indi /= c_shape[s]; @@ -57,17 +57,17 @@ __mlu_global__ void addKernel(T *c, T const *a, T const *b, uint64_t *c_shape, u __memcpy(a_src, a + a_tid, sizeof(T), GDRAM2NRAM); __memcpy(b_src, b + b_tid, sizeof(T), GDRAM2NRAM); c_src[i] = a_src[0] + b_src[0]; - __bang_printf("%ld, %ld, %ld, %ld, %.4f, %.4f, %.4f\n", taskId, i, a_tid, b_tid, static_cast(a[a_tid]), static_cast(b[b_tid]), static_cast(c_src[i])); + } + __memcpy(c + indStart, c_src, step * sizeof(T), NRAM2GDRAM); + } } template -void addUnion(cnrtQueue_t queue, void *c, void const *a, void const *b, uint64_t *c_shape, uint64_t *a_strides_d, uint64_t *b_strides_d, uint64_t ndim, uint64_t length){ - auto a_ = reinterpret_cast(a); - auto b_ = reinterpret_cast(b); - auto c_ = reinterpret_cast(c); +void addUnion(cnrtQueue_t queue, T *c, T const *a, T const *b, uint64_t *c_shape, uint64_t *a_strides_d, uint64_t *b_strides_d, uint64_t ndim, uint64_t length){ + cnrtDim3_t k_dim; cnrtFunctionType_t k_type; @@ -75,15 +75,18 @@ void addUnion(cnrtQueue_t queue, void *c, void const *a, void const *b, uint64_t k_dim.y = 1; k_dim.z = 1; k_type = CNRT_FUNC_TYPE_UNION1; - addKernel<<>>(c_, a_, b_, c_shape, a_strides_d, b_strides_d, ndim, length); - //cnrtQueueSync(queue); + addKernel<<>>(c, a, b, c_shape, a_strides_d, b_strides_d, ndim, length); + cnrtQueueSync(queue); } void add_bang_f16(AddBangDescriptor_t desc, void *c, void const *a, void const *b, void *stream) { auto queue = reinterpret_cast(stream); uint64_t ndim = desc->ndim; uint64_t length = desc->c_data_size; - addUnion(queue, c, a, b, desc->c_shape, desc->a_strides_d, desc->b_strides_d, ndim, length); + auto a_ = reinterpret_cast(a); + auto b_ = reinterpret_cast(b); + auto c_ = reinterpret_cast(c); + addUnion(queue, c_, a_, b_, desc->c_shape, desc->a_strides_d, desc->b_strides_d, ndim, length); } From 84b0dc2c8d094717ed42da885f5fedeb79105f11 Mon Sep 17 00:00:00 2001 From: xgqdut2016 Date: Wed, 23 Oct 2024 14:19:50 +0800 Subject: [PATCH 3/5] special shape --- src/ops/add/bang/add_bang.cc | 8 +++- src/ops/add/bang/add_bang.h | 1 + src/ops/add/bang/add_bang.mlu | 90 +++++++++++++++++++++-------------- 3 files changed, 63 insertions(+), 36 deletions(-) diff --git a/src/ops/add/bang/add_bang.cc b/src/ops/add/bang/add_bang.cc index 3e4c8168..d5e8414e 100644 --- a/src/ops/add/bang/add_bang.cc +++ b/src/ops/add/bang/add_bang.cc @@ -27,6 +27,11 @@ infiniopStatus_t bangCreateAddDescriptor(BangHandle_t handle, a_strides[i] = (i < ndim - a->ndim || c->shape[i] != a->shape[i + a->ndim - ndim]) ? 0 : a->strides[i + a->ndim - ndim]; b_strides[i] = (i < ndim - b->ndim || c->shape[i] != b->shape[i + b->ndim - ndim]) ? 0 : b->strides[i + b->ndim - ndim]; } + bool condition = false; + for (size_t i = 0; i < ndim; ++i) { + condition = (a_strides[i] == 0 || b_strides[i] == 0); + } + uint64_t *c_shape, *a_strides_d, *b_strides_d; cnrtMalloc((void **) &c_shape, ndim * sizeof(uint64_t)); @@ -43,7 +48,8 @@ infiniopStatus_t bangCreateAddDescriptor(BangHandle_t handle, c_data_size, c_shape, a_strides_d, - b_strides_d}; + b_strides_d, + condition}; return STATUS_SUCCESS; } diff --git a/src/ops/add/bang/add_bang.h b/src/ops/add/bang/add_bang.h index 25cc56c1..886da5b5 100644 --- a/src/ops/add/bang/add_bang.h +++ b/src/ops/add/bang/add_bang.h @@ -14,6 +14,7 @@ struct AddBangDescriptor { uint64_t *c_shape; uint64_t *a_strides_d; uint64_t *b_strides_d; + bool condition; }; typedef struct AddBangDescriptor *AddBangDescriptor_t; diff --git a/src/ops/add/bang/add_bang.mlu b/src/ops/add/bang/add_bang.mlu index eaeac923..26812747 100644 --- a/src/ops/add/bang/add_bang.mlu +++ b/src/ops/add/bang/add_bang.mlu @@ -8,7 +8,7 @@ const uint64_t SRC_MAX_SIZE = 1024 * 64; __nram__ char nram_buffer[NRAM_MAX_SIZE]; template -__mlu_global__ void addKernel(T *c, T const *a, T const *b, uint64_t *c_shape, uint64_t *a_strides_d, uint64_t *b_strides_d, uint64_t ndim, uint64_t length){ +__mlu_global__ void addKernel(T *c, T const *a, T const *b, uint64_t *c_shape, uint64_t *a_strides_d, uint64_t *b_strides_d, uint64_t ndim, uint64_t length, bool condition){ const uint64_t maxNum = SRC_MAX_SIZE/sizeof(T); uint64_t taskSize = taskDim * maxNum; @@ -25,48 +25,68 @@ __mlu_global__ void addKernel(T *c, T const *a, T const *b, uint64_t *c_shape, u T *c_src = (T *)nram_buffer; T *a_src = c_src + maxNum; T *b_src = a_src + maxNum; - for(uint64_t r = 0; r < repeat; r++){ - __memcpy(c_src, c + r * taskSize + taskId * maxNum, maxNum * sizeof(T), GDRAM2NRAM); - for(uint64_t i = 0; i < maxNum; i++){ - uint64_t a_tid = 0; - uint64_t b_tid = 0; - uint64_t indi = i + r * taskSize + taskId * maxNum; - for(int s = ndim - 1; s >= 0; s -= 1){ - a_tid += (indi % c_shape[s]) * a_strides_d[s]; - b_tid += (indi % c_shape[s]) * b_strides_d[s]; - indi /= c_shape[s]; + if(condition){ + for(uint64_t r = 0; r < repeat; r++){ + __memcpy(c_src, c + r * taskSize + taskId * maxNum, maxNum * sizeof(T), GDRAM2NRAM); + for(uint64_t i = 0; i < maxNum; i++){ + uint64_t a_tid = 0; + uint64_t b_tid = 0; + uint64_t indi = i + r * taskSize + taskId * maxNum; + for(int s = ndim - 1; s >= 0; s -= 1){ + a_tid += (indi % c_shape[s]) * a_strides_d[s]; + b_tid += (indi % c_shape[s]) * b_strides_d[s]; + indi /= c_shape[s]; + } + c_src[i] = a[a_tid] + b[b_tid]; } - c_src[i] = a[a_tid] + b[b_tid]; + __memcpy(c + r * taskSize + taskId * maxNum, c_src, maxNum * sizeof(T), NRAM2GDRAM); } - __memcpy(c + r * taskSize + taskId * maxNum, c_src, maxNum * sizeof(T), NRAM2GDRAM); - } - if(step){ - - __memcpy(c_src, c + indStart, step * sizeof(T), GDRAM2NRAM); - for(uint64_t i = 0; i < step; i++){ - uint64_t a_tid = 0; - uint64_t b_tid = 0; - uint64_t indi = i + indStart; - - for(int s = ndim - 1; s >= 0; --s){ - a_tid += (indi % c_shape[s]) * a_strides_d[s]; - b_tid += (indi % c_shape[s]) * b_strides_d[s]; - indi /= c_shape[s]; + if(step){ + + __memcpy(c_src, c + indStart, step * sizeof(T), GDRAM2NRAM); + for(uint64_t i = 0; i < step; i++){ + uint64_t a_tid = 0; + uint64_t b_tid = 0; + uint64_t indi = i + indStart; + + for(int s = ndim - 1; s >= 0; --s){ + a_tid += (indi % c_shape[s]) * a_strides_d[s]; + b_tid += (indi % c_shape[s]) * b_strides_d[s]; + indi /= c_shape[s]; + } + + __memcpy(a_src, a + a_tid, sizeof(T), GDRAM2NRAM); + __memcpy(b_src, b + b_tid, sizeof(T), GDRAM2NRAM); + c_src[i] = a_src[0] + b_src[0]; + } - __memcpy(a_src, a + a_tid, sizeof(T), GDRAM2NRAM); - __memcpy(b_src, b + b_tid, sizeof(T), GDRAM2NRAM); - c_src[i] = a_src[0] + b_src[0]; + __memcpy(c + indStart, c_src, step * sizeof(T), NRAM2GDRAM); + + } + } + else{ + for(uint64_t r = 0; r < repeat; r++){ + __memcpy(c_src, c + r * taskSize + taskId * maxNum, maxNum * sizeof(T), GDRAM2NRAM); + __memcpy(a_src, a + r * taskSize + taskId * maxNum, maxNum * sizeof(T), GDRAM2NRAM); + __memcpy(b_src, b + r * taskSize + taskId * maxNum, maxNum * sizeof(T), GDRAM2NRAM); + __bang_add(c_src, a_src, b_src, maxNum); + __memcpy(c + r * taskSize + taskId * maxNum, c_src, maxNum * sizeof(T), NRAM2GDRAM); + } + if(step){ + + __memcpy(c_src, c + indStart, step * sizeof(T), GDRAM2NRAM); + __memcpy(a_src, a + indStart, step * sizeof(T), GDRAM2NRAM); + __memcpy(b_src, b + indStart, step * sizeof(T), GDRAM2NRAM); + __bang_add(c_src, a_src, b_src, step); + __memcpy(c + indStart, c_src, step * sizeof(T), NRAM2GDRAM); } - - __memcpy(c + indStart, c_src, step * sizeof(T), NRAM2GDRAM); - } } template -void addUnion(cnrtQueue_t queue, T *c, T const *a, T const *b, uint64_t *c_shape, uint64_t *a_strides_d, uint64_t *b_strides_d, uint64_t ndim, uint64_t length){ +void addUnion(cnrtQueue_t queue, T *c, T const *a, T const *b, uint64_t *c_shape, uint64_t *a_strides_d, uint64_t *b_strides_d, uint64_t ndim, uint64_t length, bool condition){ cnrtDim3_t k_dim; cnrtFunctionType_t k_type; @@ -75,7 +95,7 @@ void addUnion(cnrtQueue_t queue, T *c, T const *a, T const *b, uint64_t *c_shape k_dim.y = 1; k_dim.z = 1; k_type = CNRT_FUNC_TYPE_UNION1; - addKernel<<>>(c, a, b, c_shape, a_strides_d, b_strides_d, ndim, length); + addKernel<<>>(c, a, b, c_shape, a_strides_d, b_strides_d, ndim, length, condition); cnrtQueueSync(queue); } void add_bang_f16(AddBangDescriptor_t desc, void *c, void const *a, void const *b, @@ -86,7 +106,7 @@ void add_bang_f16(AddBangDescriptor_t desc, void *c, void const *a, void const * auto a_ = reinterpret_cast(a); auto b_ = reinterpret_cast(b); auto c_ = reinterpret_cast(c); - addUnion(queue, c_, a_, b_, desc->c_shape, desc->a_strides_d, desc->b_strides_d, ndim, length); + addUnion(queue, c_, a_, b_, desc->c_shape, desc->a_strides_d, desc->b_strides_d, ndim, length, desc->condition); } From 61adc971dc67464ccf769fff72b9984b365a66d2 Mon Sep 17 00:00:00 2001 From: xgqdut2016 Date: Wed, 23 Oct 2024 15:07:09 +0800 Subject: [PATCH 4/5] condition a and b --- src/ops/add/bang/add_bang.cc | 18 +++-- src/ops/add/bang/add_bang.h | 3 +- src/ops/add/bang/add_bang.mlu | 123 +++++++++++++++++++++++++++------- 3 files changed, 114 insertions(+), 30 deletions(-) diff --git a/src/ops/add/bang/add_bang.cc b/src/ops/add/bang/add_bang.cc index d5e8414e..9a1d4d96 100644 --- a/src/ops/add/bang/add_bang.cc +++ b/src/ops/add/bang/add_bang.cc @@ -27,11 +27,20 @@ infiniopStatus_t bangCreateAddDescriptor(BangHandle_t handle, a_strides[i] = (i < ndim - a->ndim || c->shape[i] != a->shape[i + a->ndim - ndim]) ? 0 : a->strides[i + a->ndim - ndim]; b_strides[i] = (i < ndim - b->ndim || c->shape[i] != b->shape[i + b->ndim - ndim]) ? 0 : b->strides[i + b->ndim - ndim]; } - bool condition = false; + bool a_condition = false; + bool b_condition = false; for (size_t i = 0; i < ndim; ++i) { - condition = (a_strides[i] == 0 || b_strides[i] == 0); + a_condition = (a_strides[i] == 0); + if (a_condition) { + break; + } + } + for (size_t i = 0; i < ndim; ++i) { + b_condition = (b_strides[i] == 0); + if (b_condition) { + break; + } } - uint64_t *c_shape, *a_strides_d, *b_strides_d; cnrtMalloc((void **) &c_shape, ndim * sizeof(uint64_t)); @@ -49,7 +58,8 @@ infiniopStatus_t bangCreateAddDescriptor(BangHandle_t handle, c_shape, a_strides_d, b_strides_d, - condition}; + a_condition, + b_condition}; return STATUS_SUCCESS; } diff --git a/src/ops/add/bang/add_bang.h b/src/ops/add/bang/add_bang.h index 886da5b5..d53bcd36 100644 --- a/src/ops/add/bang/add_bang.h +++ b/src/ops/add/bang/add_bang.h @@ -14,7 +14,8 @@ struct AddBangDescriptor { uint64_t *c_shape; uint64_t *a_strides_d; uint64_t *b_strides_d; - bool condition; + bool a_condition; + bool b_condition; }; typedef struct AddBangDescriptor *AddBangDescriptor_t; diff --git a/src/ops/add/bang/add_bang.mlu b/src/ops/add/bang/add_bang.mlu index 26812747..2b9b9761 100644 --- a/src/ops/add/bang/add_bang.mlu +++ b/src/ops/add/bang/add_bang.mlu @@ -8,7 +8,7 @@ const uint64_t SRC_MAX_SIZE = 1024 * 64; __nram__ char nram_buffer[NRAM_MAX_SIZE]; template -__mlu_global__ void addKernel(T *c, T const *a, T const *b, uint64_t *c_shape, uint64_t *a_strides_d, uint64_t *b_strides_d, uint64_t ndim, uint64_t length, bool condition){ +__mlu_global__ void addKernel(T *c, T const *a, T const *b, uint64_t *c_shape, uint64_t *a_strides_d, uint64_t *b_strides_d, uint64_t ndim, uint64_t length, bool a_condition, bool b_condition){ const uint64_t maxNum = SRC_MAX_SIZE/sizeof(T); uint64_t taskSize = taskDim * maxNum; @@ -25,14 +25,14 @@ __mlu_global__ void addKernel(T *c, T const *a, T const *b, uint64_t *c_shape, u T *c_src = (T *)nram_buffer; T *a_src = c_src + maxNum; T *b_src = a_src + maxNum; - if(condition){ + if(a_condition && b_condition){ for(uint64_t r = 0; r < repeat; r++){ __memcpy(c_src, c + r * taskSize + taskId * maxNum, maxNum * sizeof(T), GDRAM2NRAM); for(uint64_t i = 0; i < maxNum; i++){ uint64_t a_tid = 0; uint64_t b_tid = 0; uint64_t indi = i + r * taskSize + taskId * maxNum; - for(int s = ndim - 1; s >= 0; s -= 1){ + for(int s = ndim - 1; s >= 0; --s){ a_tid += (indi % c_shape[s]) * a_strides_d[s]; b_tid += (indi % c_shape[s]) * b_strides_d[s]; indi /= c_shape[s]; @@ -42,7 +42,6 @@ __mlu_global__ void addKernel(T *c, T const *a, T const *b, uint64_t *c_shape, u __memcpy(c + r * taskSize + taskId * maxNum, c_src, maxNum * sizeof(T), NRAM2GDRAM); } if(step){ - __memcpy(c_src, c + indStart, step * sizeof(T), GDRAM2NRAM); for(uint64_t i = 0; i < step; i++){ uint64_t a_tid = 0; @@ -54,39 +53,113 @@ __mlu_global__ void addKernel(T *c, T const *a, T const *b, uint64_t *c_shape, u b_tid += (indi % c_shape[s]) * b_strides_d[s]; indi /= c_shape[s]; } - __memcpy(a_src, a + a_tid, sizeof(T), GDRAM2NRAM); __memcpy(b_src, b + b_tid, sizeof(T), GDRAM2NRAM); - c_src[i] = a_src[0] + b_src[0]; + c_src[i] = a[a_tid] + b[b_tid]; } - __memcpy(c + indStart, c_src, step * sizeof(T), NRAM2GDRAM); - } } else{ - for(uint64_t r = 0; r < repeat; r++){ - __memcpy(c_src, c + r * taskSize + taskId * maxNum, maxNum * sizeof(T), GDRAM2NRAM); - __memcpy(a_src, a + r * taskSize + taskId * maxNum, maxNum * sizeof(T), GDRAM2NRAM); - __memcpy(b_src, b + r * taskSize + taskId * maxNum, maxNum * sizeof(T), GDRAM2NRAM); - __bang_add(c_src, a_src, b_src, maxNum); - __memcpy(c + r * taskSize + taskId * maxNum, c_src, maxNum * sizeof(T), NRAM2GDRAM); + if(a_condition){ + for(uint64_t r = 0; r < repeat; r++){ + __memcpy(c_src, c + r * taskSize + taskId * maxNum, maxNum * sizeof(T), GDRAM2NRAM); + __memcpy(b_src, b + r * taskSize + taskId * maxNum, maxNum * sizeof(T), GDRAM2NRAM); + for(uint64_t i = 0; i < maxNum; i++){ + uint64_t a_tid = 0; + + uint64_t indi = i + r * taskSize + taskId * maxNum; + for(int s = ndim - 1; s >= 0; --s){ + a_tid += (indi % c_shape[s]) * a_strides_d[s]; + + indi /= c_shape[s]; + } + c_src[i] = a[a_tid] + b_src[i]; + } + __memcpy(c + r * taskSize + taskId * maxNum, c_src, maxNum * sizeof(T), NRAM2GDRAM); + } + if(step){ + __memcpy(c_src, c + indStart, step * sizeof(T), GDRAM2NRAM); + __memcpy(b_src, b + indStart, step * sizeof(T), GDRAM2NRAM); + for(uint64_t i = 0; i < step; i++){ + uint64_t a_tid = 0; + + uint64_t indi = i + indStart; + + for(int s = ndim - 1; s >= 0; --s){ + a_tid += (indi % c_shape[s]) * a_strides_d[s]; + + indi /= c_shape[s]; + } + + c_src[i] = a[a_tid] + b_src[i]; + + } + __memcpy(c + indStart, c_src, step * sizeof(T), NRAM2GDRAM); + } } - if(step){ - - __memcpy(c_src, c + indStart, step * sizeof(T), GDRAM2NRAM); - __memcpy(a_src, a + indStart, step * sizeof(T), GDRAM2NRAM); - __memcpy(b_src, b + indStart, step * sizeof(T), GDRAM2NRAM); - __bang_add(c_src, a_src, b_src, step); - __memcpy(c + indStart, c_src, step * sizeof(T), NRAM2GDRAM); - + else if (b_condition){ + for(uint64_t r = 0; r < repeat; r++){ + __memcpy(c_src, c + r * taskSize + taskId * maxNum, maxNum * sizeof(T), GDRAM2NRAM); + __memcpy(a_src, a + r * taskSize + taskId * maxNum, maxNum * sizeof(T), GDRAM2NRAM); + for(uint64_t i = 0; i < maxNum; i++){ + uint64_t b_tid = 0; + + uint64_t indi = i + r * taskSize + taskId * maxNum; + for(int s = ndim - 1; s >= 0; --s){ + b_tid += (indi % c_shape[s]) * b_strides_d[s]; + + indi /= c_shape[s]; + } + c_src[i] = a_src[i] + b[b_tid]; + } + __memcpy(c + r * taskSize + taskId * maxNum, c_src, maxNum * sizeof(T), NRAM2GDRAM); + } + if(step){ + __memcpy(c_src, c + indStart, step * sizeof(T), GDRAM2NRAM); + __memcpy(b_src, b + indStart, step * sizeof(T), GDRAM2NRAM); + for(uint64_t i = 0; i < step; i++){ + uint64_t b_tid = 0; + + uint64_t indi = i + indStart; + + for(int s = ndim - 1; s >= 0; --s){ + b_tid += (indi % c_shape[s]) * b_strides_d[s]; + + indi /= c_shape[s]; + } + + c_src[i] = a_src[i] + b[b_tid]; + + } + __memcpy(c + indStart, c_src, step * sizeof(T), NRAM2GDRAM); + } + } + else{ + for(uint64_t r = 0; r < repeat; r++){ + __memcpy(c_src, c + r * taskSize + taskId * maxNum, maxNum * sizeof(T), GDRAM2NRAM); + __memcpy(a_src, a + r * taskSize + taskId * maxNum, maxNum * sizeof(T), GDRAM2NRAM); + __memcpy(b_src, b + r * taskSize + taskId * maxNum, maxNum * sizeof(T), GDRAM2NRAM); + __bang_add(c_src, a_src, b_src, maxNum); + __memcpy(c + r * taskSize + taskId * maxNum, c_src, maxNum * sizeof(T), NRAM2GDRAM); + } + if(step){ + + __memcpy(c_src, c + indStart, step * sizeof(T), GDRAM2NRAM); + __memcpy(a_src, a + indStart, step * sizeof(T), GDRAM2NRAM); + __memcpy(b_src, b + indStart, step * sizeof(T), GDRAM2NRAM); + __bang_add(c_src, a_src, b_src, step); + __memcpy(c + indStart, c_src, step * sizeof(T), NRAM2GDRAM); + + } } + } } template -void addUnion(cnrtQueue_t queue, T *c, T const *a, T const *b, uint64_t *c_shape, uint64_t *a_strides_d, uint64_t *b_strides_d, uint64_t ndim, uint64_t length, bool condition){ +void addUnion(cnrtQueue_t queue, T *c, T const *a, T const *b, uint64_t *c_shape, uint64_t *a_strides_d, uint64_t *b_strides_d, uint64_t ndim, uint64_t length, bool a_condition, bool b_condition){ cnrtDim3_t k_dim; cnrtFunctionType_t k_type; @@ -95,7 +168,7 @@ void addUnion(cnrtQueue_t queue, T *c, T const *a, T const *b, uint64_t *c_shape k_dim.y = 1; k_dim.z = 1; k_type = CNRT_FUNC_TYPE_UNION1; - addKernel<<>>(c, a, b, c_shape, a_strides_d, b_strides_d, ndim, length, condition); + addKernel<<>>(c, a, b, c_shape, a_strides_d, b_strides_d, ndim, length, a_condition, b_condition); cnrtQueueSync(queue); } void add_bang_f16(AddBangDescriptor_t desc, void *c, void const *a, void const *b, @@ -106,7 +179,7 @@ void add_bang_f16(AddBangDescriptor_t desc, void *c, void const *a, void const * auto a_ = reinterpret_cast(a); auto b_ = reinterpret_cast(b); auto c_ = reinterpret_cast(c); - addUnion(queue, c_, a_, b_, desc->c_shape, desc->a_strides_d, desc->b_strides_d, ndim, length, desc->condition); + addUnion(queue, c_, a_, b_, desc->c_shape, desc->a_strides_d, desc->b_strides_d, ndim, length, desc->a_condition, desc->b_condition); } From 6966f89bc4d6b4e7f97990ad882654c2e20c9ddd Mon Sep 17 00:00:00 2001 From: xgqdut2016 Date: Wed, 23 Oct 2024 16:28:52 +0800 Subject: [PATCH 5/5] bang_add --- src/ops/add/bang/add_bang.mlu | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/src/ops/add/bang/add_bang.mlu b/src/ops/add/bang/add_bang.mlu index 2b9b9761..ae8622fa 100644 --- a/src/ops/add/bang/add_bang.mlu +++ b/src/ops/add/bang/add_bang.mlu @@ -37,8 +37,10 @@ __mlu_global__ void addKernel(T *c, T const *a, T const *b, uint64_t *c_shape, u b_tid += (indi % c_shape[s]) * b_strides_d[s]; indi /= c_shape[s]; } - c_src[i] = a[a_tid] + b[b_tid]; + a_src[i] = a[a_tid]; + b_src[i] = b[b_tid]; } + __bang_add(c_src, a_src, b_src, maxNum); __memcpy(c + r * taskSize + taskId * maxNum, c_src, maxNum * sizeof(T), NRAM2GDRAM); } if(step){ @@ -53,11 +55,10 @@ __mlu_global__ void addKernel(T *c, T const *a, T const *b, uint64_t *c_shape, u b_tid += (indi % c_shape[s]) * b_strides_d[s]; indi /= c_shape[s]; } - __memcpy(a_src, a + a_tid, sizeof(T), GDRAM2NRAM); - __memcpy(b_src, b + b_tid, sizeof(T), GDRAM2NRAM); - c_src[i] = a[a_tid] + b[b_tid]; - + a_src[i] = a[a_tid]; + b_src[i] = b[b_tid]; } + __bang_add(c_src, a_src, b_src, maxNum); __memcpy(c + indStart, c_src, step * sizeof(T), NRAM2GDRAM); } } @@ -75,8 +76,9 @@ __mlu_global__ void addKernel(T *c, T const *a, T const *b, uint64_t *c_shape, u indi /= c_shape[s]; } - c_src[i] = a[a_tid] + b_src[i]; + a_src[i] = a[a_tid]; } + __bang_add(c_src, a_src, b_src, maxNum); __memcpy(c + r * taskSize + taskId * maxNum, c_src, maxNum * sizeof(T), NRAM2GDRAM); } if(step){ @@ -93,9 +95,9 @@ __mlu_global__ void addKernel(T *c, T const *a, T const *b, uint64_t *c_shape, u indi /= c_shape[s]; } - c_src[i] = a[a_tid] + b_src[i]; - + a_src[i] = a[a_tid]; } + __bang_add(c_src, a_src, b_src, maxNum); __memcpy(c + indStart, c_src, step * sizeof(T), NRAM2GDRAM); } } @@ -112,8 +114,9 @@ __mlu_global__ void addKernel(T *c, T const *a, T const *b, uint64_t *c_shape, u indi /= c_shape[s]; } - c_src[i] = a_src[i] + b[b_tid]; + b_src[i] = b[b_tid]; } + __bang_add(c_src, a_src, b_src, maxNum); __memcpy(c + r * taskSize + taskId * maxNum, c_src, maxNum * sizeof(T), NRAM2GDRAM); } if(step){ @@ -130,9 +133,10 @@ __mlu_global__ void addKernel(T *c, T const *a, T const *b, uint64_t *c_shape, u indi /= c_shape[s]; } - c_src[i] = a_src[i] + b[b_tid]; + b_src[i] = b[b_tid]; } + __bang_add(c_src, a_src, b_src, maxNum); __memcpy(c + indStart, c_src, step * sizeof(T), NRAM2GDRAM); } }