From 0b1743460958ef0179047d1c1d871624d7d43ec8 Mon Sep 17 00:00:00 2001
From: xgqdut2016 <kenan_gewei@163.com>
Date: Thu, 17 Oct 2024 16:17:13 +0800
Subject: [PATCH 1/5] bang_add

---
 src/ops/add/bang/add_bang.cc  |  56 ++++++++++++++++++
 src/ops/add/bang/add_bang.h   |  33 +++++++++++
 src/ops/add/bang/add_bang.mlu | 103 ++++++++++++++++++++++++++++++++++
 src/ops/add/operator.cc       |  16 +++++-
 4 files changed, 205 insertions(+), 3 deletions(-)
 create mode 100644 src/ops/add/bang/add_bang.cc
 create mode 100644 src/ops/add/bang/add_bang.h
 create mode 100644 src/ops/add/bang/add_bang.mlu
diff --git a/src/ops/add/bang/add_bang.cc b/src/ops/add/bang/add_bang.cc
new file mode 100644
index 00000000..e90ea55d
--- /dev/null
+++ b/src/ops/add/bang/add_bang.cc
@@ -0,0 +1,56 @@
+#include "add_bang.h"
+#include "../../../devices/bang/common_bang.h"
+#include "../../utils.h"
+#include <cmath>
+infiniopStatus_t bangCreateAddDescriptor(BangHandle_t handle,
+                                         AddBangDescriptor_t *desc_ptr,
+                                         infiniopTensorDescriptor_t c,
+                                         infiniopTensorDescriptor_t a,
+                                         infiniopTensorDescriptor_t b) {
+    uint64_t ndim = c->ndim;
+    if (!isValidBroadcastShape(a, b, c)) {
+        return STATUS_BAD_TENSOR_SHAPE;
+    }
+    if (!is_contiguous(a) || !is_contiguous(b) || !is_contiguous(c)) {
+        return STATUS_BAD_TENSOR_STRIDES;
+    }
+    if (!dtype_eq(c->dt, F16) || c->dt != a->dt || c->dt != b->dt) {
+        return STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    uint64_t c_data_size = std::accumulate(c->shape, c->shape + c->ndim, 1ULL, std::multiplies<uint64_t>());
+
+    // get the adjusted strides for a and b
+    uint64_t *a_strides = new uint64_t[ndim];
+    uint64_t *b_strides = new uint64_t[ndim];
+    for (size_t i = 0; i < ndim; ++i) {
+        a_strides[i] = (i < ndim - a->ndim || c->shape[i] != a->shape[i + a->ndim - ndim]) ? 0 : a->strides[i + a->ndim - ndim];
+        b_strides[i] = (i < ndim - b->ndim || c->shape[i] != b->shape[i + b->ndim - ndim]) ? 0 : b->strides[i + b->ndim - ndim];
+    }
+    uint64_t *c_shape, *a_strides_d, *b_strides_d;
+    cnrtMalloc((void **) &c_shape, ndim * sizeof(uint64_t));
+    cnrtMalloc((void **) &a_strides_d, ndim * sizeof(uint64_t));
+    cnrtMalloc((void **) &b_strides_d, ndim * sizeof(uint64_t));
+    cnrtMemcpy(c_shape, c->shape, ndim * sizeof(uint64_t), cnrtMemcpyHostToDev);
+    cnrtMemcpy(a_strides_d, a_strides, ndim * sizeof(uint64_t), cnrtMemcpyHostToDev);
+    cnrtMemcpy(b_strides_d, b_strides, ndim * sizeof(uint64_t), cnrtMemcpyHostToDev);
+    *desc_ptr = new AddBangDescriptor{
+        handle->device,
+        handle->device_id,
+        c->dt,
+        ndim,
+        c_data_size,
+        c_shape,
+        a_strides_d,
+        b_strides_d};
+
+    return STATUS_SUCCESS;
+}
+
+infiniopStatus_t bangDestroyAddDescriptor(AddBangDescriptor_t desc) {
+    cnrtFree(desc->c_shape);
+    cnrtFree(desc->a_strides_d);
+    cnrtFree(desc->b_strides_d);
+    delete desc;
+    return STATUS_SUCCESS;
+}
diff --git a/src/ops/add/bang/add_bang.h b/src/ops/add/bang/add_bang.h
new file mode 100644
index 00000000..25cc56c1
--- /dev/null
+++ b/src/ops/add/bang/add_bang.h
@@ -0,0 +1,33 @@
+#ifndef __BANG_ADD_H__
+#define __BANG_ADD_H__
+
+#include "../../../devices/bang/bang_handle.h"
+#include "../../utils.h"
+#include "operators.h"
+
+struct AddBangDescriptor {
+    Device device;
+    int device_id;
+    DT dtype;
+    uint64_t ndim;
+    uint64_t c_data_size;
+    uint64_t *c_shape;
+    uint64_t *a_strides_d;
+    uint64_t *b_strides_d;
+};
+
+typedef struct AddBangDescriptor *AddBangDescriptor_t;
+
+infiniopStatus_t bangCreateAddDescriptor(BangHandle_t,
+                                         AddBangDescriptor_t *,
+                                         infiniopTensorDescriptor_t c,
+                                         infiniopTensorDescriptor_t a,
+                                         infiniopTensorDescriptor_t b);
+
+infiniopStatus_t bangAdd(AddBangDescriptor_t desc,
+                         void *c, void const *a, void const *b,
+                         void *stream);
+
+infiniopStatus_t bangDestroyAddDescriptor(AddBangDescriptor_t desc);
+
+#endif
diff --git a/src/ops/add/bang/add_bang.mlu b/src/ops/add/bang/add_bang.mlu
new file mode 100644
index 00000000..35326171
--- /dev/null
+++ b/src/ops/add/bang/add_bang.mlu
@@ -0,0 +1,103 @@
+#include "bang.h"
+#include "bang_device_functions.h"
+#include "cnrt.h"
+#include "add_bang.h"
+#include "../../../devices/bang/common_bang.h"
+#include <stdlib.h>
+
+const uint64_t SRC_MAX_SIZE = 1024 * 64;
+__nram__ char nram_buffer[NRAM_MAX_SIZE];
+template<typename T>
+__mlu_global__ void addKernel(T *c, T const *a, T const *b, uint64_t *c_shape, uint64_t *a_strides_d, uint64_t *b_strides_d, uint64_t ndim, uint64_t length){
+    const uint64_t maxNum = SRC_MAX_SIZE/sizeof(T);
+    uint64_t taskSize = taskDim * maxNum;
+
+    uint64_t remain = length % taskSize;
+    
+    uint64_t repeat = (length - remain) / taskSize;
+
+    uint64_t remainT = remain % taskDim;
+    uint64_t stepEasy = (remain - remainT) / taskDim;
+    uint64_t stepHard = stepEasy + 1;
+    uint64_t step = (taskId < remainT ? stepHard : stepEasy);
+    uint64_t indStart = repeat * taskSize + (taskId < remainT ? taskId * stepHard : remainT * stepHard + (taskId - remainT) * stepEasy);
+    //__bang_printf("%ld, %ld, %ld\n", repeat, indStart, step);
+    T *c_src = (T *)nram_buffer;
+    T *a_src = c_src + maxNum;
+    T *b_src = a_src + maxNum;
+    for(uint64_t r = 0; r < repeat; r++){
+        __memcpy(c_src, c + r * taskSize + taskId * maxNum, maxNum * sizeof(T), GDRAM2NRAM);
+        for(uint64_t i = r * taskSize + taskId * maxNum; i < r * taskSize + (taskId + 1) * maxNum; i++){
+            uint64_t a_tid = 0;
+            uint64_t b_tid = 0;
+            uint64_t indi = i;
+            for(uint64_t s = ndim - 1; s >= 0; s -= 1){
+                a_tid += (indi % c_shape[s]) * a_strides_d[s];
+                b_tid += (indi % c_shape[s]) * b_strides_d[s];
+                indi /= c_shape[s];
+            }
+            c_src[i] = a[a_tid] + b[b_tid];
+        }
+        __memcpy(c + r * taskSize + taskId * maxNum, c_src, maxNum * sizeof(T), NRAM2GDRAM);
+    }
+    if(step){
+        
+        __memcpy(c_src, c + indStart, step * sizeof(T), GDRAM2NRAM);
+        for(uint64_t i = indStart; i < indStart + step; i++){
+            uint64_t a_tid = 0;
+            uint64_t b_tid = 0;
+            uint64_t indi = i;
+           
+            for(uint64_t s = ndim - 1; s > 0; s -= 1){
+                a_tid += (indi % c_shape[s]) * a_strides_d[s];
+                b_tid += (indi % c_shape[s]) * b_strides_d[s];
+                indi /= c_shape[s];
+            }
+            
+            __memcpy(a_src, a + a_tid, sizeof(T), GDRAM2NRAM);
+            __memcpy(b_src, b + b_tid, sizeof(T), GDRAM2NRAM);
+            c_src[i] = a_src[0] + b_src[0];
+            __bang_printf("%ld, %ld, %ld, %ld, %.4f, %.4f, %.4f\n", taskId, i, a_tid, b_tid, static_cast<float>(a[a_tid]), static_cast<float>(b[b_tid]), static_cast<float>(c_src[i]));
+        }
+        __memcpy(c + indStart, c_src, step * sizeof(T), NRAM2GDRAM);
+    }
+    
+}
+template<typename T>
+void addUnion(cnrtQueue_t queue, void *c, void const *a, void const *b, uint64_t *c_shape, uint64_t *a_strides_d, uint64_t *b_strides_d, uint64_t ndim, uint64_t length){
+    auto a_ = reinterpret_cast<T const *>(a);
+    auto b_ = reinterpret_cast<T const *>(b);
+    auto c_ = reinterpret_cast<T *>(c);
+    cnrtDim3_t k_dim;
+    cnrtFunctionType_t k_type;
+
+    k_dim.x = 4;
+    k_dim.y = 1;
+    k_dim.z = 1;
+    k_type = CNRT_FUNC_TYPE_UNION1;
+    addKernel<T><<<k_dim, k_type, queue>>>(c_, a_, b_, c_shape, a_strides_d, b_strides_d, ndim, length);
+    //cnrtQueueSync(queue);
+}
+void add_bang_f16(AddBangDescriptor_t desc, void *c, void const *a, void const *b,
+                        void *stream) {
+    auto queue = reinterpret_cast<cnrtQueue_t>(stream);
+    uint64_t ndim = desc->ndim;
+    uint64_t length = desc->c_data_size;
+    addUnion<half>(queue, c, a, b, desc->c_shape, desc->a_strides_d, desc->b_strides_d, ndim, length);
+    
+}
+
+infiniopStatus_t bangAdd(AddBangDescriptor_t desc,
+                        void *c, void const *a, void const *b,
+                        void *stream) {
+    if (cnrtSetDevice(desc->device_id) != cnrtSuccess) {
+        return STATUS_BAD_DEVICE;
+    }
+    if (dtype_eq(desc->dtype, F16)) {
+        add_bang_f16(desc, c, a, b, stream);
+        return STATUS_SUCCESS;
+    }
+    return STATUS_BAD_TENSOR_DTYPE;
+}
+
+
diff --git a/src/ops/add/operator.cc b/src/ops/add/operator.cc
index c2a30ea8..81952965 100644
--- a/src/ops/add/operator.cc
+++ b/src/ops/add/operator.cc
@@ -9,6 +9,10 @@
 #include "../../devices/cuda/cuda_handle.h"
 #include "cuda/add.cuh"
 #endif
+#ifdef ENABLE_CAMBRICON_MLU
+#include "../../devices/bang/bang_handle.h"
+#include "bang/add_bang.h"
+#endif
 
 __C infiniopStatus_t infiniopCreateAddDescriptor(
     infiniopHandle_t handle,
@@ -28,7 +32,9 @@ __C infiniopStatus_t infiniopCreateAddDescriptor(
 
 #endif
 #ifdef ENABLE_CAMBRICON_MLU
-        // TODO
+        case DevCambriconMlu: {
+            return bangCreateAddDescriptor((BangHandle_t) handle, (AddBangDescriptor_t *) desc_ptr, c, a, b);
+        }
 #endif
     }
     return STATUS_BAD_DEVICE;
@@ -47,7 +53,9 @@ __C infiniopStatus_t infiniopAdd(infiniopAddDescriptor_t desc, void *c, void con
 
 #endif
 #ifdef ENABLE_CAMBRICON_MLU
-        // TODO
+        case DevCambriconMlu: {
+            return bangAdd((AddBangDescriptor_t) desc, c, a, b, stream);
+        }
 #endif
     }
     return STATUS_BAD_DEVICE;
@@ -66,7 +74,9 @@ __C infiniopStatus_t infiniopDestroyAddDescriptor(infiniopAddDescriptor_t desc)
 
 #endif
 #ifdef ENABLE_CAMBRICON_MLU
-        // TODO
+        case DevCambriconMlu: {
+            return bangDestroyAddDescriptor((AddBangDescriptor_t) desc);
+        }
 #endif
     }
     return STATUS_BAD_DEVICE;

From 6b6e1d6e40315cb49bf2c1af3aa3c4be077e39ce Mon Sep 17 00:00:00 2001
From: xgqdut2016 <kenan_gewei@163.com>
Date: Mon, 21 Oct 2024 15:35:58 +0800
Subject: [PATCH 2/5] modified add_bang.mlu

---
 src/ops/add/bang/add_bang.cc  |  3 ++-
 src/ops/add/bang/add_bang.mlu | 31 +++++++++++++++++--------------
 2 files changed, 19 insertions(+), 15 deletions(-)

diff --git a/src/ops/add/bang/add_bang.cc b/src/ops/add/bang/add_bang.cc
index e90ea55d..3e4c8168 100644
--- a/src/ops/add/bang/add_bang.cc
+++ b/src/ops/add/bang/add_bang.cc
@@ -27,7 +27,8 @@ infiniopStatus_t bangCreateAddDescriptor(BangHandle_t handle,
         a_strides[i] = (i < ndim - a->ndim || c->shape[i] != a->shape[i + a->ndim - ndim]) ? 0 : a->strides[i + a->ndim - ndim];
         b_strides[i] = (i < ndim - b->ndim || c->shape[i] != b->shape[i + b->ndim - ndim]) ? 0 : b->strides[i + b->ndim - ndim];
     }
-    uint64_t *c_shape, *a_strides_d, *b_strides_d;
+    uint64_t *c_shape,
+        *a_strides_d, *b_strides_d;
     cnrtMalloc((void **) &c_shape, ndim * sizeof(uint64_t));
     cnrtMalloc((void **) &a_strides_d, ndim * sizeof(uint64_t));
     cnrtMalloc((void **) &b_strides_d, ndim * sizeof(uint64_t));
diff --git a/src/ops/add/bang/add_bang.mlu b/src/ops/add/bang/add_bang.mlu
index 35326171..eaeac923 100644
--- a/src/ops/add/bang/add_bang.mlu
+++ b/src/ops/add/bang/add_bang.mlu
@@ -27,11 +27,11 @@ __mlu_global__ void addKernel(T *c, T const *a, T const *b, uint64_t *c_shape, u
     T *b_src = a_src + maxNum;
     for(uint64_t r = 0; r < repeat; r++){
         __memcpy(c_src, c + r * taskSize + taskId * maxNum, maxNum * sizeof(T), GDRAM2NRAM);
-        for(uint64_t i = r * taskSize + taskId * maxNum; i < r * taskSize + (taskId + 1) * maxNum; i++){
+        for(uint64_t i = 0; i < maxNum; i++){
             uint64_t a_tid = 0;
             uint64_t b_tid = 0;
-            uint64_t indi = i;
-            for(uint64_t s = ndim - 1; s >= 0; s -= 1){
+            uint64_t indi = i + r * taskSize + taskId * maxNum;
+            for(int s = ndim - 1; s >= 0; s -= 1){
                 a_tid += (indi % c_shape[s]) * a_strides_d[s];
                 b_tid += (indi % c_shape[s]) * b_strides_d[s];
                 indi /= c_shape[s];
@@ -43,12 +43,12 @@ __mlu_global__ void addKernel(T *c, T const *a, T const *b, uint64_t *c_shape, u
     if(step){
         
         __memcpy(c_src, c + indStart, step * sizeof(T), GDRAM2NRAM);
-        for(uint64_t i = indStart; i < indStart + step; i++){
+        for(uint64_t i = 0; i < step; i++){
             uint64_t a_tid = 0;
             uint64_t b_tid = 0;
-            uint64_t indi = i;
+            uint64_t indi = i + indStart;
            
-            for(uint64_t s = ndim - 1; s > 0; s -= 1){
+            for(int s = ndim - 1; s >= 0; --s){
                 a_tid += (indi % c_shape[s]) * a_strides_d[s];
                 b_tid += (indi % c_shape[s]) * b_strides_d[s];
                 indi /= c_shape[s];
@@ -57,17 +57,17 @@ __mlu_global__ void addKernel(T *c, T const *a, T const *b, uint64_t *c_shape, u
             __memcpy(a_src, a + a_tid, sizeof(T), GDRAM2NRAM);
             __memcpy(b_src, b + b_tid, sizeof(T), GDRAM2NRAM);
             c_src[i] = a_src[0] + b_src[0];
-            __bang_printf("%ld, %ld, %ld, %ld, %.4f, %.4f, %.4f\n", taskId, i, a_tid, b_tid, static_cast<float>(a[a_tid]), static_cast<float>(b[b_tid]), static_cast<float>(c_src[i]));
+            
         }
+        
         __memcpy(c + indStart, c_src, step * sizeof(T), NRAM2GDRAM);
+        
     }
     
 }
 template<typename T>
-void addUnion(cnrtQueue_t queue, void *c, void const *a, void const *b, uint64_t *c_shape, uint64_t *a_strides_d, uint64_t *b_strides_d, uint64_t ndim, uint64_t length){
-    auto a_ = reinterpret_cast<T const *>(a);
-    auto b_ = reinterpret_cast<T const *>(b);
-    auto c_ = reinterpret_cast<T *>(c);
+void addUnion(cnrtQueue_t queue, T *c, T const *a, T const *b, uint64_t *c_shape, uint64_t *a_strides_d, uint64_t *b_strides_d, uint64_t ndim, uint64_t length){
+    
     cnrtDim3_t k_dim;
     cnrtFunctionType_t k_type;
 
@@ -75,15 +75,18 @@ void addUnion(cnrtQueue_t queue, void *c, void const *a, void const *b, uint64_t
     k_dim.y = 1;
     k_dim.z = 1;
     k_type = CNRT_FUNC_TYPE_UNION1;
-    addKernel<T><<<k_dim, k_type, queue>>>(c_, a_, b_, c_shape, a_strides_d, b_strides_d, ndim, length);
-    //cnrtQueueSync(queue);
+    addKernel<T><<<k_dim, k_type, queue>>>(c, a, b, c_shape, a_strides_d, b_strides_d, ndim, length);
+    cnrtQueueSync(queue);
 }
 void add_bang_f16(AddBangDescriptor_t desc, void *c, void const *a, void const *b,
                         void *stream) {
     auto queue = reinterpret_cast<cnrtQueue_t>(stream);
     uint64_t ndim = desc->ndim;
     uint64_t length = desc->c_data_size;
-    addUnion<half>(queue, c, a, b, desc->c_shape, desc->a_strides_d, desc->b_strides_d, ndim, length);
+    auto a_ = reinterpret_cast<half const *>(a);
+    auto b_ = reinterpret_cast<half const *>(b);
+    auto c_ = reinterpret_cast<half *>(c);
+    addUnion<half>(queue, c_, a_, b_, desc->c_shape, desc->a_strides_d, desc->b_strides_d, ndim, length);
     
 }
 

From 84b0dc2c8d094717ed42da885f5fedeb79105f11 Mon Sep 17 00:00:00 2001
From: xgqdut2016 <kenan_gewei@163.com>
Date: Wed, 23 Oct 2024 14:19:50 +0800
Subject: [PATCH 3/5] special shape

---
 src/ops/add/bang/add_bang.cc  |  8 +++-
 src/ops/add/bang/add_bang.h   |  1 +
 src/ops/add/bang/add_bang.mlu | 90 +++++++++++++++++++++--------------
 3 files changed, 63 insertions(+), 36 deletions(-)

diff --git a/src/ops/add/bang/add_bang.cc b/src/ops/add/bang/add_bang.cc
index 3e4c8168..d5e8414e 100644
--- a/src/ops/add/bang/add_bang.cc
+++ b/src/ops/add/bang/add_bang.cc
@@ -27,6 +27,11 @@ infiniopStatus_t bangCreateAddDescriptor(BangHandle_t handle,
         a_strides[i] = (i < ndim - a->ndim || c->shape[i] != a->shape[i + a->ndim - ndim]) ? 0 : a->strides[i + a->ndim - ndim];
         b_strides[i] = (i < ndim - b->ndim || c->shape[i] != b->shape[i + b->ndim - ndim]) ? 0 : b->strides[i + b->ndim - ndim];
     }
+    bool condition = false;
+    for (size_t i = 0; i < ndim; ++i) {
+        condition = (a_strides[i] == 0 || b_strides[i] == 0);
+    }
+
     uint64_t *c_shape,
         *a_strides_d, *b_strides_d;
     cnrtMalloc((void **) &c_shape, ndim * sizeof(uint64_t));
@@ -43,7 +48,8 @@ infiniopStatus_t bangCreateAddDescriptor(BangHandle_t handle,
         c_data_size,
         c_shape,
         a_strides_d,
-        b_strides_d};
+        b_strides_d,
+        condition};
 
     return STATUS_SUCCESS;
 }
diff --git a/src/ops/add/bang/add_bang.h b/src/ops/add/bang/add_bang.h
index 25cc56c1..886da5b5 100644
--- a/src/ops/add/bang/add_bang.h
+++ b/src/ops/add/bang/add_bang.h
@@ -14,6 +14,7 @@ struct AddBangDescriptor {
     uint64_t *c_shape;
     uint64_t *a_strides_d;
     uint64_t *b_strides_d;
+    bool condition;
 };
 
 typedef struct AddBangDescriptor *AddBangDescriptor_t;
diff --git a/src/ops/add/bang/add_bang.mlu b/src/ops/add/bang/add_bang.mlu
index eaeac923..26812747 100644
--- a/src/ops/add/bang/add_bang.mlu
+++ b/src/ops/add/bang/add_bang.mlu
@@ -8,7 +8,7 @@
 const uint64_t SRC_MAX_SIZE = 1024 * 64;
 __nram__ char nram_buffer[NRAM_MAX_SIZE];
 template<typename T>
-__mlu_global__ void addKernel(T *c, T const *a, T const *b, uint64_t *c_shape, uint64_t *a_strides_d, uint64_t *b_strides_d, uint64_t ndim, uint64_t length){
+__mlu_global__ void addKernel(T *c, T const *a, T const *b, uint64_t *c_shape, uint64_t *a_strides_d, uint64_t *b_strides_d, uint64_t ndim, uint64_t length, bool condition){
     const uint64_t maxNum = SRC_MAX_SIZE/sizeof(T);
     uint64_t taskSize = taskDim * maxNum;
 
@@ -25,48 +25,68 @@ __mlu_global__ void addKernel(T *c, T const *a, T const *b, uint64_t *c_shape, u
     T *c_src = (T *)nram_buffer;
     T *a_src = c_src + maxNum;
     T *b_src = a_src + maxNum;
-    for(uint64_t r = 0; r < repeat; r++){
-        __memcpy(c_src, c + r * taskSize + taskId * maxNum, maxNum * sizeof(T), GDRAM2NRAM);
-        for(uint64_t i = 0; i < maxNum; i++){
-            uint64_t a_tid = 0;
-            uint64_t b_tid = 0;
-            uint64_t indi = i + r * taskSize + taskId * maxNum;
-            for(int s = ndim - 1; s >= 0; s -= 1){
-                a_tid += (indi % c_shape[s]) * a_strides_d[s];
-                b_tid += (indi % c_shape[s]) * b_strides_d[s];
-                indi /= c_shape[s];
+    if(condition){
+        for(uint64_t r = 0; r < repeat; r++){
+            __memcpy(c_src, c + r * taskSize + taskId * maxNum, maxNum * sizeof(T), GDRAM2NRAM);
+            for(uint64_t i = 0; i < maxNum; i++){
+                uint64_t a_tid = 0;
+                uint64_t b_tid = 0;
+                uint64_t indi = i + r * taskSize + taskId * maxNum;
+                for(int s = ndim - 1; s >= 0; s -= 1){
+                    a_tid += (indi % c_shape[s]) * a_strides_d[s];
+                    b_tid += (indi % c_shape[s]) * b_strides_d[s];
+                    indi /= c_shape[s];
+                }
+                c_src[i] = a[a_tid] + b[b_tid];
             }
-            c_src[i] = a[a_tid] + b[b_tid];
+            __memcpy(c + r * taskSize + taskId * maxNum, c_src, maxNum * sizeof(T), NRAM2GDRAM);
         }
-        __memcpy(c + r * taskSize + taskId * maxNum, c_src, maxNum * sizeof(T), NRAM2GDRAM);
-    }
-    if(step){
-        
-        __memcpy(c_src, c + indStart, step * sizeof(T), GDRAM2NRAM);
-        for(uint64_t i = 0; i < step; i++){
-            uint64_t a_tid = 0;
-            uint64_t b_tid = 0;
-            uint64_t indi = i + indStart;
-           
-            for(int s = ndim - 1; s >= 0; --s){
-                a_tid += (indi % c_shape[s]) * a_strides_d[s];
-                b_tid += (indi % c_shape[s]) * b_strides_d[s];
-                indi /= c_shape[s];
+        if(step){
+            
+            __memcpy(c_src, c + indStart, step * sizeof(T), GDRAM2NRAM);
+            for(uint64_t i = 0; i < step; i++){
+                uint64_t a_tid = 0;
+                uint64_t b_tid = 0;
+                uint64_t indi = i + indStart;
+            
+                for(int s = ndim - 1; s >= 0; --s){
+                    a_tid += (indi % c_shape[s]) * a_strides_d[s];
+                    b_tid += (indi % c_shape[s]) * b_strides_d[s];
+                    indi /= c_shape[s];
+                }
+                
+                __memcpy(a_src, a + a_tid, sizeof(T), GDRAM2NRAM);
+                __memcpy(b_src, b + b_tid, sizeof(T), GDRAM2NRAM);
+                c_src[i] = a_src[0] + b_src[0];
+                
             }
             
-            __memcpy(a_src, a + a_tid, sizeof(T), GDRAM2NRAM);
-            __memcpy(b_src, b + b_tid, sizeof(T), GDRAM2NRAM);
-            c_src[i] = a_src[0] + b_src[0];
+            __memcpy(c + indStart, c_src, step * sizeof(T), NRAM2GDRAM);
+            
+        }
+    }
+    else{
+        for(uint64_t r = 0; r < repeat; r++){
+            __memcpy(c_src, c + r * taskSize + taskId * maxNum, maxNum * sizeof(T), GDRAM2NRAM);
+            __memcpy(a_src, a + r * taskSize + taskId * maxNum, maxNum * sizeof(T), GDRAM2NRAM);
+            __memcpy(b_src, b + r * taskSize + taskId * maxNum, maxNum * sizeof(T), GDRAM2NRAM);
+            __bang_add(c_src, a_src, b_src, maxNum);
+            __memcpy(c + r * taskSize + taskId * maxNum, c_src, maxNum * sizeof(T), NRAM2GDRAM);
+        }
+        if(step){
+            
+            __memcpy(c_src, c + indStart, step * sizeof(T), GDRAM2NRAM);
+            __memcpy(a_src, a + indStart, step * sizeof(T), GDRAM2NRAM);
+            __memcpy(b_src, b + indStart, step * sizeof(T), GDRAM2NRAM);
+            __bang_add(c_src, a_src, b_src, step);
+            __memcpy(c + indStart, c_src, step * sizeof(T), NRAM2GDRAM);
             
         }
-        
-        __memcpy(c + indStart, c_src, step * sizeof(T), NRAM2GDRAM);
-        
     }
     
 }
 template<typename T>
-void addUnion(cnrtQueue_t queue, T *c, T const *a, T const *b, uint64_t *c_shape, uint64_t *a_strides_d, uint64_t *b_strides_d, uint64_t ndim, uint64_t length){
+void addUnion(cnrtQueue_t queue, T *c, T const *a, T const *b, uint64_t *c_shape, uint64_t *a_strides_d, uint64_t *b_strides_d, uint64_t ndim, uint64_t length, bool condition){
     
     cnrtDim3_t k_dim;
     cnrtFunctionType_t k_type;
@@ -75,7 +95,7 @@ void addUnion(cnrtQueue_t queue, T *c, T const *a, T const *b, uint64_t *c_shape
     k_dim.y = 1;
     k_dim.z = 1;
     k_type = CNRT_FUNC_TYPE_UNION1;
-    addKernel<T><<<k_dim, k_type, queue>>>(c, a, b, c_shape, a_strides_d, b_strides_d, ndim, length);
+    addKernel<T><<<k_dim, k_type, queue>>>(c, a, b, c_shape, a_strides_d, b_strides_d, ndim, length, condition);
     cnrtQueueSync(queue);
 }
 void add_bang_f16(AddBangDescriptor_t desc, void *c, void const *a, void const *b,
@@ -86,7 +106,7 @@ void add_bang_f16(AddBangDescriptor_t desc, void *c, void const *a, void const *
     auto a_ = reinterpret_cast<half const *>(a);
     auto b_ = reinterpret_cast<half const *>(b);
     auto c_ = reinterpret_cast<half *>(c);
-    addUnion<half>(queue, c_, a_, b_, desc->c_shape, desc->a_strides_d, desc->b_strides_d, ndim, length);
+    addUnion<half>(queue, c_, a_, b_, desc->c_shape, desc->a_strides_d, desc->b_strides_d, ndim, length, desc->condition);
     
 }
 

From 61adc971dc67464ccf769fff72b9984b365a66d2 Mon Sep 17 00:00:00 2001
From: xgqdut2016 <kenan_gewei@163.com>
Date: Wed, 23 Oct 2024 15:07:09 +0800
Subject: [PATCH 4/5] condition a and b

---
 src/ops/add/bang/add_bang.cc  |  18 +++--
 src/ops/add/bang/add_bang.h   |   3 +-
 src/ops/add/bang/add_bang.mlu | 123 +++++++++++++++++++++++++++-------
 3 files changed, 114 insertions(+), 30 deletions(-)

diff --git a/src/ops/add/bang/add_bang.cc b/src/ops/add/bang/add_bang.cc
index d5e8414e..9a1d4d96 100644
--- a/src/ops/add/bang/add_bang.cc
+++ b/src/ops/add/bang/add_bang.cc
@@ -27,11 +27,20 @@ infiniopStatus_t bangCreateAddDescriptor(BangHandle_t handle,
         a_strides[i] = (i < ndim - a->ndim || c->shape[i] != a->shape[i + a->ndim - ndim]) ? 0 : a->strides[i + a->ndim - ndim];
         b_strides[i] = (i < ndim - b->ndim || c->shape[i] != b->shape[i + b->ndim - ndim]) ? 0 : b->strides[i + b->ndim - ndim];
     }
-    bool condition = false;
+    bool a_condition = false;
+    bool b_condition = false;
     for (size_t i = 0; i < ndim; ++i) {
-        condition = (a_strides[i] == 0 || b_strides[i] == 0);
+        a_condition = (a_strides[i] == 0);
+        if (a_condition) {
+            break;
+        }
+    }
+    for (size_t i = 0; i < ndim; ++i) {
+        b_condition = (b_strides[i] == 0);
+        if (b_condition) {
+            break;
+        }
     }
-
     uint64_t *c_shape,
         *a_strides_d, *b_strides_d;
     cnrtMalloc((void **) &c_shape, ndim * sizeof(uint64_t));
@@ -49,7 +58,8 @@ infiniopStatus_t bangCreateAddDescriptor(BangHandle_t handle,
         c_shape,
         a_strides_d,
         b_strides_d,
-        condition};
+        a_condition,
+        b_condition};
 
     return STATUS_SUCCESS;
 }
diff --git a/src/ops/add/bang/add_bang.h b/src/ops/add/bang/add_bang.h
index 886da5b5..d53bcd36 100644
--- a/src/ops/add/bang/add_bang.h
+++ b/src/ops/add/bang/add_bang.h
@@ -14,7 +14,8 @@ struct AddBangDescriptor {
     uint64_t *c_shape;
     uint64_t *a_strides_d;
     uint64_t *b_strides_d;
-    bool condition;
+    bool a_condition;
+    bool b_condition;
 };
 
 typedef struct AddBangDescriptor *AddBangDescriptor_t;
diff --git a/src/ops/add/bang/add_bang.mlu b/src/ops/add/bang/add_bang.mlu
index 26812747..2b9b9761 100644
--- a/src/ops/add/bang/add_bang.mlu
+++ b/src/ops/add/bang/add_bang.mlu
@@ -8,7 +8,7 @@
 const uint64_t SRC_MAX_SIZE = 1024 * 64;
 __nram__ char nram_buffer[NRAM_MAX_SIZE];
 template<typename T>
-__mlu_global__ void addKernel(T *c, T const *a, T const *b, uint64_t *c_shape, uint64_t *a_strides_d, uint64_t *b_strides_d, uint64_t ndim, uint64_t length, bool condition){
+__mlu_global__ void addKernel(T *c, T const *a, T const *b, uint64_t *c_shape, uint64_t *a_strides_d, uint64_t *b_strides_d, uint64_t ndim, uint64_t length, bool a_condition, bool b_condition){
     const uint64_t maxNum = SRC_MAX_SIZE/sizeof(T);
     uint64_t taskSize = taskDim * maxNum;
 
@@ -25,14 +25,14 @@ __mlu_global__ void addKernel(T *c, T const *a, T const *b, uint64_t *c_shape, u
     T *c_src = (T *)nram_buffer;
     T *a_src = c_src + maxNum;
     T *b_src = a_src + maxNum;
-    if(condition){
+    if(a_condition && b_condition){
         for(uint64_t r = 0; r < repeat; r++){
             __memcpy(c_src, c + r * taskSize + taskId * maxNum, maxNum * sizeof(T), GDRAM2NRAM);
             for(uint64_t i = 0; i < maxNum; i++){
                 uint64_t a_tid = 0;
                 uint64_t b_tid = 0;
                 uint64_t indi = i + r * taskSize + taskId * maxNum;
-                for(int s = ndim - 1; s >= 0; s -= 1){
+                for(int s = ndim - 1; s >= 0; --s){
                     a_tid += (indi % c_shape[s]) * a_strides_d[s];
                     b_tid += (indi % c_shape[s]) * b_strides_d[s];
                     indi /= c_shape[s];
@@ -42,7 +42,6 @@ __mlu_global__ void addKernel(T *c, T const *a, T const *b, uint64_t *c_shape, u
             __memcpy(c + r * taskSize + taskId * maxNum, c_src, maxNum * sizeof(T), NRAM2GDRAM);
         }
         if(step){
-            
             __memcpy(c_src, c + indStart, step * sizeof(T), GDRAM2NRAM);
             for(uint64_t i = 0; i < step; i++){
                 uint64_t a_tid = 0;
@@ -54,39 +53,113 @@ __mlu_global__ void addKernel(T *c, T const *a, T const *b, uint64_t *c_shape, u
                     b_tid += (indi % c_shape[s]) * b_strides_d[s];
                     indi /= c_shape[s];
                 }
-                
                 __memcpy(a_src, a + a_tid, sizeof(T), GDRAM2NRAM);
                 __memcpy(b_src, b + b_tid, sizeof(T), GDRAM2NRAM);
-                c_src[i] = a_src[0] + b_src[0];
+                c_src[i] = a[a_tid] + b[b_tid];
                 
             }
-            
             __memcpy(c + indStart, c_src, step * sizeof(T), NRAM2GDRAM);
-            
         }
     }
     else{
-        for(uint64_t r = 0; r < repeat; r++){
-            __memcpy(c_src, c + r * taskSize + taskId * maxNum, maxNum * sizeof(T), GDRAM2NRAM);
-            __memcpy(a_src, a + r * taskSize + taskId * maxNum, maxNum * sizeof(T), GDRAM2NRAM);
-            __memcpy(b_src, b + r * taskSize + taskId * maxNum, maxNum * sizeof(T), GDRAM2NRAM);
-            __bang_add(c_src, a_src, b_src, maxNum);
-            __memcpy(c + r * taskSize + taskId * maxNum, c_src, maxNum * sizeof(T), NRAM2GDRAM);
+        if(a_condition){
+            for(uint64_t r = 0; r < repeat; r++){
+                __memcpy(c_src, c + r * taskSize + taskId * maxNum, maxNum * sizeof(T), GDRAM2NRAM);
+                __memcpy(b_src, b + r * taskSize + taskId * maxNum, maxNum * sizeof(T), GDRAM2NRAM);
+                for(uint64_t i = 0; i < maxNum; i++){
+                    uint64_t a_tid = 0;
+                    
+                    uint64_t indi = i + r * taskSize + taskId * maxNum;
+                    for(int s = ndim - 1; s >= 0; --s){
+                        a_tid += (indi % c_shape[s]) * a_strides_d[s];
+                        
+                        indi /= c_shape[s];
+                    }
+                    c_src[i] = a[a_tid] + b_src[i];
+                }
+                __memcpy(c + r * taskSize + taskId * maxNum, c_src, maxNum * sizeof(T), NRAM2GDRAM);
+            }
+            if(step){
+                __memcpy(c_src, c + indStart, step * sizeof(T), GDRAM2NRAM);
+                __memcpy(b_src, b + indStart, step * sizeof(T), GDRAM2NRAM);
+                for(uint64_t i = 0; i < step; i++){
+                    uint64_t a_tid = 0;
+                    
+                    uint64_t indi = i + indStart;
+                
+                    for(int s = ndim - 1; s >= 0; --s){
+                        a_tid += (indi % c_shape[s]) * a_strides_d[s];
+                        
+                        indi /= c_shape[s];
+                    }
+                    
+                    c_src[i] = a[a_tid] + b_src[i];
+                    
+                }
+                __memcpy(c + indStart, c_src, step * sizeof(T), NRAM2GDRAM);
+            }
         }
-        if(step){
-            
-            __memcpy(c_src, c + indStart, step * sizeof(T), GDRAM2NRAM);
-            __memcpy(a_src, a + indStart, step * sizeof(T), GDRAM2NRAM);
-            __memcpy(b_src, b + indStart, step * sizeof(T), GDRAM2NRAM);
-            __bang_add(c_src, a_src, b_src, step);
-            __memcpy(c + indStart, c_src, step * sizeof(T), NRAM2GDRAM);
-            
+        else if (b_condition){
+            for(uint64_t r = 0; r < repeat; r++){
+                __memcpy(c_src, c + r * taskSize + taskId * maxNum, maxNum * sizeof(T), GDRAM2NRAM);
+                __memcpy(a_src, a + r * taskSize + taskId * maxNum, maxNum * sizeof(T), GDRAM2NRAM);
+                for(uint64_t i = 0; i < maxNum; i++){
+                    uint64_t b_tid = 0;
+                    
+                    uint64_t indi = i + r * taskSize + taskId * maxNum;
+                    for(int s = ndim - 1; s >= 0; --s){
+                        b_tid += (indi % c_shape[s]) * b_strides_d[s];
+                        
+                        indi /= c_shape[s];
+                    }
+                    c_src[i] = a_src[i] + b[b_tid];
+                }
+                __memcpy(c + r * taskSize + taskId * maxNum, c_src, maxNum * sizeof(T), NRAM2GDRAM);
+            }
+            if(step){
+                __memcpy(c_src, c + indStart, step * sizeof(T), GDRAM2NRAM);
+                __memcpy(b_src, b + indStart, step * sizeof(T), GDRAM2NRAM);
+                for(uint64_t i = 0; i < step; i++){
+                    uint64_t b_tid = 0;
+                    
+                    uint64_t indi = i + indStart;
+                
+                    for(int s = ndim - 1; s >= 0; --s){
+                        b_tid += (indi % c_shape[s]) * b_strides_d[s];
+                        
+                        indi /= c_shape[s];
+                    }
+                    
+                    c_src[i] = a_src[i] + b[b_tid];
+                    
+                }
+                __memcpy(c + indStart, c_src, step * sizeof(T), NRAM2GDRAM);
+            }
+        }
+        else{
+            for(uint64_t r = 0; r < repeat; r++){
+                __memcpy(c_src, c + r * taskSize + taskId * maxNum, maxNum * sizeof(T), GDRAM2NRAM);
+                __memcpy(a_src, a + r * taskSize + taskId * maxNum, maxNum * sizeof(T), GDRAM2NRAM);
+                __memcpy(b_src, b + r * taskSize + taskId * maxNum, maxNum * sizeof(T), GDRAM2NRAM);
+                __bang_add(c_src, a_src, b_src, maxNum);
+                __memcpy(c + r * taskSize + taskId * maxNum, c_src, maxNum * sizeof(T), NRAM2GDRAM);
+            }
+            if(step){
+                
+                __memcpy(c_src, c + indStart, step * sizeof(T), GDRAM2NRAM);
+                __memcpy(a_src, a + indStart, step * sizeof(T), GDRAM2NRAM);
+                __memcpy(b_src, b + indStart, step * sizeof(T), GDRAM2NRAM);
+                __bang_add(c_src, a_src, b_src, step);
+                __memcpy(c + indStart, c_src, step * sizeof(T), NRAM2GDRAM);
+                
+            }
         }
+        
     }
     
 }
 template<typename T>
-void addUnion(cnrtQueue_t queue, T *c, T const *a, T const *b, uint64_t *c_shape, uint64_t *a_strides_d, uint64_t *b_strides_d, uint64_t ndim, uint64_t length, bool condition){
+void addUnion(cnrtQueue_t queue, T *c, T const *a, T const *b, uint64_t *c_shape, uint64_t *a_strides_d, uint64_t *b_strides_d, uint64_t ndim, uint64_t length, bool a_condition, bool b_condition){
     
     cnrtDim3_t k_dim;
     cnrtFunctionType_t k_type;
@@ -95,7 +168,7 @@ void addUnion(cnrtQueue_t queue, T *c, T const *a, T const *b, uint64_t *c_shape
     k_dim.y = 1;
     k_dim.z = 1;
     k_type = CNRT_FUNC_TYPE_UNION1;
-    addKernel<T><<<k_dim, k_type, queue>>>(c, a, b, c_shape, a_strides_d, b_strides_d, ndim, length, condition);
+    addKernel<T><<<k_dim, k_type, queue>>>(c, a, b, c_shape, a_strides_d, b_strides_d, ndim, length, a_condition, b_condition);
     cnrtQueueSync(queue);
 }
 void add_bang_f16(AddBangDescriptor_t desc, void *c, void const *a, void const *b,
@@ -106,7 +179,7 @@ void add_bang_f16(AddBangDescriptor_t desc, void *c, void const *a, void const *
     auto a_ = reinterpret_cast<half const *>(a);
     auto b_ = reinterpret_cast<half const *>(b);
     auto c_ = reinterpret_cast<half *>(c);
-    addUnion<half>(queue, c_, a_, b_, desc->c_shape, desc->a_strides_d, desc->b_strides_d, ndim, length, desc->condition);
+    addUnion<half>(queue, c_, a_, b_, desc->c_shape, desc->a_strides_d, desc->b_strides_d, ndim, length, desc->a_condition, desc->b_condition);
     
 }
 

From 6966f89bc4d6b4e7f97990ad882654c2e20c9ddd Mon Sep 17 00:00:00 2001
From: xgqdut2016 <kenan_gewei@163.com>
Date: Wed, 23 Oct 2024 16:28:52 +0800
Subject: [PATCH 5/5] bang_add

---
 src/ops/add/bang/add_bang.mlu | 24 ++++++++++++++----------
 1 file changed, 14 insertions(+), 10 deletions(-)

diff --git a/src/ops/add/bang/add_bang.mlu b/src/ops/add/bang/add_bang.mlu
index 2b9b9761..ae8622fa 100644
--- a/src/ops/add/bang/add_bang.mlu
+++ b/src/ops/add/bang/add_bang.mlu
@@ -37,8 +37,10 @@ __mlu_global__ void addKernel(T *c, T const *a, T const *b, uint64_t *c_shape, u
                     b_tid += (indi % c_shape[s]) * b_strides_d[s];
                     indi /= c_shape[s];
                 }
-                c_src[i] = a[a_tid] + b[b_tid];
+                a_src[i] = a[a_tid];
+                b_src[i] = b[b_tid];
             }
+            __bang_add(c_src, a_src, b_src, maxNum);
             __memcpy(c + r * taskSize + taskId * maxNum, c_src, maxNum * sizeof(T), NRAM2GDRAM);
         }
         if(step){
@@ -53,11 +55,10 @@ __mlu_global__ void addKernel(T *c, T const *a, T const *b, uint64_t *c_shape, u
                     b_tid += (indi % c_shape[s]) * b_strides_d[s];
                     indi /= c_shape[s];
                 }
-                __memcpy(a_src, a + a_tid, sizeof(T), GDRAM2NRAM);
-                __memcpy(b_src, b + b_tid, sizeof(T), GDRAM2NRAM);
-                c_src[i] = a[a_tid] + b[b_tid];
-                
+                a_src[i] = a[a_tid];
+                b_src[i] = b[b_tid];  
             }
+            __bang_add(c_src, a_src, b_src, maxNum);
             __memcpy(c + indStart, c_src, step * sizeof(T), NRAM2GDRAM);
         }
     }
@@ -75,8 +76,9 @@ __mlu_global__ void addKernel(T *c, T const *a, T const *b, uint64_t *c_shape, u
                         
                         indi /= c_shape[s];
                     }
-                    c_src[i] = a[a_tid] + b_src[i];
+                    a_src[i] = a[a_tid];
                 }
+                __bang_add(c_src, a_src, b_src, maxNum);
                 __memcpy(c + r * taskSize + taskId * maxNum, c_src, maxNum * sizeof(T), NRAM2GDRAM);
             }
             if(step){
@@ -93,9 +95,9 @@ __mlu_global__ void addKernel(T *c, T const *a, T const *b, uint64_t *c_shape, u
                         indi /= c_shape[s];
                     }
                     
-                    c_src[i] = a[a_tid] + b_src[i];
-                    
+                    a_src[i] = a[a_tid];
                 }
+                __bang_add(c_src, a_src, b_src, maxNum);
                 __memcpy(c + indStart, c_src, step * sizeof(T), NRAM2GDRAM);
             }
         }
@@ -112,8 +114,9 @@ __mlu_global__ void addKernel(T *c, T const *a, T const *b, uint64_t *c_shape, u
                         
                         indi /= c_shape[s];
                     }
-                    c_src[i] = a_src[i] + b[b_tid];
+                    b_src[i] = b[b_tid];
                 }
+                __bang_add(c_src, a_src, b_src, maxNum);
                 __memcpy(c + r * taskSize + taskId * maxNum, c_src, maxNum * sizeof(T), NRAM2GDRAM);
             }
             if(step){
@@ -130,9 +133,10 @@ __mlu_global__ void addKernel(T *c, T const *a, T const *b, uint64_t *c_shape, u
                         indi /= c_shape[s];
                     }
                     
-                    c_src[i] = a_src[i] + b[b_tid];
+                    b_src[i] = b[b_tid];
                     
                 }
+                __bang_add(c_src, a_src, b_src, maxNum);
                 __memcpy(c + indStart, c_src, step * sizeof(T), NRAM2GDRAM);
             }
         }