From 5eec5fecdf79431b71c523638b56418b1d92dc0e Mon Sep 17 00:00:00 2001
From: Tim Dettmers <tim.dettmers@gmail.com>
Date: Wed, 25 Nov 2015 21:59:33 +0100
Subject: [PATCH] Added strided elewise ops (logistic, tanh).

---
 brainstorm/handlers/pycuda_handler.py       | 157 ++++++++++++++++++++
 brainstorm/tests/test_handler_operations.py | 132 ++++++++++++++++
 2 files changed, 289 insertions(+)

diff --git a/brainstorm/handlers/pycuda_handler.py b/brainstorm/handlers/pycuda_handler.py
index d77f329..2f47227 100644
--- a/brainstorm/handlers/pycuda_handler.py
+++ b/brainstorm/handlers/pycuda_handler.py
@@ -143,6 +143,49 @@ def avgpool2d_backward_batch(self, inputs, window, outputs, padding,
                                in_deltas,
                                block=(NUM_CUDA_THREADS, 1, 1),
                                grid=(get_blocks(inputs.size), 1))
+        
+    def slice_copy_strided(self, inputs, outputs, slice_shape_inputs, slice_shape_outputs):
+        
+            _slice_copy_impl(inputs, outputs,                               
+                               slice_shape_inputs,slice_shape_outputs, np.int32(len(slice_shape_inputs)/4),
+                               block=(NUM_CUDA_THREADS, 1, 1),
+                               grid=(get_blocks(inputs.size), 1))
+            
+        
+    def handle_shape(self, shape):
+        shape = [np.int32(dim) for dim in shape]
+        while len(shape) < 5:        
+            shape = [np.int32(1)] + shape
+        return shape
+    
+    def strided_elementwise_inplace(self, inputs, idx,func):
+        shape = self.handle_shape(inputs.shape)
+        
+        if func not in strided_inp_funcs: 
+            raise Exception("Strided function not supported. \
+                            Supported functions are: {0}"
+                            .format(strided_inp_funcs.keys()))
+        
+        
+        strided_inp_funcs[func](inputs, np.int32(idx),                                
+                           shape[0],shape[1], shape[2], shape[3], shape[4],
+                           block=(NUM_CUDA_THREADS, 1, 1),
+                           grid=(get_blocks(inputs.size), 1))
+        
+        
+    def strided_elementwise(self, inputs, outputs, stride,func):
+        shape = self.handle_shape(inputs.shape)
+        
+        if func not in strided_funcs: 
+            raise Exception("Strided function not supported. \
+                            Supported functions are: {0}"
+                            .format(strided_inp_funcs.keys()))
+        
+        
+        strided_funcs[func](inputs, outputs, np.int32(stride),                                
+                           shape[0],shape[1], shape[2], shape[3], shape[4],
+                           block=(NUM_CUDA_THREADS, 1, 1),
+                           grid=(get_blocks(inputs.size), 1))
 
     def avgpool2d_forward_batch(self, inputs, window, outputs, padding,
                                 stride):
@@ -877,3 +920,117 @@ def tanh_deriv(self, x, y, dy, dx):
     """
 _mod_avepool_bwd_fp32 = SourceModule(__avepool_bwd_fp32_kernel)
 _avepool_bwd_fp32_impl = _mod_avepool_bwd_fp32.get_function("ave_pool_bwd")
+
+
+__strided_elewise_inp = """
+    __global__ void strided_elementwise_inp_{0}(float *in, 
+     int matrixIndex, int dim1, int dim2, int dim3, int dim4, int dim5) 
+    {{
+        int index = blockIdx.x * blockDim.x + threadIdx.x;
+        int idx = 0;
+        
+        for(int i = index; i < dim1*dim2*dim3*dim4; i+=blockDim.x * gridDim.x)
+        {{       
+            idx = (dim1*dim2*dim3*dim4*matrixIndex) + i;            
+            
+            in[idx] = {1};
+        }}               
+    }}
+    """
+__strided_logistic_inp = __strided_elewise_inp.format(
+                        "logistic","1./(1.+__expf(in[idx]))")
+_mod_strided_elewise_kernel_logistic = SourceModule(__strided_logistic_inp)
+_strided_elewise_inp_logistic = _mod_strided_elewise_kernel_logistic.get_function(
+                                "strided_elementwise_inp_logistic")
+
+__strided_tanh_inp = __strided_elewise_inp.format("tanh","tanh(in[idx])")
+_mod_strided_ele_kernel_tanh_inp = SourceModule(__strided_tanh_inp)
+_strided_elewise_inp_tanh = _mod_strided_ele_kernel_tanh_inp.get_function(
+                            "strided_elementwise_inp_tanh")
+
+strided_inp_funcs = {}
+strided_inp_funcs['logistic'] = _strided_elewise_inp_logistic
+strided_inp_funcs['tanh'] = _strided_elewise_inp_tanh
+
+
+__strided_elewise = """
+    __global__ void strided_elementwise_{0}(float *in, float *out,
+     int matrixIndex, int dim1, int dim2, int dim3, int dim4, int dim5) 
+    {{
+        int index = blockIdx.x * blockDim.x + threadIdx.x;
+        int idx = 0;
+        
+        for(int d5 = 0; d5 < dim5; d5++)
+        {{        
+            for(int i = index; i < dim1*dim2*dim3*dim4; i+=blockDim.x * gridDim.x)
+            {{       
+                idx = (dim1*dim2*dim3*dim4*d5) + i;            
+                
+                if(d5 == matrixIndex){{ out[idx] = {1}; }}
+                else{{ out[idx] = in[idx]; }}
+            }}    
+        }}               
+    }}
+    """
+__strided_logistic = __strided_elewise.format(
+                        "logistic","1./(1.+__expf(in[idx]))")
+_mod_strided_elewise_kernel_logistic = SourceModule(__strided_logistic)
+_strided_elewise_logistic = _mod_strided_elewise_kernel_logistic.get_function(
+                                "strided_elementwise_logistic")
+
+__strided_tanh = __strided_elewise.format("tanh","tanh(in[idx])")
+_mod_strided_ele_kernel_tanh = SourceModule(__strided_tanh)
+_strided_elewise_tanh = _mod_strided_ele_kernel_tanh.get_function(
+                            "strided_elementwise_tanh")
+
+strided_funcs = {}
+strided_funcs['logistic'] = _strided_elewise_logistic
+strided_funcs['tanh'] = _strided_elewise_tanh
+
+__slice_copy_kernel = """
+    __global__ void slice_copy(float *in, float *out,
+        float *from_shape, float *to_shape, int shapes) 
+    {
+        int in_start = 0;
+        int in_length = 0;//segment length
+        int in_segments= 0;
+        int in_stride = 0;
+        int in_current_segment = 0;
+        int in_slice_idx = 0;
+        
+        int out_start = 0;
+        int out_length  = 0;
+        int out_segments = 0;
+        int out_stride = 0;
+        int out_current_segment = 0;
+        int out_slice_idx = 0;    
+        
+      for(int shape = 0; shape < shapes; shape++)
+      {
+        in_start = (int)from_shape[shape*4];
+        in_length  = (int)from_shape[shape*4+1];
+        in_segments = (int)from_shape[shape*4+2];
+        in_stride = (int)from_shape[shape*4+3];
+        
+        out_start = (int)to_shape[shape*4];
+        out_length  = (int)to_shape[shape*4+1];
+        out_segments = (int)from_shape[shape*4+2];
+        out_stride = (int)to_shape[shape*4+3];
+        
+        for(int idx = (blockIdx.x * blockDim.x) + threadIdx.x; 
+            idx < in_length*in_segments ; idx+=blockDim.x * gridDim.x)
+        {
+          in_current_segment = ((idx)/in_length);
+          out_current_segment = ((idx)/out_length);
+          in_slice_idx = in_start + idx + (in_current_segment*in_stride);
+          out_slice_idx = out_start + idx + (out_current_segment*out_stride);
+          
+          out[out_slice_idx] = in[in_slice_idx];   
+          
+          
+        }
+      }
+    }
+    """
+_mod_slice_copy_kernel = SourceModule(__slice_copy_kernel)
+_slice_copy_impl = _mod_slice_copy_kernel.get_function("slice_copy")
diff --git a/brainstorm/tests/test_handler_operations.py b/brainstorm/tests/test_handler_operations.py
index dc050f3..ecaae5b 100644
--- a/brainstorm/tests/test_handler_operations.py
+++ b/brainstorm/tests/test_handler_operations.py
@@ -9,11 +9,14 @@
 from brainstorm.handlers import NumpyHandler
 from brainstorm.optional import has_pycuda
 
+import pycuda
+
 # np.random.seed(1234)
 dtype = np.float32
 NO_CON = set()
 
 
+
 def _conv2d_forward_batch(inputs, weights, bias, outputs, padding, stride):
     """
     Loop-based implementation of 2D convolution to check against.
@@ -167,3 +170,132 @@ def test_conv2d_forward_batch_pycuda():
                                     print("Expected:\n", true_outputs)
                                     print("Obtained:\n", outputs)
                                 assert passed
+                                
+                                  
+@pytest.mark.skipif(has_pycuda is False, reason='requires PyCUDA+scikit-cuda')
+def test_strided_elementwise():
+    from brainstorm.handlers import PyCudaHandler
+    _h = PyCudaHandler()
+    rdm = np.random.RandomState(1345)
+    
+    def get_rdm_array(shape, dims):
+        if dims == 2: return rdm.randn(shape[0],shape[1])
+        elif dims == 3: return rdm.randn(shape[0],shape[1], shape[2])
+        else: return rdm.randn(shape[0],shape[1], shape[2], shape[3])
+        
+    for dims in range(2,5):
+        for i in range(10):
+            shape = rdm.randint(1,17,dims)            
+            a1 = np.float32(get_rdm_array(shape, dims))
+            a2 = np.float32(get_rdm_array(shape, dims))
+            a3 = np.float32(get_rdm_array(shape, dims))
+            a = np.vstack([a1,a2,a3])
+            original_shape = a.shape
+            a = a.reshape([int(original_shape[0]/3)] + list(original_shape[1:])+[3])
+            b = np.zeros_like(a, dtype=np.float32)
+            A = _h.create_from_numpy(a)
+            
+            
+            idx = rdm.randint(0,2)
+            func = ['logistic', 'tanh'][idx]
+        
+            _h.strided_elementwise_inplace(A, 1,func)            
+            outputs = _h.get_numpy_copy(A).reshape(original_shape)
+            
+            c1 = a1
+            c2 = 1./(1.+np.exp(a2)) if idx == 0 else np.tanh(a2)
+            c3 = a3
+            c = np.vstack([c1,c2,c3])
+            
+            passed = np.allclose(outputs, c)                
+            assert passed
+    
+def test_strided_elementwise_inplace():    
+    from brainstorm.handlers import PyCudaHandler
+    _h = PyCudaHandler()
+    rdm = np.random.RandomState(1345)
+    
+    def get_rdm_array(shape, dims):
+        if dims == 2: return rdm.randn(shape[0],shape[1])
+        elif dims == 3: return rdm.randn(shape[0],shape[1], shape[2])
+        else: return rdm.randn(shape[0],shape[1], shape[2], shape[3])
+        
+    for dims in range(2,5):
+        for i in range(10):
+            shape = rdm.randint(1,17,dims)            
+            a1 = np.float32(get_rdm_array(shape, dims))
+            a2 = np.float32(get_rdm_array(shape, dims))
+            a3 = np.float32(get_rdm_array(shape, dims))
+            a = np.vstack([a1,a2,a3])
+            original_shape = a.shape
+            a = a.reshape([int(original_shape[0]/3)] + list(original_shape[1:])+[3])
+            b = np.zeros_like(a, dtype=np.float32)
+            A = _h.create_from_numpy(a)
+            
+            _h.strided_elementwise_inplace(A, 1,'logistic')
+            _h.strided_elementwise_inplace(A, 0,'tanh')
+            outputs = _h.get_numpy_copy(A).reshape(original_shape)
+            
+            c1 = np.tanh(a1)
+            c2 = 1./(1.+np.exp(a2))
+            c3 = a3
+            c = np.vstack([c1,c2,c3])
+            
+            passed = np.allclose(outputs, c)                
+            assert passed
+    
+    
+    
+'''                                
+@pytest.mark.skipif(has_pycuda is False, reason='requires PyCUDA+scikit-cuda')
+def test_slice_copy_stride():
+    from brainstorm.handlers import PyCudaHandler
+    _h = PyCudaHandler()
+    #2 dim test
+    a = np.float32(np.random.rand(10,10))
+    start = 4
+    length = 2
+    segments = 3
+    stride = 1
+    slices = [start, length, segments, stride]
+    data = []
+    for seg in range(segments):        
+        row = np.int32(start/a.shape[1])
+        offset = start - (row*a.shape[0])
+        data += a[row,offset + (length*seg) + (seg*stride):offset + (length*seg) + (seg*stride) + length].tolist()
+    
+    
+    s = np.array(data, dtype=np.float32)
+    A = _h.create_from_numpy(a)
+    S = _h.create_from_numpy(np.zeros_like(s,dtype=np.float32))
+    slices_A =_h.create_from_numpy(np.array(slices,dtype=np.float32))
+    slices_B =_h.create_from_numpy(np.array([0,length*segments,1,0],dtype=np.float32))
+    _h.slice_copy_strided(A,S, slices_A, slices_B)
+    outputs = _h.get_numpy_copy(S)
+    passed = np.allclose(outputs, s)
+    assert passed
+    #3 dim test
+    a = np.float32(np.random.rand(10,10,10))
+    start = 50
+    length = 6
+    segments = 4
+    stride = 5
+    slices = [start, length, segments, stride]
+    data = []
+    for seg in range(segments):        
+        row = np.int32(start/(a.shape[1]*a.shape[2]))
+        col = np.int32(start/(a.shape[1]))
+        offset = start - (row*(a.shape[1]*a.shape[2])) - (col*(a.shape[1]))
+        data += a[row,col, offset + (length*seg) + (seg*stride):offset + (length*seg) + (seg*stride) + length].tolist()
+    
+    
+    s = np.array(data, dtype=np.float32)
+    A = _h.create_from_numpy(a)
+    S = _h.create_from_numpy(np.zeros_like(s,dtype=np.float32))
+    slices_A =_h.create_from_numpy(np.array(slices,dtype=np.float32))
+    slices_B =_h.create_from_numpy(np.array([0,length*segments,1,0],dtype=np.float32))
+    _h.slice_copy_strided(A,S, slices_A, slices_B)
+    outputs = _h.get_numpy_copy(S)
+    passed = np.allclose(outputs, s)
+    assert passed
+'''