|
9 | 9 | // NTX must be >=2 so r is filled properly |
10 | 10 | template <typename T, Int NTX, Int NTY> |
11 | 11 | __global__ void AveragePooling_fp(T *input_features, T *output_features, |
12 | | - Int nPlanes, Int input_stride, |
13 | | - Int output_stride, Int *rules, Int nHot, |
14 | | - T alpha) { |
| 12 | + Int nPlanes, Int input_stride, |
| 13 | + Int output_stride, Int *rules, Int nHot, |
| 14 | + T alpha) { |
15 | 15 | __shared__ Int r[NTY * 2]; |
16 | 16 | for (Int n = blockIdx.x * NTY; n < nHot; n += gridDim.x * NTY) { |
17 | 17 | { |
18 | 18 | Int i = threadIdx.x + NTX * threadIdx.y; |
19 | 19 | if (i < NTY * 2 and i < 2 * (nHot - n)) |
20 | | - r[i] = rules[2 * n + i]; |
| 20 | + r[i] = rules[2 * n + i]; |
21 | 21 | } |
22 | 22 | __syncthreads(); |
23 | 23 | if (n + threadIdx.y < nHot) { |
24 | 24 | Int i = r[2 * threadIdx.y] * input_stride; |
25 | 25 | Int o = r[2 * threadIdx.y + 1] * output_stride; |
26 | 26 | for (Int plane = threadIdx.x; plane < nPlanes; plane += NTX) |
27 | | - output_features[o + plane]+= alpha * input_features[i + plane]; |
28 | | - // atomicAdd(&output_features[o + plane], |
29 | | - // alpha * input_features[i + plane]); |
| 27 | + output_features[o + plane] += alpha * input_features[i + plane]; |
| 28 | + // atomicAdd(&output_features[o + plane], |
| 29 | + // alpha * input_features[i + plane]); |
30 | 30 | } |
31 | 31 | __syncthreads(); |
32 | 32 | } |
33 | 33 | } |
34 | 34 |
|
35 | 35 | template <typename T> |
36 | 36 | void cuda_AveragePooling_ForwardPass(T *input_features, T *output_features, |
37 | | - Int nPlanes, Int input_stride, |
38 | | - Int output_stride, RuleBook _rules, |
39 | | - Int filterVolume) { |
40 | | - RULEBOOKITERATOR((AveragePooling_fp<T, 32, 32><<<32, dim3(32, 32)>>>( |
41 | | - input_features, output_features, nPlanes, input_stride, output_stride, |
42 | | - rbB, nHotB, 1.0 / filterVolume)); |
43 | | - , ) |
| 37 | + Int nPlanes, Int input_stride, |
| 38 | + Int output_stride, RuleBook _rules, |
| 39 | + Int filterVolume) { |
| 40 | + auto application = [&](Int *rbB, Int nHotB, cudaStream_t &stream) -> void { |
| 41 | + AveragePooling_fp<T, 32, 32><<<32, dim3(32, 32), 0, stream>>>( |
| 42 | + input_features, output_features, nPlanes, input_stride, output_stride, |
| 43 | + rbB, nHotB, 1.0 / filterVolume); |
| 44 | + }; |
| 45 | + |
| 46 | + iterateRuleBook(_rules, application); |
44 | 47 | } |
45 | 48 | template <typename T, Int NTX, Int NTY> |
46 | 49 | __global__ void AveragePooling_bp(T *d_input_features, T *d_output_features, |
47 | | - Int nPlanes, Int input_stride, |
48 | | - Int output_stride, Int *rules, Int nHot, |
49 | | - T alpha) { |
| 50 | + Int nPlanes, Int input_stride, |
| 51 | + Int output_stride, Int *rules, Int nHot, |
| 52 | + T alpha) { |
50 | 53 | __shared__ Int r[NTY * 2]; |
51 | 54 | for (Int n = blockIdx.x * NTY; n < nHot; n += gridDim.x * NTY) { |
52 | 55 | { |
53 | 56 | Int i = threadIdx.x + NTX * threadIdx.y; |
54 | 57 | if (i < NTY * 2 and i < 2 * (nHot - n)) |
55 | | - r[i] = rules[2 * n + i]; |
| 58 | + r[i] = rules[2 * n + i]; |
56 | 59 | } |
57 | 60 | __syncthreads(); |
58 | 61 | if (n + threadIdx.y < nHot) { |
59 | 62 | Int i = r[2 * threadIdx.y] * input_stride; |
60 | 63 | Int o = r[2 * threadIdx.y + 1] * output_stride; |
61 | 64 | for (Int plane = threadIdx.x; plane < nPlanes; plane += NTX) |
62 | | - d_input_features[i + plane] += alpha * d_output_features[o + plane]; |
| 65 | + d_input_features[i + plane] += alpha * d_output_features[o + plane]; |
63 | 66 | } |
64 | 67 | __syncthreads(); |
65 | 68 | } |
66 | 69 | } |
67 | 70 |
|
68 | 71 | template <typename T> |
69 | 72 | void cuda_AveragePooling_BackwardPass(T *d_input_features, T *d_output_features, |
70 | | - Int nPlanes, Int input_stride, |
71 | | - Int output_stride, RuleBook _rules, |
72 | | - Int filterVolume) { |
73 | | - RULEBOOKITERATOR((AveragePooling_bp<T, 32, 32><<<32, dim3(32, 32)>>>( |
74 | | - d_input_features, d_output_features, nPlanes, input_stride, output_stride, |
75 | | - rbB, nHotB, 1.0 / filterVolume)); |
76 | | - , ) |
77 | | -} |
78 | | - |
79 | | - |
80 | | - |
81 | | - |
82 | | - |
83 | | - |
84 | | - |
85 | | - |
86 | | - |
| 73 | + Int nPlanes, Int input_stride, |
| 74 | + Int output_stride, RuleBook _rules, |
| 75 | + Int filterVolume) { |
87 | 76 |
|
| 77 | + auto application = [&](Int *rbB, Int nHotB, cudaStream_t &stream) -> void { |
| 78 | + AveragePooling_bp<T, 32, 32><<<32, dim3(32, 32), 0, stream>>>( |
| 79 | + d_input_features, d_output_features, nPlanes, input_stride, output_stride, |
| 80 | + rbB, nHotB, 1.0 / filterVolume); |
| 81 | + }; |
88 | 82 |
|
| 83 | + iterateRuleBook(_rules, application); |
| 84 | +} |
89 | 85 |
|
90 | 86 | // NTX must be >=2 so r is filled properly |
91 | 87 | template <typename T, Int NTX, Int NTY> |
92 | | -__global__ void CopyFeaturesHelper_fp(T *input_features, T *output_features, Int * rules, |
93 | | - Int nPlanes, Int nHot) { |
| 88 | +__global__ void CopyFeaturesHelper_fp(T *input_features, T *output_features, |
| 89 | + Int *rules, Int nPlanes, Int nHot) { |
94 | 90 | __shared__ Int r[NTY * 2]; |
95 | 91 | for (Int n = blockIdx.x * NTY; n < nHot; n += gridDim.x * NTY) { |
96 | 92 | { |
97 | 93 | Int i = threadIdx.x + NTX * threadIdx.y; |
98 | 94 | if (i < NTY * 2 and i < 2 * (nHot - n)) |
99 | | - r[i] = rules[2 * n + i]; |
| 95 | + r[i] = rules[2 * n + i]; |
100 | 96 | } |
101 | 97 | __syncthreads(); |
102 | 98 | if (n + threadIdx.y < nHot) { |
103 | | - Int i = r[2 * threadIdx.y+1] * nPlanes; |
104 | | - Int o = r[2 * threadIdx.y ] * nPlanes; |
| 99 | + Int i = r[2 * threadIdx.y + 1] * nPlanes; |
| 100 | + Int o = r[2 * threadIdx.y] * nPlanes; |
105 | 101 | for (Int plane = threadIdx.x; plane < nPlanes; plane += NTX) |
106 | | - output_features[o + plane]= input_features[i + plane]; |
| 102 | + output_features[o + plane] = input_features[i + plane]; |
107 | 103 | } |
108 | 104 | __syncthreads(); |
109 | 105 | } |
110 | 106 | } |
111 | 107 |
|
112 | 108 | template <typename T> |
113 | | -void cuda_CopyFeaturesHelper_ForwardPass(T *input_features, T *output_features, Int* rules, |
114 | | - Int nPlanes, Int nHot) { |
115 | | -CopyFeaturesHelper_fp<T, 32, 32><<<32, dim3(32, 32)>>>( |
116 | | - input_features, output_features, rules, nPlanes, |
117 | | - nHot); |
| 109 | +void cuda_CopyFeaturesHelper_ForwardPass(T *input_features, T *output_features, |
| 110 | + Int *rules, Int nPlanes, Int nHot) { |
| 111 | + CopyFeaturesHelper_fp<T, 32, 32><<<32, dim3(32, 32)>>>( |
| 112 | + input_features, output_features, rules, nPlanes, nHot); |
118 | 113 | } |
119 | 114 | template <typename T, Int NTX, Int NTY> |
120 | | -__global__ void CopyFeaturesHelper_bp(T *d_input_features, T *d_output_features, Int* rules, |
121 | | - Int nPlanes,Int nHot) { |
| 115 | +__global__ void CopyFeaturesHelper_bp(T *d_input_features, T *d_output_features, |
| 116 | + Int *rules, Int nPlanes, Int nHot) { |
122 | 117 | __shared__ Int r[NTY * 2]; |
123 | 118 | for (Int n = blockIdx.x * NTY; n < nHot; n += gridDim.x * NTY) { |
124 | 119 | { |
125 | 120 | Int i = threadIdx.x + NTX * threadIdx.y; |
126 | 121 | if (i < NTY * 2 and i < 2 * (nHot - n)) |
127 | | - r[i] = rules[2 * n + i]; |
| 122 | + r[i] = rules[2 * n + i]; |
128 | 123 | } |
129 | 124 | __syncthreads(); |
130 | 125 | if (n + threadIdx.y < nHot) { |
131 | | - Int i = r[2 * threadIdx.y+1] * nPlanes; |
| 126 | + Int i = r[2 * threadIdx.y + 1] * nPlanes; |
132 | 127 | Int o = r[2 * threadIdx.y] * nPlanes; |
133 | 128 | for (Int plane = threadIdx.x; plane < nPlanes; plane += NTX) |
134 | | - d_input_features[i + plane] = d_output_features[o + plane]; |
| 129 | + d_input_features[i + plane] = d_output_features[o + plane]; |
135 | 130 | } |
136 | 131 | __syncthreads(); |
137 | 132 | } |
138 | 133 | } |
139 | 134 |
|
140 | 135 | template <typename T> |
141 | | -void cuda_CopyFeaturesHelper_BackwardPass(T *d_input_features, T *d_output_features, |
142 | | - Int* rules, Int nPlanes, Int nHot) { |
143 | | -CopyFeaturesHelper_bp<T, 32, 32><<<32, dim3(32, 32)>>>( |
| 136 | +void cuda_CopyFeaturesHelper_BackwardPass(T *d_input_features, |
| 137 | + T *d_output_features, Int *rules, |
| 138 | + Int nPlanes, Int nHot) { |
| 139 | + CopyFeaturesHelper_bp<T, 32, 32><<<32, dim3(32, 32)>>>( |
144 | 140 | d_input_features, d_output_features, rules, nPlanes, nHot); |
145 | 141 | } |
0 commit comments