Skip to content

Commit 0cd86d0

Browse files
committed
In AVX-512 LDu/STu, handle pair of single as a double.
This should improves slightly the performance by reducing the number of uops needed to do the gather/scatter.
1 parent 1c2b7b4 commit 0cd86d0

File tree

1 file changed

+7
-19
lines changed

1 file changed

+7
-19
lines changed

simd-support/simd-avx512.h

Lines changed: 7 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -99,31 +99,19 @@ static inline void STA(R *x, V v, INT ovs, const R *aligned_like) {
9999
static inline V LDu(const R *x, INT ivs, const R *aligned_like)
100100
{
101101
(void)aligned_like; /* UNUSED */
102-
__m512i index = _mm512_set_epi32(7 * ivs + 1, 7 * ivs,
103-
6 * ivs + 1, 6 * ivs,
104-
5 * ivs + 1, 5 * ivs,
105-
4 * ivs + 1, 4 * ivs,
106-
3 * ivs + 1, 3 * ivs,
107-
2 * ivs + 1, 2 * ivs,
108-
1 * ivs + 1, 1 * ivs,
109-
0 * ivs + 1, 0 * ivs);
102+
/* pretend pair of single are a double */
103+
const __m256i index = _mm256_set_epi32(7 * ivs, 6 * ivs, 5 * ivs, 4 * ivs, 3 * ivs, 2 * ivs, 1 * ivs, 0 * ivs);
110104

111-
return _mm512_i32gather_ps(index, x, 4);
105+
return (V)_mm512_i32gather_pd(index, x, 4);
112106
}
113107

114108
static inline void STu(R *x, V v, INT ovs, const R *aligned_like)
115109
{
116110
(void)aligned_like; /* UNUSED */
117-
__m512i index = _mm512_set_epi32(7 * ovs + 1, 7 * ovs,
118-
6 * ovs + 1, 6 * ovs,
119-
5 * ovs + 1, 5 * ovs,
120-
4 * ovs + 1, 4 * ovs,
121-
3 * ovs + 1, 3 * ovs,
122-
2 * ovs + 1, 2 * ovs,
123-
1 * ovs + 1, 1 * ovs,
124-
0 * ovs + 1, 0 * ovs);
125-
126-
_mm512_i32scatter_ps(x, index, v, 4);
111+
/* pretend pair of single are a double */
112+
const __m256i index = _mm256_set_epi32(7 * ovs, 6 * ovs, 5 * ovs, 4 * ovs, 3 * ovs, 2 * ovs, 1 * ovs, 0 * ovs);
113+
114+
_mm512_i32scatter_pd(x, index, (__m512d)v, 4);
127115
}
128116

129117
#else /* !FFTW_SINGLE */

0 commit comments

Comments
 (0)