From e75940230bbb2f79e7b9ffe68e7002ee78dcaa70 Mon Sep 17 00:00:00 2001 From: gtsoul-tech Date: Tue, 26 Mar 2024 16:38:27 +0200 Subject: [PATCH 01/12] normal dct --- dct.c | 115 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 115 insertions(+) create mode 100644 dct.c diff --git a/dct.c b/dct.c new file mode 100644 index 0000000..a2ff442 --- /dev/null +++ b/dct.c @@ -0,0 +1,115 @@ +#include +#include +#include +#include +#include +typedef uint16_t dctcoef; +void +dct4x4dc(dctcoef d[16]) { + dctcoef tmp[16]; + + for (int i = 0; i < 4; i++) { + int s01 = d[i * 4 + 0] + d[i * 4 + 1]; + int d01 = d[i * 4 + 0] - d[i * 4 + 1]; + int s23 = d[i * 4 + 2] + d[i * 4 + 3]; + int d23 = d[i * 4 + 2] - d[i * 4 + 3]; + + tmp[0 * 4 + i] = s01 + s23; + tmp[1 * 4 + i] = s01 - s23; + tmp[2 * 4 + i] = d01 - d23; + tmp[3 * 4 + i] = d01 + d23; + } + + for (int i = 0; i < 4; i++) { + int s01 = tmp[i * 4 + 0] + tmp[i * 4 + 1]; + int d01 = tmp[i * 4 + 0] - tmp[i * 4 + 1]; + int s23 = tmp[i * 4 + 2] + tmp[i * 4 + 3]; + int d23 = tmp[i * 4 + 2] - tmp[i * 4 + 3]; + + d[i * 4 + 0] = (s01 + s23 + 1) >> 1; + d[i * 4 + 1] = (s01 - s23 + 1) >> 1; + d[i * 4 + 2] = (d01 - d23 + 1) >> 1; + d[i * 4 + 3] = (d01 + d23 + 1) >> 1; + } +} + +void +dct4x4dc_sse(dctcoef d[16]) { + __m128i tmp[2]; + tmp[0] = _mm_setzero_si128(); // Set first vector to zero + tmp[1] = _mm_setzero_si128(); // Set second vector to zero + + for (int i = 0; i < 2; i++) { + __m128i row1 = _mm_loadu_si128((__m128i *) &d[i * 8]); + __m128i row2 = _mm_loadu_si128((__m128i *) &d[i * 8 + 4]); + + __m128i s01s23 = _mm_add_epi16(row1, row2); + __m128i d01d23 = _mm_sub_epi16(row1, row2); + + __m128i tmp1 = _mm_unpacklo_epi64(s01s23, d01d23); + __m128i tmp2 = _mm_unpackhi_epi64(s01s23, d01d23); + + tmp[i] = _mm_unpacklo_epi64(tmp1, tmp2); + } + for (int i = 0; i < 2; i++) { + __m128i vec = tmp[i]; + + __m128i s01s23 = _mm_add_epi16(vec, _mm_shuffle_epi32(vec, _MM_SHUFFLE(1, 0, 3, 2))); + __m128i d01d23 = _mm_sub_epi16(vec, _mm_shuffle_epi32(vec, _MM_SHUFFLE(1, 0, 3, 2))); + + __m128i res1 = _mm_add_epi16(s01s23, _mm_set1_epi16(1)); + __m128i res2 = _mm_add_epi16(d01d23, _mm_set1_epi16(1)); + + res1 = _mm_srai_epi16(res1, 1); + res2 = _mm_srai_epi16(res2, 1); + + _mm_storeu_si128((__m128i *) &d[i * 8], res1); + _mm_storeu_si128((__m128i *) &d[i * 8 + 4], res2); + } +} +int +main() { + struct timespec start, mid, end; + srand(time(NULL)); + dctcoef matrix[16]; + dctcoef matrix2[16]; + for (int i = 0; i < 16; i++) { + matrix[i] = rand() % 65536; + matrix2[i] = matrix[i]; + } + printf("Original matrix:\n"); + for (int i = 0; i < 16; i += 4) { + printf("%02x %02x %02x %02x\n", matrix[i], matrix[i + 1], matrix[i + 2], matrix[i + 3]); + } + clock_gettime(CLOCK_MONOTONIC, &start); + dct4x4dc(matrix); + clock_gettime(CLOCK_MONOTONIC, &mid); + dct4x4dc_sse(matrix2); + clock_gettime(CLOCK_MONOTONIC, &end); + + printf("\nMatrix after dct4x4dc:\n"); + for (int i = 0; i < 16; i += 4) { + printf("%02x %02x %02x %02x\n", matrix[i], matrix[i + 1], matrix[i + 2], matrix[i + 3]); + } + + printf("\nMatrix2 after dct4x4dc_sse:\n"); + for (int i = 0; i < 16; i += 4) { + printf("%02x %02x %02x %02x\n", matrix2[i], matrix2[i + 1], matrix2[i + 2], matrix2[i + 3]); + } + + long seconds1 = mid.tv_sec - start.tv_sec; + long nanoseconds1 = mid.tv_nsec - start.tv_nsec; + if (nanoseconds1 < 0) { + seconds1--; + nanoseconds1 += 1000000000; + } + long seconds2 = end.tv_sec - mid.tv_sec; + long nanoseconds2 = end.tv_nsec - mid.tv_nsec; + if (nanoseconds2 < 0) { + seconds2--; + nanoseconds2 += 1000000000; + } + printf("scalar: %ld.%09ld seconds\n", seconds1, nanoseconds1); + printf("SSE : %ld.%09ld seconds\n", seconds2, nanoseconds2); + return 0; +} \ No newline at end of file From e8e76638719a481f7a5692bd20aabe84927c2dc7 Mon Sep 17 00:00:00 2001 From: gtsoul-tech Date: Wed, 27 Mar 2024 16:18:25 +0200 Subject: [PATCH 02/12] phase1 --- dct.c | 110 ++++++++++++++++++++++++++++++++++++++-------------------- 1 file changed, 73 insertions(+), 37 deletions(-) diff --git a/dct.c b/dct.c index a2ff442..7664fbb 100644 --- a/dct.c +++ b/dct.c @@ -3,7 +3,19 @@ #include #include #include + +#define N 1000000000 typedef uint16_t dctcoef; +void +print_vector(__m128i v) { + uint16_t buf[8]; + _mm_storeu_si128((__m128i *) buf, v); + for (int i = 0; i < 8; i++) { + printf("%02x ", buf[i]); + } + printf("\n"); +} + void dct4x4dc(dctcoef d[16]) { dctcoef tmp[16]; @@ -19,7 +31,9 @@ dct4x4dc(dctcoef d[16]) { tmp[2 * 4 + i] = d01 - d23; tmp[3 * 4 + i] = d01 + d23; } - + for (int i = 0; i < 16; i = i + 4) { + printf("in dct_c %02x %02x %02x %02x\n", tmp[0 + i], tmp[1 + i], tmp[2 + i], tmp[3 + i]); + } for (int i = 0; i < 4; i++) { int s01 = tmp[i * 4 + 0] + tmp[i * 4 + 1]; int d01 = tmp[i * 4 + 0] - tmp[i * 4 + 1]; @@ -35,37 +49,59 @@ dct4x4dc(dctcoef d[16]) { void dct4x4dc_sse(dctcoef d[16]) { - __m128i tmp[2]; - tmp[0] = _mm_setzero_si128(); // Set first vector to zero - tmp[1] = _mm_setzero_si128(); // Set second vector to zero - - for (int i = 0; i < 2; i++) { - __m128i row1 = _mm_loadu_si128((__m128i *) &d[i * 8]); - __m128i row2 = _mm_loadu_si128((__m128i *) &d[i * 8 + 4]); - - __m128i s01s23 = _mm_add_epi16(row1, row2); - __m128i d01d23 = _mm_sub_epi16(row1, row2); - - __m128i tmp1 = _mm_unpacklo_epi64(s01s23, d01d23); - __m128i tmp2 = _mm_unpackhi_epi64(s01s23, d01d23); - - tmp[i] = _mm_unpacklo_epi64(tmp1, tmp2); - } - for (int i = 0; i < 2; i++) { - __m128i vec = tmp[i]; - - __m128i s01s23 = _mm_add_epi16(vec, _mm_shuffle_epi32(vec, _MM_SHUFFLE(1, 0, 3, 2))); - __m128i d01d23 = _mm_sub_epi16(vec, _mm_shuffle_epi32(vec, _MM_SHUFFLE(1, 0, 3, 2))); - - __m128i res1 = _mm_add_epi16(s01s23, _mm_set1_epi16(1)); - __m128i res2 = _mm_add_epi16(d01d23, _mm_set1_epi16(1)); - - res1 = _mm_srai_epi16(res1, 1); - res2 = _mm_srai_epi16(res2, 1); - - _mm_storeu_si128((__m128i *) &d[i * 8], res1); - _mm_storeu_si128((__m128i *) &d[i * 8 + 4], res2); - } + dctcoef dT[4][4]; // This will hold the transposed matrix + + // Load the rows of d into 128-bit vectors + __m128i row1row2 = _mm_set_epi16(d[7], d[6], d[5], d[4], d[3], d[2], d[1], d[0]); + __m128i row3row4 = _mm_set_epi16(d[15], d[14], d[13], d[12], d[11], d[10], d[9], d[8]); + + __m128i tmp1 = _mm_unpacklo_epi16(row1row2, row3row4); + __m128i tmp3 = _mm_unpackhi_epi16(row1row2, row3row4); + __m128i tmp1_new = _mm_unpacklo_epi16(tmp1, tmp3); + __m128i tmp3_new = _mm_unpackhi_epi16(tmp1, tmp3); + __m128i upper_half_tmp1 = _mm_srli_si128(tmp1_new, 8); + _mm_storel_epi64((__m128i *) &dT[1][0], upper_half_tmp1); + _mm_storel_epi64((__m128i *) &dT[0][0], _mm_move_epi64(tmp1_new)); + _mm_storel_epi64((__m128i *) &dT[3][0], _mm_srli_si128(tmp3_new, 8)); + _mm_storel_epi64((__m128i *) &dT[2][0], _mm_move_epi64(tmp3_new)); + // load rows from transposed d + row1row2 = _mm_loadu_si128((__m128i *) dT[0]); + row3row4 = _mm_loadu_si128((__m128i *) dT[2]); + // 1st + 2nd +3rd +4rth + __m128i totalSum = _mm_add_epi16(row1row2, row3row4); + __m128i shuffled = _mm_shuffle_epi32(totalSum, _MM_SHUFFLE(2, 3, 3, 2)); + totalSum = _mm_add_epi16(totalSum, shuffled); + _mm_storel_epi64((__m128i *) dT[0], totalSum); + // 1st + 2nd -3rd -4rth + totalSum = _mm_sub_epi16(row1row2, row3row4); + shuffled = _mm_shuffle_epi32(totalSum, _MM_SHUFFLE(2, 3, 3, 2)); + totalSum = _mm_add_epi16(totalSum, shuffled); + _mm_storel_epi64((__m128i *) dT[1], totalSum); + // 1st - 2nd -3rd +4rth + __m128i mask1 = _mm_setr_epi16(0, 0, 0, 0, -1, -1, -1, -1); + __m128i mask2 = _mm_setr_epi16(-1, -1, -1, -1, 0, 0, 0, 0); + __m128i zero = _mm_setzero_si128(); + __m128i signMask1 = _mm_cmplt_epi16(mask1, zero); + __m128i signMask2 = _mm_cmplt_epi16(mask2, zero); + row1row2 = _mm_sub_epi16(_mm_andnot_si128(signMask1, row1row2), _mm_and_si128(signMask1, row1row2)); + row3row4 = _mm_sub_epi16(_mm_andnot_si128(signMask2, row3row4), _mm_and_si128(signMask2, row3row4)); + totalSum = _mm_add_epi16(row1row2, row3row4); + shuffled = _mm_shuffle_epi32(totalSum, _MM_SHUFFLE(2, 3, 3, 2)); + totalSum = _mm_add_epi16(totalSum, shuffled); + _mm_storel_epi64((__m128i *) dT[2], totalSum); + // 1st - 2nd +3rd -4rth + totalSum = _mm_sub_epi16(row1row2, row3row4); + shuffled = _mm_shuffle_epi32(totalSum, _MM_SHUFFLE(2, 3, 3, 2)); + totalSum = _mm_add_epi16(totalSum, shuffled); + _mm_storel_epi64((__m128i *) dT[3], totalSum); + + /* + for (int i = 0; i < 4; i++) { + for (int j = 0; j < 4; j++) { + printf("%02x ", (uint16_t) dT[i][j]); + } + printf("\n"); + }*/ } int main() { @@ -74,7 +110,7 @@ main() { dctcoef matrix[16]; dctcoef matrix2[16]; for (int i = 0; i < 16; i++) { - matrix[i] = rand() % 65536; + matrix[i] = rand() & 0xFF; // 8 bit unsigned matrix2[i] = matrix[i]; } printf("Original matrix:\n"); @@ -87,14 +123,14 @@ main() { dct4x4dc_sse(matrix2); clock_gettime(CLOCK_MONOTONIC, &end); - printf("\nMatrix after dct4x4dc:\n"); + // printf("\nMatrix after dct4x4dc:\n"); for (int i = 0; i < 16; i += 4) { - printf("%02x %02x %02x %02x\n", matrix[i], matrix[i + 1], matrix[i + 2], matrix[i + 3]); + // printf("%02x %02x %02x %02x\n", matrix[i], matrix[i + 1], matrix[i + 2], matrix[i + 3]); } - printf("\nMatrix2 after dct4x4dc_sse:\n"); + // printf("\nMatrix2 after dct4x4dc_sse:\n"); for (int i = 0; i < 16; i += 4) { - printf("%02x %02x %02x %02x\n", matrix2[i], matrix2[i + 1], matrix2[i + 2], matrix2[i + 3]); + // printf("%02x %02x %02x %02x\n", matrix2[i], matrix2[i + 1], matrix2[i + 2], matrix2[i + 3]); } long seconds1 = mid.tv_sec - start.tv_sec; From cf0e7e2bf08411136db8b817dcdb41f7a3371ca9 Mon Sep 17 00:00:00 2001 From: gtsoul-tech Date: Thu, 28 Mar 2024 12:37:29 +0200 Subject: [PATCH 03/12] phase1 refactored --- dct.c | 138 ++++++++++++++++++++++++++++++++++++++-------------------- 1 file changed, 92 insertions(+), 46 deletions(-) diff --git a/dct.c b/dct.c index 7664fbb..1d3eafe 100644 --- a/dct.c +++ b/dct.c @@ -11,7 +11,7 @@ print_vector(__m128i v) { uint16_t buf[8]; _mm_storeu_si128((__m128i *) buf, v); for (int i = 0; i < 8; i++) { - printf("%02x ", buf[i]); + printf("%x ", buf[i]); } printf("\n"); } @@ -31,9 +31,12 @@ dct4x4dc(dctcoef d[16]) { tmp[2 * 4 + i] = d01 - d23; tmp[3 * 4 + i] = d01 + d23; } - for (int i = 0; i < 16; i = i + 4) { - printf("in dct_c %02x %02x %02x %02x\n", tmp[0 + i], tmp[1 + i], tmp[2 + i], tmp[3 + i]); + + printf("\n"); + for (int i = 0; i < 16; i += 4) { + printf("%02x %02x %02x %02x\n", tmp[i], tmp[i + 1], tmp[i + 2], tmp[i + 3]); } + for (int i = 0; i < 4; i++) { int s01 = tmp[i * 4 + 0] + tmp[i * 4 + 1]; int d01 = tmp[i * 4 + 0] - tmp[i * 4 + 1]; @@ -52,56 +55,99 @@ dct4x4dc_sse(dctcoef d[16]) { dctcoef dT[4][4]; // This will hold the transposed matrix // Load the rows of d into 128-bit vectors - __m128i row1row2 = _mm_set_epi16(d[7], d[6], d[5], d[4], d[3], d[2], d[1], d[0]); - __m128i row3row4 = _mm_set_epi16(d[15], d[14], d[13], d[12], d[11], d[10], d[9], d[8]); - + __m128i row1row2 = _mm_loadu_si128((__m128i *) &d[0]); + __m128i row3row4 = _mm_loadu_si128((__m128i *) &d[8]); // load instead of set __m128i tmp1 = _mm_unpacklo_epi16(row1row2, row3row4); __m128i tmp3 = _mm_unpackhi_epi16(row1row2, row3row4); - __m128i tmp1_new = _mm_unpacklo_epi16(tmp1, tmp3); - __m128i tmp3_new = _mm_unpackhi_epi16(tmp1, tmp3); - __m128i upper_half_tmp1 = _mm_srli_si128(tmp1_new, 8); - _mm_storel_epi64((__m128i *) &dT[1][0], upper_half_tmp1); - _mm_storel_epi64((__m128i *) &dT[0][0], _mm_move_epi64(tmp1_new)); - _mm_storel_epi64((__m128i *) &dT[3][0], _mm_srli_si128(tmp3_new, 8)); - _mm_storel_epi64((__m128i *) &dT[2][0], _mm_move_epi64(tmp3_new)); - // load rows from transposed d - row1row2 = _mm_loadu_si128((__m128i *) dT[0]); - row3row4 = _mm_loadu_si128((__m128i *) dT[2]); + row1row2 = _mm_unpacklo_epi16(tmp1, tmp3); + row3row4 = _mm_unpackhi_epi16(tmp1, tmp3); + + __m128i totalSum1 = _mm_add_epi16(row1row2, row3row4); + __m128i totalSum2 = _mm_sub_epi16(row1row2, row3row4); + __m128i shuffled1 = _mm_shuffle_epi32(totalSum1, _MM_SHUFFLE(2, 3, 3, 2)); + __m128i shuffled2 = _mm_shuffle_epi32(totalSum2, _MM_SHUFFLE(2, 3, 3, 2)); + totalSum1 = _mm_add_epi16(totalSum1, shuffled1); + totalSum2 = _mm_add_epi16(totalSum2, shuffled2); // 1st + 2nd +3rd +4rth - __m128i totalSum = _mm_add_epi16(row1row2, row3row4); - __m128i shuffled = _mm_shuffle_epi32(totalSum, _MM_SHUFFLE(2, 3, 3, 2)); - totalSum = _mm_add_epi16(totalSum, shuffled); - _mm_storel_epi64((__m128i *) dT[0], totalSum); // 1st + 2nd -3rd -4rth - totalSum = _mm_sub_epi16(row1row2, row3row4); - shuffled = _mm_shuffle_epi32(totalSum, _MM_SHUFFLE(2, 3, 3, 2)); - totalSum = _mm_add_epi16(totalSum, shuffled); - _mm_storel_epi64((__m128i *) dT[1], totalSum); // 1st - 2nd -3rd +4rth - __m128i mask1 = _mm_setr_epi16(0, 0, 0, 0, -1, -1, -1, -1); - __m128i mask2 = _mm_setr_epi16(-1, -1, -1, -1, 0, 0, 0, 0); + __m128i maskFF = _mm_set1_epi16(0xFF); __m128i zero = _mm_setzero_si128(); - __m128i signMask1 = _mm_cmplt_epi16(mask1, zero); - __m128i signMask2 = _mm_cmplt_epi16(mask2, zero); - row1row2 = _mm_sub_epi16(_mm_andnot_si128(signMask1, row1row2), _mm_and_si128(signMask1, row1row2)); - row3row4 = _mm_sub_epi16(_mm_andnot_si128(signMask2, row3row4), _mm_and_si128(signMask2, row3row4)); - totalSum = _mm_add_epi16(row1row2, row3row4); - shuffled = _mm_shuffle_epi32(totalSum, _MM_SHUFFLE(2, 3, 3, 2)); - totalSum = _mm_add_epi16(totalSum, shuffled); - _mm_storel_epi64((__m128i *) dT[2], totalSum); + __m128i mask1 = _mm_slli_si128(maskFF, 8); + __m128i mask2 = _mm_srli_si128(maskFF, 8); + __m128i masked_part = _mm_and_si128(mask1, row1row2); + __m128i neg_masked_part = _mm_sub_epi16(zero, masked_part); + row1row2 = _mm_or_si128(_mm_andnot_si128(mask1, row1row2), neg_masked_part); + masked_part = _mm_and_si128(mask2, row3row4); + neg_masked_part = _mm_sub_epi16(zero, masked_part); + row3row4 = _mm_or_si128(_mm_andnot_si128(mask2, row3row4), neg_masked_part); + + __m128i totalSum3 = _mm_add_epi16(row1row2, row3row4); + __m128i shuffled3 = _mm_shuffle_epi32(totalSum3, _MM_SHUFFLE(2, 3, 3, 2)); + totalSum3 = _mm_add_epi16(totalSum3, shuffled3); // 1st - 2nd +3rd -4rth - totalSum = _mm_sub_epi16(row1row2, row3row4); + __m128i totalSum4 = _mm_sub_epi16(row1row2, row3row4); + __m128i shuffled4 = _mm_shuffle_epi32(totalSum4, _MM_SHUFFLE(2, 3, 3, 2)); + totalSum4 = _mm_add_epi16(totalSum4, shuffled4); + + _mm_storel_epi64((__m128i *) dT[0], totalSum1); + _mm_storel_epi64((__m128i *) dT[1], totalSum2); + _mm_storel_epi64((__m128i *) dT[2], totalSum3); + _mm_storel_epi64((__m128i *) dT[3], totalSum4); + printf("\n"); + /* + // PHASE 2 + row1row2 = _mm_loadu_si128((__m128i *) &dT[0][0]); + row3row4 = _mm_loadu_si128((__m128i *) &dT[2][0]); // load instead of set + + // transpose dT back + __m128i ones = _mm_set1_epi32(1); // to divide + tmp1 = _mm_unpacklo_epi16(row1row2, row3row4); + tmp3 = _mm_unpackhi_epi16(row1row2, row3row4); + row1row2 = _mm_unpacklo_epi16(tmp1, tmp3); + row3row4 = _mm_unpackhi_epi16(tmp1, tmp3); + + printf("\n\n"); + print_vector(row1row2); + print_vector(row3row4); + /* + __m128i row1 = _mm_loadu_si128((__m128i *) &d[0]); + __m128i extended_row1 = _mm_unpacklo_epi16(row1, zero); + __m128i row2 = _mm_loadu_si128((__m128i *) &d[4]); + __m128i extended_row2 = _mm_unpacklo_epi16(row2, zero); + __m128i row3 = _mm_loadu_si128((__m128i *) &d[8]); + __m128i extended_row3 = _mm_unpacklo_epi16(row3, zero); + __m128i row4 = _mm_loadu_si128((__m128i *) &d[12]); + __m128i extended_row4 = _mm_unpacklo_epi16(row4, zero); + // 1st + 2nd +3rd +4rth + + __m128i tempSum = _mm_add_epi32(extended_row1, extended_row2); + totalSum = _mm_add_epi32(extended_row3, extended_row4); + totalSum = _mm_add_epi32(totalSum, tempSum); + totalSum = _mm_add_epi32(totalSum, ones); + totalSum = _mm_srli_epi32(totalSum, 1); + + print_vector(totalSum); + _mm_storeu_si64((__m128i *) &d[0], _mm_packs_epi32(totalSum, totalSum)); + + printf("\n"); + for (int i = 0; i < 16; i += 4) { + printf("%02x %02x %02x %02x\n", d[i], d[i + 1], d[i + 2], d[i + 3]); + } + // 1st + 2nd +3rd +4rth + totalSum = _mm_add_epi32(row1, row3row4); shuffled = _mm_shuffle_epi32(totalSum, _MM_SHUFFLE(2, 3, 3, 2)); totalSum = _mm_add_epi16(totalSum, shuffled); - _mm_storel_epi64((__m128i *) dT[3], totalSum); + printf("\n"); + totalSum = _mm_add_epi16(totalSum, ones); + _mm_storel_epi64((__m128i *) &d[0], _mm_srli_epi16(totalSum, 1)); + * / /* - for (int i = 0; i < 4; i++) { - for (int j = 0; j < 4; j++) { - printf("%02x ", (uint16_t) dT[i][j]); - } - printf("\n"); - }*/ + // 1st + 2nd -3rd -4rth + // 1st - 2nd -3rd +4rth + // 1st - 2nd +3rd -4rth + */ } int main() { @@ -123,14 +169,14 @@ main() { dct4x4dc_sse(matrix2); clock_gettime(CLOCK_MONOTONIC, &end); - // printf("\nMatrix after dct4x4dc:\n"); + printf("\nMatrix after dct4x4dc:\n"); for (int i = 0; i < 16; i += 4) { - // printf("%02x %02x %02x %02x\n", matrix[i], matrix[i + 1], matrix[i + 2], matrix[i + 3]); + printf("%02x %02x %02x %02x\n", matrix[i], matrix[i + 1], matrix[i + 2], matrix[i + 3]); } - // printf("\nMatrix2 after dct4x4dc_sse:\n"); + printf("\nMatrix2 after dct4x4dc_sse:\n"); for (int i = 0; i < 16; i += 4) { - // printf("%02x %02x %02x %02x\n", matrix2[i], matrix2[i + 1], matrix2[i + 2], matrix2[i + 3]); + printf("%02x %02x %02x %02x\n", matrix2[i], matrix2[i + 1], matrix2[i + 2], matrix2[i + 3]); } long seconds1 = mid.tv_sec - start.tv_sec; From 35d1da076ffda7dc52e8e9bca777be0f19150193 Mon Sep 17 00:00:00 2001 From: gtsoul-tech Date: Thu, 28 Mar 2024 16:05:28 +0200 Subject: [PATCH 04/12] correct results --- dct.c | 101 ++++++++++++++++++++++++++++++++-------------------------- 1 file changed, 56 insertions(+), 45 deletions(-) diff --git a/dct.c b/dct.c index 1d3eafe..543b256 100644 --- a/dct.c +++ b/dct.c @@ -1,10 +1,10 @@ -#include +#include #include #include #include #include -#define N 1000000000 +#define N 100 typedef uint16_t dctcoef; void print_vector(__m128i v) { @@ -31,11 +31,11 @@ dct4x4dc(dctcoef d[16]) { tmp[2 * 4 + i] = d01 - d23; tmp[3 * 4 + i] = d01 + d23; } - + /* printf("\n"); for (int i = 0; i < 16; i += 4) { printf("%02x %02x %02x %02x\n", tmp[i], tmp[i + 1], tmp[i + 2], tmp[i + 3]); - } + }*/ for (int i = 0; i < 4; i++) { int s01 = tmp[i * 4 + 0] + tmp[i * 4 + 1]; @@ -94,8 +94,6 @@ dct4x4dc_sse(dctcoef d[16]) { _mm_storel_epi64((__m128i *) dT[1], totalSum2); _mm_storel_epi64((__m128i *) dT[2], totalSum3); _mm_storel_epi64((__m128i *) dT[3], totalSum4); - printf("\n"); - /* // PHASE 2 row1row2 = _mm_loadu_si128((__m128i *) &dT[0][0]); row3row4 = _mm_loadu_si128((__m128i *) &dT[2][0]); // load instead of set @@ -107,47 +105,60 @@ dct4x4dc_sse(dctcoef d[16]) { row1row2 = _mm_unpacklo_epi16(tmp1, tmp3); row3row4 = _mm_unpackhi_epi16(tmp1, tmp3); - printf("\n\n"); - print_vector(row1row2); - print_vector(row3row4); - /* - __m128i row1 = _mm_loadu_si128((__m128i *) &d[0]); - __m128i extended_row1 = _mm_unpacklo_epi16(row1, zero); - __m128i row2 = _mm_loadu_si128((__m128i *) &d[4]); - __m128i extended_row2 = _mm_unpacklo_epi16(row2, zero); - __m128i row3 = _mm_loadu_si128((__m128i *) &d[8]); - __m128i extended_row3 = _mm_unpacklo_epi16(row3, zero); - __m128i row4 = _mm_loadu_si128((__m128i *) &d[12]); - __m128i extended_row4 = _mm_unpacklo_epi16(row4, zero); + __m128i row1_32 = _mm_unpacklo_epi16(row1row2, zero); + __m128i row2_32 = _mm_unpackhi_epi16(row1row2, zero); + __m128i row3_32 = _mm_unpacklo_epi16(row3row4, zero); + __m128i row4_32 = _mm_unpackhi_epi16(row3row4, zero); // 1st + 2nd +3rd +4rth + __m128i totalSum11 = _mm_add_epi32(row1_32, row2_32); + __m128i totalSum12 = _mm_add_epi32(row3_32, row4_32); + totalSum11 = _mm_add_epi32(totalSum11, totalSum12); + totalSum11 = _mm_add_epi32(totalSum11, ones); + totalSum11 = _mm_srli_epi32(totalSum11, 1); + __m128i mask = _mm_set1_epi32(0x0000FFFF); - __m128i tempSum = _mm_add_epi32(extended_row1, extended_row2); - totalSum = _mm_add_epi32(extended_row3, extended_row4); - totalSum = _mm_add_epi32(totalSum, tempSum); - totalSum = _mm_add_epi32(totalSum, ones); - totalSum = _mm_srli_epi32(totalSum, 1); - - print_vector(totalSum); - _mm_storeu_si64((__m128i *) &d[0], _mm_packs_epi32(totalSum, totalSum)); - - printf("\n"); - for (int i = 0; i < 16; i += 4) { - printf("%02x %02x %02x %02x\n", d[i], d[i + 1], d[i + 2], d[i + 3]); - } - // 1st + 2nd +3rd +4rth - totalSum = _mm_add_epi32(row1, row3row4); - shuffled = _mm_shuffle_epi32(totalSum, _MM_SHUFFLE(2, 3, 3, 2)); - totalSum = _mm_add_epi16(totalSum, shuffled); - - printf("\n"); - totalSum = _mm_add_epi16(totalSum, ones); - _mm_storel_epi64((__m128i *) &d[0], _mm_srli_epi16(totalSum, 1)); - * / - /* - // 1st + 2nd -3rd -4rth - // 1st - 2nd -3rd +4rth - // 1st - 2nd +3rd -4rth - */ + // 1st + 2nd -3rd -4rth + // Perform bitwise AND operation to keep only the lower 16 bits of each 32-bit integer + totalSum11 = _mm_and_si128(totalSum11, mask); + totalSum11 = _mm_packus_epi32(totalSum11, zero); + + __m128i totalSum21 = _mm_add_epi32(row1_32, row2_32); + __m128i totalSum22 = _mm_add_epi32(row3_32, row4_32); + totalSum21 = _mm_sub_epi32(totalSum21, totalSum22); + totalSum21 = _mm_add_epi32(totalSum21, ones); + totalSum21 = _mm_srli_epi32(totalSum21, 1); + totalSum21 = _mm_and_si128(totalSum21, mask); // keep only the lower 16 bits of each 32-bit integer + totalSum21 = _mm_packus_epi32(totalSum21, zero); + + __m128i totalSum31 = _mm_sub_epi32(row1_32, row2_32); + __m128i totalSum32 = _mm_sub_epi32(row4_32, row3_32); + totalSum31 = _mm_add_epi32(totalSum31, totalSum32); + totalSum31 = _mm_add_epi32(totalSum31, ones); + totalSum31 = _mm_srli_epi32(totalSum31, 1); + totalSum31 = _mm_and_si128(totalSum31, mask); // keep only the lower 16 bits of each 32-bit integer + totalSum31 = _mm_packus_epi32(totalSum31, zero); + // 1st - 2nd -3rd +4rth + __m128i totalSum41 = _mm_sub_epi32(row1_32, row2_32); + __m128i totalSum42 = _mm_sub_epi32(row3_32, row4_32); + totalSum41 = _mm_add_epi32(totalSum41, totalSum42); + totalSum41 = _mm_add_epi32(totalSum41, ones); + totalSum41 = _mm_srli_epi32(totalSum41, 1); + totalSum41 = _mm_and_si128(totalSum41, mask); // keep only the lower 16 bits of each 32-bit integer + totalSum41 = _mm_packus_epi32(totalSum41, zero); + + _mm_storel_epi64((__m128i *) &d[0], totalSum11); + _mm_storel_epi64((__m128i *) &d[4], totalSum21); + _mm_storel_epi64((__m128i *) &d[8], totalSum31); + _mm_storel_epi64((__m128i *) &d[12], totalSum41); + + row1row2 = _mm_loadu_si128((__m128i *) &d[0]); + row3row4 = _mm_loadu_si128((__m128i *) &d[8]); // load instead of set + tmp1 = _mm_unpacklo_epi16(row1row2, row3row4); + tmp3 = _mm_unpackhi_epi16(row1row2, row3row4); + _mm_storel_epi64((__m128i *) &d[0], _mm_move_epi64(_mm_unpacklo_epi16(tmp1, tmp3))); + _mm_storel_epi64((__m128i *) &d[4], _mm_srli_si128(_mm_unpacklo_epi16(tmp1, tmp3), 8)); + _mm_storel_epi64((__m128i *) &d[8], _mm_move_epi64(_mm_unpackhi_epi16(tmp1, tmp3))); + _mm_storel_epi64((__m128i *) &d[12], _mm_srli_si128(_mm_unpackhi_epi16(tmp1, tmp3), 8)); } int main() { From f7804bf17d47f07ebb6da3e36e948db0523deb97 Mon Sep 17 00:00:00 2001 From: gtsoul-tech Date: Thu, 28 Mar 2024 16:14:21 +0200 Subject: [PATCH 05/12] add dct --- Makefile | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 3eb1080..ee2c2ef 100644 --- a/Makefile +++ b/Makefile @@ -8,7 +8,8 @@ ALL= average_avx512\ average_sse\ scalarxmat44\ mat44xmat44\ - vect4xmat44 + vect4xmat44\ + dct all: $(ALL) average_sse: average_sse.c @@ -27,6 +28,8 @@ mat44xmat44: mat44xmat44.c $(CC) $(CFLAGS) -mavx2 -mavx512f mat44xmat44.c -o mat44xmat44 vect4xmat44: vect4xmat44.c $(CC) $(CFLAGS) -mavx2 -mavx512f -mfma vect4xmat44.c -o vect4xmat44 +dct: dct.c + $(CC) $(CFLAGS) -msse4.1 dct.c -o dct else ifeq ($(filter $(ARCH),arm64 aarch64),$(ARCH)) CFLAGS += -march=native From a85e67f3a62c2381525ad5652d4e56767653dc0a Mon Sep 17 00:00:00 2001 From: gtsoul-tech Date: Thu, 28 Mar 2024 16:29:37 +0200 Subject: [PATCH 06/12] 2tranposes --- dct.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/dct.c b/dct.c index 543b256..487a200 100644 --- a/dct.c +++ b/dct.c @@ -90,14 +90,14 @@ dct4x4dc_sse(dctcoef d[16]) { __m128i shuffled4 = _mm_shuffle_epi32(totalSum4, _MM_SHUFFLE(2, 3, 3, 2)); totalSum4 = _mm_add_epi16(totalSum4, shuffled4); - _mm_storel_epi64((__m128i *) dT[0], totalSum1); - _mm_storel_epi64((__m128i *) dT[1], totalSum2); - _mm_storel_epi64((__m128i *) dT[2], totalSum3); - _mm_storel_epi64((__m128i *) dT[3], totalSum4); - // PHASE 2 - row1row2 = _mm_loadu_si128((__m128i *) &dT[0][0]); - row3row4 = _mm_loadu_si128((__m128i *) &dT[2][0]); // load instead of set + _mm_storel_epi64((__m128i *) &d[0], totalSum1); + _mm_storel_epi64((__m128i *) &d[4], totalSum2); + _mm_storel_epi64((__m128i *) &d[8], totalSum3); + _mm_storel_epi64((__m128i *) &d[12], totalSum4); + // PHASE 2 + row1row2 = _mm_loadu_si128((__m128i *) &d[0]); + row3row4 = _mm_loadu_si128((__m128i *) &d[8]); // transpose dT back __m128i ones = _mm_set1_epi32(1); // to divide tmp1 = _mm_unpacklo_epi16(row1row2, row3row4); From 53906296d98cacd74bd7eeabc66154c6ee7a8359 Mon Sep 17 00:00:00 2001 From: gtsoul-tech Date: Fri, 29 Mar 2024 09:30:30 +0200 Subject: [PATCH 07/12] average timing, some comments --- dct.c | 128 ++++++++++++++++++++++++++++++++++------------------------ 1 file changed, 76 insertions(+), 52 deletions(-) diff --git a/dct.c b/dct.c index 487a200..d6d5b1e 100644 --- a/dct.c +++ b/dct.c @@ -4,7 +4,8 @@ #include #include -#define N 100 +#define N 100 +#define ITER 1 typedef uint16_t dctcoef; void print_vector(__m128i v) { @@ -52,9 +53,9 @@ dct4x4dc(dctcoef d[16]) { void dct4x4dc_sse(dctcoef d[16]) { - dctcoef dT[4][4]; // This will hold the transposed matrix - // Load the rows of d into 128-bit vectors + // transpose d and keep them in row1row2, row3row4 + // transpose 1 time __m128i row1row2 = _mm_loadu_si128((__m128i *) &d[0]); __m128i row3row4 = _mm_loadu_si128((__m128i *) &d[8]); // load instead of set __m128i tmp1 = _mm_unpacklo_epi16(row1row2, row3row4); @@ -62,15 +63,17 @@ dct4x4dc_sse(dctcoef d[16]) { row1row2 = _mm_unpacklo_epi16(tmp1, tmp3); row3row4 = _mm_unpackhi_epi16(tmp1, tmp3); + // 1st(d0 d4 d8 d12) + 2nd(d1 d5 d9 d13) + + // 3rd(d2 d6 d10 d14) +4rth(d3 d7 d11 d15) + + // 1st + 2nd -3rd -4rth (same logic) __m128i totalSum1 = _mm_add_epi16(row1row2, row3row4); __m128i totalSum2 = _mm_sub_epi16(row1row2, row3row4); __m128i shuffled1 = _mm_shuffle_epi32(totalSum1, _MM_SHUFFLE(2, 3, 3, 2)); __m128i shuffled2 = _mm_shuffle_epi32(totalSum2, _MM_SHUFFLE(2, 3, 3, 2)); totalSum1 = _mm_add_epi16(totalSum1, shuffled1); totalSum2 = _mm_add_epi16(totalSum2, shuffled2); - // 1st + 2nd +3rd +4rth - // 1st + 2nd -3rd -4rth - // 1st - 2nd -3rd +4rth + // 1st - 2nd -3rd +4rth (same logic) __m128i maskFF = _mm_set1_epi16(0xFF); __m128i zero = _mm_setzero_si128(); __m128i mask1 = _mm_slli_si128(maskFF, 8); @@ -81,15 +84,15 @@ dct4x4dc_sse(dctcoef d[16]) { masked_part = _mm_and_si128(mask2, row3row4); neg_masked_part = _mm_sub_epi16(zero, masked_part); row3row4 = _mm_or_si128(_mm_andnot_si128(mask2, row3row4), neg_masked_part); - __m128i totalSum3 = _mm_add_epi16(row1row2, row3row4); __m128i shuffled3 = _mm_shuffle_epi32(totalSum3, _MM_SHUFFLE(2, 3, 3, 2)); totalSum3 = _mm_add_epi16(totalSum3, shuffled3); - // 1st - 2nd +3rd -4rth + // 1st - 2nd +3rd -4rth (same logic) __m128i totalSum4 = _mm_sub_epi16(row1row2, row3row4); __m128i shuffled4 = _mm_shuffle_epi32(totalSum4, _MM_SHUFFLE(2, 3, 3, 2)); totalSum4 = _mm_add_epi16(totalSum4, shuffled4); + // store back to d to keep the intermediate results _mm_storel_epi64((__m128i *) &d[0], totalSum1); _mm_storel_epi64((__m128i *) &d[4], totalSum2); _mm_storel_epi64((__m128i *) &d[8], totalSum3); @@ -98,18 +101,17 @@ dct4x4dc_sse(dctcoef d[16]) { // PHASE 2 row1row2 = _mm_loadu_si128((__m128i *) &d[0]); row3row4 = _mm_loadu_si128((__m128i *) &d[8]); - // transpose dT back + __m128i ones = _mm_set1_epi32(1); // to divide tmp1 = _mm_unpacklo_epi16(row1row2, row3row4); tmp3 = _mm_unpackhi_epi16(row1row2, row3row4); row1row2 = _mm_unpacklo_epi16(tmp1, tmp3); row3row4 = _mm_unpackhi_epi16(tmp1, tmp3); - __m128i row1_32 = _mm_unpacklo_epi16(row1row2, zero); __m128i row2_32 = _mm_unpackhi_epi16(row1row2, zero); __m128i row3_32 = _mm_unpacklo_epi16(row3row4, zero); __m128i row4_32 = _mm_unpackhi_epi16(row3row4, zero); - // 1st + 2nd +3rd +4rth + // 1st + 2nd +3rd +4rth (same logic) __m128i totalSum11 = _mm_add_epi32(row1_32, row2_32); __m128i totalSum12 = _mm_add_epi32(row3_32, row4_32); totalSum11 = _mm_add_epi32(totalSum11, totalSum12); @@ -117,7 +119,7 @@ dct4x4dc_sse(dctcoef d[16]) { totalSum11 = _mm_srli_epi32(totalSum11, 1); __m128i mask = _mm_set1_epi32(0x0000FFFF); - // 1st + 2nd -3rd -4rth + // 1st + 2nd -3rd -4rth (same logic) // Perform bitwise AND operation to keep only the lower 16 bits of each 32-bit integer totalSum11 = _mm_and_si128(totalSum11, mask); totalSum11 = _mm_packus_epi32(totalSum11, zero); @@ -129,7 +131,7 @@ dct4x4dc_sse(dctcoef d[16]) { totalSum21 = _mm_srli_epi32(totalSum21, 1); totalSum21 = _mm_and_si128(totalSum21, mask); // keep only the lower 16 bits of each 32-bit integer totalSum21 = _mm_packus_epi32(totalSum21, zero); - + // 1st - 2nd -3rd +4rth (same logic) __m128i totalSum31 = _mm_sub_epi32(row1_32, row2_32); __m128i totalSum32 = _mm_sub_epi32(row4_32, row3_32); totalSum31 = _mm_add_epi32(totalSum31, totalSum32); @@ -137,7 +139,7 @@ dct4x4dc_sse(dctcoef d[16]) { totalSum31 = _mm_srli_epi32(totalSum31, 1); totalSum31 = _mm_and_si128(totalSum31, mask); // keep only the lower 16 bits of each 32-bit integer totalSum31 = _mm_packus_epi32(totalSum31, zero); - // 1st - 2nd -3rd +4rth + // 1st - 2nd +3rd -4rth (same logic) __m128i totalSum41 = _mm_sub_epi32(row1_32, row2_32); __m128i totalSum42 = _mm_sub_epi32(row3_32, row4_32); totalSum41 = _mm_add_epi32(totalSum41, totalSum42); @@ -163,46 +165,68 @@ dct4x4dc_sse(dctcoef d[16]) { int main() { struct timespec start, mid, end; - srand(time(NULL)); - dctcoef matrix[16]; - dctcoef matrix2[16]; - for (int i = 0; i < 16; i++) { - matrix[i] = rand() & 0xFF; // 8 bit unsigned - matrix2[i] = matrix[i]; - } - printf("Original matrix:\n"); - for (int i = 0; i < 16; i += 4) { - printf("%02x %02x %02x %02x\n", matrix[i], matrix[i + 1], matrix[i + 2], matrix[i + 3]); - } - clock_gettime(CLOCK_MONOTONIC, &start); - dct4x4dc(matrix); - clock_gettime(CLOCK_MONOTONIC, &mid); - dct4x4dc_sse(matrix2); - clock_gettime(CLOCK_MONOTONIC, &end); - - printf("\nMatrix after dct4x4dc:\n"); - for (int i = 0; i < 16; i += 4) { - printf("%02x %02x %02x %02x\n", matrix[i], matrix[i + 1], matrix[i + 2], matrix[i + 3]); - } - - printf("\nMatrix2 after dct4x4dc_sse:\n"); - for (int i = 0; i < 16; i += 4) { - printf("%02x %02x %02x %02x\n", matrix2[i], matrix2[i + 1], matrix2[i + 2], matrix2[i + 3]); - } - - long seconds1 = mid.tv_sec - start.tv_sec; - long nanoseconds1 = mid.tv_nsec - start.tv_nsec; - if (nanoseconds1 < 0) { - seconds1--; - nanoseconds1 += 1000000000; - } - long seconds2 = end.tv_sec - mid.tv_sec; - long nanoseconds2 = end.tv_nsec - mid.tv_nsec; - if (nanoseconds2 < 0) { - seconds2--; - nanoseconds2 += 1000000000; + long s1sum = 0, s2sum = 0, n1sum = 0, n2sum = 0; + int z = ITER; + while (z--) { + srand(time(NULL)); + dctcoef matrix[16]; + dctcoef matrix2[16]; + + for (int i = 0; i < 16; i++) { + matrix[i] = rand() & 0xFF; // 8 bit unsigned + matrix2[i] = matrix[i]; + } + printf("Original matrix:\n"); + for (int i = 0; i < 16; i += 4) { + printf("%02x %02x %02x %02x\n", matrix[i], matrix[i + 1], matrix[i + 2], matrix[i + 3]); + } + + clock_gettime(CLOCK_MONOTONIC, &start); + dct4x4dc(matrix); + clock_gettime(CLOCK_MONOTONIC, &mid); + dct4x4dc_sse(matrix2); + clock_gettime(CLOCK_MONOTONIC, &end); + + printf("\nMatrix after dct4x4dc:\n"); + for (int i = 0; i < 16; i += 4) { + printf("%02x %02x %02x %02x\n", matrix[i], matrix[i + 1], matrix[i + 2], matrix[i + 3]); + } + + printf("\nMatrix2 after dct4x4dc_sse:\n"); + for (int i = 0; i < 16; i += 4) { + printf("%02x %02x %02x %02x\n", matrix2[i], matrix2[i + 1], matrix2[i + 2], matrix2[i + 3]); + } + + long seconds1 = mid.tv_sec - start.tv_sec; + long nanoseconds1 = mid.tv_nsec - start.tv_nsec; + if (nanoseconds1 < 0) { + seconds1--; + nanoseconds1 += 1000000000; + } + long seconds2 = end.tv_sec - mid.tv_sec; + long nanoseconds2 = end.tv_nsec - mid.tv_nsec; + if (nanoseconds2 < 0) { + seconds2--; + nanoseconds2 += 1000000000; + } + s1sum += seconds1; + s2sum += seconds2; + n1sum += nanoseconds1; + n2sum += nanoseconds2; + if (n1sum > 1000000000) { + s1sum++; + n1sum -= 1000000000; + } + if (n2sum > 1000000000) { + s2sum++; + n2sum -= 1000000000; + } } + /* printf("scalar: %ld.%09ld seconds\n", seconds1, nanoseconds1); printf("SSE : %ld.%09ld seconds\n", seconds2, nanoseconds2); + */ + printf("scalar: %ld.%09ld seconds\n", s1sum, n1sum); + printf("SSE : %ld.%09ld seconds\n", s2sum, n2sum); return 0; } \ No newline at end of file From 9f83cf48d4b72e5dcf8e489e4a1c4e97acca8e36 Mon Sep 17 00:00:00 2001 From: gtsoul-tech Date: Fri, 29 Mar 2024 09:47:23 +0200 Subject: [PATCH 08/12] remove intermediate store --- dct.c | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/dct.c b/dct.c index d6d5b1e..258c054 100644 --- a/dct.c +++ b/dct.c @@ -92,15 +92,11 @@ dct4x4dc_sse(dctcoef d[16]) { __m128i shuffled4 = _mm_shuffle_epi32(totalSum4, _MM_SHUFFLE(2, 3, 3, 2)); totalSum4 = _mm_add_epi16(totalSum4, shuffled4); - // store back to d to keep the intermediate results - _mm_storel_epi64((__m128i *) &d[0], totalSum1); - _mm_storel_epi64((__m128i *) &d[4], totalSum2); - _mm_storel_epi64((__m128i *) &d[8], totalSum3); - _mm_storel_epi64((__m128i *) &d[12], totalSum4); - + // instead of storing back to d ,keep the intermediate results // PHASE 2 - row1row2 = _mm_loadu_si128((__m128i *) &d[0]); - row3row4 = _mm_loadu_si128((__m128i *) &d[8]); + // transpose again + row1row2 = _mm_unpacklo_epi64(totalSum1, totalSum2); + row3row4 = _mm_unpacklo_epi64(totalSum3, totalSum4); __m128i ones = _mm_set1_epi32(1); // to divide tmp1 = _mm_unpacklo_epi16(row1row2, row3row4); @@ -153,6 +149,7 @@ dct4x4dc_sse(dctcoef d[16]) { _mm_storel_epi64((__m128i *) &d[8], totalSum31); _mm_storel_epi64((__m128i *) &d[12], totalSum41); + // transpose again row1row2 = _mm_loadu_si128((__m128i *) &d[0]); row3row4 = _mm_loadu_si128((__m128i *) &d[8]); // load instead of set tmp1 = _mm_unpacklo_epi16(row1row2, row3row4); From 759705d8d444ad966b03d6c38c7541bc653fa5d2 Mon Sep 17 00:00:00 2001 From: gtsoul-tech Date: Fri, 29 Mar 2024 09:52:28 +0200 Subject: [PATCH 09/12] transpose in vectors(less stores-loads)- comments --- dct.c | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/dct.c b/dct.c index 258c054..8098ae4 100644 --- a/dct.c +++ b/dct.c @@ -94,7 +94,7 @@ dct4x4dc_sse(dctcoef d[16]) { // instead of storing back to d ,keep the intermediate results // PHASE 2 - // transpose again + // transpose in vectors again(no stores) row1row2 = _mm_unpacklo_epi64(totalSum1, totalSum2); row3row4 = _mm_unpacklo_epi64(totalSum3, totalSum4); @@ -144,14 +144,9 @@ dct4x4dc_sse(dctcoef d[16]) { totalSum41 = _mm_and_si128(totalSum41, mask); // keep only the lower 16 bits of each 32-bit integer totalSum41 = _mm_packus_epi32(totalSum41, zero); - _mm_storel_epi64((__m128i *) &d[0], totalSum11); - _mm_storel_epi64((__m128i *) &d[4], totalSum21); - _mm_storel_epi64((__m128i *) &d[8], totalSum31); - _mm_storel_epi64((__m128i *) &d[12], totalSum41); - - // transpose again - row1row2 = _mm_loadu_si128((__m128i *) &d[0]); - row3row4 = _mm_loadu_si128((__m128i *) &d[8]); // load instead of set + // transpose in vectors again(no stores) + row1row2 = _mm_unpacklo_epi64(totalSum11, totalSum21); + row3row4 = _mm_unpacklo_epi64(totalSum31, totalSum41); tmp1 = _mm_unpacklo_epi16(row1row2, row3row4); tmp3 = _mm_unpackhi_epi16(row1row2, row3row4); _mm_storel_epi64((__m128i *) &d[0], _mm_move_epi64(_mm_unpacklo_epi16(tmp1, tmp3))); From 62513e5d57b86bd20950979d75026ae8043c27dc Mon Sep 17 00:00:00 2001 From: gtsoul-tech Date: Fri, 29 Mar 2024 09:56:02 +0200 Subject: [PATCH 10/12] comment out prints of matrixes-more iters --- dct.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/dct.c b/dct.c index 8098ae4..8800841 100644 --- a/dct.c +++ b/dct.c @@ -5,7 +5,7 @@ #include #define N 100 -#define ITER 1 +#define ITER 100000 typedef uint16_t dctcoef; void print_vector(__m128i v) { @@ -168,17 +168,18 @@ main() { matrix[i] = rand() & 0xFF; // 8 bit unsigned matrix2[i] = matrix[i]; } + /* printf("Original matrix:\n"); for (int i = 0; i < 16; i += 4) { printf("%02x %02x %02x %02x\n", matrix[i], matrix[i + 1], matrix[i + 2], matrix[i + 3]); } - + */ clock_gettime(CLOCK_MONOTONIC, &start); dct4x4dc(matrix); clock_gettime(CLOCK_MONOTONIC, &mid); dct4x4dc_sse(matrix2); clock_gettime(CLOCK_MONOTONIC, &end); - + /* printf("\nMatrix after dct4x4dc:\n"); for (int i = 0; i < 16; i += 4) { printf("%02x %02x %02x %02x\n", matrix[i], matrix[i + 1], matrix[i + 2], matrix[i + 3]); @@ -188,7 +189,7 @@ main() { for (int i = 0; i < 16; i += 4) { printf("%02x %02x %02x %02x\n", matrix2[i], matrix2[i + 1], matrix2[i + 2], matrix2[i + 3]); } - + */ long seconds1 = mid.tv_sec - start.tv_sec; long nanoseconds1 = mid.tv_nsec - start.tv_nsec; if (nanoseconds1 < 0) { From 52e7db164e8f94d33743e0305f33ac54e6053ecc Mon Sep 17 00:00:00 2001 From: gtsoul-tech Date: Fri, 29 Mar 2024 13:20:56 +0200 Subject: [PATCH 11/12] change last stores --- dct.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/dct.c b/dct.c index 8800841..34c3eff 100644 --- a/dct.c +++ b/dct.c @@ -5,7 +5,7 @@ #include #define N 100 -#define ITER 100000 +#define ITER 1 typedef uint16_t dctcoef; void print_vector(__m128i v) { @@ -145,14 +145,14 @@ dct4x4dc_sse(dctcoef d[16]) { totalSum41 = _mm_packus_epi32(totalSum41, zero); // transpose in vectors again(no stores) - row1row2 = _mm_unpacklo_epi64(totalSum11, totalSum21); + row1row2 = _mm_unpacklo_epi64(totalSum11, totalSum21); // interleave 64bits row3row4 = _mm_unpacklo_epi64(totalSum31, totalSum41); - tmp1 = _mm_unpacklo_epi16(row1row2, row3row4); + tmp1 = _mm_unpacklo_epi16(row1row2, row3row4); // interleave the lower 8 16bit integers tmp3 = _mm_unpackhi_epi16(row1row2, row3row4); - _mm_storel_epi64((__m128i *) &d[0], _mm_move_epi64(_mm_unpacklo_epi16(tmp1, tmp3))); - _mm_storel_epi64((__m128i *) &d[4], _mm_srli_si128(_mm_unpacklo_epi16(tmp1, tmp3), 8)); - _mm_storel_epi64((__m128i *) &d[8], _mm_move_epi64(_mm_unpackhi_epi16(tmp1, tmp3))); - _mm_storel_epi64((__m128i *) &d[12], _mm_srli_si128(_mm_unpackhi_epi16(tmp1, tmp3), 8)); + row1row2 = _mm_unpacklo_epi16(tmp1, tmp3); + row3row4 = _mm_unpackhi_epi16(tmp1, tmp3); + _mm_storeu_si128((__m128i *) d, row1row2); // store 128bit in one go + _mm_storeu_si128((__m128i *) &d[8], row3row4); } int main() { @@ -179,7 +179,7 @@ main() { clock_gettime(CLOCK_MONOTONIC, &mid); dct4x4dc_sse(matrix2); clock_gettime(CLOCK_MONOTONIC, &end); - /* + printf("\nMatrix after dct4x4dc:\n"); for (int i = 0; i < 16; i += 4) { printf("%02x %02x %02x %02x\n", matrix[i], matrix[i + 1], matrix[i + 2], matrix[i + 3]); @@ -189,7 +189,7 @@ main() { for (int i = 0; i < 16; i += 4) { printf("%02x %02x %02x %02x\n", matrix2[i], matrix2[i + 1], matrix2[i + 2], matrix2[i + 3]); } - */ + long seconds1 = mid.tv_sec - start.tv_sec; long nanoseconds1 = mid.tv_nsec - start.tv_nsec; if (nanoseconds1 < 0) { From a06bc3f0e394ccd2e954baf2e86c885d8dbdff43 Mon Sep 17 00:00:00 2001 From: gtsoul-tech Date: Fri, 29 Mar 2024 13:35:41 +0200 Subject: [PATCH 12/12] comment out prints --- dct.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/dct.c b/dct.c index 34c3eff..7d0df59 100644 --- a/dct.c +++ b/dct.c @@ -5,7 +5,7 @@ #include #define N 100 -#define ITER 1 +#define ITER 100000 typedef uint16_t dctcoef; void print_vector(__m128i v) { @@ -179,7 +179,7 @@ main() { clock_gettime(CLOCK_MONOTONIC, &mid); dct4x4dc_sse(matrix2); clock_gettime(CLOCK_MONOTONIC, &end); - + /* printf("\nMatrix after dct4x4dc:\n"); for (int i = 0; i < 16; i += 4) { printf("%02x %02x %02x %02x\n", matrix[i], matrix[i + 1], matrix[i + 2], matrix[i + 3]); @@ -189,7 +189,7 @@ main() { for (int i = 0; i < 16; i += 4) { printf("%02x %02x %02x %02x\n", matrix2[i], matrix2[i + 1], matrix2[i + 2], matrix2[i + 3]); } - + */ long seconds1 = mid.tv_sec - start.tv_sec; long nanoseconds1 = mid.tv_nsec - start.tv_nsec; if (nanoseconds1 < 0) {