From e75940230bbb2f79e7b9ffe68e7002ee78dcaa70 Mon Sep 17 00:00:00 2001
From: gtsoul-tech <gtsoulkanakis@gmail.com>
Date: Tue, 26 Mar 2024 16:38:27 +0200
Subject: [PATCH 01/12] normal dct

---
 dct.c | 115 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 115 insertions(+)
 create mode 100644 dct.c

diff --git a/dct.c b/dct.c
new file mode 100644
index 0000000..a2ff442
--- /dev/null
+++ b/dct.c
@@ -0,0 +1,115 @@
+#include <emmintrin.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+typedef uint16_t dctcoef;
+void
+dct4x4dc(dctcoef d[16]) {
+    dctcoef tmp[16];
+
+    for (int i = 0; i < 4; i++) {
+        int s01 = d[i * 4 + 0] + d[i * 4 + 1];
+        int d01 = d[i * 4 + 0] - d[i * 4 + 1];
+        int s23 = d[i * 4 + 2] + d[i * 4 + 3];
+        int d23 = d[i * 4 + 2] - d[i * 4 + 3];
+
+        tmp[0 * 4 + i] = s01 + s23;
+        tmp[1 * 4 + i] = s01 - s23;
+        tmp[2 * 4 + i] = d01 - d23;
+        tmp[3 * 4 + i] = d01 + d23;
+    }
+
+    for (int i = 0; i < 4; i++) {
+        int s01 = tmp[i * 4 + 0] + tmp[i * 4 + 1];
+        int d01 = tmp[i * 4 + 0] - tmp[i * 4 + 1];
+        int s23 = tmp[i * 4 + 2] + tmp[i * 4 + 3];
+        int d23 = tmp[i * 4 + 2] - tmp[i * 4 + 3];
+
+        d[i * 4 + 0] = (s01 + s23 + 1) >> 1;
+        d[i * 4 + 1] = (s01 - s23 + 1) >> 1;
+        d[i * 4 + 2] = (d01 - d23 + 1) >> 1;
+        d[i * 4 + 3] = (d01 + d23 + 1) >> 1;
+    }
+}
+
+void
+dct4x4dc_sse(dctcoef d[16]) {
+    __m128i tmp[2];
+    tmp[0] = _mm_setzero_si128();   // Set first vector to zero
+    tmp[1] = _mm_setzero_si128();   // Set second vector to zero
+
+    for (int i = 0; i < 2; i++) {
+        __m128i row1 = _mm_loadu_si128((__m128i *) &d[i * 8]);
+        __m128i row2 = _mm_loadu_si128((__m128i *) &d[i * 8 + 4]);
+
+        __m128i s01s23 = _mm_add_epi16(row1, row2);
+        __m128i d01d23 = _mm_sub_epi16(row1, row2);
+
+        __m128i tmp1 = _mm_unpacklo_epi64(s01s23, d01d23);
+        __m128i tmp2 = _mm_unpackhi_epi64(s01s23, d01d23);
+
+        tmp[i] = _mm_unpacklo_epi64(tmp1, tmp2);
+    }
+    for (int i = 0; i < 2; i++) {
+        __m128i vec = tmp[i];
+
+        __m128i s01s23 = _mm_add_epi16(vec, _mm_shuffle_epi32(vec, _MM_SHUFFLE(1, 0, 3, 2)));
+        __m128i d01d23 = _mm_sub_epi16(vec, _mm_shuffle_epi32(vec, _MM_SHUFFLE(1, 0, 3, 2)));
+
+        __m128i res1 = _mm_add_epi16(s01s23, _mm_set1_epi16(1));
+        __m128i res2 = _mm_add_epi16(d01d23, _mm_set1_epi16(1));
+
+        res1 = _mm_srai_epi16(res1, 1);
+        res2 = _mm_srai_epi16(res2, 1);
+
+        _mm_storeu_si128((__m128i *) &d[i * 8], res1);
+        _mm_storeu_si128((__m128i *) &d[i * 8 + 4], res2);
+    }
+}
+int
+main() {
+    struct timespec start, mid, end;
+    srand(time(NULL));
+    dctcoef matrix[16];
+    dctcoef matrix2[16];
+    for (int i = 0; i < 16; i++) {
+        matrix[i] = rand() % 65536;
+        matrix2[i] = matrix[i];
+    }
+    printf("Original matrix:\n");
+    for (int i = 0; i < 16; i += 4) {
+        printf("%02x %02x %02x %02x\n", matrix[i], matrix[i + 1], matrix[i + 2], matrix[i + 3]);
+    }
+    clock_gettime(CLOCK_MONOTONIC, &start);
+    dct4x4dc(matrix);
+    clock_gettime(CLOCK_MONOTONIC, &mid);
+    dct4x4dc_sse(matrix2);
+    clock_gettime(CLOCK_MONOTONIC, &end);
+
+    printf("\nMatrix after dct4x4dc:\n");
+    for (int i = 0; i < 16; i += 4) {
+        printf("%02x %02x %02x %02x\n", matrix[i], matrix[i + 1], matrix[i + 2], matrix[i + 3]);
+    }
+
+    printf("\nMatrix2 after dct4x4dc_sse:\n");
+    for (int i = 0; i < 16; i += 4) {
+        printf("%02x %02x %02x %02x\n", matrix2[i], matrix2[i + 1], matrix2[i + 2], matrix2[i + 3]);
+    }
+
+    long seconds1 = mid.tv_sec - start.tv_sec;
+    long nanoseconds1 = mid.tv_nsec - start.tv_nsec;
+    if (nanoseconds1 < 0) {
+        seconds1--;
+        nanoseconds1 += 1000000000;
+    }
+    long seconds2 = end.tv_sec - mid.tv_sec;
+    long nanoseconds2 = end.tv_nsec - mid.tv_nsec;
+    if (nanoseconds2 < 0) {
+        seconds2--;
+        nanoseconds2 += 1000000000;
+    }
+    printf("scalar: %ld.%09ld seconds\n", seconds1, nanoseconds1);
+    printf("SSE   : %ld.%09ld seconds\n", seconds2, nanoseconds2);
+    return 0;
+}
\ No newline at end of file

From e8e76638719a481f7a5692bd20aabe84927c2dc7 Mon Sep 17 00:00:00 2001
From: gtsoul-tech <gtsoulkanakis@gmail.com>
Date: Wed, 27 Mar 2024 16:18:25 +0200
Subject: [PATCH 02/12] phase1

---
 dct.c | 110 ++++++++++++++++++++++++++++++++++++++--------------------
 1 file changed, 73 insertions(+), 37 deletions(-)

diff --git a/dct.c b/dct.c
index a2ff442..7664fbb 100644
--- a/dct.c
+++ b/dct.c
@@ -3,7 +3,19 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <time.h>
+
+#define N 1000000000
 typedef uint16_t dctcoef;
+void
+print_vector(__m128i v) {
+    uint16_t buf[8];
+    _mm_storeu_si128((__m128i *) buf, v);
+    for (int i = 0; i < 8; i++) {
+        printf("%02x ", buf[i]);
+    }
+    printf("\n");
+}
+
 void
 dct4x4dc(dctcoef d[16]) {
     dctcoef tmp[16];
@@ -19,7 +31,9 @@ dct4x4dc(dctcoef d[16]) {
         tmp[2 * 4 + i] = d01 - d23;
         tmp[3 * 4 + i] = d01 + d23;
     }
-
+    for (int i = 0; i < 16; i = i + 4) {
+        printf("in dct_c %02x %02x %02x %02x\n", tmp[0 + i], tmp[1 + i], tmp[2 + i], tmp[3 + i]);
+    }
     for (int i = 0; i < 4; i++) {
         int s01 = tmp[i * 4 + 0] + tmp[i * 4 + 1];
         int d01 = tmp[i * 4 + 0] - tmp[i * 4 + 1];
@@ -35,37 +49,59 @@ dct4x4dc(dctcoef d[16]) {
 
 void
 dct4x4dc_sse(dctcoef d[16]) {
-    __m128i tmp[2];
-    tmp[0] = _mm_setzero_si128();   // Set first vector to zero
-    tmp[1] = _mm_setzero_si128();   // Set second vector to zero
-
-    for (int i = 0; i < 2; i++) {
-        __m128i row1 = _mm_loadu_si128((__m128i *) &d[i * 8]);
-        __m128i row2 = _mm_loadu_si128((__m128i *) &d[i * 8 + 4]);
-
-        __m128i s01s23 = _mm_add_epi16(row1, row2);
-        __m128i d01d23 = _mm_sub_epi16(row1, row2);
-
-        __m128i tmp1 = _mm_unpacklo_epi64(s01s23, d01d23);
-        __m128i tmp2 = _mm_unpackhi_epi64(s01s23, d01d23);
-
-        tmp[i] = _mm_unpacklo_epi64(tmp1, tmp2);
-    }
-    for (int i = 0; i < 2; i++) {
-        __m128i vec = tmp[i];
-
-        __m128i s01s23 = _mm_add_epi16(vec, _mm_shuffle_epi32(vec, _MM_SHUFFLE(1, 0, 3, 2)));
-        __m128i d01d23 = _mm_sub_epi16(vec, _mm_shuffle_epi32(vec, _MM_SHUFFLE(1, 0, 3, 2)));
-
-        __m128i res1 = _mm_add_epi16(s01s23, _mm_set1_epi16(1));
-        __m128i res2 = _mm_add_epi16(d01d23, _mm_set1_epi16(1));
-
-        res1 = _mm_srai_epi16(res1, 1);
-        res2 = _mm_srai_epi16(res2, 1);
-
-        _mm_storeu_si128((__m128i *) &d[i * 8], res1);
-        _mm_storeu_si128((__m128i *) &d[i * 8 + 4], res2);
-    }
+    dctcoef dT[4][4];   // This will hold the transposed matrix
+
+    // Load the rows of d into 128-bit vectors
+    __m128i row1row2 = _mm_set_epi16(d[7], d[6], d[5], d[4], d[3], d[2], d[1], d[0]);
+    __m128i row3row4 = _mm_set_epi16(d[15], d[14], d[13], d[12], d[11], d[10], d[9], d[8]);
+
+    __m128i tmp1 = _mm_unpacklo_epi16(row1row2, row3row4);
+    __m128i tmp3 = _mm_unpackhi_epi16(row1row2, row3row4);
+    __m128i tmp1_new = _mm_unpacklo_epi16(tmp1, tmp3);
+    __m128i tmp3_new = _mm_unpackhi_epi16(tmp1, tmp3);
+    __m128i upper_half_tmp1 = _mm_srli_si128(tmp1_new, 8);
+    _mm_storel_epi64((__m128i *) &dT[1][0], upper_half_tmp1);
+    _mm_storel_epi64((__m128i *) &dT[0][0], _mm_move_epi64(tmp1_new));
+    _mm_storel_epi64((__m128i *) &dT[3][0], _mm_srli_si128(tmp3_new, 8));
+    _mm_storel_epi64((__m128i *) &dT[2][0], _mm_move_epi64(tmp3_new));
+    // load rows from transposed d
+    row1row2 = _mm_loadu_si128((__m128i *) dT[0]);
+    row3row4 = _mm_loadu_si128((__m128i *) dT[2]);
+    // 1st + 2nd +3rd +4rth
+    __m128i totalSum = _mm_add_epi16(row1row2, row3row4);
+    __m128i shuffled = _mm_shuffle_epi32(totalSum, _MM_SHUFFLE(2, 3, 3, 2));
+    totalSum = _mm_add_epi16(totalSum, shuffled);
+    _mm_storel_epi64((__m128i *) dT[0], totalSum);
+    // 1st + 2nd -3rd -4rth
+    totalSum = _mm_sub_epi16(row1row2, row3row4);
+    shuffled = _mm_shuffle_epi32(totalSum, _MM_SHUFFLE(2, 3, 3, 2));
+    totalSum = _mm_add_epi16(totalSum, shuffled);
+    _mm_storel_epi64((__m128i *) dT[1], totalSum);
+    // 1st - 2nd -3rd +4rth
+    __m128i mask1 = _mm_setr_epi16(0, 0, 0, 0, -1, -1, -1, -1);
+    __m128i mask2 = _mm_setr_epi16(-1, -1, -1, -1, 0, 0, 0, 0);
+    __m128i zero = _mm_setzero_si128();
+    __m128i signMask1 = _mm_cmplt_epi16(mask1, zero);
+    __m128i signMask2 = _mm_cmplt_epi16(mask2, zero);
+    row1row2 = _mm_sub_epi16(_mm_andnot_si128(signMask1, row1row2), _mm_and_si128(signMask1, row1row2));
+    row3row4 = _mm_sub_epi16(_mm_andnot_si128(signMask2, row3row4), _mm_and_si128(signMask2, row3row4));
+    totalSum = _mm_add_epi16(row1row2, row3row4);
+    shuffled = _mm_shuffle_epi32(totalSum, _MM_SHUFFLE(2, 3, 3, 2));
+    totalSum = _mm_add_epi16(totalSum, shuffled);
+    _mm_storel_epi64((__m128i *) dT[2], totalSum);
+    // 1st - 2nd +3rd -4rth
+    totalSum = _mm_sub_epi16(row1row2, row3row4);
+    shuffled = _mm_shuffle_epi32(totalSum, _MM_SHUFFLE(2, 3, 3, 2));
+    totalSum = _mm_add_epi16(totalSum, shuffled);
+    _mm_storel_epi64((__m128i *) dT[3], totalSum);
+
+    /*
+    for (int i = 0; i < 4; i++) {
+        for (int j = 0; j < 4; j++) {
+            printf("%02x ", (uint16_t) dT[i][j]);
+        }
+        printf("\n");
+    }*/
 }
 int
 main() {
@@ -74,7 +110,7 @@ main() {
     dctcoef matrix[16];
     dctcoef matrix2[16];
     for (int i = 0; i < 16; i++) {
-        matrix[i] = rand() % 65536;
+        matrix[i] = rand() & 0xFF;   // 8 bit unsigned
         matrix2[i] = matrix[i];
     }
     printf("Original matrix:\n");
@@ -87,14 +123,14 @@ main() {
     dct4x4dc_sse(matrix2);
     clock_gettime(CLOCK_MONOTONIC, &end);
 
-    printf("\nMatrix after dct4x4dc:\n");
+    // printf("\nMatrix after dct4x4dc:\n");
     for (int i = 0; i < 16; i += 4) {
-        printf("%02x %02x %02x %02x\n", matrix[i], matrix[i + 1], matrix[i + 2], matrix[i + 3]);
+        // printf("%02x %02x %02x %02x\n", matrix[i], matrix[i + 1], matrix[i + 2], matrix[i + 3]);
     }
 
-    printf("\nMatrix2 after dct4x4dc_sse:\n");
+    // printf("\nMatrix2 after dct4x4dc_sse:\n");
     for (int i = 0; i < 16; i += 4) {
-        printf("%02x %02x %02x %02x\n", matrix2[i], matrix2[i + 1], matrix2[i + 2], matrix2[i + 3]);
+        // printf("%02x %02x %02x %02x\n", matrix2[i], matrix2[i + 1], matrix2[i + 2], matrix2[i + 3]);
     }
 
     long seconds1 = mid.tv_sec - start.tv_sec;

From cf0e7e2bf08411136db8b817dcdb41f7a3371ca9 Mon Sep 17 00:00:00 2001
From: gtsoul-tech <gtsoulkanakis@gmail.com>
Date: Thu, 28 Mar 2024 12:37:29 +0200
Subject: [PATCH 03/12] phase1 refactored

---
 dct.c | 138 ++++++++++++++++++++++++++++++++++++++--------------------
 1 file changed, 92 insertions(+), 46 deletions(-)

diff --git a/dct.c b/dct.c
index 7664fbb..1d3eafe 100644
--- a/dct.c
+++ b/dct.c
@@ -11,7 +11,7 @@ print_vector(__m128i v) {
     uint16_t buf[8];
     _mm_storeu_si128((__m128i *) buf, v);
     for (int i = 0; i < 8; i++) {
-        printf("%02x ", buf[i]);
+        printf("%x ", buf[i]);
     }
     printf("\n");
 }
@@ -31,9 +31,12 @@ dct4x4dc(dctcoef d[16]) {
         tmp[2 * 4 + i] = d01 - d23;
         tmp[3 * 4 + i] = d01 + d23;
     }
-    for (int i = 0; i < 16; i = i + 4) {
-        printf("in dct_c %02x %02x %02x %02x\n", tmp[0 + i], tmp[1 + i], tmp[2 + i], tmp[3 + i]);
+
+    printf("\n");
+    for (int i = 0; i < 16; i += 4) {
+        printf("%02x %02x %02x %02x\n", tmp[i], tmp[i + 1], tmp[i + 2], tmp[i + 3]);
     }
+
     for (int i = 0; i < 4; i++) {
         int s01 = tmp[i * 4 + 0] + tmp[i * 4 + 1];
         int d01 = tmp[i * 4 + 0] - tmp[i * 4 + 1];
@@ -52,56 +55,99 @@ dct4x4dc_sse(dctcoef d[16]) {
     dctcoef dT[4][4];   // This will hold the transposed matrix
 
     // Load the rows of d into 128-bit vectors
-    __m128i row1row2 = _mm_set_epi16(d[7], d[6], d[5], d[4], d[3], d[2], d[1], d[0]);
-    __m128i row3row4 = _mm_set_epi16(d[15], d[14], d[13], d[12], d[11], d[10], d[9], d[8]);
-
+    __m128i row1row2 = _mm_loadu_si128((__m128i *) &d[0]);
+    __m128i row3row4 = _mm_loadu_si128((__m128i *) &d[8]);   // load instead of set
     __m128i tmp1 = _mm_unpacklo_epi16(row1row2, row3row4);
     __m128i tmp3 = _mm_unpackhi_epi16(row1row2, row3row4);
-    __m128i tmp1_new = _mm_unpacklo_epi16(tmp1, tmp3);
-    __m128i tmp3_new = _mm_unpackhi_epi16(tmp1, tmp3);
-    __m128i upper_half_tmp1 = _mm_srli_si128(tmp1_new, 8);
-    _mm_storel_epi64((__m128i *) &dT[1][0], upper_half_tmp1);
-    _mm_storel_epi64((__m128i *) &dT[0][0], _mm_move_epi64(tmp1_new));
-    _mm_storel_epi64((__m128i *) &dT[3][0], _mm_srli_si128(tmp3_new, 8));
-    _mm_storel_epi64((__m128i *) &dT[2][0], _mm_move_epi64(tmp3_new));
-    // load rows from transposed d
-    row1row2 = _mm_loadu_si128((__m128i *) dT[0]);
-    row3row4 = _mm_loadu_si128((__m128i *) dT[2]);
+    row1row2 = _mm_unpacklo_epi16(tmp1, tmp3);
+    row3row4 = _mm_unpackhi_epi16(tmp1, tmp3);
+
+    __m128i totalSum1 = _mm_add_epi16(row1row2, row3row4);
+    __m128i totalSum2 = _mm_sub_epi16(row1row2, row3row4);
+    __m128i shuffled1 = _mm_shuffle_epi32(totalSum1, _MM_SHUFFLE(2, 3, 3, 2));
+    __m128i shuffled2 = _mm_shuffle_epi32(totalSum2, _MM_SHUFFLE(2, 3, 3, 2));
+    totalSum1 = _mm_add_epi16(totalSum1, shuffled1);
+    totalSum2 = _mm_add_epi16(totalSum2, shuffled2);
     // 1st + 2nd +3rd +4rth
-    __m128i totalSum = _mm_add_epi16(row1row2, row3row4);
-    __m128i shuffled = _mm_shuffle_epi32(totalSum, _MM_SHUFFLE(2, 3, 3, 2));
-    totalSum = _mm_add_epi16(totalSum, shuffled);
-    _mm_storel_epi64((__m128i *) dT[0], totalSum);
     // 1st + 2nd -3rd -4rth
-    totalSum = _mm_sub_epi16(row1row2, row3row4);
-    shuffled = _mm_shuffle_epi32(totalSum, _MM_SHUFFLE(2, 3, 3, 2));
-    totalSum = _mm_add_epi16(totalSum, shuffled);
-    _mm_storel_epi64((__m128i *) dT[1], totalSum);
     // 1st - 2nd -3rd +4rth
-    __m128i mask1 = _mm_setr_epi16(0, 0, 0, 0, -1, -1, -1, -1);
-    __m128i mask2 = _mm_setr_epi16(-1, -1, -1, -1, 0, 0, 0, 0);
+    __m128i maskFF = _mm_set1_epi16(0xFF);
     __m128i zero = _mm_setzero_si128();
-    __m128i signMask1 = _mm_cmplt_epi16(mask1, zero);
-    __m128i signMask2 = _mm_cmplt_epi16(mask2, zero);
-    row1row2 = _mm_sub_epi16(_mm_andnot_si128(signMask1, row1row2), _mm_and_si128(signMask1, row1row2));
-    row3row4 = _mm_sub_epi16(_mm_andnot_si128(signMask2, row3row4), _mm_and_si128(signMask2, row3row4));
-    totalSum = _mm_add_epi16(row1row2, row3row4);
-    shuffled = _mm_shuffle_epi32(totalSum, _MM_SHUFFLE(2, 3, 3, 2));
-    totalSum = _mm_add_epi16(totalSum, shuffled);
-    _mm_storel_epi64((__m128i *) dT[2], totalSum);
+    __m128i mask1 = _mm_slli_si128(maskFF, 8);
+    __m128i mask2 = _mm_srli_si128(maskFF, 8);
+    __m128i masked_part = _mm_and_si128(mask1, row1row2);
+    __m128i neg_masked_part = _mm_sub_epi16(zero, masked_part);
+    row1row2 = _mm_or_si128(_mm_andnot_si128(mask1, row1row2), neg_masked_part);
+    masked_part = _mm_and_si128(mask2, row3row4);
+    neg_masked_part = _mm_sub_epi16(zero, masked_part);
+    row3row4 = _mm_or_si128(_mm_andnot_si128(mask2, row3row4), neg_masked_part);
+
+    __m128i totalSum3 = _mm_add_epi16(row1row2, row3row4);
+    __m128i shuffled3 = _mm_shuffle_epi32(totalSum3, _MM_SHUFFLE(2, 3, 3, 2));
+    totalSum3 = _mm_add_epi16(totalSum3, shuffled3);
     // 1st - 2nd +3rd -4rth
-    totalSum = _mm_sub_epi16(row1row2, row3row4);
+    __m128i totalSum4 = _mm_sub_epi16(row1row2, row3row4);
+    __m128i shuffled4 = _mm_shuffle_epi32(totalSum4, _MM_SHUFFLE(2, 3, 3, 2));
+    totalSum4 = _mm_add_epi16(totalSum4, shuffled4);
+
+    _mm_storel_epi64((__m128i *) dT[0], totalSum1);
+    _mm_storel_epi64((__m128i *) dT[1], totalSum2);
+    _mm_storel_epi64((__m128i *) dT[2], totalSum3);
+    _mm_storel_epi64((__m128i *) dT[3], totalSum4);
+    printf("\n");
+    /*
+    // PHASE 2
+    row1row2 = _mm_loadu_si128((__m128i *) &dT[0][0]);
+    row3row4 = _mm_loadu_si128((__m128i *) &dT[2][0]);   // load instead of set
+
+    // transpose dT back
+    __m128i ones = _mm_set1_epi32(1);   // to divide
+    tmp1 = _mm_unpacklo_epi16(row1row2, row3row4);
+    tmp3 = _mm_unpackhi_epi16(row1row2, row3row4);
+    row1row2 = _mm_unpacklo_epi16(tmp1, tmp3);
+    row3row4 = _mm_unpackhi_epi16(tmp1, tmp3);
+
+    printf("\n\n");
+    print_vector(row1row2);
+    print_vector(row3row4);
+    /*
+    __m128i row1 = _mm_loadu_si128((__m128i *) &d[0]);
+    __m128i extended_row1 = _mm_unpacklo_epi16(row1, zero);
+    __m128i row2 = _mm_loadu_si128((__m128i *) &d[4]);
+    __m128i extended_row2 = _mm_unpacklo_epi16(row2, zero);
+    __m128i row3 = _mm_loadu_si128((__m128i *) &d[8]);
+    __m128i extended_row3 = _mm_unpacklo_epi16(row3, zero);
+    __m128i row4 = _mm_loadu_si128((__m128i *) &d[12]);
+    __m128i extended_row4 = _mm_unpacklo_epi16(row4, zero);
+    // 1st + 2nd +3rd +4rth
+
+    __m128i tempSum = _mm_add_epi32(extended_row1, extended_row2);
+    totalSum = _mm_add_epi32(extended_row3, extended_row4);
+    totalSum = _mm_add_epi32(totalSum, tempSum);
+    totalSum = _mm_add_epi32(totalSum, ones);
+    totalSum = _mm_srli_epi32(totalSum, 1);
+
+    print_vector(totalSum);
+    _mm_storeu_si64((__m128i *) &d[0], _mm_packs_epi32(totalSum, totalSum));
+
+    printf("\n");
+    for (int i = 0; i < 16; i += 4) {
+        printf("%02x %02x %02x %02x\n", d[i], d[i + 1], d[i + 2], d[i + 3]);
+    }
+    // 1st + 2nd +3rd +4rth
+    totalSum = _mm_add_epi32(row1, row3row4);
     shuffled = _mm_shuffle_epi32(totalSum, _MM_SHUFFLE(2, 3, 3, 2));
     totalSum = _mm_add_epi16(totalSum, shuffled);
-    _mm_storel_epi64((__m128i *) dT[3], totalSum);
 
+    printf("\n");
+    totalSum = _mm_add_epi16(totalSum, ones);
+    _mm_storel_epi64((__m128i *) &d[0], _mm_srli_epi16(totalSum, 1));
+    * /
     /*
-    for (int i = 0; i < 4; i++) {
-        for (int j = 0; j < 4; j++) {
-            printf("%02x ", (uint16_t) dT[i][j]);
-        }
-        printf("\n");
-    }*/
+       // 1st + 2nd -3rd -4rth
+       // 1st - 2nd -3rd +4rth
+       // 1st - 2nd +3rd -4rth
+       */
 }
 int
 main() {
@@ -123,14 +169,14 @@ main() {
     dct4x4dc_sse(matrix2);
     clock_gettime(CLOCK_MONOTONIC, &end);
 
-    // printf("\nMatrix after dct4x4dc:\n");
+    printf("\nMatrix after dct4x4dc:\n");
     for (int i = 0; i < 16; i += 4) {
-        // printf("%02x %02x %02x %02x\n", matrix[i], matrix[i + 1], matrix[i + 2], matrix[i + 3]);
+        printf("%02x %02x %02x %02x\n", matrix[i], matrix[i + 1], matrix[i + 2], matrix[i + 3]);
     }
 
-    // printf("\nMatrix2 after dct4x4dc_sse:\n");
+    printf("\nMatrix2 after dct4x4dc_sse:\n");
     for (int i = 0; i < 16; i += 4) {
-        // printf("%02x %02x %02x %02x\n", matrix2[i], matrix2[i + 1], matrix2[i + 2], matrix2[i + 3]);
+        printf("%02x %02x %02x %02x\n", matrix2[i], matrix2[i + 1], matrix2[i + 2], matrix2[i + 3]);
     }
 
     long seconds1 = mid.tv_sec - start.tv_sec;

From 35d1da076ffda7dc52e8e9bca777be0f19150193 Mon Sep 17 00:00:00 2001
From: gtsoul-tech <gtsoulkanakis@gmail.com>
Date: Thu, 28 Mar 2024 16:05:28 +0200
Subject: [PATCH 04/12] correct results

---
 dct.c | 101 ++++++++++++++++++++++++++++++++--------------------------
 1 file changed, 56 insertions(+), 45 deletions(-)

diff --git a/dct.c b/dct.c
index 1d3eafe..543b256 100644
--- a/dct.c
+++ b/dct.c
@@ -1,10 +1,10 @@
-#include <emmintrin.h>
+#include <smmintrin.h>
 #include <stdint.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <time.h>
 
-#define N 1000000000
+#define N 100
 typedef uint16_t dctcoef;
 void
 print_vector(__m128i v) {
@@ -31,11 +31,11 @@ dct4x4dc(dctcoef d[16]) {
         tmp[2 * 4 + i] = d01 - d23;
         tmp[3 * 4 + i] = d01 + d23;
     }
-
+    /*
     printf("\n");
     for (int i = 0; i < 16; i += 4) {
         printf("%02x %02x %02x %02x\n", tmp[i], tmp[i + 1], tmp[i + 2], tmp[i + 3]);
-    }
+    }*/
 
     for (int i = 0; i < 4; i++) {
         int s01 = tmp[i * 4 + 0] + tmp[i * 4 + 1];
@@ -94,8 +94,6 @@ dct4x4dc_sse(dctcoef d[16]) {
     _mm_storel_epi64((__m128i *) dT[1], totalSum2);
     _mm_storel_epi64((__m128i *) dT[2], totalSum3);
     _mm_storel_epi64((__m128i *) dT[3], totalSum4);
-    printf("\n");
-    /*
     // PHASE 2
     row1row2 = _mm_loadu_si128((__m128i *) &dT[0][0]);
     row3row4 = _mm_loadu_si128((__m128i *) &dT[2][0]);   // load instead of set
@@ -107,47 +105,60 @@ dct4x4dc_sse(dctcoef d[16]) {
     row1row2 = _mm_unpacklo_epi16(tmp1, tmp3);
     row3row4 = _mm_unpackhi_epi16(tmp1, tmp3);
 
-    printf("\n\n");
-    print_vector(row1row2);
-    print_vector(row3row4);
-    /*
-    __m128i row1 = _mm_loadu_si128((__m128i *) &d[0]);
-    __m128i extended_row1 = _mm_unpacklo_epi16(row1, zero);
-    __m128i row2 = _mm_loadu_si128((__m128i *) &d[4]);
-    __m128i extended_row2 = _mm_unpacklo_epi16(row2, zero);
-    __m128i row3 = _mm_loadu_si128((__m128i *) &d[8]);
-    __m128i extended_row3 = _mm_unpacklo_epi16(row3, zero);
-    __m128i row4 = _mm_loadu_si128((__m128i *) &d[12]);
-    __m128i extended_row4 = _mm_unpacklo_epi16(row4, zero);
+    __m128i row1_32 = _mm_unpacklo_epi16(row1row2, zero);
+    __m128i row2_32 = _mm_unpackhi_epi16(row1row2, zero);
+    __m128i row3_32 = _mm_unpacklo_epi16(row3row4, zero);
+    __m128i row4_32 = _mm_unpackhi_epi16(row3row4, zero);
     // 1st + 2nd +3rd +4rth
+    __m128i totalSum11 = _mm_add_epi32(row1_32, row2_32);
+    __m128i totalSum12 = _mm_add_epi32(row3_32, row4_32);
+    totalSum11 = _mm_add_epi32(totalSum11, totalSum12);
+    totalSum11 = _mm_add_epi32(totalSum11, ones);
+    totalSum11 = _mm_srli_epi32(totalSum11, 1);
+    __m128i mask = _mm_set1_epi32(0x0000FFFF);
 
-    __m128i tempSum = _mm_add_epi32(extended_row1, extended_row2);
-    totalSum = _mm_add_epi32(extended_row3, extended_row4);
-    totalSum = _mm_add_epi32(totalSum, tempSum);
-    totalSum = _mm_add_epi32(totalSum, ones);
-    totalSum = _mm_srli_epi32(totalSum, 1);
-
-    print_vector(totalSum);
-    _mm_storeu_si64((__m128i *) &d[0], _mm_packs_epi32(totalSum, totalSum));
-
-    printf("\n");
-    for (int i = 0; i < 16; i += 4) {
-        printf("%02x %02x %02x %02x\n", d[i], d[i + 1], d[i + 2], d[i + 3]);
-    }
-    // 1st + 2nd +3rd +4rth
-    totalSum = _mm_add_epi32(row1, row3row4);
-    shuffled = _mm_shuffle_epi32(totalSum, _MM_SHUFFLE(2, 3, 3, 2));
-    totalSum = _mm_add_epi16(totalSum, shuffled);
-
-    printf("\n");
-    totalSum = _mm_add_epi16(totalSum, ones);
-    _mm_storel_epi64((__m128i *) &d[0], _mm_srli_epi16(totalSum, 1));
-    * /
-    /*
-       // 1st + 2nd -3rd -4rth
-       // 1st - 2nd -3rd +4rth
-       // 1st - 2nd +3rd -4rth
-       */
+    // 1st + 2nd -3rd -4rth
+    // Perform bitwise AND operation to keep only the lower 16 bits of each 32-bit integer
+    totalSum11 = _mm_and_si128(totalSum11, mask);
+    totalSum11 = _mm_packus_epi32(totalSum11, zero);
+
+    __m128i totalSum21 = _mm_add_epi32(row1_32, row2_32);
+    __m128i totalSum22 = _mm_add_epi32(row3_32, row4_32);
+    totalSum21 = _mm_sub_epi32(totalSum21, totalSum22);
+    totalSum21 = _mm_add_epi32(totalSum21, ones);
+    totalSum21 = _mm_srli_epi32(totalSum21, 1);
+    totalSum21 = _mm_and_si128(totalSum21, mask);   // keep only the lower 16 bits of each 32-bit integer
+    totalSum21 = _mm_packus_epi32(totalSum21, zero);
+
+    __m128i totalSum31 = _mm_sub_epi32(row1_32, row2_32);
+    __m128i totalSum32 = _mm_sub_epi32(row4_32, row3_32);
+    totalSum31 = _mm_add_epi32(totalSum31, totalSum32);
+    totalSum31 = _mm_add_epi32(totalSum31, ones);
+    totalSum31 = _mm_srli_epi32(totalSum31, 1);
+    totalSum31 = _mm_and_si128(totalSum31, mask);   // keep only the lower 16 bits of each 32-bit integer
+    totalSum31 = _mm_packus_epi32(totalSum31, zero);
+    // 1st - 2nd -3rd +4rth
+    __m128i totalSum41 = _mm_sub_epi32(row1_32, row2_32);
+    __m128i totalSum42 = _mm_sub_epi32(row3_32, row4_32);
+    totalSum41 = _mm_add_epi32(totalSum41, totalSum42);
+    totalSum41 = _mm_add_epi32(totalSum41, ones);
+    totalSum41 = _mm_srli_epi32(totalSum41, 1);
+    totalSum41 = _mm_and_si128(totalSum41, mask);   // keep only the lower 16 bits of each 32-bit integer
+    totalSum41 = _mm_packus_epi32(totalSum41, zero);
+
+    _mm_storel_epi64((__m128i *) &d[0], totalSum11);
+    _mm_storel_epi64((__m128i *) &d[4], totalSum21);
+    _mm_storel_epi64((__m128i *) &d[8], totalSum31);
+    _mm_storel_epi64((__m128i *) &d[12], totalSum41);
+
+    row1row2 = _mm_loadu_si128((__m128i *) &d[0]);
+    row3row4 = _mm_loadu_si128((__m128i *) &d[8]);   // load instead of set
+    tmp1 = _mm_unpacklo_epi16(row1row2, row3row4);
+    tmp3 = _mm_unpackhi_epi16(row1row2, row3row4);
+    _mm_storel_epi64((__m128i *) &d[0], _mm_move_epi64(_mm_unpacklo_epi16(tmp1, tmp3)));
+    _mm_storel_epi64((__m128i *) &d[4], _mm_srli_si128(_mm_unpacklo_epi16(tmp1, tmp3), 8));
+    _mm_storel_epi64((__m128i *) &d[8], _mm_move_epi64(_mm_unpackhi_epi16(tmp1, tmp3)));
+    _mm_storel_epi64((__m128i *) &d[12], _mm_srli_si128(_mm_unpackhi_epi16(tmp1, tmp3), 8));
 }
 int
 main() {

From f7804bf17d47f07ebb6da3e36e948db0523deb97 Mon Sep 17 00:00:00 2001
From: gtsoul-tech <gtsoulkanakis@gmail.com>
Date: Thu, 28 Mar 2024 16:14:21 +0200
Subject: [PATCH 05/12] add dct

---
 Makefile | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index 3eb1080..ee2c2ef 100644
--- a/Makefile
+++ b/Makefile
@@ -8,7 +8,8 @@ ALL= average_avx512\
         average_sse\
         scalarxmat44\
         mat44xmat44\
-        vect4xmat44 
+        vect4xmat44\
+		dct
 all: $(ALL)
 
 average_sse: average_sse.c
@@ -27,6 +28,8 @@ mat44xmat44: mat44xmat44.c
 	$(CC) $(CFLAGS) -mavx2 -mavx512f mat44xmat44.c -o mat44xmat44
 vect4xmat44: vect4xmat44.c
 	$(CC) $(CFLAGS) -mavx2 -mavx512f -mfma vect4xmat44.c -o vect4xmat44
+dct: dct.c
+	$(CC) $(CFLAGS) -msse4.1 dct.c -o dct
 
 else ifeq ($(filter $(ARCH),arm64 aarch64),$(ARCH))
 CFLAGS += -march=native

From a85e67f3a62c2381525ad5652d4e56767653dc0a Mon Sep 17 00:00:00 2001
From: gtsoul-tech <gtsoulkanakis@gmail.com>
Date: Thu, 28 Mar 2024 16:29:37 +0200
Subject: [PATCH 06/12] 2tranposes

---
 dct.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/dct.c b/dct.c
index 543b256..487a200 100644
--- a/dct.c
+++ b/dct.c
@@ -90,14 +90,14 @@ dct4x4dc_sse(dctcoef d[16]) {
     __m128i shuffled4 = _mm_shuffle_epi32(totalSum4, _MM_SHUFFLE(2, 3, 3, 2));
     totalSum4 = _mm_add_epi16(totalSum4, shuffled4);
 
-    _mm_storel_epi64((__m128i *) dT[0], totalSum1);
-    _mm_storel_epi64((__m128i *) dT[1], totalSum2);
-    _mm_storel_epi64((__m128i *) dT[2], totalSum3);
-    _mm_storel_epi64((__m128i *) dT[3], totalSum4);
-    // PHASE 2
-    row1row2 = _mm_loadu_si128((__m128i *) &dT[0][0]);
-    row3row4 = _mm_loadu_si128((__m128i *) &dT[2][0]);   // load instead of set
+    _mm_storel_epi64((__m128i *) &d[0], totalSum1);
+    _mm_storel_epi64((__m128i *) &d[4], totalSum2);
+    _mm_storel_epi64((__m128i *) &d[8], totalSum3);
+    _mm_storel_epi64((__m128i *) &d[12], totalSum4);
 
+    // PHASE 2
+    row1row2 = _mm_loadu_si128((__m128i *) &d[0]);
+    row3row4 = _mm_loadu_si128((__m128i *) &d[8]);
     // transpose dT back
     __m128i ones = _mm_set1_epi32(1);   // to divide
     tmp1 = _mm_unpacklo_epi16(row1row2, row3row4);

From 53906296d98cacd74bd7eeabc66154c6ee7a8359 Mon Sep 17 00:00:00 2001
From: gtsoul-tech <gtsoulkanakis@gmail.com>
Date: Fri, 29 Mar 2024 09:30:30 +0200
Subject: [PATCH 07/12] average timing, some comments

---
 dct.c | 128 ++++++++++++++++++++++++++++++++++------------------------
 1 file changed, 76 insertions(+), 52 deletions(-)

diff --git a/dct.c b/dct.c
index 487a200..d6d5b1e 100644
--- a/dct.c
+++ b/dct.c
@@ -4,7 +4,8 @@
 #include <stdlib.h>
 #include <time.h>
 
-#define N 100
+#define N    100
+#define ITER 1
 typedef uint16_t dctcoef;
 void
 print_vector(__m128i v) {
@@ -52,9 +53,9 @@ dct4x4dc(dctcoef d[16]) {
 
 void
 dct4x4dc_sse(dctcoef d[16]) {
-    dctcoef dT[4][4];   // This will hold the transposed matrix
-
     // Load the rows of d into 128-bit vectors
+    // transpose d and keep them in row1row2, row3row4
+    // transpose 1 time
     __m128i row1row2 = _mm_loadu_si128((__m128i *) &d[0]);
     __m128i row3row4 = _mm_loadu_si128((__m128i *) &d[8]);   // load instead of set
     __m128i tmp1 = _mm_unpacklo_epi16(row1row2, row3row4);
@@ -62,15 +63,17 @@ dct4x4dc_sse(dctcoef d[16]) {
     row1row2 = _mm_unpacklo_epi16(tmp1, tmp3);
     row3row4 = _mm_unpackhi_epi16(tmp1, tmp3);
 
+    // 1st(d0 d4 d8 d12) + 2nd(d1 d5 d9 d13) +
+    // 3rd(d2 d6 d10 d14) +4rth(d3 d7 d11 d15)
+
+    // 1st + 2nd -3rd -4rth (same logic)
     __m128i totalSum1 = _mm_add_epi16(row1row2, row3row4);
     __m128i totalSum2 = _mm_sub_epi16(row1row2, row3row4);
     __m128i shuffled1 = _mm_shuffle_epi32(totalSum1, _MM_SHUFFLE(2, 3, 3, 2));
     __m128i shuffled2 = _mm_shuffle_epi32(totalSum2, _MM_SHUFFLE(2, 3, 3, 2));
     totalSum1 = _mm_add_epi16(totalSum1, shuffled1);
     totalSum2 = _mm_add_epi16(totalSum2, shuffled2);
-    // 1st + 2nd +3rd +4rth
-    // 1st + 2nd -3rd -4rth
-    // 1st - 2nd -3rd +4rth
+    // 1st - 2nd -3rd +4rth (same logic)
     __m128i maskFF = _mm_set1_epi16(0xFF);
     __m128i zero = _mm_setzero_si128();
     __m128i mask1 = _mm_slli_si128(maskFF, 8);
@@ -81,15 +84,15 @@ dct4x4dc_sse(dctcoef d[16]) {
     masked_part = _mm_and_si128(mask2, row3row4);
     neg_masked_part = _mm_sub_epi16(zero, masked_part);
     row3row4 = _mm_or_si128(_mm_andnot_si128(mask2, row3row4), neg_masked_part);
-
     __m128i totalSum3 = _mm_add_epi16(row1row2, row3row4);
     __m128i shuffled3 = _mm_shuffle_epi32(totalSum3, _MM_SHUFFLE(2, 3, 3, 2));
     totalSum3 = _mm_add_epi16(totalSum3, shuffled3);
-    // 1st - 2nd +3rd -4rth
+    // 1st - 2nd +3rd -4rth (same logic)
     __m128i totalSum4 = _mm_sub_epi16(row1row2, row3row4);
     __m128i shuffled4 = _mm_shuffle_epi32(totalSum4, _MM_SHUFFLE(2, 3, 3, 2));
     totalSum4 = _mm_add_epi16(totalSum4, shuffled4);
 
+    // store back to d to keep the intermediate results
     _mm_storel_epi64((__m128i *) &d[0], totalSum1);
     _mm_storel_epi64((__m128i *) &d[4], totalSum2);
     _mm_storel_epi64((__m128i *) &d[8], totalSum3);
@@ -98,18 +101,17 @@ dct4x4dc_sse(dctcoef d[16]) {
     // PHASE 2
     row1row2 = _mm_loadu_si128((__m128i *) &d[0]);
     row3row4 = _mm_loadu_si128((__m128i *) &d[8]);
-    // transpose dT back
+
     __m128i ones = _mm_set1_epi32(1);   // to divide
     tmp1 = _mm_unpacklo_epi16(row1row2, row3row4);
     tmp3 = _mm_unpackhi_epi16(row1row2, row3row4);
     row1row2 = _mm_unpacklo_epi16(tmp1, tmp3);
     row3row4 = _mm_unpackhi_epi16(tmp1, tmp3);
-
     __m128i row1_32 = _mm_unpacklo_epi16(row1row2, zero);
     __m128i row2_32 = _mm_unpackhi_epi16(row1row2, zero);
     __m128i row3_32 = _mm_unpacklo_epi16(row3row4, zero);
     __m128i row4_32 = _mm_unpackhi_epi16(row3row4, zero);
-    // 1st + 2nd +3rd +4rth
+    // 1st + 2nd +3rd +4rth (same logic)
     __m128i totalSum11 = _mm_add_epi32(row1_32, row2_32);
     __m128i totalSum12 = _mm_add_epi32(row3_32, row4_32);
     totalSum11 = _mm_add_epi32(totalSum11, totalSum12);
@@ -117,7 +119,7 @@ dct4x4dc_sse(dctcoef d[16]) {
     totalSum11 = _mm_srli_epi32(totalSum11, 1);
     __m128i mask = _mm_set1_epi32(0x0000FFFF);
 
-    // 1st + 2nd -3rd -4rth
+    // 1st + 2nd -3rd -4rth (same logic)
     // Perform bitwise AND operation to keep only the lower 16 bits of each 32-bit integer
     totalSum11 = _mm_and_si128(totalSum11, mask);
     totalSum11 = _mm_packus_epi32(totalSum11, zero);
@@ -129,7 +131,7 @@ dct4x4dc_sse(dctcoef d[16]) {
     totalSum21 = _mm_srli_epi32(totalSum21, 1);
     totalSum21 = _mm_and_si128(totalSum21, mask);   // keep only the lower 16 bits of each 32-bit integer
     totalSum21 = _mm_packus_epi32(totalSum21, zero);
-
+    // 1st - 2nd -3rd +4rth (same logic)
     __m128i totalSum31 = _mm_sub_epi32(row1_32, row2_32);
     __m128i totalSum32 = _mm_sub_epi32(row4_32, row3_32);
     totalSum31 = _mm_add_epi32(totalSum31, totalSum32);
@@ -137,7 +139,7 @@ dct4x4dc_sse(dctcoef d[16]) {
     totalSum31 = _mm_srli_epi32(totalSum31, 1);
     totalSum31 = _mm_and_si128(totalSum31, mask);   // keep only the lower 16 bits of each 32-bit integer
     totalSum31 = _mm_packus_epi32(totalSum31, zero);
-    // 1st - 2nd -3rd +4rth
+    // 1st - 2nd +3rd -4rth (same logic)
     __m128i totalSum41 = _mm_sub_epi32(row1_32, row2_32);
     __m128i totalSum42 = _mm_sub_epi32(row3_32, row4_32);
     totalSum41 = _mm_add_epi32(totalSum41, totalSum42);
@@ -163,46 +165,68 @@ dct4x4dc_sse(dctcoef d[16]) {
 int
 main() {
     struct timespec start, mid, end;
-    srand(time(NULL));
-    dctcoef matrix[16];
-    dctcoef matrix2[16];
-    for (int i = 0; i < 16; i++) {
-        matrix[i] = rand() & 0xFF;   // 8 bit unsigned
-        matrix2[i] = matrix[i];
-    }
-    printf("Original matrix:\n");
-    for (int i = 0; i < 16; i += 4) {
-        printf("%02x %02x %02x %02x\n", matrix[i], matrix[i + 1], matrix[i + 2], matrix[i + 3]);
-    }
-    clock_gettime(CLOCK_MONOTONIC, &start);
-    dct4x4dc(matrix);
-    clock_gettime(CLOCK_MONOTONIC, &mid);
-    dct4x4dc_sse(matrix2);
-    clock_gettime(CLOCK_MONOTONIC, &end);
-
-    printf("\nMatrix after dct4x4dc:\n");
-    for (int i = 0; i < 16; i += 4) {
-        printf("%02x %02x %02x %02x\n", matrix[i], matrix[i + 1], matrix[i + 2], matrix[i + 3]);
-    }
-
-    printf("\nMatrix2 after dct4x4dc_sse:\n");
-    for (int i = 0; i < 16; i += 4) {
-        printf("%02x %02x %02x %02x\n", matrix2[i], matrix2[i + 1], matrix2[i + 2], matrix2[i + 3]);
-    }
-
-    long seconds1 = mid.tv_sec - start.tv_sec;
-    long nanoseconds1 = mid.tv_nsec - start.tv_nsec;
-    if (nanoseconds1 < 0) {
-        seconds1--;
-        nanoseconds1 += 1000000000;
-    }
-    long seconds2 = end.tv_sec - mid.tv_sec;
-    long nanoseconds2 = end.tv_nsec - mid.tv_nsec;
-    if (nanoseconds2 < 0) {
-        seconds2--;
-        nanoseconds2 += 1000000000;
+    long s1sum = 0, s2sum = 0, n1sum = 0, n2sum = 0;
+    int z = ITER;
+    while (z--) {
+        srand(time(NULL));
+        dctcoef matrix[16];
+        dctcoef matrix2[16];
+
+        for (int i = 0; i < 16; i++) {
+            matrix[i] = rand() & 0xFF;   // 8 bit unsigned
+            matrix2[i] = matrix[i];
+        }
+        printf("Original matrix:\n");
+        for (int i = 0; i < 16; i += 4) {
+            printf("%02x %02x %02x %02x\n", matrix[i], matrix[i + 1], matrix[i + 2], matrix[i + 3]);
+        }
+
+        clock_gettime(CLOCK_MONOTONIC, &start);
+        dct4x4dc(matrix);
+        clock_gettime(CLOCK_MONOTONIC, &mid);
+        dct4x4dc_sse(matrix2);
+        clock_gettime(CLOCK_MONOTONIC, &end);
+
+        printf("\nMatrix after dct4x4dc:\n");
+        for (int i = 0; i < 16; i += 4) {
+            printf("%02x %02x %02x %02x\n", matrix[i], matrix[i + 1], matrix[i + 2], matrix[i + 3]);
+        }
+
+        printf("\nMatrix2 after dct4x4dc_sse:\n");
+        for (int i = 0; i < 16; i += 4) {
+            printf("%02x %02x %02x %02x\n", matrix2[i], matrix2[i + 1], matrix2[i + 2], matrix2[i + 3]);
+        }
+
+        long seconds1 = mid.tv_sec - start.tv_sec;
+        long nanoseconds1 = mid.tv_nsec - start.tv_nsec;
+        if (nanoseconds1 < 0) {
+            seconds1--;
+            nanoseconds1 += 1000000000;
+        }
+        long seconds2 = end.tv_sec - mid.tv_sec;
+        long nanoseconds2 = end.tv_nsec - mid.tv_nsec;
+        if (nanoseconds2 < 0) {
+            seconds2--;
+            nanoseconds2 += 1000000000;
+        }
+        s1sum += seconds1;
+        s2sum += seconds2;
+        n1sum += nanoseconds1;
+        n2sum += nanoseconds2;
+        if (n1sum > 1000000000) {
+            s1sum++;
+            n1sum -= 1000000000;
+        }
+        if (n2sum > 1000000000) {
+            s2sum++;
+            n2sum -= 1000000000;
+        }
     }
+    /*
     printf("scalar: %ld.%09ld seconds\n", seconds1, nanoseconds1);
     printf("SSE   : %ld.%09ld seconds\n", seconds2, nanoseconds2);
+    */
+    printf("scalar: %ld.%09ld seconds\n", s1sum, n1sum);
+    printf("SSE   : %ld.%09ld seconds\n", s2sum, n2sum);
     return 0;
 }
\ No newline at end of file

From 9f83cf48d4b72e5dcf8e489e4a1c4e97acca8e36 Mon Sep 17 00:00:00 2001
From: gtsoul-tech <gtsoulkanakis@gmail.com>
Date: Fri, 29 Mar 2024 09:47:23 +0200
Subject: [PATCH 08/12] remove intermediate store

---
 dct.c | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

diff --git a/dct.c b/dct.c
index d6d5b1e..258c054 100644
--- a/dct.c
+++ b/dct.c
@@ -92,15 +92,11 @@ dct4x4dc_sse(dctcoef d[16]) {
     __m128i shuffled4 = _mm_shuffle_epi32(totalSum4, _MM_SHUFFLE(2, 3, 3, 2));
     totalSum4 = _mm_add_epi16(totalSum4, shuffled4);
 
-    // store back to d to keep the intermediate results
-    _mm_storel_epi64((__m128i *) &d[0], totalSum1);
-    _mm_storel_epi64((__m128i *) &d[4], totalSum2);
-    _mm_storel_epi64((__m128i *) &d[8], totalSum3);
-    _mm_storel_epi64((__m128i *) &d[12], totalSum4);
-
+    // instead of storing back to d ,keep the intermediate results
     // PHASE 2
-    row1row2 = _mm_loadu_si128((__m128i *) &d[0]);
-    row3row4 = _mm_loadu_si128((__m128i *) &d[8]);
+    // transpose again
+    row1row2 = _mm_unpacklo_epi64(totalSum1, totalSum2);
+    row3row4 = _mm_unpacklo_epi64(totalSum3, totalSum4);
 
     __m128i ones = _mm_set1_epi32(1);   // to divide
     tmp1 = _mm_unpacklo_epi16(row1row2, row3row4);
@@ -153,6 +149,7 @@ dct4x4dc_sse(dctcoef d[16]) {
     _mm_storel_epi64((__m128i *) &d[8], totalSum31);
     _mm_storel_epi64((__m128i *) &d[12], totalSum41);
 
+    // transpose again
     row1row2 = _mm_loadu_si128((__m128i *) &d[0]);
     row3row4 = _mm_loadu_si128((__m128i *) &d[8]);   // load instead of set
     tmp1 = _mm_unpacklo_epi16(row1row2, row3row4);

From 759705d8d444ad966b03d6c38c7541bc653fa5d2 Mon Sep 17 00:00:00 2001
From: gtsoul-tech <gtsoulkanakis@gmail.com>
Date: Fri, 29 Mar 2024 09:52:28 +0200
Subject: [PATCH 09/12] transpose in vectors(less stores-loads)- comments

---
 dct.c | 13 ++++---------
 1 file changed, 4 insertions(+), 9 deletions(-)

diff --git a/dct.c b/dct.c
index 258c054..8098ae4 100644
--- a/dct.c
+++ b/dct.c
@@ -94,7 +94,7 @@ dct4x4dc_sse(dctcoef d[16]) {
 
     // instead of storing back to d ,keep the intermediate results
     // PHASE 2
-    // transpose again
+    // transpose in vectors again(no stores)
     row1row2 = _mm_unpacklo_epi64(totalSum1, totalSum2);
     row3row4 = _mm_unpacklo_epi64(totalSum3, totalSum4);
 
@@ -144,14 +144,9 @@ dct4x4dc_sse(dctcoef d[16]) {
     totalSum41 = _mm_and_si128(totalSum41, mask);   // keep only the lower 16 bits of each 32-bit integer
     totalSum41 = _mm_packus_epi32(totalSum41, zero);
 
-    _mm_storel_epi64((__m128i *) &d[0], totalSum11);
-    _mm_storel_epi64((__m128i *) &d[4], totalSum21);
-    _mm_storel_epi64((__m128i *) &d[8], totalSum31);
-    _mm_storel_epi64((__m128i *) &d[12], totalSum41);
-
-    // transpose again
-    row1row2 = _mm_loadu_si128((__m128i *) &d[0]);
-    row3row4 = _mm_loadu_si128((__m128i *) &d[8]);   // load instead of set
+    // transpose in vectors again(no stores)
+    row1row2 = _mm_unpacklo_epi64(totalSum11, totalSum21);
+    row3row4 = _mm_unpacklo_epi64(totalSum31, totalSum41);
     tmp1 = _mm_unpacklo_epi16(row1row2, row3row4);
     tmp3 = _mm_unpackhi_epi16(row1row2, row3row4);
     _mm_storel_epi64((__m128i *) &d[0], _mm_move_epi64(_mm_unpacklo_epi16(tmp1, tmp3)));

From 62513e5d57b86bd20950979d75026ae8043c27dc Mon Sep 17 00:00:00 2001
From: gtsoul-tech <gtsoulkanakis@gmail.com>
Date: Fri, 29 Mar 2024 09:56:02 +0200
Subject: [PATCH 10/12] comment out prints of matrixes-more iters

---
 dct.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/dct.c b/dct.c
index 8098ae4..8800841 100644
--- a/dct.c
+++ b/dct.c
@@ -5,7 +5,7 @@
 #include <time.h>
 
 #define N    100
-#define ITER 1
+#define ITER 100000
 typedef uint16_t dctcoef;
 void
 print_vector(__m128i v) {
@@ -168,17 +168,18 @@ main() {
             matrix[i] = rand() & 0xFF;   // 8 bit unsigned
             matrix2[i] = matrix[i];
         }
+        /*
         printf("Original matrix:\n");
         for (int i = 0; i < 16; i += 4) {
             printf("%02x %02x %02x %02x\n", matrix[i], matrix[i + 1], matrix[i + 2], matrix[i + 3]);
         }
-
+        */
         clock_gettime(CLOCK_MONOTONIC, &start);
         dct4x4dc(matrix);
         clock_gettime(CLOCK_MONOTONIC, &mid);
         dct4x4dc_sse(matrix2);
         clock_gettime(CLOCK_MONOTONIC, &end);
-
+        /*
         printf("\nMatrix after dct4x4dc:\n");
         for (int i = 0; i < 16; i += 4) {
             printf("%02x %02x %02x %02x\n", matrix[i], matrix[i + 1], matrix[i + 2], matrix[i + 3]);
@@ -188,7 +189,7 @@ main() {
         for (int i = 0; i < 16; i += 4) {
             printf("%02x %02x %02x %02x\n", matrix2[i], matrix2[i + 1], matrix2[i + 2], matrix2[i + 3]);
         }
-
+        */
         long seconds1 = mid.tv_sec - start.tv_sec;
         long nanoseconds1 = mid.tv_nsec - start.tv_nsec;
         if (nanoseconds1 < 0) {

From 52e7db164e8f94d33743e0305f33ac54e6053ecc Mon Sep 17 00:00:00 2001
From: gtsoul-tech <gtsoulkanakis@gmail.com>
Date: Fri, 29 Mar 2024 13:20:56 +0200
Subject: [PATCH 11/12] change last stores

---
 dct.c | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/dct.c b/dct.c
index 8800841..34c3eff 100644
--- a/dct.c
+++ b/dct.c
@@ -5,7 +5,7 @@
 #include <time.h>
 
 #define N    100
-#define ITER 100000
+#define ITER 1
 typedef uint16_t dctcoef;
 void
 print_vector(__m128i v) {
@@ -145,14 +145,14 @@ dct4x4dc_sse(dctcoef d[16]) {
     totalSum41 = _mm_packus_epi32(totalSum41, zero);
 
     // transpose in vectors again(no stores)
-    row1row2 = _mm_unpacklo_epi64(totalSum11, totalSum21);
+    row1row2 = _mm_unpacklo_epi64(totalSum11, totalSum21);   // interleave 64bits
     row3row4 = _mm_unpacklo_epi64(totalSum31, totalSum41);
-    tmp1 = _mm_unpacklo_epi16(row1row2, row3row4);
+    tmp1 = _mm_unpacklo_epi16(row1row2, row3row4);   // interleave the lower 8 16bit integers
     tmp3 = _mm_unpackhi_epi16(row1row2, row3row4);
-    _mm_storel_epi64((__m128i *) &d[0], _mm_move_epi64(_mm_unpacklo_epi16(tmp1, tmp3)));
-    _mm_storel_epi64((__m128i *) &d[4], _mm_srli_si128(_mm_unpacklo_epi16(tmp1, tmp3), 8));
-    _mm_storel_epi64((__m128i *) &d[8], _mm_move_epi64(_mm_unpackhi_epi16(tmp1, tmp3)));
-    _mm_storel_epi64((__m128i *) &d[12], _mm_srli_si128(_mm_unpackhi_epi16(tmp1, tmp3), 8));
+    row1row2 = _mm_unpacklo_epi16(tmp1, tmp3);
+    row3row4 = _mm_unpackhi_epi16(tmp1, tmp3);
+    _mm_storeu_si128((__m128i *) d, row1row2);   // store 128bit in one go
+    _mm_storeu_si128((__m128i *) &d[8], row3row4);
 }
 int
 main() {
@@ -179,7 +179,7 @@ main() {
         clock_gettime(CLOCK_MONOTONIC, &mid);
         dct4x4dc_sse(matrix2);
         clock_gettime(CLOCK_MONOTONIC, &end);
-        /*
+
         printf("\nMatrix after dct4x4dc:\n");
         for (int i = 0; i < 16; i += 4) {
             printf("%02x %02x %02x %02x\n", matrix[i], matrix[i + 1], matrix[i + 2], matrix[i + 3]);
@@ -189,7 +189,7 @@ main() {
         for (int i = 0; i < 16; i += 4) {
             printf("%02x %02x %02x %02x\n", matrix2[i], matrix2[i + 1], matrix2[i + 2], matrix2[i + 3]);
         }
-        */
+
         long seconds1 = mid.tv_sec - start.tv_sec;
         long nanoseconds1 = mid.tv_nsec - start.tv_nsec;
         if (nanoseconds1 < 0) {

From a06bc3f0e394ccd2e954baf2e86c885d8dbdff43 Mon Sep 17 00:00:00 2001
From: gtsoul-tech <gtsoulkanakis@gmail.com>
Date: Fri, 29 Mar 2024 13:35:41 +0200
Subject: [PATCH 12/12] comment out prints

---
 dct.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/dct.c b/dct.c
index 34c3eff..7d0df59 100644
--- a/dct.c
+++ b/dct.c
@@ -5,7 +5,7 @@
 #include <time.h>
 
 #define N    100
-#define ITER 1
+#define ITER 100000
 typedef uint16_t dctcoef;
 void
 print_vector(__m128i v) {
@@ -179,7 +179,7 @@ main() {
         clock_gettime(CLOCK_MONOTONIC, &mid);
         dct4x4dc_sse(matrix2);
         clock_gettime(CLOCK_MONOTONIC, &end);
-
+        /*
         printf("\nMatrix after dct4x4dc:\n");
         for (int i = 0; i < 16; i += 4) {
             printf("%02x %02x %02x %02x\n", matrix[i], matrix[i + 1], matrix[i + 2], matrix[i + 3]);
@@ -189,7 +189,7 @@ main() {
         for (int i = 0; i < 16; i += 4) {
             printf("%02x %02x %02x %02x\n", matrix2[i], matrix2[i + 1], matrix2[i + 2], matrix2[i + 3]);
         }
-
+        */
         long seconds1 = mid.tv_sec - start.tv_sec;
         long nanoseconds1 = mid.tv_nsec - start.tv_nsec;
         if (nanoseconds1 < 0) {