diff --git a/sgemm-small.c b/sgemm-small.c index a7eda31..83fbb34 100644 --- a/sgemm-small.c +++ b/sgemm-small.c @@ -4,186 +4,272 @@ #include #include -#define NUM_REGISTERS 4; - void square_sgemm( int n, float *A, float *B, float *C ) { - int i, j , k, l; - //int count = 0; //for debug - float At[n*n] __attribute__ ((aligned(16))); - float temp; - __m128 x; - __m128 y; - __m128 z; - __m128 a; - __m128 zero = _mm_setzero_ps(); - __m128 partialSum; - __m128 partialSum1; - __m128 partialSum2; - __m128 partialSum3; - __m128 partialSum4; - __m128 partialSum5; - __m128 partialSum6; - __m128 partialSum7; - float pSum[4]; - float pSum1[4]; - float pSum2[4]; - float pSum3[4]; - float cij=0.0, cij1=0.0, cij2=0.0, cij3=0.0, cij4=0.0, cij5=0.0, cij6=0.0, cij7=0.0; - //transpose A - //I was unable to simd this without doing extra store/loads + int f, g, h, i, j , k, l, alpha, beta, gamma; + int blockI = 64, blockJ = 64, blockK = 64; +// if (n < 300) { +// blockI = 64; +// blockJ = 64; +// blockK = 256; +// } else { +// blockI = 16; +// blockJ = 16; +// blockK = 256; +// } + float temp, temp1, temp2, temp3, temp4; + __m128 x; + __m128 y; + __m128 a; + __m128 b; + __m128 c; + __m128 d; + __m128 partialSum; + __m128 partialSum1; + __m128 partialSum2; + __m128 partialSum3; + __m128 partialSum4; + __m128 partialSum5; + __m128 partialSum6; + __m128 partialSum7; + float cij=0.0, cij1=0.0, cij2=0.0, cij3=0.0, cij4=0.0, cij5=0.0, cij6=0.0, cij7=0.0; + __m128 c1; + __m128 c2; + float *At = malloc(n*n*sizeof(float)); for (i = 0; i < n; i ++) { - for (j = 0; j < n/4*4; j += 4) { - At[i+j*n] = A[j+i*n]; - At[i+(j+1)*n] = A[j+i*n + 1]; - At[i+(j+2)*n] = A[j+i*n + 2]; - At[i+(j+3)*n] = A[j+i*n + 3]; - } - for (; j