From e8068f6d525315adc1ed26b036b9fe29495298c9 Mon Sep 17 00:00:00 2001 From: RuQing Xu Date: Mon, 21 Nov 2022 02:39:28 +0900 Subject: [PATCH 1/5] Arm NEON Improve C-Prefetching for DGEMM - Only DGEMM at this moment. - Prefetch whole lines. - Scatter prefetching insts. --- kernels/armv8a/3/bli_gemm_armv8a_asm_d8x6r.c | 76 +++++++++++--------- 1 file changed, 42 insertions(+), 34 deletions(-) diff --git a/kernels/armv8a/3/bli_gemm_armv8a_asm_d8x6r.c b/kernels/armv8a/3/bli_gemm_armv8a_asm_d8x6r.c index b0df23fb06..c9a7a21311 100644 --- a/kernels/armv8a/3/bli_gemm_armv8a_asm_d8x6r.c +++ b/kernels/armv8a/3/bli_gemm_armv8a_asm_d8x6r.c @@ -89,12 +89,18 @@ * +---+ +---+ +---+ * */ -#define DGEMM_8X6_MKER_LOOP_PLAIN(C00,C01,C02,C10,C11,C12,C20,C21,C22,C30,C31,C32,C40,C41,C42,C50,C51,C52,C60,C61,C62,C70,C71,C72,A0,A1,A2,A3,B0,B1,B2,AADDR,ASHIFT,BADDR,BSHIFT,LOADNEXT) \ +#define DGEMM_8X6_MKER_LOOP_PLAIN(C00,C01,C02,C10,C11,C12,C20,C21,C22,C30,C31,C32,C40,C41,C42,C50,C51,C52,C60,C61,C62,C70,C71,C72,A0,A1,A2,A3,B0,B1,B2,AADDR,ASHIFT,BADDR,BSHIFT,LOADNEXT,CADDR,RSC,LASTB,PRFC) \ + GEMM_PRFC_FH_ ##PRFC (CADDR) \ DGEMM_2X2_NANOKERNEL(C00,C10,B0,A0) \ + GEMM_PRFC_LH_FWD_ ##PRFC (CADDR,RSC,LASTB) \ DGEMM_2X2_NANOKERNEL(C20,C30,B0,A1) \ + GEMM_PRFC_FH_ ##PRFC (CADDR) \ DGEMM_2X2_NANOKERNEL(C01,C11,B1,A0) \ + GEMM_PRFC_LH_FWD_ ##PRFC (CADDR,RSC,LASTB) \ DGEMM_2X2_NANOKERNEL(C21,C31,B1,A1) \ + GEMM_PRFC_FH_ ##PRFC (CADDR) \ DGEMM_2X2_NANOKERNEL(C02,C12,B2,A0) \ + GEMM_PRFC_LH_FWD_ ##PRFC (CADDR,RSC,LASTB) \ DGEMM_2X2_NANOKERNEL(C22,C32,B2,A1) \ DGEMM_LOAD2V_ ##LOADNEXT (A0,A1,AADDR,ASHIFT) \ DGEMM_2X2_NANOKERNEL(C40,C50,B0,A2) \ @@ -116,6 +122,15 @@ DGEMM_LOAD1V_load(V1,ADDR,IMM) \ DGEMM_LOAD1V_load(V2,ADDR,IMM+16) +// Interleaving prefetch or not. +#define GEMM_PRFC_FH_noload(CADDR) +#define GEMM_PRFC_LH_FWD_noload(CADDR,RSC,LASTB) +#define GEMM_PRFC_FH_load(CADDR) \ +" prfm PLDL1KEEP, ["#CADDR"] \n\t" +#define GEMM_PRFC_LH_FWD_load(CADDR,RSC,LASTB) \ +" prfm PLDL1KEEP, ["#CADDR", "#LASTB"] \n\t" \ +" add "#CADDR", "#CADDR", "#RSC" \n\t" + // For contiguous storage of C. #define DLOADC_3V_R_FWD(C0,C1,C2,CADDR,CSHIFT,RSC) \ DLOAD2V(C0,C1,CADDR,CSHIFT) \ @@ -173,16 +188,13 @@ void bli_sgemm_armv8a_asm_12x8r " lsl x3, x3, #2 \n\t" // rs_b " lsl x6, x6, #2 \n\t" // rs_c " \n\t" -" cmp %w[ct], wzr \n\t" " mov x9, x5 \n\t" -BNE(SEND_PRFMC_FH) PRFMC_FWD(x9,x6,32) // Prefetch C 01/12. PRFMC_FWD(x9,x6,32) // Prefetch C 02/12. PRFMC_FWD(x9,x6,32) // Prefetch C 03/12. PRFMC_FWD(x9,x6,32) // Prefetch C 04/12. PRFMC_FWD(x9,x6,32) // Prefetch C 05/12. PRFMC_FWD(x9,x6,32) // Prefetch C 06/12. -LABEL(SEND_PRFMC_FH) " \n\t" " ldr x4, %[k_mker] \n\t" // Number of loops. " ldr x8, %[k_left] \n\t" @@ -206,15 +218,12 @@ BEQ(SCLEAR_CCOLS) " add x0, x0, x2 \n\t" " ldr q27, [x0, #16*0] \n\t" " \n\t" -" cmp %w[ct], wzr \n\t" -BNE(SEND_PRFMC_LH) PRFMC_FWD(x9,x6,32) // Prefetch C 07/12. PRFMC_FWD(x9,x6,32) // Prefetch C 08/12. PRFMC_FWD(x9,x6,32) // Prefetch C 09/12. PRFMC_FWD(x9,x6,32) // Prefetch C 10/12. PRFMC_FWD(x9,x6,32) // Prefetch C 11/12. PRFMC_FWD(x9,x6,32) // Prefetch C 12/12. -LABEL(SEND_PRFMC_LH) " cmp x4, #0 \n\t" // Reset branching flag. " \n\t" " ldr q28, [x1, #16*0] \n\t" // Load B. @@ -359,8 +368,7 @@ LABEL(SEND_WRITE_MEM) [alpha] "m" (alpha), [beta] "m" (beta), [a_next] "m" (a_next), - [b_next] "m" (b_next), - [ct] "r" (_use_ct) // Defined by macro. + [b_next] "m" (b_next) : "x0","x1","x2","x3","x4","x5","x6","x7","x8","x9", "v0","v1","v2","v3","v4","v5","v6","v7", "v8","v9","v10","v11","v12","v13","v14","v15", @@ -418,18 +426,7 @@ void bli_dgemm_armv8a_asm_8x6r " lsl x3, x3, #3 \n\t" // rs_b " lsl x6, x6, #3 \n\t" // rs_c " \n\t" -" cmp %w[ct], wzr \n\t" " mov x9, x5 \n\t" -BNE(DEND_PRFMC) -PRFMC_FWD(x9,x6,40) // Prefetch C 1/8. -PRFMC_FWD(x9,x6,40) // Prefetch C 2/8. -PRFMC_FWD(x9,x6,40) // Prefetch C 3/8. -PRFMC_FWD(x9,x6,40) // Prefetch C 4/8. -PRFMC_FWD(x9,x6,40) // Prefetch C 5/8. -PRFMC_FWD(x9,x6,40) // Prefetch C 6/8. -PRFMC_FWD(x9,x6,40) // Prefetch C 7/8. -PRFMC_FWD(x9,x6,40) // Prefetch C 8/8. -LABEL(DEND_PRFMC) " \n\t" " ldr x4, %[k_mker] \n\t" // Number of loops. " ldr x8, %[k_left] \n\t" @@ -439,8 +436,8 @@ LABEL(DEND_PRFMC) // V[24:27] <- A // V[28:31] <- B // Under this scheme, the following is defined: -#define DGEMM_8X6_MKER_LOOP_PLAIN_LOC(A0,A1,A2,A3,B0,B1,B2,AADDR,ASHIFT,BADDR,BSHIFT,LOADNEXT) \ - DGEMM_8X6_MKER_LOOP_PLAIN(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,A0,A1,A2,A3,B0,B1,B2,AADDR,ASHIFT,BADDR,BSHIFT,LOADNEXT) +#define DGEMM_8X6_MKER_LOOP_PLAIN_LOC(A0,A1,A2,A3,B0,B1,B2,AADDR,ASHIFT,BADDR,BSHIFT,LOADNEXT,PRFC) \ + DGEMM_8X6_MKER_LOOP_PLAIN(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,A0,A1,A2,A3,B0,B1,B2,AADDR,ASHIFT,BADDR,BSHIFT,LOADNEXT,x9,x6,40,PRFC) // Load from memory. LABEL(DLOAD_ABC) " \n\t" // No-microkernel early return is a must @@ -458,40 +455,52 @@ BEQ(DCLEAR_CCOLS) " ldr q30, [x1, #16*2] \n\t" " add x1, x1, x3 \n\t" " ldr q31, [x1, #16*0] \n\t" -LABEL(DCLEAR_CCOLS) +LABEL(DCLEAR_CCOLS) // Clear registers & prefetch some of C microtiles. +GEMM_PRFC_FH_load(x9) // Prefetch C 0.5/8. CLEAR8V(0,1,2,3,4,5,6,7) +GEMM_PRFC_LH_FWD_load(x9,x6,40) // Prefetch C 1/8. CLEAR8V(8,9,10,11,12,13,14,15) +GEMM_PRFC_FH_load(x9) // Prefetch C 1.5/8. CLEAR8V(16,17,18,19,20,21,22,23) +GEMM_PRFC_LH_FWD_load(x9,x6,40) // Prefetch C 2/8. // No-microkernel early return, once again. BEQ(DK_LEFT_LOOP) // // Microkernel is defined here as: -#define DGEMM_8X6_MKER_LOOP_PLAIN_LOC_FWD(A0,A1,A2,A3,B0,B1,B2) \ - DGEMM_8X6_MKER_LOOP_PLAIN_LOC(A0,A1,A2,A3,B0,B1,B2,x0,0,x1,16,load) \ +#define DGEMM_8X6_MKER_LOOP_PLAIN_LOC_FWD(A0,A1,A2,A3,B0,B1,B2,PRFC) \ + DGEMM_8X6_MKER_LOOP_PLAIN_LOC(A0,A1,A2,A3,B0,B1,B2,x0,0,x1,16,load,PRFC) \ "add x1, x1, x3 \n\t" \ "ldr q"#B2", [x1, #16*0] \n\t" \ "ldr q"#A2", [x0, #16*2] \n\t" \ "ldr q"#A3", [x0, #16*3] \n\t" \ "add x0, x0, x2 \n\t" +// Start microkernel loop -- Special treatment for the very first loop. +DGEMM_8X6_MKER_LOOP_PLAIN_LOC_FWD(24,25,26,27,28,29,30,load) // Prefetch C 3-5/8. +DGEMM_8X6_MKER_LOOP_PLAIN_LOC_FWD(24,25,26,27,31,28,29,load) // Prefetch C 6-8/8. +" \n\t" // Decrease counter before final replica. +" subs x4, x4, #1 \n\t" // Branch early to avoid reading excess mem. +BEQ(DFIN_MKER_LOOP) +DGEMM_8X6_MKER_LOOP_PLAIN_LOC_FWD(24,25,26,27,30,31,28,noload) +DGEMM_8X6_MKER_LOOP_PLAIN_LOC_FWD(24,25,26,27,29,30,31,noload) // Start microkernel loop. LABEL(DK_MKER_LOOP) -DGEMM_8X6_MKER_LOOP_PLAIN_LOC_FWD(24,25,26,27,28,29,30) -DGEMM_8X6_MKER_LOOP_PLAIN_LOC_FWD(24,25,26,27,31,28,29) +DGEMM_8X6_MKER_LOOP_PLAIN_LOC_FWD(24,25,26,27,28,29,30,noload) +DGEMM_8X6_MKER_LOOP_PLAIN_LOC_FWD(24,25,26,27,31,28,29,noload) " \n\t" // Decrease counter before final replica. " subs x4, x4, #1 \n\t" // Branch early to avoid reading excess mem. BEQ(DFIN_MKER_LOOP) -DGEMM_8X6_MKER_LOOP_PLAIN_LOC_FWD(24,25,26,27,30,31,28) -DGEMM_8X6_MKER_LOOP_PLAIN_LOC_FWD(24,25,26,27,29,30,31) +DGEMM_8X6_MKER_LOOP_PLAIN_LOC_FWD(24,25,26,27,30,31,28,noload) +DGEMM_8X6_MKER_LOOP_PLAIN_LOC_FWD(24,25,26,27,29,30,31,noload) BRANCH(DK_MKER_LOOP) // // Final microkernel loop. LABEL(DFIN_MKER_LOOP) -DGEMM_8X6_MKER_LOOP_PLAIN_LOC(24,25,26,27,30,31,28,x0,0,x1,16,load) +DGEMM_8X6_MKER_LOOP_PLAIN_LOC(24,25,26,27,30,31,28,x0,0,x1,16,load,noload) " add x1, x1, x3 \n\t" " ldr q26, [x0, #16*2] \n\t" " ldr q27, [x0, #16*3] \n\t" " add x0, x0, x2 \n\t" -DGEMM_8X6_MKER_LOOP_PLAIN_LOC(24,25,26,27,29,30,31,xzr,-1,xzr,-1,noload) +DGEMM_8X6_MKER_LOOP_PLAIN_LOC(24,25,26,27,29,30,31,xzr,-1,xzr,-1,noload,noload) // // Loops left behind microkernels. LABEL(DK_LEFT_LOOP) @@ -507,7 +516,7 @@ BEQ(DWRITE_MEM_PREP) " ldr q30, [x1, #16*2] \n\t" " add x1, x1, x3 \n\t" " sub x8, x8, #1 \n\t" -DGEMM_8X6_MKER_LOOP_PLAIN_LOC(24,25,26,27,28,29,30,xzr,-1,xzr,-1,noload) +DGEMM_8X6_MKER_LOOP_PLAIN_LOC(24,25,26,27,28,29,30,xzr,-1,xzr,-1,noload,noload) BRANCH(DK_LEFT_LOOP) // // Scale and write to memory. @@ -589,8 +598,7 @@ LABEL(DEND_WRITE_MEM) [alpha] "m" (alpha), [beta] "m" (beta), [a_next] "m" (a_next), - [b_next] "m" (b_next), - [ct] "r" (_use_ct) // Defined by macro. + [b_next] "m" (b_next) : "x0","x1","x2","x3","x4","x5","x6","x7","x8","x9", "v0","v1","v2","v3","v4","v5","v6","v7", "v8","v9","v10","v11","v12","v13","v14","v15", From ad73717e8a377dcf41d4f9ba3d0a3f11724617af Mon Sep 17 00:00:00 2001 From: RuQing Xu Date: Mon, 21 Nov 2022 02:41:10 +0900 Subject: [PATCH 2/5] Arm NEON Init. Opt. For DGEMM Instead of clearing C rows, Deploy first-k FMUL so that instructions are saved. --- kernels/armv8a/3/bli_gemm_armv8a_asm_d8x6r.c | 103 +++++++++++-------- 1 file changed, 58 insertions(+), 45 deletions(-) diff --git a/kernels/armv8a/3/bli_gemm_armv8a_asm_d8x6r.c b/kernels/armv8a/3/bli_gemm_armv8a_asm_d8x6r.c index c9a7a21311..9c29145b96 100644 --- a/kernels/armv8a/3/bli_gemm_armv8a_asm_d8x6r.c +++ b/kernels/armv8a/3/bli_gemm_armv8a_asm_d8x6r.c @@ -89,28 +89,30 @@ * +---+ +---+ +---+ * */ -#define DGEMM_8X6_MKER_LOOP_PLAIN(C00,C01,C02,C10,C11,C12,C20,C21,C22,C30,C31,C32,C40,C41,C42,C50,C51,C52,C60,C61,C62,C70,C71,C72,A0,A1,A2,A3,B0,B1,B2,AADDR,ASHIFT,BADDR,BSHIFT,LOADNEXT,CADDR,RSC,LASTB,PRFC) \ +#define DGEMM_8X6_MKER_LOOP(SUFFIX,C00,C01,C02,C10,C11,C12,C20,C21,C22,C30,C31,C32,C40,C41,C42,C50,C51,C52,C60,C61,C62,C70,C71,C72,A0,A1,A2,A3,B0,B1,B2,AADDR,ASHIFT,BADDR,BSHIFT,LOADNEXT,CADDR,RSC,LASTB,PRFC) \ GEMM_PRFC_FH_ ##PRFC (CADDR) \ - DGEMM_2X2_NANOKERNEL(C00,C10,B0,A0) \ + DGEMM_2X2_NANOKERNEL_ ##SUFFIX (C00,C10,B0,A0) \ GEMM_PRFC_LH_FWD_ ##PRFC (CADDR,RSC,LASTB) \ - DGEMM_2X2_NANOKERNEL(C20,C30,B0,A1) \ + DGEMM_2X2_NANOKERNEL_ ##SUFFIX (C20,C30,B0,A1) \ GEMM_PRFC_FH_ ##PRFC (CADDR) \ - DGEMM_2X2_NANOKERNEL(C01,C11,B1,A0) \ + DGEMM_2X2_NANOKERNEL_ ##SUFFIX (C01,C11,B1,A0) \ GEMM_PRFC_LH_FWD_ ##PRFC (CADDR,RSC,LASTB) \ - DGEMM_2X2_NANOKERNEL(C21,C31,B1,A1) \ + DGEMM_2X2_NANOKERNEL_ ##SUFFIX (C21,C31,B1,A1) \ GEMM_PRFC_FH_ ##PRFC (CADDR) \ - DGEMM_2X2_NANOKERNEL(C02,C12,B2,A0) \ + DGEMM_2X2_NANOKERNEL_ ##SUFFIX (C02,C12,B2,A0) \ GEMM_PRFC_LH_FWD_ ##PRFC (CADDR,RSC,LASTB) \ - DGEMM_2X2_NANOKERNEL(C22,C32,B2,A1) \ + DGEMM_2X2_NANOKERNEL_ ##SUFFIX (C22,C32,B2,A1) \ DGEMM_LOAD2V_ ##LOADNEXT (A0,A1,AADDR,ASHIFT) \ - DGEMM_2X2_NANOKERNEL(C40,C50,B0,A2) \ - DGEMM_2X2_NANOKERNEL(C60,C70,B0,A3) \ + DGEMM_2X2_NANOKERNEL_ ##SUFFIX (C40,C50,B0,A2) \ + GEMM_PRFC_FH_ ##PRFC (CADDR) \ + DGEMM_2X2_NANOKERNEL_ ##SUFFIX (C60,C70,B0,A3) \ DGEMM_LOAD1V_ ##LOADNEXT (B0,BADDR,BSHIFT) \ - DGEMM_2X2_NANOKERNEL(C41,C51,B1,A2) \ - DGEMM_2X2_NANOKERNEL(C61,C71,B1,A3) \ + DGEMM_2X2_NANOKERNEL_ ##SUFFIX (C41,C51,B1,A2) \ + GEMM_PRFC_LH_FWD_ ##PRFC (CADDR,RSC,LASTB) \ + DGEMM_2X2_NANOKERNEL_ ##SUFFIX (C61,C71,B1,A3) \ DGEMM_LOAD1V_ ##LOADNEXT (B1,BADDR,BSHIFT+16) \ - DGEMM_2X2_NANOKERNEL(C42,C52,B2,A2) \ - DGEMM_2X2_NANOKERNEL(C62,C72,B2,A3) + DGEMM_2X2_NANOKERNEL_ ##SUFFIX (C42,C52,B2,A2) \ + DGEMM_2X2_NANOKERNEL_ ##SUFFIX (C62,C72,B2,A3) // Interleaving load or not. #define DGEMM_LOAD1V_noload(V1,ADDR,IMM) @@ -295,6 +297,7 @@ LABEL(SPREFETCH_ABNEXT) " prfm PLDL1STRM, [x0, 64*2] \n\t" // to try to activate hardware prefetcher. " prfm PLDL1STRM, [x1, 64*0] \n\t" " prfm PLDL1STRM, [x1, 64*1] \n\t" +" prfm PLDL1STRM, [x1, 64*2] \n\t" " prfm PLDL1STRM, [x1, 64*3] \n\t" " \n\t" " fmov d26, #1.0 \n\t" @@ -436,13 +439,13 @@ void bli_dgemm_armv8a_asm_8x6r // V[24:27] <- A // V[28:31] <- B // Under this scheme, the following is defined: -#define DGEMM_8X6_MKER_LOOP_PLAIN_LOC(A0,A1,A2,A3,B0,B1,B2,AADDR,ASHIFT,BADDR,BSHIFT,LOADNEXT,PRFC) \ - DGEMM_8X6_MKER_LOOP_PLAIN(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,A0,A1,A2,A3,B0,B1,B2,AADDR,ASHIFT,BADDR,BSHIFT,LOADNEXT,x9,x6,40,PRFC) +#define DGEMM_8X6_MKER_LOOP_LOC(SUFFIX,A0,A1,A2,A3,B0,B1,B2,AADDR,ASHIFT,BADDR,BSHIFT,LOADNEXT,PRFC) \ + DGEMM_8X6_MKER_LOOP(SUFFIX,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,A0,A1,A2,A3,B0,B1,B2,AADDR,ASHIFT,BADDR,BSHIFT,LOADNEXT,x9,x6,40,PRFC) // Load from memory. LABEL(DLOAD_ABC) " \n\t" // No-microkernel early return is a must " cmp x4, #0 \n\t" // to avoid out-of-boundary read. -BEQ(DCLEAR_CCOLS) +BEQ(DK_LEFT_LOOP_INIT) " \n\t" " ldr q24, [x0, #16*0] \n\t" // Load A. " ldr q25, [x0, #16*1] \n\t" @@ -455,52 +458,40 @@ BEQ(DCLEAR_CCOLS) " ldr q30, [x1, #16*2] \n\t" " add x1, x1, x3 \n\t" " ldr q31, [x1, #16*0] \n\t" -LABEL(DCLEAR_CCOLS) // Clear registers & prefetch some of C microtiles. -GEMM_PRFC_FH_load(x9) // Prefetch C 0.5/8. -CLEAR8V(0,1,2,3,4,5,6,7) -GEMM_PRFC_LH_FWD_load(x9,x6,40) // Prefetch C 1/8. -CLEAR8V(8,9,10,11,12,13,14,15) -GEMM_PRFC_FH_load(x9) // Prefetch C 1.5/8. -CLEAR8V(16,17,18,19,20,21,22,23) -GEMM_PRFC_LH_FWD_load(x9,x6,40) // Prefetch C 2/8. -// No-microkernel early return, once again. -BEQ(DK_LEFT_LOOP) // // Microkernel is defined here as: -#define DGEMM_8X6_MKER_LOOP_PLAIN_LOC_FWD(A0,A1,A2,A3,B0,B1,B2,PRFC) \ - DGEMM_8X6_MKER_LOOP_PLAIN_LOC(A0,A1,A2,A3,B0,B1,B2,x0,0,x1,16,load,PRFC) \ +#define DGEMM_8X6_MKER_LOOP_LOC_FWD(SUFFIX,A0,A1,A2,A3,B0,B1,B2,PRFC) \ + DGEMM_8X6_MKER_LOOP_LOC(SUFFIX,A0,A1,A2,A3,B0,B1,B2,x0,0,x1,16,load,PRFC) \ "add x1, x1, x3 \n\t" \ "ldr q"#B2", [x1, #16*0] \n\t" \ "ldr q"#A2", [x0, #16*2] \n\t" \ "ldr q"#A3", [x0, #16*3] \n\t" \ "add x0, x0, x2 \n\t" // Start microkernel loop -- Special treatment for the very first loop. -DGEMM_8X6_MKER_LOOP_PLAIN_LOC_FWD(24,25,26,27,28,29,30,load) // Prefetch C 3-5/8. -DGEMM_8X6_MKER_LOOP_PLAIN_LOC_FWD(24,25,26,27,31,28,29,load) // Prefetch C 6-8/8. -" \n\t" // Decrease counter before final replica. -" subs x4, x4, #1 \n\t" // Branch early to avoid reading excess mem. -BEQ(DFIN_MKER_LOOP) -DGEMM_8X6_MKER_LOOP_PLAIN_LOC_FWD(24,25,26,27,30,31,28,noload) -DGEMM_8X6_MKER_LOOP_PLAIN_LOC_FWD(24,25,26,27,29,30,31,noload) +" subs x4, x4, #1 \n\t" // Decrease counter in advance. +DGEMM_8X6_MKER_LOOP_LOC_FWD(INIT,24,25,26,27,28,29,30,load) // Prefetch C 1-4/8. +DGEMM_8X6_MKER_LOOP_LOC_FWD(PLAIN,24,25,26,27,31,28,29,load) // Prefetch C 5-8/8. +BEQ(DFIN_MKER_LOOP) // Branch early to avoid reading excess mem. +DGEMM_8X6_MKER_LOOP_LOC_FWD(PLAIN,24,25,26,27,30,31,28,noload) +DGEMM_8X6_MKER_LOOP_LOC_FWD(PLAIN,24,25,26,27,29,30,31,noload) // Start microkernel loop. LABEL(DK_MKER_LOOP) -DGEMM_8X6_MKER_LOOP_PLAIN_LOC_FWD(24,25,26,27,28,29,30,noload) -DGEMM_8X6_MKER_LOOP_PLAIN_LOC_FWD(24,25,26,27,31,28,29,noload) -" \n\t" // Decrease counter before final replica. -" subs x4, x4, #1 \n\t" // Branch early to avoid reading excess mem. -BEQ(DFIN_MKER_LOOP) -DGEMM_8X6_MKER_LOOP_PLAIN_LOC_FWD(24,25,26,27,30,31,28,noload) -DGEMM_8X6_MKER_LOOP_PLAIN_LOC_FWD(24,25,26,27,29,30,31,noload) +" subs x4, x4, #1 \n\t" // Decrease counter in advance. +DGEMM_8X6_MKER_LOOP_LOC_FWD(PLAIN,24,25,26,27,28,29,30,noload) +DGEMM_8X6_MKER_LOOP_LOC_FWD(PLAIN,24,25,26,27,31,28,29,noload) +BEQ(DFIN_MKER_LOOP) // Branch early to avoid reading excess mem. +DGEMM_8X6_MKER_LOOP_LOC_FWD(PLAIN,24,25,26,27,30,31,28,noload) +DGEMM_8X6_MKER_LOOP_LOC_FWD(PLAIN,24,25,26,27,29,30,31,noload) BRANCH(DK_MKER_LOOP) // // Final microkernel loop. LABEL(DFIN_MKER_LOOP) -DGEMM_8X6_MKER_LOOP_PLAIN_LOC(24,25,26,27,30,31,28,x0,0,x1,16,load,noload) +DGEMM_8X6_MKER_LOOP_LOC(PLAIN,24,25,26,27,30,31,28,x0,0,x1,16,load,noload) " add x1, x1, x3 \n\t" " ldr q26, [x0, #16*2] \n\t" " ldr q27, [x0, #16*3] \n\t" " add x0, x0, x2 \n\t" -DGEMM_8X6_MKER_LOOP_PLAIN_LOC(24,25,26,27,29,30,31,xzr,-1,xzr,-1,noload,noload) +DGEMM_8X6_MKER_LOOP_LOC(PLAIN,24,25,26,27,29,30,31,xzr,-1,xzr,-1,noload,noload) // // Loops left behind microkernels. LABEL(DK_LEFT_LOOP) @@ -516,9 +507,30 @@ BEQ(DWRITE_MEM_PREP) " ldr q30, [x1, #16*2] \n\t" " add x1, x1, x3 \n\t" " sub x8, x8, #1 \n\t" -DGEMM_8X6_MKER_LOOP_PLAIN_LOC(24,25,26,27,28,29,30,xzr,-1,xzr,-1,noload,noload) +DGEMM_8X6_MKER_LOOP_LOC(PLAIN,24,25,26,27,28,29,30,xzr,-1,xzr,-1,noload,noload) +BRANCH(DK_LEFT_LOOP) +LABEL(DK_LEFT_LOOP_INIT) +" cmp x8, #0 \n\t" // End of exec. +BEQ(DCLEAR_CCOLS) +" ldr q24, [x0, #16*0] \n\t" // Load A col. +" ldr q25, [x0, #16*1] \n\t" +" ldr q26, [x0, #16*2] \n\t" +" ldr q27, [x0, #16*3] \n\t" +" add x0, x0, x2 \n\t" +" ldr q28, [x1, #16*0] \n\t" // Load B row. +" ldr q29, [x1, #16*1] \n\t" +" ldr q30, [x1, #16*2] \n\t" +" add x1, x1, x3 \n\t" +" sub x8, x8, #1 \n\t" +DGEMM_8X6_MKER_LOOP_LOC(INIT,24,25,26,27,28,29,30,xzr,-1,xzr,-1,noload,noload) BRANCH(DK_LEFT_LOOP) // +// No FMUL at all to clear C up. Have to zeroize. +LABEL(DCLEAR_CCOLS) +CLEAR8V(0,1,2,3,4,5,6,7) +CLEAR8V(8,9,10,11,12,13,14,15) +CLEAR8V(16,17,18,19,20,21,22,23) +// // Scale and write to memory. LABEL(DWRITE_MEM_PREP) " ldr x4, %[alpha] \n\t" // Load alpha & beta (address). @@ -534,6 +546,7 @@ LABEL(DPREFETCH_ABNEXT) " prfm PLDL1STRM, [x0, 64*2] \n\t" // to try to activate hardware prefetcher. " prfm PLDL1STRM, [x1, 64*0] \n\t" " prfm PLDL1STRM, [x1, 64*1] \n\t" +" prfm PLDL1STRM, [x1, 64*2] \n\t" " prfm PLDL1STRM, [x1, 64*3] \n\t" " \n\t" " fmov d26, #1.0 \n\t" From 0ddde0f16e6091e0b7b333ff7a526527a0914476 Mon Sep 17 00:00:00 2001 From: RuQing Xu Date: Mon, 21 Nov 2022 11:56:25 +0900 Subject: [PATCH 3/5] Arm NEON DGEMM Change Regs IO Instead of loading from stack, directly pass regs in. Arm64 has 30 regs for use. This may or may not speed up a tiny bit. --- kernels/armv8a/3/bli_gemm_armv8a_asm_d8x6r.c | 196 +++++++++---------- 1 file changed, 89 insertions(+), 107 deletions(-) diff --git a/kernels/armv8a/3/bli_gemm_armv8a_asm_d8x6r.c b/kernels/armv8a/3/bli_gemm_armv8a_asm_d8x6r.c index 9c29145b96..c1fd933302 100644 --- a/kernels/armv8a/3/bli_gemm_armv8a_asm_d8x6r.c +++ b/kernels/armv8a/3/bli_gemm_armv8a_asm_d8x6r.c @@ -372,7 +372,7 @@ LABEL(SEND_WRITE_MEM) [beta] "m" (beta), [a_next] "m" (a_next), [b_next] "m" (b_next) -: "x0","x1","x2","x3","x4","x5","x6","x7","x8","x9", +: "x0","x1","x2","x3","x4","x5","x6","x8","x9", "v0","v1","v2","v3","v4","v5","v6","v7", "v8","v9","v10","v11","v12","v13","v14","v15", "v16","v17","v18","v19", @@ -411,28 +411,14 @@ void bli_dgemm_armv8a_asm_8x6r uint64_t k_left = k % 4; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; + // TODO: Aggregated str instructions. GEMM_UKR_SETUP_CT( d, 8, 6, true ); __asm__ volatile ( -" ldr x0, %[a] \n\t" -" ldr x1, %[b] \n\t" -" mov x2, #8 \n\t" // Column-skip of A. -" mov x3, #6 \n\t" // Row-skip of B. -" \n\t" -" ldr x5, %[c] \n\t" -" ldr x6, %[rs_c] \n\t" // Row-skip of C. (column-skip == 1) -" \n\t" -" \n\t" // Multiply some address skips by sizeof(double). -" lsl x2, x2, #3 \n\t" // cs_a -" lsl x3, x3, #3 \n\t" // rs_b -" lsl x6, x6, #3 \n\t" // rs_c -" \n\t" -" mov x9, x5 \n\t" -" \n\t" -" ldr x4, %[k_mker] \n\t" // Number of loops. -" ldr x8, %[k_left] \n\t" +" lsl %3, %3, #3 \n\t" // rs_c *= sizeof(double). +" mov x9, %2 \n\t" // Address of C for prefetching. " \n\t" // Storage scheme: // V[ 0:23] <- C @@ -440,35 +426,35 @@ void bli_dgemm_armv8a_asm_8x6r // V[28:31] <- B // Under this scheme, the following is defined: #define DGEMM_8X6_MKER_LOOP_LOC(SUFFIX,A0,A1,A2,A3,B0,B1,B2,AADDR,ASHIFT,BADDR,BSHIFT,LOADNEXT,PRFC) \ - DGEMM_8X6_MKER_LOOP(SUFFIX,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,A0,A1,A2,A3,B0,B1,B2,AADDR,ASHIFT,BADDR,BSHIFT,LOADNEXT,x9,x6,40,PRFC) + DGEMM_8X6_MKER_LOOP(SUFFIX,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,A0,A1,A2,A3,B0,B1,B2,AADDR,ASHIFT,BADDR,BSHIFT,LOADNEXT,x9,%3,40,PRFC) // Load from memory. LABEL(DLOAD_ABC) " \n\t" // No-microkernel early return is a must -" cmp x4, #0 \n\t" // to avoid out-of-boundary read. +" cmp %4, #0 \n\t" // to avoid out-of-boundary read. BEQ(DK_LEFT_LOOP_INIT) " \n\t" -" ldr q24, [x0, #16*0] \n\t" // Load A. -" ldr q25, [x0, #16*1] \n\t" -" ldr q26, [x0, #16*2] \n\t" -" ldr q27, [x0, #16*3] \n\t" -" add x0, x0, x2 \n\t" +" ldr q24, [%0, #16*0] \n\t" // Load A. +" ldr q25, [%0, #16*1] \n\t" +" ldr q26, [%0, #16*2] \n\t" +" ldr q27, [%0, #16*3] \n\t" +" add %0, %0, #64 \n\t" " \n\t" -" ldr q28, [x1, #16*0] \n\t" // Load B. -" ldr q29, [x1, #16*1] \n\t" -" ldr q30, [x1, #16*2] \n\t" -" add x1, x1, x3 \n\t" -" ldr q31, [x1, #16*0] \n\t" +" ldr q28, [%1, #16*0] \n\t" // Load B. +" ldr q29, [%1, #16*1] \n\t" +" ldr q30, [%1, #16*2] \n\t" +" add %1, %1, #48 \n\t" +" ldr q31, [%1, #16*0] \n\t" // // Microkernel is defined here as: #define DGEMM_8X6_MKER_LOOP_LOC_FWD(SUFFIX,A0,A1,A2,A3,B0,B1,B2,PRFC) \ - DGEMM_8X6_MKER_LOOP_LOC(SUFFIX,A0,A1,A2,A3,B0,B1,B2,x0,0,x1,16,load,PRFC) \ - "add x1, x1, x3 \n\t" \ - "ldr q"#B2", [x1, #16*0] \n\t" \ - "ldr q"#A2", [x0, #16*2] \n\t" \ - "ldr q"#A3", [x0, #16*3] \n\t" \ - "add x0, x0, x2 \n\t" + DGEMM_8X6_MKER_LOOP_LOC(SUFFIX,A0,A1,A2,A3,B0,B1,B2,%0,0,%1,16,load,PRFC) \ + "add %1, %1, #48 \n\t" \ + "ldr q"#B2", [%1, #16*0] \n\t" \ + "ldr q"#A2", [%0, #16*2] \n\t" \ + "ldr q"#A3", [%0, #16*3] \n\t" \ + "add %0, %0, #64 \n\t" // Start microkernel loop -- Special treatment for the very first loop. -" subs x4, x4, #1 \n\t" // Decrease counter in advance. +" subs %4, %4, #1 \n\t" // Decrease counter in advance. DGEMM_8X6_MKER_LOOP_LOC_FWD(INIT,24,25,26,27,28,29,30,load) // Prefetch C 1-4/8. DGEMM_8X6_MKER_LOOP_LOC_FWD(PLAIN,24,25,26,27,31,28,29,load) // Prefetch C 5-8/8. BEQ(DFIN_MKER_LOOP) // Branch early to avoid reading excess mem. @@ -476,7 +462,7 @@ DGEMM_8X6_MKER_LOOP_LOC_FWD(PLAIN,24,25,26,27,30,31,28,noload) DGEMM_8X6_MKER_LOOP_LOC_FWD(PLAIN,24,25,26,27,29,30,31,noload) // Start microkernel loop. LABEL(DK_MKER_LOOP) -" subs x4, x4, #1 \n\t" // Decrease counter in advance. +" subs %4, %4, #1 \n\t" // Decrease counter in advance. DGEMM_8X6_MKER_LOOP_LOC_FWD(PLAIN,24,25,26,27,28,29,30,noload) DGEMM_8X6_MKER_LOOP_LOC_FWD(PLAIN,24,25,26,27,31,28,29,noload) BEQ(DFIN_MKER_LOOP) // Branch early to avoid reading excess mem. @@ -486,42 +472,42 @@ BRANCH(DK_MKER_LOOP) // // Final microkernel loop. LABEL(DFIN_MKER_LOOP) -DGEMM_8X6_MKER_LOOP_LOC(PLAIN,24,25,26,27,30,31,28,x0,0,x1,16,load,noload) -" add x1, x1, x3 \n\t" -" ldr q26, [x0, #16*2] \n\t" -" ldr q27, [x0, #16*3] \n\t" -" add x0, x0, x2 \n\t" +DGEMM_8X6_MKER_LOOP_LOC(PLAIN,24,25,26,27,30,31,28,%0,0,%1,16,load,noload) +" add %1, %1, #48 \n\t" +" ldr q26, [%0, #16*2] \n\t" +" ldr q27, [%0, #16*3] \n\t" +" add %0, %0, #64 \n\t" DGEMM_8X6_MKER_LOOP_LOC(PLAIN,24,25,26,27,29,30,31,xzr,-1,xzr,-1,noload,noload) // // Loops left behind microkernels. LABEL(DK_LEFT_LOOP) -" cmp x8, #0 \n\t" // End of exec. +" cmp %5, #0 \n\t" // End of exec. BEQ(DWRITE_MEM_PREP) -" ldr q24, [x0, #16*0] \n\t" // Load A col. -" ldr q25, [x0, #16*1] \n\t" -" ldr q26, [x0, #16*2] \n\t" -" ldr q27, [x0, #16*3] \n\t" -" add x0, x0, x2 \n\t" -" ldr q28, [x1, #16*0] \n\t" // Load B row. -" ldr q29, [x1, #16*1] \n\t" -" ldr q30, [x1, #16*2] \n\t" -" add x1, x1, x3 \n\t" -" sub x8, x8, #1 \n\t" +" ldr q24, [%0, #16*0] \n\t" // Load A col. +" ldr q25, [%0, #16*1] \n\t" +" ldr q26, [%0, #16*2] \n\t" +" ldr q27, [%0, #16*3] \n\t" +" add %0, %0, #64 \n\t" +" ldr q28, [%1, #16*0] \n\t" // Load B row. +" ldr q29, [%1, #16*1] \n\t" +" ldr q30, [%1, #16*2] \n\t" +" add %1, %1, #48 \n\t" +" sub %5, %5, #1 \n\t" DGEMM_8X6_MKER_LOOP_LOC(PLAIN,24,25,26,27,28,29,30,xzr,-1,xzr,-1,noload,noload) BRANCH(DK_LEFT_LOOP) LABEL(DK_LEFT_LOOP_INIT) -" cmp x8, #0 \n\t" // End of exec. +" cmp %5, #0 \n\t" // End of exec. BEQ(DCLEAR_CCOLS) -" ldr q24, [x0, #16*0] \n\t" // Load A col. -" ldr q25, [x0, #16*1] \n\t" -" ldr q26, [x0, #16*2] \n\t" -" ldr q27, [x0, #16*3] \n\t" -" add x0, x0, x2 \n\t" -" ldr q28, [x1, #16*0] \n\t" // Load B row. -" ldr q29, [x1, #16*1] \n\t" -" ldr q30, [x1, #16*2] \n\t" -" add x1, x1, x3 \n\t" -" sub x8, x8, #1 \n\t" +" ldr q24, [%0, #16*0] \n\t" // Load A col. +" ldr q25, [%0, #16*1] \n\t" +" ldr q26, [%0, #16*2] \n\t" +" ldr q27, [%0, #16*3] \n\t" +" add %0, %0, #64 \n\t" +" ldr q28, [%1, #16*0] \n\t" // Load B row. +" ldr q29, [%1, #16*1] \n\t" +" ldr q30, [%1, #16*2] \n\t" +" add %1, %1, #48 \n\t" +" sub %5, %5, #1 \n\t" DGEMM_8X6_MKER_LOOP_LOC(INIT,24,25,26,27,28,29,30,xzr,-1,xzr,-1,noload,noload) BRANCH(DK_LEFT_LOOP) // @@ -533,21 +519,17 @@ CLEAR8V(16,17,18,19,20,21,22,23) // // Scale and write to memory. LABEL(DWRITE_MEM_PREP) -" ldr x4, %[alpha] \n\t" // Load alpha & beta (address). -" ldr x8, %[beta] \n\t" -" ld1r {v24.2d}, [x4] \n\t" // Load alpha & beta. -" ld1r {v25.2d}, [x8] \n\t" +" ld1r {v24.2d}, [%[alpha]] \n\t" // Load alpha & beta. +" ld1r {v25.2d}, [%[beta]] \n\t" " \n\t" LABEL(DPREFETCH_ABNEXT) -" ldr x0, %[a_next] \n\t" -" ldr x1, %[b_next] \n\t" -" prfm PLDL1STRM, [x0, 64*0] \n\t" // Do not know cache line size, -" prfm PLDL1STRM, [x0, 64*1] \n\t" // issue some number of prfm instructions -" prfm PLDL1STRM, [x0, 64*2] \n\t" // to try to activate hardware prefetcher. -" prfm PLDL1STRM, [x1, 64*0] \n\t" -" prfm PLDL1STRM, [x1, 64*1] \n\t" -" prfm PLDL1STRM, [x1, 64*2] \n\t" -" prfm PLDL1STRM, [x1, 64*3] \n\t" +" prfm PLDL1STRM, [%[a_next], 64*0] \n\t" // Do not know cache line size, +" prfm PLDL1STRM, [%[a_next], 64*1] \n\t" // issue some number of prfm instructions +" prfm PLDL1STRM, [%[a_next], 64*2] \n\t" // to try to activate hardware prefetcher. +" prfm PLDL1STRM, [%[b_next], 64*0] \n\t" +" prfm PLDL1STRM, [%[b_next], 64*1] \n\t" +" prfm PLDL1STRM, [%[b_next], 64*2] \n\t" +" prfm PLDL1STRM, [%[b_next], 64*3] \n\t" " \n\t" " fmov d26, #1.0 \n\t" " fcmp d24, d26 \n\t" @@ -557,8 +539,8 @@ DSCALE8V(8,9,10,11,12,13,14,15,24,0) DSCALE8V(16,17,18,19,20,21,22,23,24,0) LABEL(DUNIT_ALPHA) " \n\t" -" mov x9, x5 \n\t" // C address for loading. -" \n\t" // C address for storing is x5 itself. +" mov x9, %2 \n\t" // C address for loading. +" \n\t" // C address for storing is %2 itself. // // Contiguous C-storage. LABEL(DWRITE_MEM_R) @@ -567,52 +549,52 @@ LABEL(DWRITE_MEM_R) " \n\t" // multiple times for skipping load. // Row 0 & 1: BEQ(DZERO_BETA_R_0_1) -DLOADC_3V_R_FWD(26,27,28,x9,0,x6) -DLOADC_3V_R_FWD(29,30,31,x9,0,x6) +DLOADC_3V_R_FWD(26,27,28,x9,0,%3) +DLOADC_3V_R_FWD(29,30,31,x9,0,%3) DSCALEA2V(0,1,26,27,25,0) DSCALEA2V(2,3,28,29,25,0) DSCALEA2V(4,5,30,31,25,0) LABEL(DZERO_BETA_R_0_1) -DSTOREC_3V_R_FWD(0,1,2,x5,0,x6) -DSTOREC_3V_R_FWD(3,4,5,x5,0,x6) +DSTOREC_3V_R_FWD(0,1,2,%2,0,%3) +DSTOREC_3V_R_FWD(3,4,5,%2,0,%3) // Row 2 & 3 & 4 & 5: BEQ(DZERO_BETA_R_2_3_4_5) -DLOADC_3V_R_FWD(26,27,28,x9,0,x6) -DLOADC_3V_R_FWD(29,30,31,x9,0,x6) -DLOADC_3V_R_FWD(0,1,2,x9,0,x6) -DLOADC_3V_R_FWD(3,4,5,x9,0,x6) +DLOADC_3V_R_FWD(26,27,28,x9,0,%3) +DLOADC_3V_R_FWD(29,30,31,x9,0,%3) +DLOADC_3V_R_FWD(0,1,2,x9,0,%3) +DLOADC_3V_R_FWD(3,4,5,x9,0,%3) DSCALEA4V(6,7,8,9,26,27,28,29,25,0) DSCALEA4V(10,11,12,13,30,31,0,1,25,0) DSCALEA4V(14,15,16,17,2,3,4,5,25,0) LABEL(DZERO_BETA_R_2_3_4_5) -DSTOREC_3V_R_FWD(6,7,8,x5,0,x6) -DSTOREC_3V_R_FWD(9,10,11,x5,0,x6) -DSTOREC_3V_R_FWD(12,13,14,x5,0,x6) -DSTOREC_3V_R_FWD(15,16,17,x5,0,x6) +DSTOREC_3V_R_FWD(6,7,8,%2,0,%3) +DSTOREC_3V_R_FWD(9,10,11,%2,0,%3) +DSTOREC_3V_R_FWD(12,13,14,%2,0,%3) +DSTOREC_3V_R_FWD(15,16,17,%2,0,%3) // Row 6 & 7 BEQ(DZERO_BETA_R_6_7) -DLOADC_3V_R_FWD(26,27,28,x9,0,x6) -DLOADC_3V_R_FWD(29,30,31,x9,0,x6) +DLOADC_3V_R_FWD(26,27,28,x9,0,%3) +DLOADC_3V_R_FWD(29,30,31,x9,0,%3) DSCALEA2V(18,19,26,27,25,0) DSCALEA2V(20,21,28,29,25,0) DSCALEA2V(22,23,30,31,25,0) LABEL(DZERO_BETA_R_6_7) -DSTOREC_3V_R_FWD(18,19,20,x5,0,x6) -DSTOREC_3V_R_FWD(21,22,23,x5,0,x6) +DSTOREC_3V_R_FWD(18,19,20,%2,0,%3) +DSTOREC_3V_R_FWD(21,22,23,%2,0,%3) // Done. LABEL(DEND_WRITE_MEM) +: "+r" (a), // %0 + "+r" (b), // %1 + "+r" (c), // %2 + "+r" (rs_c), // %3 + "+r" (k_mker), // %4 + "+r" (k_left), // %5 + [alpha] "+r" (alpha), + [beta] "+r" (beta), + [a_next] "+r" (a_next), + [b_next] "+r" (b_next) : -: [a] "m" (a), - [b] "m" (b), - [c] "m" (c), - [rs_c] "m" (rs_c), - [k_mker] "m" (k_mker), - [k_left] "m" (k_left), - [alpha] "m" (alpha), - [beta] "m" (beta), - [a_next] "m" (a_next), - [b_next] "m" (b_next) -: "x0","x1","x2","x3","x4","x5","x6","x7","x8","x9", +: "x9", "v0","v1","v2","v3","v4","v5","v6","v7", "v8","v9","v10","v11","v12","v13","v14","v15", "v16","v17","v18","v19", From 04b5b71362b6bbef8d17d8af1fe32b1ba6e93258 Mon Sep 17 00:00:00 2001 From: RuQing Xu Date: Mon, 21 Nov 2022 17:26:31 +0900 Subject: [PATCH 4/5] Fix Init. Bug Forget to commit header for ad73717e8. --- kernels/armv8a/3/armv8a_asm_d2x2.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/kernels/armv8a/3/armv8a_asm_d2x2.h b/kernels/armv8a/3/armv8a_asm_d2x2.h index 5bb0bb4d39..30051d2a02 100644 --- a/kernels/armv8a/3/armv8a_asm_d2x2.h +++ b/kernels/armv8a/3/armv8a_asm_d2x2.h @@ -47,9 +47,25 @@ " fmla v"#C0".2d, v"#A".2d, v"#B".d[0] \n\t" \ " fmla v"#C1".2d, v"#A".2d, v"#B".d[1] \n\t" +#define DGEMM_2X2_NANOKERNEL_PLAIN(C0,C1,A,B) \ + DGEMM_2X2_NANOKERNEL(C0,C1,A,B) + +#define DGEMM_2X2_NANOKERNEL_INIT(C0,C1,A,B) \ +" fmul v"#C0".2d, v"#A".2d, v"#B".d[0] \n\t" \ +" fmul v"#C1".2d, v"#A".2d, v"#B".d[1] \n\t" + #define SGEMM_4X4_NANOKERNEL(C0,C1,C2,C3,A,B) \ " fmla v"#C0".4s, v"#A".4s, v"#B".s[0] \n\t" \ " fmla v"#C1".4s, v"#A".4s, v"#B".s[1] \n\t" \ " fmla v"#C2".4s, v"#A".4s, v"#B".s[2] \n\t" \ " fmla v"#C3".4s, v"#A".4s, v"#B".s[3] \n\t" +#define SGEMM_4X4_NANOKERNEL_PLAIN(C0,C1,C2,C3,A,B) \ + SGEMM_4X4_NANOKERNEL(C0,C1,C2,C3,A,B) + +#define SGEMM_4X4_NANOKERNEL_INIT(C0,C1,C2,C3,A,B) \ +" fmul v"#C0".4s, v"#A".4s, v"#B".s[0] \n\t" \ +" fmul v"#C1".4s, v"#A".4s, v"#B".s[1] \n\t" \ +" fmul v"#C2".4s, v"#A".4s, v"#B".s[2] \n\t" \ +" fmul v"#C3".4s, v"#A".4s, v"#B".s[3] \n\t" + From 47c63c1784418e1b68c751b8accc6ba068d6548c Mon Sep 17 00:00:00 2001 From: RuQing Xu Date: Sat, 17 Dec 2022 01:56:13 +0900 Subject: [PATCH 5/5] Armv8-A Port Row-maj DGEMM Uker Changes to SGEMM - Init k-loop clears C. - Scattered C preloading. --- kernels/armv8a/3/bli_gemm_armv8a_asm_d8x6r.c | 98 ++++++++++++-------- 1 file changed, 58 insertions(+), 40 deletions(-) diff --git a/kernels/armv8a/3/bli_gemm_armv8a_asm_d8x6r.c b/kernels/armv8a/3/bli_gemm_armv8a_asm_d8x6r.c index c1fd933302..7ded8e7442 100644 --- a/kernels/armv8a/3/bli_gemm_armv8a_asm_d8x6r.c +++ b/kernels/armv8a/3/bli_gemm_armv8a_asm_d8x6r.c @@ -54,16 +54,22 @@ * | 4 | | 5 | * +---+ +---+ */ -#define SGEMM_12X8_MKER_LOOP_PLAIN(C00,C01,C10,C11,C20,C21,C30,C31,C40,C41,C50,C51,C60,C61,C70,C71,C80,C81,C90,C91,CA0,CA1,CB0,CB1,A0,A1,A2,B0,B1,AADDR,ASHIFT,BADDR,BSHIFT,LOADNEXT) \ - SGEMM_4X4_NANOKERNEL(C00,C10,C20,C30,B0,A0) \ - SGEMM_4X4_NANOKERNEL(C01,C11,C21,C31,B1,A0) \ +#define SGEMM_12X8_MKER_LOOP(SUFFIX,C00,C01,C10,C11,C20,C21,C30,C31,C40,C41,C50,C51,C60,C61,C70,C71,C80,C81,C90,C91,CA0,CA1,CB0,CB1,A0,A1,A2,B0,B1,AADDR,ASHIFT,BADDR,BSHIFT,LOADNEXT,CADDR,RSC,LASTB,PRFC) \ + SGEMM_4X4_NANOKERNEL_ ##SUFFIX (C00,C10,C20,C30,B0,A0) \ + GEMM_PRFC_FH_ ##PRFC (CADDR) \ + SGEMM_4X4_NANOKERNEL_ ##SUFFIX (C01,C11,C21,C31,B1,A0) \ DGEMM_LOAD1V_ ##LOADNEXT (A0,AADDR,ASHIFT) /* Contiguous load is the same across S/D. */ \ - SGEMM_4X4_NANOKERNEL(C40,C50,C60,C70,B0,A1) \ - SGEMM_4X4_NANOKERNEL(C41,C51,C61,C71,B1,A1) \ + GEMM_PRFC_LH_FWD_ ##PRFC (CADDR,RSC,LASTB) \ + SGEMM_4X4_NANOKERNEL_ ##SUFFIX (C40,C50,C60,C70,B0,A1) \ + GEMM_PRFC_FH_ ##PRFC (CADDR) \ + SGEMM_4X4_NANOKERNEL_ ##SUFFIX (C41,C51,C61,C71,B1,A1) \ DGEMM_LOAD1V_ ##LOADNEXT (A1,AADDR,ASHIFT+16) \ - SGEMM_4X4_NANOKERNEL(C80,C90,CA0,CB0,B0,A2) \ + GEMM_PRFC_LH_FWD_ ##PRFC (CADDR,RSC,LASTB) \ + SGEMM_4X4_NANOKERNEL_ ##SUFFIX (C80,C90,CA0,CB0,B0,A2) \ DGEMM_LOAD1V_ ##LOADNEXT (B0,BADDR,BSHIFT) \ - SGEMM_4X4_NANOKERNEL(C81,C91,CA1,CB1,B1,A2) + GEMM_PRFC_FH_ ##PRFC (CADDR) \ + SGEMM_4X4_NANOKERNEL_ ##SUFFIX (C81,C91,CA1,CB1,B1,A2) \ + GEMM_PRFC_LH_FWD_ ##PRFC (CADDR,RSC,LASTB) // For contiguous storage of C, SLOAD is the same as DLOAD. #define SLOADC_2V_R_FWD(C0,C1,CADDR,CSHIFT,RSC) \ @@ -191,12 +197,6 @@ void bli_sgemm_armv8a_asm_12x8r " lsl x6, x6, #2 \n\t" // rs_c " \n\t" " mov x9, x5 \n\t" -PRFMC_FWD(x9,x6,32) // Prefetch C 01/12. -PRFMC_FWD(x9,x6,32) // Prefetch C 02/12. -PRFMC_FWD(x9,x6,32) // Prefetch C 03/12. -PRFMC_FWD(x9,x6,32) // Prefetch C 04/12. -PRFMC_FWD(x9,x6,32) // Prefetch C 05/12. -PRFMC_FWD(x9,x6,32) // Prefetch C 06/12. " \n\t" " ldr x4, %[k_mker] \n\t" // Number of loops. " ldr x8, %[k_left] \n\t" @@ -206,13 +206,13 @@ PRFMC_FWD(x9,x6,32) // Prefetch C 06/12. // V[24:27] <- A // V[28:31] <- B // Under this scheme, the following is defined: -#define SGEMM_12X8_MKER_LOOP_PLAIN_LOC(A0,A1,A2,B0,B1,AADDR,ASHIFT,BADDR,BSHIFT,LOADNEXT) \ - SGEMM_12X8_MKER_LOOP_PLAIN(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,A0,A1,A2,B0,B1,AADDR,ASHIFT,BADDR,BSHIFT,LOADNEXT) +#define SGEMM_12X8_MKER_LOOP_LOC(SUFFIX,A0,A1,A2,B0,B1,AADDR,ASHIFT,BADDR,BSHIFT,LOADNEXT,PRFC) \ + SGEMM_12X8_MKER_LOOP(SUFFIX,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,A0,A1,A2,B0,B1,AADDR,ASHIFT,BADDR,BSHIFT,LOADNEXT,x9,x6,32,PRFC) // Load from memory. LABEL(SLOAD_ABC) " \n\t" // No-microkernel early return is a must " cmp x4, #0 \n\t" // to avoid out-of-boundary read. -BEQ(SCLEAR_CCOLS) +BEQ(SK_LEFT_LOOP_INIT) " \n\t" " ldr q24, [x0, #16*0] \n\t" // Load A. " ldr q25, [x0, #16*1] \n\t" @@ -220,52 +220,47 @@ BEQ(SCLEAR_CCOLS) " add x0, x0, x2 \n\t" " ldr q27, [x0, #16*0] \n\t" " \n\t" -PRFMC_FWD(x9,x6,32) // Prefetch C 07/12. -PRFMC_FWD(x9,x6,32) // Prefetch C 08/12. -PRFMC_FWD(x9,x6,32) // Prefetch C 09/12. -PRFMC_FWD(x9,x6,32) // Prefetch C 10/12. -PRFMC_FWD(x9,x6,32) // Prefetch C 11/12. -PRFMC_FWD(x9,x6,32) // Prefetch C 12/12. -" cmp x4, #0 \n\t" // Reset branching flag. -" \n\t" " ldr q28, [x1, #16*0] \n\t" // Load B. " ldr q29, [x1, #16*1] \n\t" " add x1, x1, x3 \n\t" " ldr q30, [x1, #16*0] \n\t" " ldr q31, [x1, #16*1] \n\t" " add x1, x1, x3 \n\t" -LABEL(SCLEAR_CCOLS) -CLEAR8V(0,1,2,3,4,5,6,7) -CLEAR8V(8,9,10,11,12,13,14,15) -CLEAR8V(16,17,18,19,20,21,22,23) -// No-microkernel early return, once again. -BEQ(SK_LEFT_LOOP) // // Microkernel is defined here as: -#define SGEMM_12X8_MKER_LOOP_PLAIN_LOC_FWD(A0,A1,A2,B0,B1) \ - SGEMM_12X8_MKER_LOOP_PLAIN_LOC(A0,A1,A2,B0,B1,x0,16,x1,0,load) \ +#define SGEMM_12X8_MKER_LOOP_LOC_FWD(SUFFIX,A0,A1,A2,B0,B1,PRFC) \ + SGEMM_12X8_MKER_LOOP_LOC(SUFFIX,A0,A1,A2,B0,B1,x0,16,x1,0,load,PRFC) \ "add x0, x0, x2 \n\t" \ "ldr q"#A2", [x0, #16*0] \n\t" \ "ldr q"#B1", [x1, #16*1] \n\t" \ "add x1, x1, x3 \n\t" -// Start microkernel loop. +// Start microkernel loop -- Initial handled differently. +SGEMM_12X8_MKER_LOOP_LOC_FWD(INIT,24,25,26,28,29,load) // Interleaving C prefetch 03/12. +SGEMM_12X8_MKER_LOOP_LOC_FWD(PLAIN,27,24,25,30,31,load) // Interleaving C prefetch 06/12. +" \n\t" // Decrease counter before final replica. +" subs x4, x4, #1 \n\t" // Branch early to avoid reading excess mem. +BEQ(SFIN_MKER_LOOP) +SGEMM_12X8_MKER_LOOP_LOC_FWD(PLAIN,26,27,24,28,29,load) // Interleaving C prefetch 09/12. +SGEMM_12X8_MKER_LOOP_LOC_FWD(PLAIN,25,26,27,30,31,load) // Interleaving C prefetch 12/12. +// +// The microkernel loop. LABEL(SK_MKER_LOOP) -SGEMM_12X8_MKER_LOOP_PLAIN_LOC_FWD(24,25,26,28,29) -SGEMM_12X8_MKER_LOOP_PLAIN_LOC_FWD(27,24,25,30,31) +SGEMM_12X8_MKER_LOOP_LOC_FWD(PLAIN,24,25,26,28,29,noload) +SGEMM_12X8_MKER_LOOP_LOC_FWD(PLAIN,27,24,25,30,31,noload) " \n\t" // Decrease counter before final replica. " subs x4, x4, #1 \n\t" // Branch early to avoid reading excess mem. BEQ(SFIN_MKER_LOOP) -SGEMM_12X8_MKER_LOOP_PLAIN_LOC_FWD(26,27,24,28,29) -SGEMM_12X8_MKER_LOOP_PLAIN_LOC_FWD(25,26,27,30,31) +SGEMM_12X8_MKER_LOOP_LOC_FWD(PLAIN,26,27,24,28,29,noload) +SGEMM_12X8_MKER_LOOP_LOC_FWD(PLAIN,25,26,27,30,31,noload) BRANCH(SK_MKER_LOOP) // // Final microkernel loop. LABEL(SFIN_MKER_LOOP) -SGEMM_12X8_MKER_LOOP_PLAIN_LOC(26,27,24,28,29,xzr,-1,xzr,-1,noload) +SGEMM_12X8_MKER_LOOP_LOC(PLAIN,26,27,24,28,29,xzr,-1,xzr,-1,noload,noload) " ldr q26, [x0, #16*1] \n\t" " ldr q27, [x0, #16*2] \n\t" " add x0, x0, x2 \n\t" -SGEMM_12X8_MKER_LOOP_PLAIN_LOC(25,26,27,30,31,xzr,-1,xzr,-1,noload) +SGEMM_12X8_MKER_LOOP_LOC(PLAIN,25,26,27,30,31,xzr,-1,xzr,-1,noload,noload) // // Loops left behind microkernels. LABEL(SK_LEFT_LOOP) @@ -279,9 +274,30 @@ BEQ(SWRITE_MEM_PREP) " ldr q29, [x1, #16*1] \n\t" " add x1, x1, x3 \n\t" " sub x8, x8, #1 \n\t" -SGEMM_12X8_MKER_LOOP_PLAIN_LOC(24,25,26,28,29,xzr,-1,xzr,-1,noload) +SGEMM_12X8_MKER_LOOP_LOC(PLAIN,24,25,26,28,29,xzr,-1,xzr,-1,noload,noload) BRANCH(SK_LEFT_LOOP) // +// No microkernel 4-loop. Have to clear C rows in the first k_left. +LABEL(SK_LEFT_LOOP_INIT) +" cmp x8, #0 \n\t" // End of exec. +BEQ(SCLEAR_CCOLS) +" ldr q24, [x0, #16*0] \n\t" // Load A col. +" ldr q25, [x0, #16*1] \n\t" +" ldr q26, [x0, #16*2] \n\t" +" add x0, x0, x2 \n\t" +" ldr q28, [x1, #16*0] \n\t" // Load B row. +" ldr q29, [x1, #16*1] \n\t" +" add x1, x1, x3 \n\t" +" sub x8, x8, #1 \n\t" +SGEMM_12X8_MKER_LOOP_LOC(INIT,24,25,26,28,29,xzr,-1,xzr,-1,noload,noload) +BRANCH(SK_LEFT_LOOP) +// +// No FMUL at all to clear C up. Have to zeroize. +LABEL(SCLEAR_CCOLS) +CLEAR8V(0,1,2,3,4,5,6,7) +CLEAR8V(8,9,10,11,12,13,14,15) +CLEAR8V(16,17,18,19,20,21,22,23) +// // Scale and write to memory. LABEL(SWRITE_MEM_PREP) " ldr x4, %[alpha] \n\t" // Load alpha & beta (address). @@ -495,6 +511,8 @@ BEQ(DWRITE_MEM_PREP) " sub %5, %5, #1 \n\t" DGEMM_8X6_MKER_LOOP_LOC(PLAIN,24,25,26,27,28,29,30,xzr,-1,xzr,-1,noload,noload) BRANCH(DK_LEFT_LOOP) +// +// No microkernel 4-loop. Have to clear C rows in the first k_left. LABEL(DK_LEFT_LOOP_INIT) " cmp %5, #0 \n\t" // End of exec. BEQ(DCLEAR_CCOLS)