diff --git a/frame/3/bli_l3_thrinfo.h b/frame/3/bli_l3_thrinfo.h index 37a3909fd6..cab83a39a6 100644 --- a/frame/3/bli_l3_thrinfo.h +++ b/frame/3/bli_l3_thrinfo.h @@ -43,6 +43,8 @@ // change depending on BLIS_ENABLE_JRIR_SLAB / BLIS_ENABLE_JRIR_RR. #define bli_gemm_get_next_a_upanel( a1, step, inc ) ( a1 + step * inc ) #define bli_gemm_get_next_b_upanel( b1, step, inc ) ( b1 + step * inc ) +#define bli_gemm_get_next_c_utilem( c1, step, inc ) ( c1 + step * inc ) +#define bli_gemm_get_next_c_utilen( c1, step, inc ) ( c1 + step * inc ) // gemmt diff --git a/frame/3/gemm/bli_gemm_ker_var2.c b/frame/3/gemm/bli_gemm_ker_var2.c index 199e72cb65..5ccc20825c 100644 --- a/frame/3/gemm/bli_gemm_ker_var2.c +++ b/frame/3/gemm/bli_gemm_ker_var2.c @@ -293,18 +293,25 @@ void bli_gemm_ker_var2 // Compute the addresses of the next panels of A and B. const char* a2 = bli_gemm_get_next_a_upanel( a1, rstep_a, ir_inc ); + const char* c2 = bli_gemm_get_next_c_utilem( c11, rstep_c, ir_inc ); if ( bli_is_last_iter( i, ir_end, ir_tid, ir_nt ) ) { a2 = a_cast; b2 = bli_gemm_get_next_b_upanel( b1, cstep_b, jr_inc ); + c2 = bli_gemm_get_next_c_utilen( c1, cstep_c, jr_inc ); if ( bli_is_last_iter( j, jr_end, jr_tid, jr_nt ) ) + { b2 = b_cast; + c2 = bli_gemm_get_next_c_utilem( c_cast, rs_c, m ); + c2 = bli_gemm_get_next_c_utilem( c2, rstep_c, ir_inc ); + } } // Save addresses of next panels of A and B to the auxinfo_t // object. bli_auxinfo_set_next_a( a2, &aux ); bli_auxinfo_set_next_b( b2, &aux ); + bli_auxinfo_set_next_c( c2, &aux ); // Edge case handling now occurs within the microkernel itself, but // we must still explicitly accumulate to a temporary microtile in diff --git a/frame/base/bli_auxinfo.h b/frame/base/bli_auxinfo.h index 166480b30a..73b54a7d91 100644 --- a/frame/base/bli_auxinfo.h +++ b/frame/base/bli_auxinfo.h @@ -55,6 +55,10 @@ BLIS_INLINE const void* bli_auxinfo_next_b( const auxinfo_t* ai ) { return ai->b_next; } +BLIS_INLINE const void* bli_auxinfo_next_c( const auxinfo_t* ai ) +{ + return ai->c_next; +} BLIS_INLINE inc_t bli_auxinfo_is_a( const auxinfo_t* ai ) { @@ -103,6 +107,10 @@ BLIS_INLINE void bli_auxinfo_set_next_b( const void* p, auxinfo_t* ai ) { ai->b_next = p; } +BLIS_INLINE void bli_auxinfo_set_next_c( const void* p, auxinfo_t* ai ) +{ + ai->c_next = p; +} BLIS_INLINE void bli_auxinfo_set_next_ab( const void* ap, const void* bp, auxinfo_t* ai ) { ai->a_next = ap; diff --git a/frame/include/bli_type_defs.h b/frame/include/bli_type_defs.h index 08c7ddc4a6..e7d397473d 100644 --- a/frame/include/bli_type_defs.h +++ b/frame/include/bli_type_defs.h @@ -1113,10 +1113,11 @@ typedef struct pack_t schema_a; pack_t schema_b; - // Pointers to the micro-panels of A and B which will be used by the - // next call to the micro-kernel. + // Pointers to the micro-panels of A and B, and micro-tile of C, which + // will be used by the next call to the micro-kernel. const void* a_next; const void* b_next; + const void* c_next; // The imaginary strides of A and B. inc_t is_a;