diff --git a/config/template/kernels/1f/bli_dotxaxpyf_template_noopt_var1.c b/config/template/kernels/1f/bli_dotxaxpyf_template_noopt_var1.c index aeb502f354..5a8a30942a 100644 --- a/config/template/kernels/1f/bli_dotxaxpyf_template_noopt_var1.c +++ b/config/template/kernels/1f/bli_dotxaxpyf_template_noopt_var1.c @@ -43,7 +43,8 @@ void bli_zdotxaxpyf_template_noopt conj_t conjx, dim_t m, dim_t b_n, - dcomplex* restrict alpha, + dcomplex* restrict alphaw, + dcomplex* restrict alphax, dcomplex* restrict a, inc_t inca, inc_t lda, dcomplex* restrict w, inc_t incw, dcomplex* restrict x, inc_t incx, @@ -67,8 +68,8 @@ void bli_zdotxaxpyf_template_noopt This kernel performs the following two gemv-like operations: - y := beta * y + alpha * conjat( A^T ) * conjw( w ) - z := z + alpha * conja( A ) * conjx( x ) + y := beta * y + alphaw * conjat( A^T ) * conjw( w ) + z := z + alphax * conja( A ) * conjx( x ) where A is an m x b_n matrix, x and y are vector of length b_n, w and z are vectors of length m, and alpha and beta are scalars. The operation @@ -84,7 +85,8 @@ void bli_zdotxaxpyf_template_noopt - m: The number of rows in matrix A. - b_n: The number of columns in matrix A. Must be equal to or less than the fusing factor. - - alpha: The address of the scalar to be applied to A^T*w and A*x. + - alphaw: The address of the scalar to be applied to A^T*w. + - alphax: The address of the scalar to be applied to A*x. - a: The address of matrix A. - inca: The row stride of A. inca should be unit unless the implementation makes special accomodation for non-unit values. @@ -205,7 +207,8 @@ void bli_zdotxaxpyf_template_noopt conjx, m, b_n, - alpha, + alphaw, + alphax, a, inca, lda, w, incw, x, incx, @@ -239,7 +242,7 @@ void bli_zdotxaxpyf_template_noopt for ( j = 0; j < b_n; ++j ) { bli_zcopys( *xp[ j ], alpha_x[ j ] ); - bli_zscals( *alpha, alpha_x[ j ] ); + bli_zscals( *alphax, alpha_x[ j ] ); } } else // if ( bli_is_conj( conjx ) ) @@ -247,7 +250,7 @@ void bli_zdotxaxpyf_template_noopt for ( j = 0; j < b_n; ++j ) { bli_zcopyjs( *xp[ j ], alpha_x[ j ] ); - bli_zscals( *alpha, alpha_x[ j ] ); + bli_zscals( *alphax, alpha_x[ j ] ); } } @@ -468,7 +471,7 @@ void bli_zdotxaxpyf_template_noopt for ( j = 0; j < b_n; ++j ) { bli_zscals( *beta, *yp[ j ] ); - bli_zaxpys( *alpha, At_w[ j ], *yp[ j ] ); + bli_zaxpys( *alphaw, At_w[ j ], *yp[ j ] ); } } diff --git a/configure b/configure index 676c3b0ad6..4fa1ad5309 100755 --- a/configure +++ b/configure @@ -4773,7 +4773,7 @@ print_usage_plugin() --disable-examples, --enable-examples - Do not include (created by default) example code for plugin + Do not include (included by default) example code for plugin registration, kernels, etc. --disable-templates, --enable-templates @@ -5549,3 +5549,4 @@ case ${0##*/} in plugin_main "$@" ;; esac + diff --git a/frame/0/bli_l0_check.c b/frame/0/bli_l0_check.c index a1f1c1ca11..993f1757e2 100644 --- a/frame/0/bli_l0_check.c +++ b/frame/0/bli_l0_check.c @@ -58,6 +58,7 @@ GENFRONT( sqrtsc ) GENFRONT( sqrtrsc ) GENFRONT( subsc ) GENFRONT( invertsc ) +GENFRONT( negsc ) #undef GENFRONT diff --git a/frame/0/bli_l0_check.h b/frame/0/bli_l0_check.h index e5818dbde5..1aa247c3d5 100644 --- a/frame/0/bli_l0_check.h +++ b/frame/0/bli_l0_check.h @@ -54,6 +54,7 @@ GENTPROT( sqrtsc ) GENTPROT( sqrtrsc ) GENTPROT( subsc ) GENTPROT( invertsc ) +GENTPROT( negsc ) #undef GENTPROT diff --git a/frame/0/bli_l0_fpa.c b/frame/0/bli_l0_fpa.c index b841ce5a5c..deedad9ffd 100644 --- a/frame/0/bli_l0_fpa.c +++ b/frame/0/bli_l0_fpa.c @@ -55,6 +55,7 @@ GENFRONT( divsc ) GENFRONT( mulsc ) GENFRONT( subsc ) GENFRONT( invertsc ) +GENFRONT( negsc ) GENFRONT( sqrtsc ) GENFRONT( sqrtrsc ) GENFRONT( unzipsc ) diff --git a/frame/0/bli_l0_fpa.h b/frame/0/bli_l0_fpa.h index 623a3f69bd..b4ad0040fa 100644 --- a/frame/0/bli_l0_fpa.h +++ b/frame/0/bli_l0_fpa.h @@ -49,6 +49,7 @@ GENPROT( divsc ) GENPROT( mulsc ) GENPROT( subsc ) GENPROT( invertsc ) +GENPROT( negsc ) GENPROT( sqrtsc ) GENPROT( sqrtrsc ) GENPROT( unzipsc ) diff --git a/frame/0/bli_l0_ft.h b/frame/0/bli_l0_ft.h index 4c9933d84f..1682cbe808 100644 --- a/frame/0/bli_l0_ft.h +++ b/frame/0/bli_l0_ft.h @@ -53,6 +53,7 @@ INSERT_GENTDEF( addsc ) INSERT_GENTDEF( divsc ) INSERT_GENTDEF( subsc ) INSERT_GENTDEF( invertsc ) +INSERT_GENTDEF( negsc ) // mulsc diff --git a/frame/0/bli_l0_oapi.c b/frame/0/bli_l0_oapi.c index d30540ff99..1da0eefe4f 100644 --- a/frame/0/bli_l0_oapi.c +++ b/frame/0/bli_l0_oapi.c @@ -116,6 +116,7 @@ GENFRONT( divsc ) GENFRONT( mulsc ) GENFRONT( subsc ) GENFRONT( invertsc ) +GENFRONT( negsc ) #undef GENFRONT diff --git a/frame/0/bli_l0_oapi.h b/frame/0/bli_l0_oapi.h index ad342b2323..618a82c617 100644 --- a/frame/0/bli_l0_oapi.h +++ b/frame/0/bli_l0_oapi.h @@ -66,6 +66,7 @@ GENPROT( sqrtsc ) GENPROT( sqrtrsc ) GENPROT( subsc ) GENPROT( invertsc ) +GENPROT( negsc ) #undef GENPROT diff --git a/frame/0/bli_l0_tapi.c b/frame/0/bli_l0_tapi.c index ef2a942f3d..5e3847169d 100644 --- a/frame/0/bli_l0_tapi.c +++ b/frame/0/bli_l0_tapi.c @@ -83,6 +83,29 @@ void PASTEMAC(ch,opname) \ INSERT_GENTFUNC_BASIC( invertsc, inverts ) +#undef GENTFUNCR +#define GENTFUNCR( ctype, ctype_r, ch, chr, opname, kername ) \ +\ +void PASTEMAC(ch,opname) \ + ( \ + conj_t conjchi, \ + const ctype* chi, \ + ctype* psi \ + ) \ +{ \ + bli_init_once(); \ +\ + ctype chi_conj; \ + ctype_r chi_conj_r, chi_conj_i; \ +\ + PASTEMAC(ch,copycjs)( conjchi, *chi, chi_conj ); \ + PASTEMAC(ch,gets)( chi_conj, chi_conj_r, chi_conj_i ); \ + PASTEMAC(ch,sets)( -chi_conj_r, -chi_conj_i, *psi ); \ +} + +INSERT_GENTFUNCR_BASIC( negsc, inverts ) + + #undef GENTFUNC #define GENTFUNC( ctype, ch, opname, kername ) \ \ diff --git a/frame/0/bli_l0_tapi.h b/frame/0/bli_l0_tapi.h index cf28b07d76..65714f916d 100644 --- a/frame/0/bli_l0_tapi.h +++ b/frame/0/bli_l0_tapi.h @@ -52,6 +52,7 @@ INSERT_GENTPROT_BASIC( divsc ) INSERT_GENTPROT_BASIC( mulsc ) INSERT_GENTPROT_BASIC( subsc ) INSERT_GENTPROT_BASIC( invertsc ) +INSERT_GENTPROT_BASIC( negsc ) #undef GENTPROTR diff --git a/frame/1d/bli_l1d_check.c b/frame/1d/bli_l1d_check.c index 776ab8aee6..9cdc0c0a17 100644 --- a/frame/1d/bli_l1d_check.c +++ b/frame/1d/bli_l1d_check.c @@ -101,6 +101,7 @@ void PASTEMAC(opname,_check) \ GENFRONT( invscald ) GENFRONT( scald ) GENFRONT( setd ) +GENFRONT( setrd ) GENFRONT( setid ) GENFRONT( shiftd ) diff --git a/frame/1d/bli_l1d_check.h b/frame/1d/bli_l1d_check.h index 56286f9eee..5fc99ae164 100644 --- a/frame/1d/bli_l1d_check.h +++ b/frame/1d/bli_l1d_check.h @@ -88,6 +88,7 @@ void PASTEMAC(opname,_check) \ GENTPROT( invscald ) GENTPROT( scald ) GENTPROT( setd ) +GENTPROT( setrd ) GENTPROT( setid ) GENTPROT( shiftd ) diff --git a/frame/1d/bli_l1d_fpa.c b/frame/1d/bli_l1d_fpa.c index 1eaa241312..11cb51c1e3 100644 --- a/frame/1d/bli_l1d_fpa.c +++ b/frame/1d/bli_l1d_fpa.c @@ -59,6 +59,7 @@ GENFRONT( invertd ) GENFRONT( invscald ) GENFRONT( scald ) GENFRONT( setd ) +GENFRONT( setrd ) GENFRONT( setid ) GENFRONT( shiftd ) GENFRONT( xpbyd ) diff --git a/frame/1d/bli_l1d_fpa.h b/frame/1d/bli_l1d_fpa.h index 92775b3b28..fd35591d99 100644 --- a/frame/1d/bli_l1d_fpa.h +++ b/frame/1d/bli_l1d_fpa.h @@ -51,6 +51,7 @@ GENPROT( invertd ) GENPROT( invscald ) GENPROT( scald ) GENPROT( setd ) +GENPROT( setrd ) GENPROT( setid ) GENPROT( shiftd ) GENPROT( xpbyd ) diff --git a/frame/1d/bli_l1d_ft.h b/frame/1d/bli_l1d_ft.h index c80a4bb5ce..52c75143c0 100644 --- a/frame/1d/bli_l1d_ft.h +++ b/frame/1d/bli_l1d_ft.h @@ -115,7 +115,7 @@ INSERT_GENTDEF( invscald ) INSERT_GENTDEF( scald ) INSERT_GENTDEF( setd ) -// setid +// setrd, setid #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ @@ -130,6 +130,7 @@ typedef void (*PASTECH(ch,opname,EX_SUF,tsuf)) \ BLIS_TAPI_EX_PARAMS \ ); +INSERT_GENTDEFR( setrd ) INSERT_GENTDEFR( setid ) // shiftd diff --git a/frame/1d/bli_l1d_oapi.c b/frame/1d/bli_l1d_oapi.c index d4caabba93..12f575aba5 100644 --- a/frame/1d/bli_l1d_oapi.c +++ b/frame/1d/bli_l1d_oapi.c @@ -310,6 +310,7 @@ void PASTEMAC(opname,EX_SUF) \ ); \ } +GENFRONT( setrd ) GENFRONT( setid ) diff --git a/frame/1d/bli_l1d_oapi.h b/frame/1d/bli_l1d_oapi.h index 81171f3b88..16b6cea3d3 100644 --- a/frame/1d/bli_l1d_oapi.h +++ b/frame/1d/bli_l1d_oapi.h @@ -92,6 +92,7 @@ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ GENTPROT( invscald ) GENTPROT( scald ) GENTPROT( setd ) +GENTPROT( setrd ) GENTPROT( setid ) GENTPROT( shiftd ) diff --git a/frame/1d/bli_l1d_tapi.c b/frame/1d/bli_l1d_tapi.c index 17e7fcd3bb..c12f397471 100644 --- a/frame/1d/bli_l1d_tapi.c +++ b/frame/1d/bli_l1d_tapi.c @@ -341,6 +341,79 @@ void PASTEMAC(ch,opname,EX_SUF) \ dim_t n_elem; \ dim_t offx; \ inc_t incx; \ +\ + if ( bli_zero_dim2( m, n ) ) return; \ +\ + if ( bli_is_outside_diag( diagoffx, BLIS_NO_TRANSPOSE, m, n ) ) return; \ +\ + /* Determine the distance to the diagonals, the number of diagonal + elements, and the diagonal increments. */ \ + bli_set_dims_incs_1d \ + ( \ + diagoffx, \ + m, n, rs_x, cs_x, \ + &offx, &n_elem, &incx \ + ); \ +\ + /* Alternate implementation. (Substitute for remainder of function). */ \ + /* for ( i = 0; i < n_elem; ++i ) \ + { \ + ctype* chi11 = x1 + (i )*incx; \ +\ + PASTEMAC(ch,setrs)( *alpha, *chi11 ); \ + } */ \ +\ + /* Acquire the address of the real component of the first element, + and scale the increment for use in the real domain if the data type is complex. */ \ + x1 = ( ctype_r* )( x + offx ); \ + if ( bli_is_complex( dt ) ) \ + incx = 2*incx; \ +\ + /* Obtain a valid context from the gks if necessary. */ \ + if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \ +\ + /* Query the context for the operation's kernel address. */ \ + PASTECH(kername,_ker_ft) f = bli_cntx_get_ukr_dt( dt_r, kerid, cntx ); \ +\ + /* Invoke the kernel with the appropriate parameters. */ \ + f \ + ( \ + BLIS_NO_CONJUGATE, \ + n_elem, \ + ( ctype_r* )alpha, \ + x1, incx, \ + ( cntx_t* )cntx \ + ); \ +} + +INSERT_GENTFUNCR_BASIC( setrd, setv, BLIS_SETV_KER ) + + +#undef GENTFUNCR +#define GENTFUNCR( ctype, ctype_r, ch, chr, opname, kername, kerid ) \ +\ +void PASTEMAC(ch,opname,EX_SUF) \ + ( \ + doff_t diagoffx, \ + dim_t m, \ + dim_t n, \ + const ctype_r* alpha, \ + ctype* x, inc_t rs_x, inc_t cs_x \ + BLIS_TAPI_EX_PARAMS \ + ) \ +{ \ +\ + bli_init_once(); \ +\ + BLIS_TAPI_EX_DECLS \ +\ + const num_t dt = PASTEMAC(ch,type); \ + const num_t dt_r = PASTEMAC(chr,type); \ +\ + ctype_r* x1; \ + dim_t n_elem; \ + dim_t offx; \ + inc_t incx; \ \ /* If the datatype is real, the entire operation is a no-op. */ \ if ( bli_is_real( dt ) ) return; \ diff --git a/frame/1d/bli_l1d_tapi.h b/frame/1d/bli_l1d_tapi.h index 71c8b7334a..22d0d56a0c 100644 --- a/frame/1d/bli_l1d_tapi.h +++ b/frame/1d/bli_l1d_tapi.h @@ -124,6 +124,7 @@ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname,EX_SUF) \ BLIS_TAPI_EX_PARAMS \ ); +INSERT_GENTPROTR_BASIC( setrd ) INSERT_GENTPROTR_BASIC( setid ) diff --git a/frame/1f/bli_l1f_check.c b/frame/1f/bli_l1f_check.c index e05cb7750f..b6f0995236 100644 --- a/frame/1f/bli_l1f_check.c +++ b/frame/1f/bli_l1f_check.c @@ -288,7 +288,8 @@ void bli_dotaxpyv_check void bli_dotxaxpyf_check ( - const obj_t* alpha, + const obj_t* alphaw, + const obj_t* alphax, const obj_t* at, const obj_t* a, const obj_t* w, @@ -302,7 +303,10 @@ void bli_dotxaxpyf_check // Check object datatypes. - e_val = bli_check_noninteger_object( alpha ); + e_val = bli_check_noninteger_object( alphaw ); + bli_check_error_code( e_val ); + + e_val = bli_check_noninteger_object( alphax ); bli_check_error_code( e_val ); e_val = bli_check_floating_object( at ); @@ -345,7 +349,10 @@ void bli_dotxaxpyf_check // Check object dimensions. - e_val = bli_check_scalar_object( alpha ); + e_val = bli_check_scalar_object( alphaw ); + bli_check_error_code( e_val ); + + e_val = bli_check_scalar_object( alphax ); bli_check_error_code( e_val ); e_val = bli_check_matrix_object( at ); @@ -397,7 +404,10 @@ void bli_dotxaxpyf_check // Check object buffers (for non-NULLness). - e_val = bli_check_object_buffer( alpha ); + e_val = bli_check_object_buffer( alphaw ); + bli_check_error_code( e_val ); + + e_val = bli_check_object_buffer( alphax ); bli_check_error_code( e_val ); e_val = bli_check_object_buffer( at ); diff --git a/frame/1f/bli_l1f_check.h b/frame/1f/bli_l1f_check.h index 9cd53107a4..7afb02c8b7 100644 --- a/frame/1f/bli_l1f_check.h +++ b/frame/1f/bli_l1f_check.h @@ -87,7 +87,8 @@ GENTPROT( dotaxpyv ) \ void PASTEMAC(opname,_check) \ ( \ - const obj_t* alpha, \ + const obj_t* alphaw, \ + const obj_t* alphax, \ const obj_t* at, \ const obj_t* a, \ const obj_t* w, \ diff --git a/frame/1f/bli_l1f_ft.h b/frame/1f/bli_l1f_ft.h index ba74ecb8e3..72129b1664 100644 --- a/frame/1f/bli_l1f_ft.h +++ b/frame/1f/bli_l1f_ft.h @@ -132,7 +132,8 @@ typedef void (*PASTECH(ch,opname,EX_SUF,tsuf)) \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ - const ctype* alpha, \ + const ctype* alphaw, \ + const ctype* alphax, \ const ctype* a, inc_t inca, inc_t lda, \ const ctype* w, inc_t incw, \ const ctype* x, inc_t incx, \ diff --git a/frame/1f/bli_l1f_ker_params.h b/frame/1f/bli_l1f_ker_params.h index c6963c995f..3b4792ba8c 100644 --- a/frame/1f/bli_l1f_ker_params.h +++ b/frame/1f/bli_l1f_ker_params.h @@ -78,7 +78,8 @@ conj_t conjx, \ dim_t m, \ dim_t b_n, \ - const void* alpha, \ + const void* alphaw, \ + const void* alphax, \ const void* a, inc_t inca, inc_t lda, \ const void* w, inc_t incw, \ const void* x, inc_t incx, \ diff --git a/frame/1f/bli_l1f_oapi.c b/frame/1f/bli_l1f_oapi.c index 7022c43061..cfe5b3c9bf 100644 --- a/frame/1f/bli_l1f_oapi.c +++ b/frame/1f/bli_l1f_oapi.c @@ -253,7 +253,8 @@ GENFRONT( dotaxpyv ) \ void PASTEMAC(opname,EX_SUF) \ ( \ - const obj_t* alpha, \ + const obj_t* alphaw, \ + const obj_t* alphax, \ const obj_t* at, \ const obj_t* a, \ const obj_t* w, \ @@ -288,23 +289,28 @@ void PASTEMAC(opname,EX_SUF) \ void* buf_z = bli_obj_buffer_at_off( z ); \ inc_t inc_z = bli_obj_vector_inc( z ); \ \ - void* buf_alpha; \ + void* buf_alphaw; \ + void* buf_alphax; \ void* buf_beta; \ \ - obj_t alpha_local; \ + obj_t alphaw_local; \ + obj_t alphax_local; \ obj_t beta_local; \ \ if ( bli_error_checking_is_enabled() ) \ - PASTEMAC(opname,_check)( alpha, at, a, w, x, beta, y, z ); \ + PASTEMAC(opname,_check)( alphaw, alphax, at, a, w, x, beta, y, z ); \ \ /* Create local copy-casts of scalars (and apply internal conjugation as needed). */ \ bli_obj_scalar_init_detached_copy_of( dt, BLIS_NO_CONJUGATE, \ - alpha, &alpha_local ); \ + alphaw, &alphaw_local ); \ + bli_obj_scalar_init_detached_copy_of( dt, BLIS_NO_CONJUGATE, \ + alphax, &alphax_local ); \ bli_obj_scalar_init_detached_copy_of( dt, BLIS_NO_CONJUGATE, \ beta, &beta_local ); \ - buf_alpha = bli_obj_buffer_for_1x1( dt, &alpha_local ); \ - buf_beta = bli_obj_buffer_for_1x1( dt, &beta_local ); \ + buf_alphaw = bli_obj_buffer_for_1x1( dt, &alphaw_local ); \ + buf_alphax = bli_obj_buffer_for_1x1( dt, &alphax_local ); \ + buf_beta = bli_obj_buffer_for_1x1( dt, &beta_local ); \ \ /* Support cases where matrix A requires a transposition. */ \ if ( bli_obj_has_trans( a ) ) { bli_swap_incs( &rs_a, &cs_a ); } \ @@ -322,7 +328,8 @@ void PASTEMAC(opname,EX_SUF) \ conjx, \ m, \ b_n, \ - buf_alpha, \ + buf_alphaw, \ + buf_alphax, \ buf_a, rs_a, cs_a, \ buf_w, inc_w, \ buf_x, inc_x, \ diff --git a/frame/1f/bli_l1f_oapi.h b/frame/1f/bli_l1f_oapi.h index d0d53a6dfc..bb2e241cb4 100644 --- a/frame/1f/bli_l1f_oapi.h +++ b/frame/1f/bli_l1f_oapi.h @@ -90,7 +90,8 @@ GENTPROT( dotaxpyv ) \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ - const obj_t* alpha, \ + const obj_t* alphaw, \ + const obj_t* alphax, \ const obj_t* at, \ const obj_t* a, \ const obj_t* w, \ diff --git a/frame/1f/bli_l1f_tapi.c b/frame/1f/bli_l1f_tapi.c index 32f1599eac..d9c56f33f0 100644 --- a/frame/1f/bli_l1f_tapi.c +++ b/frame/1f/bli_l1f_tapi.c @@ -185,7 +185,8 @@ void PASTEMAC(ch,opname,EX_SUF) \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ - const ctype* alpha, \ + const ctype* alphaw, \ + const ctype* alphax, \ const ctype* a, inc_t inca, inc_t lda, \ const ctype* w, inc_t incw, \ const ctype* x, inc_t incx, \ @@ -214,7 +215,8 @@ void PASTEMAC(ch,opname,EX_SUF) \ conjx, \ m, \ b_n, \ - ( ctype* )alpha, \ + ( ctype* )alphaw, \ + ( ctype* )alphax, \ ( ctype* )a, inca, lda, \ ( ctype* )w, incw, \ ( ctype* )x, incx, \ diff --git a/frame/1f/bli_l1f_tapi.h b/frame/1f/bli_l1f_tapi.h index 986b394487..0d6a363467 100644 --- a/frame/1f/bli_l1f_tapi.h +++ b/frame/1f/bli_l1f_tapi.h @@ -106,7 +106,8 @@ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname,EX_SUF) \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ - const ctype* alpha, \ + const ctype* alphaw, \ + const ctype* alphax, \ const ctype* a, inc_t inca, inc_t lda, \ const ctype* w, inc_t incw, \ const ctype* x, inc_t incx, \ diff --git a/frame/1m/packm/bli_packm_struc_cxk.c b/frame/1m/packm/bli_packm_struc_cxk.c index 74f9de8f85..46bdae889e 100644 --- a/frame/1m/packm/bli_packm_struc_cxk.c +++ b/frame/1m/packm/bli_packm_struc_cxk.c @@ -85,6 +85,10 @@ void PASTEMAC(chc,chp,varname) \ \ ukr_t cxk_ker_id = BLIS_PACKM_KER; \ ukr_t cxc_ker_id = BLIS_PACKM_DIAG_KER; \ +\ + ctypep* kappa_cast = ( ctypep* )kappa; \ + ctypep minus_kappa; \ + PASTEMAC(chp,neg2s)( *kappa_cast, minus_kappa ); \ \ if ( bli_is_1m_packed( schema ) ) \ { \ @@ -95,7 +99,7 @@ void PASTEMAC(chc,chp,varname) \ { \ ctypep_r kappa_r, kappa_i; \ ( void )kappa_r; \ - PASTEMAC(chp,gets)( *( ctypep* )kappa, kappa_r, kappa_i ); \ + PASTEMAC(chp,gets)( *kappa_cast, kappa_r, kappa_i ); \ if ( PASTEMAC(chp_r,eq0)( kappa_i ) ) \ { \ /* Treat the matrix as real with doubled strides. */ \ @@ -116,10 +120,10 @@ void PASTEMAC(chc,chp,varname) \ dt_p0 = bli_dt_proj_to_real( dt_p ); \ } \ \ - const void* zero = bli_obj_buffer_for_const( dt_p0, &BLIS_ZERO ); \ - setv_ker_ft f_setv = bli_cntx_get_ukr_dt( dt_p0, BLIS_SETV_KER, cntx ); \ - packm_cxk_ker_ft f_cxk = bli_cntx_get_ukr2_dt( dt_c, dt_p, cxk_ker_id, cntx ); \ - packm_cxc_diag_ker_ft f_cxc = bli_cntx_get_ukr2_dt( dt_c, dt_p, cxc_ker_id, cntx ); \ + const void* zero = bli_obj_buffer_for_const( dt_p0, &BLIS_ZERO ); \ + setv_ker_ft f_setv = bli_cntx_get_ukr_dt( dt_p0, BLIS_SETV_KER, cntx ); \ + packm_cxk_ker_ft f_cxk = bli_cntx_get_ukr2_dt( dt_c, dt_p, cxk_ker_id, cntx ); \ + packm_cxc_diag_ker_ft f_cxc = bli_cntx_get_ukr2_dt( dt_c, dt_p, cxc_ker_id, cntx ); \ \ /* For general matrices, pack and return early */ \ if ( bli_is_general( strucc ) ) \ @@ -166,14 +170,20 @@ void PASTEMAC(chc,chp,varname) \ inc_t ldc10_r = ldc_r; \ inc_t incc10 = incc; \ inc_t ldc10 = ldc; \ + ctypep* kappa_use = kappa_cast; \ \ if ( bli_is_upper( uploc ) ) \ { \ bli_reflect_to_stored_part( diagoffc, c10, incc10_r, ldc10_r ); \ bli_swap_incs(&incc10, &ldc10); \ \ - if ( bli_is_hermitian( strucc ) ) \ + if ( bli_is_hermitian( strucc ) || \ + bli_is_skew_hermitian( strucc ) ) \ bli_toggle_conj( &conjc10 ); \ +\ + if ( bli_is_skew_symmetric( strucc ) || \ + bli_is_skew_hermitian( strucc ) ) \ + kappa_use = &minus_kappa; \ } \ \ /* If we are referencing the unstored part of a triangular matrix, @@ -200,7 +210,7 @@ void PASTEMAC(chc,chp,varname) \ panel_bcast, \ p10_len, \ p10_len_max, \ - kappa, \ + kappa_use, \ c10, incc10, ldc10, \ p10, ldp, \ params, \ @@ -256,14 +266,20 @@ void PASTEMAC(chc,chp,varname) \ inc_t ldc12_r = ldc_r; \ inc_t incc12 = incc; \ inc_t ldc12 = ldc; \ + ctypep* kappa_use = kappa_cast; \ \ if ( bli_is_lower( uploc ) ) \ { \ bli_reflect_to_stored_part( diagoffc - i, c12, incc12_r, ldc12_r ); \ bli_swap_incs(&incc12, &ldc12); \ \ - if ( bli_is_hermitian( strucc ) ) \ + if ( bli_is_hermitian( strucc ) || \ + bli_is_skew_hermitian( strucc ) ) \ bli_toggle_conj( &conjc12 ); \ +\ + if ( bli_is_skew_symmetric( strucc ) || \ + bli_is_skew_hermitian( strucc ) ) \ + kappa_use = &minus_kappa; \ } \ \ /* If we are referencing the unstored part of a triangular matrix, @@ -290,7 +306,7 @@ void PASTEMAC(chc,chp,varname) \ panel_bcast, \ p12_len, \ p12_len_max, \ - kappa, \ + kappa_use, \ c12, incc12, ldc12, \ p12, ldp, \ params, \ diff --git a/frame/2/bli_l2_check.c b/frame/2/bli_l2_check.c index a2772e1c46..7006cbb00b 100644 --- a/frame/2/bli_l2_check.c +++ b/frame/2/bli_l2_check.c @@ -134,6 +134,76 @@ void bli_symv_check } +void bli_shmv_check + ( + const obj_t* alpha, + const obj_t* a, + const obj_t* x, + const obj_t* beta, + const obj_t* y + ) +{ + err_t e_val; + + // Perform checks common to gemv/hemv/symv/trmv/trsv. + + bli_xxmv_check( alpha, a, x, beta, y ); + + // Check squareness. + + e_val = bli_check_square_object( a ); + bli_check_error_code( e_val ); + + // Check object structure. + + e_val = bli_check_skew_hermitian_object( a ); + bli_check_error_code( e_val ); + + // Check for consistent datatypes. + + e_val = bli_check_consistent_object_datatypes( a, x ); + bli_check_error_code( e_val ); + + e_val = bli_check_consistent_object_datatypes( a, y ); + bli_check_error_code( e_val ); +} + + +void bli_skmv_check + ( + const obj_t* alpha, + const obj_t* a, + const obj_t* x, + const obj_t* beta, + const obj_t* y + ) +{ + err_t e_val; + + // Perform checks common to gemv/hemv/symv/trmv/trsv. + + bli_xxmv_check( alpha, a, x, beta, y ); + + // Check squareness. + + e_val = bli_check_square_object( a ); + bli_check_error_code( e_val ); + + // Check object structure. + + e_val = bli_check_skew_symmetric_object( a ); + bli_check_error_code( e_val ); + + // Check for consistent datatypes. + + e_val = bli_check_consistent_object_datatypes( a, x ); + bli_check_error_code( e_val ); + + e_val = bli_check_consistent_object_datatypes( a, y ); + bli_check_error_code( e_val ); +} + + void bli_trmv_check ( const obj_t* alpha, @@ -287,6 +357,40 @@ void bli_her2_check } +void bli_shr2_check + ( + const obj_t* alpha, + const obj_t* x, + const obj_t* y, + const obj_t* a + ) +{ + err_t e_val; + + // Perform checks common to ger/her/her2/syr/syr2. + + bli_xxr_check( alpha, x, y, a ); + + // Check squareness. + + e_val = bli_check_square_object( a ); + bli_check_error_code( e_val ); + + // Check object structure. + + e_val = bli_check_skew_hermitian_object( a ); + bli_check_error_code( e_val ); + + // Check for consistent datatypes. + + e_val = bli_check_consistent_object_datatypes( a, x ); + bli_check_error_code( e_val ); + + e_val = bli_check_consistent_object_datatypes( a, y ); + bli_check_error_code( e_val ); +} + + void bli_syr_check ( const obj_t* alpha, @@ -351,6 +455,40 @@ void bli_syr2_check } +void bli_skr2_check + ( + const obj_t* alpha, + const obj_t* x, + const obj_t* y, + const obj_t* a + ) +{ + err_t e_val; + + // Perform checks common to ger/her/her2/syr/syr2. + + bli_xxr_check( alpha, x, y, a ); + + // Check squareness. + + e_val = bli_check_square_object( a ); + bli_check_error_code( e_val ); + + // Check object structure. + + e_val = bli_check_skew_symmetric_object( a ); + bli_check_error_code( e_val ); + + // Check for consistent datatypes. + + e_val = bli_check_consistent_object_datatypes( a, x ); + bli_check_error_code( e_val ); + + e_val = bli_check_consistent_object_datatypes( a, y ); + bli_check_error_code( e_val ); +} + + // ----------------------------------------------------------------------------- void bli_xxmv_check diff --git a/frame/2/bli_l2_check.h b/frame/2/bli_l2_check.h index b698e9d591..79e7becc90 100644 --- a/frame/2/bli_l2_check.h +++ b/frame/2/bli_l2_check.h @@ -52,6 +52,8 @@ void PASTEMAC(opname,_check) \ GENPROT( gemv ) GENPROT( hemv ) GENPROT( symv ) +GENPROT( shmv ) +GENPROT( skmv ) #undef GENPROT @@ -68,6 +70,8 @@ void PASTEMAC(opname,_check) \ GENPROT( ger ) GENPROT( her2 ) GENPROT( syr2 ) +GENPROT( shr2 ) +GENPROT( skr2 ) #undef GENPROT diff --git a/frame/2/bli_l2_fpa.c b/frame/2/bli_l2_fpa.c index 223f912eb8..9ae11b3d89 100644 --- a/frame/2/bli_l2_fpa.c +++ b/frame/2/bli_l2_fpa.c @@ -54,10 +54,14 @@ GENFRONT( gemv ) GENFRONT( ger ) GENFRONT( hemv ) GENFRONT( symv ) +GENFRONT( shmv ) +GENFRONT( skmv ) GENFRONT( her ) GENFRONT( syr ) GENFRONT( her2 ) GENFRONT( syr2 ) +GENFRONT( shr2 ) +GENFRONT( skr2 ) GENFRONT( trmv ) GENFRONT( trsv ) diff --git a/frame/2/bli_l2_fpa.h b/frame/2/bli_l2_fpa.h index 9ca9bb89fd..8cc2b01e90 100644 --- a/frame/2/bli_l2_fpa.h +++ b/frame/2/bli_l2_fpa.h @@ -46,10 +46,14 @@ GENPROT( gemv ) GENPROT( ger ) GENPROT( hemv ) GENPROT( symv ) +GENPROT( shmv ) +GENPROT( skmv ) GENPROT( her ) GENPROT( syr ) GENPROT( her2 ) GENPROT( syr2 ) +GENPROT( shr2 ) +GENPROT( skr2 ) GENPROT( trmv ) GENPROT( trsv ) diff --git a/frame/2/bli_l2_ft.h b/frame/2/bli_l2_ft.h index 94ca133fb2..fc00969272 100644 --- a/frame/2/bli_l2_ft.h +++ b/frame/2/bli_l2_ft.h @@ -99,6 +99,8 @@ typedef void (*PASTECH(ch,opname,EX_SUF,tsuf)) \ INSERT_GENTDEF( hemv ) INSERT_GENTDEF( symv ) +INSERT_GENTDEF( shmv ) +INSERT_GENTDEF( skmv ) // her @@ -156,6 +158,8 @@ typedef void (*PASTECH(ch,opname,EX_SUF,tsuf)) \ INSERT_GENTDEF( her2 ) INSERT_GENTDEF( syr2 ) +INSERT_GENTDEF( shr2 ) +INSERT_GENTDEF( skr2 ) // trmv, trsv diff --git a/frame/2/bli_l2_ft_unb.h b/frame/2/bli_l2_ft_unb.h index 39ef25ec8c..492181bfba 100644 --- a/frame/2/bli_l2_ft_unb.h +++ b/frame/2/bli_l2_ft_unb.h @@ -88,10 +88,10 @@ INSERT_GENTDEF( ger ) \ typedef void (*PASTECH(ch,opname,_unb,tsuf)) \ ( \ + struc_t struca, \ uplo_t uploa, \ conj_t conja, \ conj_t conjx, \ - conj_t conjh, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ @@ -129,10 +129,10 @@ INSERT_GENTDEFR( her ) \ typedef void (*PASTECH(ch,opname,_unb,tsuf)) \ ( \ + struc_t struca, \ uplo_t uploa, \ conj_t conjx, \ conj_t conjy, \ - conj_t conjh, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ diff --git a/frame/2/bli_l2_oapi.c b/frame/2/bli_l2_oapi.c index 3860d06ad6..411d1b7d64 100644 --- a/frame/2/bli_l2_oapi.c +++ b/frame/2/bli_l2_oapi.c @@ -251,6 +251,8 @@ void PASTEMAC(opname,EX_SUF) \ GENFRONT( hemv ) GENFRONT( symv ) +GENFRONT( shmv ) +GENFRONT( skmv ) #undef GENFRONT @@ -379,6 +381,8 @@ void PASTEMAC(opname,EX_SUF) \ GENFRONT( her2 ) GENFRONT( syr2 ) +GENFRONT( shr2 ) +GENFRONT( skr2 ) #undef GENFRONT diff --git a/frame/2/bli_l2_oapi.h b/frame/2/bli_l2_oapi.h index 391de06d58..0db31498ad 100644 --- a/frame/2/bli_l2_oapi.h +++ b/frame/2/bli_l2_oapi.h @@ -53,6 +53,8 @@ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ GENPROT( gemv ) GENPROT( hemv ) GENPROT( symv ) +GENPROT( shmv ) +GENPROT( skmv ) #undef GENPROT @@ -70,6 +72,8 @@ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ GENPROT( ger ) GENPROT( her2 ) GENPROT( syr2 ) +GENPROT( shr2 ) +GENPROT( skr2 ) #undef GENPROT diff --git a/frame/2/bli_l2_tapi.c b/frame/2/bli_l2_tapi.c index f6f2a035d4..eb2a3fb040 100644 --- a/frame/2/bli_l2_tapi.c +++ b/frame/2/bli_l2_tapi.c @@ -176,7 +176,7 @@ INSERT_GENTFUNC_BASIC( ger, ger, ger_unb_var1, ger_unb_var2 ) #undef GENTFUNC -#define GENTFUNC( ctype, ch, opname, ftname, conjh, rvarname, cvarname ) \ +#define GENTFUNC( ctype, ch, opname, ftname, struca, rvarname, cvarname ) \ \ void PASTEMAC(ch,opname,EX_SUF) \ ( \ @@ -234,10 +234,10 @@ void PASTEMAC(ch,opname,EX_SUF) \ level-1f kernel to implement the current operation. */ \ f \ ( \ + struca, \ uploa, \ conja, \ conjx, \ - conjh, /* used by variants to distinguish hemv from symv */ \ m, \ ( ctype* )alpha, \ ( ctype* )a, rs_a, cs_a, \ @@ -248,8 +248,10 @@ void PASTEMAC(ch,opname,EX_SUF) \ ); \ } -INSERT_GENTFUNC_BASIC( hemv, hemv, BLIS_CONJUGATE, hemv_unf_var1, hemv_unf_var3 ) -INSERT_GENTFUNC_BASIC( symv, hemv, BLIS_NO_CONJUGATE, hemv_unf_var1, hemv_unf_var3 ) +INSERT_GENTFUNC_BASIC( hemv, hemv, BLIS_HERMITIAN, hemv_unf_var1, hemv_unf_var3 ) +INSERT_GENTFUNC_BASIC( symv, hemv, BLIS_SYMMETRIC, hemv_unf_var1, hemv_unf_var3 ) +INSERT_GENTFUNC_BASIC( shmv, hemv, BLIS_SKEW_HERMITIAN, hemv_unf_var1, hemv_unf_var3 ) +INSERT_GENTFUNC_BASIC( skmv, hemv, BLIS_SKEW_SYMMETRIC, hemv_unf_var1, hemv_unf_var3 ) #undef GENTFUNCR @@ -374,7 +376,7 @@ INSERT_GENTFUNC_BASIC( syr, her, BLIS_NO_CONJUGATE, her_unb_var1, her_unb_var2 ) #undef GENTFUNC -#define GENTFUNC( ctype, ch, opname, ftname, conjh, rvarname, cvarname ) \ +#define GENTFUNC( ctype, ch, opname, ftname, struca, rvarname, cvarname ) \ \ void PASTEMAC(ch,opname,EX_SUF) \ ( \ @@ -418,10 +420,10 @@ void PASTEMAC(ch,opname,EX_SUF) \ level-1f kernel to implement the current operation. */ \ f \ ( \ + struca, \ uploa, \ conjx, \ conjy, \ - conjh, \ m, \ ( ctype* )alpha, \ ( ctype* )x, incx, \ @@ -431,8 +433,10 @@ void PASTEMAC(ch,opname,EX_SUF) \ ); \ } -INSERT_GENTFUNC_BASIC( her2, her2, BLIS_CONJUGATE, her2_unf_var1, her2_unf_var4 ) -INSERT_GENTFUNC_BASIC( syr2, her2, BLIS_NO_CONJUGATE, her2_unf_var1, her2_unf_var4 ) +INSERT_GENTFUNC_BASIC( her2, her2, BLIS_HERMITIAN, her2_unf_var1, her2_unf_var4 ) +INSERT_GENTFUNC_BASIC( syr2, her2, BLIS_SYMMETRIC, her2_unf_var1, her2_unf_var4 ) +INSERT_GENTFUNC_BASIC( shr2, her2, BLIS_SKEW_HERMITIAN, her2_unf_var1, her2_unf_var4 ) +INSERT_GENTFUNC_BASIC( skr2, her2, BLIS_SKEW_SYMMETRIC, her2_unf_var1, her2_unf_var4 ) #undef GENTFUNC diff --git a/frame/2/bli_l2_tapi.h b/frame/2/bli_l2_tapi.h index a263360e78..bee2fb7086 100644 --- a/frame/2/bli_l2_tapi.h +++ b/frame/2/bli_l2_tapi.h @@ -95,6 +95,8 @@ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname,EX_SUF) \ INSERT_GENTPROT_BASIC( hemv ) INSERT_GENTPROT_BASIC( symv ) +INSERT_GENTPROT_BASIC( shmv ) +INSERT_GENTPROT_BASIC( skmv ) #undef GENTPROTR @@ -149,6 +151,8 @@ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname,EX_SUF) \ INSERT_GENTPROT_BASIC( her2 ) INSERT_GENTPROT_BASIC( syr2 ) +INSERT_GENTPROT_BASIC( shr2 ) +INSERT_GENTPROT_BASIC( skr2 ) #undef GENTPROT diff --git a/frame/2/hemv/bli_hemv_unb_var1.c b/frame/2/hemv/bli_hemv_unb_var1.c index eeffc42929..16185f9348 100644 --- a/frame/2/hemv/bli_hemv_unb_var1.c +++ b/frame/2/hemv/bli_hemv_unb_var1.c @@ -39,10 +39,10 @@ \ void PASTEMAC(ch,varname) \ ( \ + struc_t struc, \ uplo_t uplo, \ conj_t conja, \ conj_t conjx, \ - conj_t conjh, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ @@ -69,25 +69,37 @@ void PASTEMAC(ch,varname) \ dim_t n_behind; \ inc_t rs_at, cs_at; \ conj_t conj0, conj1; \ + ctype alpha0, alpha1; \ +\ + rs_at = rs_a; \ + cs_at = cs_a; \ +\ + conj0 = conja; \ + conj1 = conja; \ +\ + PASTEMAC(ch,copys)( *alpha, alpha0 ); \ + PASTEMAC(ch,copys)( *alpha, alpha1 ); \ \ /* The algorithm will be expressed in terms of the lower triangular case; the upper triangular case is supported by swapping the row and column strides of A and toggling some conj parameters. */ \ if ( bli_is_lower( uplo ) ) \ { \ - rs_at = rs_a; \ - cs_at = cs_a; \ + if ( bli_is_hermitian( struc ) || bli_is_skew_hermitian( struc ) ) \ + bli_toggle_conj( &conj0 ); \ \ - conj0 = bli_apply_conj( conjh, conja ); \ - conj1 = conja; \ + if ( bli_is_skew_symmetric( struc ) || bli_is_skew_hermitian( struc ) ) \ + PASTEMAC(ch,neg2s)( *alpha, alpha0 ); \ } \ else /* if ( bli_is_upper( uplo ) ) */ \ { \ - rs_at = cs_a; \ - cs_at = rs_a; \ + bli_swap_incs( &rs_at, &cs_at ); \ \ - conj0 = conja; \ - conj1 = bli_apply_conj( conjh, conja ); \ + if ( bli_is_hermitian( struc ) || bli_is_skew_hermitian( struc ) ) \ + bli_toggle_conj( &conj1 ); \ +\ + if ( bli_is_skew_symmetric( struc ) || bli_is_skew_hermitian( struc ) ) \ + PASTEMAC(ch,neg2s)( *alpha, alpha1 ); \ } \ \ /* If beta is zero, use setv. Otherwise, scale by beta. */ \ @@ -135,9 +147,9 @@ void PASTEMAC(ch,varname) \ \ /* Apply conjx to chi1 and and scale by alpha. */ \ PASTEMAC(ch,copycjs)( conjx, *chi1, conjx_chi1 ); \ - PASTEMAC(ch,scal2s)( *alpha, conjx_chi1, alpha_chi1 ); \ + PASTEMAC(ch,scal2s)( alpha0, conjx_chi1, alpha_chi1 ); \ \ - /* y0 = y0 + alpha * a10t' * chi1; */ \ + /* y0 = y0 +/- alpha * a10t' * chi1; */ \ kfp_av \ ( \ conj0, \ @@ -148,13 +160,13 @@ void PASTEMAC(ch,varname) \ cntx \ ); \ \ - /* psi1 = psi1 + alpha * a10t * x0; */ \ + /* psi1 = psi1 +/- alpha * a10t * x0; */ \ kfp_dv \ ( \ conj1, \ conjx, \ n_behind, \ - alpha, \ + &alpha1, \ a10t, cs_at, \ x0, incx, \ one, \ @@ -165,10 +177,15 @@ void PASTEMAC(ch,varname) \ /* For hemv, explicitly set the imaginary component of alpha11 to zero. */ \ PASTEMAC(ch,copycjs)( conja, *alpha11, alpha11_temp ); \ - if ( bli_is_conj( conjh ) ) \ + if ( bli_is_hermitian( struc ) ) \ PASTEMAC(ch,seti0s)( alpha11_temp ); \ + if ( bli_is_skew_hermitian( struc ) ) \ + PASTEMAC(ch,setr0s)( alpha11_temp ); \ + if ( bli_is_skew_symmetric( struc ) ) \ + PASTEMAC(ch,set0s)( alpha11_temp ); \ \ /* psi1 = psi1 + alpha * alpha11 * chi1; */ \ + PASTEMAC(ch,scal2s)( *alpha, conjx_chi1, alpha_chi1 ); \ PASTEMAC(ch,axpys)( alpha_chi1, alpha11_temp, *psi1 ); \ \ } \ diff --git a/frame/2/hemv/bli_hemv_unb_var2.c b/frame/2/hemv/bli_hemv_unb_var2.c index 07de60dcc0..5ef3fa8c93 100644 --- a/frame/2/hemv/bli_hemv_unb_var2.c +++ b/frame/2/hemv/bli_hemv_unb_var2.c @@ -39,10 +39,10 @@ \ void PASTEMAC(ch,varname) \ ( \ + struc_t struc, \ uplo_t uplo, \ conj_t conja, \ conj_t conjx, \ - conj_t conjh, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ @@ -71,25 +71,37 @@ void PASTEMAC(ch,varname) \ dim_t n_ahead; \ inc_t rs_at, cs_at; \ conj_t conj0, conj1; \ + ctype alpha0, alpha1; \ +\ + rs_at = rs_a; \ + cs_at = cs_a; \ +\ + conj0 = conja; \ + conj1 = conja; \ +\ + PASTEMAC(ch,copys)( *alpha, alpha0 ); \ + PASTEMAC(ch,copys)( *alpha, alpha1 ); \ \ /* The algorithm will be expressed in terms of the lower triangular case; the upper triangular case is supported by swapping the row and column strides of A and toggling some conj parameters. */ \ if ( bli_is_lower( uplo ) ) \ { \ - rs_at = rs_a; \ - cs_at = cs_a; \ + if ( bli_is_hermitian( struc ) || bli_is_skew_hermitian( struc ) ) \ + bli_toggle_conj( &conj1 ); \ \ - conj0 = conja; \ - conj1 = bli_apply_conj( conjh, conja ); \ + if ( bli_is_skew_symmetric( struc ) || bli_is_skew_hermitian( struc ) ) \ + PASTEMAC(ch,neg2s)( *alpha, alpha1 ); \ } \ else /* if ( bli_is_upper( uplo ) ) */ \ { \ - rs_at = cs_a; \ - cs_at = rs_a; \ + bli_swap_incs( &rs_at, &cs_at ); \ +\ + if ( bli_is_hermitian( struc ) || bli_is_skew_hermitian( struc ) ) \ + bli_toggle_conj( &conj0 ); \ \ - conj0 = bli_apply_conj( conjh, conja ); \ - conj1 = conja; \ + if ( bli_is_skew_symmetric( struc ) || bli_is_skew_hermitian( struc ) ) \ + PASTEMAC(ch,neg2s)( *alpha, alpha0 ); \ } \ \ /* If beta is zero, use setv. Otherwise, scale by beta. */ \ @@ -139,13 +151,13 @@ void PASTEMAC(ch,varname) \ PASTEMAC(ch,copycjs)( conjx, *chi1, conjx_chi1 ); \ PASTEMAC(ch,scal2s)( *alpha, conjx_chi1, alpha_chi1 ); \ \ - /* psi1 = psi1 + alpha * a10t * x0; */ \ + /* psi1 = psi1 +/- alpha * a10t * x0; */ \ kfp_dv \ ( \ conj0, \ conjx, \ n_behind, \ - alpha, \ + &alpha0, \ a10t, cs_at, \ x0, incx, \ one, \ @@ -153,13 +165,13 @@ void PASTEMAC(ch,varname) \ cntx \ ); \ \ - /* psi1 = psi1 + alpha * a21' * x2; */ \ + /* psi1 = psi1 +/- alpha * a21' * x2; */ \ kfp_dv \ ( \ conj1, \ conjx, \ n_ahead, \ - alpha, \ + &alpha1, \ a21, rs_at, \ x2, incx, \ one, \ @@ -170,8 +182,12 @@ void PASTEMAC(ch,varname) \ /* For hemv, explicitly set the imaginary component of alpha11 to zero. */ \ PASTEMAC(ch,copycjs)( conja, *alpha11, alpha11_temp ); \ - if ( bli_is_conj( conjh ) ) \ + if ( bli_is_hermitian( struc ) ) \ PASTEMAC(ch,seti0s)( alpha11_temp ); \ + if ( bli_is_skew_hermitian( struc ) ) \ + PASTEMAC(ch,setr0s)( alpha11_temp ); \ + if ( bli_is_skew_symmetric( struc ) ) \ + PASTEMAC(ch,set0s)( alpha11_temp ); \ \ /* psi1 = psi1 + alpha * alpha11 * chi1; */ \ PASTEMAC(ch,axpys)( alpha_chi1, alpha11_temp, *psi1 ); \ diff --git a/frame/2/hemv/bli_hemv_unb_var3.c b/frame/2/hemv/bli_hemv_unb_var3.c index 1edd78f824..c66c3dce0e 100644 --- a/frame/2/hemv/bli_hemv_unb_var3.c +++ b/frame/2/hemv/bli_hemv_unb_var3.c @@ -39,10 +39,10 @@ \ void PASTEMAC(ch,varname) \ ( \ + struc_t struc, \ uplo_t uplo, \ conj_t conja, \ conj_t conjx, \ - conj_t conjh, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ @@ -69,25 +69,37 @@ void PASTEMAC(ch,varname) \ dim_t n_ahead; \ inc_t rs_at, cs_at; \ conj_t conj0, conj1; \ + ctype alpha0, alpha1; \ +\ + rs_at = rs_a; \ + cs_at = cs_a; \ +\ + conj0 = conja; \ + conj1 = conja; \ +\ + PASTEMAC(ch,copys)( *alpha, alpha0 ); \ + PASTEMAC(ch,copys)( *alpha, alpha1 ); \ \ /* The algorithm will be expressed in terms of the lower triangular case; the upper triangular case is supported by swapping the row and column strides of A and toggling some conj parameters. */ \ if ( bli_is_lower( uplo ) ) \ { \ - rs_at = rs_a; \ - cs_at = cs_a; \ + if ( bli_is_hermitian( struc ) || bli_is_skew_hermitian( struc ) ) \ + bli_toggle_conj( &conj0 ); \ \ - conj0 = bli_apply_conj( conjh, conja ); \ - conj1 = conja; \ + if ( bli_is_skew_symmetric( struc ) || bli_is_skew_hermitian( struc ) ) \ + PASTEMAC(ch,neg2s)( *alpha, alpha0 ); \ } \ else /* if ( bli_is_upper( uplo ) ) */ \ { \ - rs_at = cs_a; \ - cs_at = rs_a; \ + bli_swap_incs( &rs_at, &cs_at ); \ \ - conj0 = conja; \ - conj1 = bli_apply_conj( conjh, conja ); \ + if ( bli_is_hermitian( struc ) || bli_is_skew_hermitian( struc ) ) \ + bli_toggle_conj( &conj1 ); \ +\ + if ( bli_is_skew_symmetric( struc ) || bli_is_skew_hermitian( struc ) ) \ + PASTEMAC(ch,neg2s)( *alpha, alpha1 ); \ } \ \ /* If beta is zero, use setv. Otherwise, scale by beta. */ \ @@ -132,27 +144,32 @@ void PASTEMAC(ch,varname) \ x2 = x + (i+1)*incx; \ psi1 = y + (i )*incy; \ y2 = y + (i+1)*incy; \ -\ - /* Apply conjx to chi1 and and scale by alpha. */ \ - PASTEMAC(ch,copycjs)( conjx, *chi1, conjx_chi1 ); \ - PASTEMAC(ch,scal2s)( *alpha, conjx_chi1, alpha_chi1 ); \ \ /* For hemv, explicitly set the imaginary component of alpha11 to zero. */ \ PASTEMAC(ch,copycjs)( conja, *alpha11, alpha11_temp ); \ - if ( bli_is_conj( conjh ) ) \ + if ( bli_is_hermitian( struc ) ) \ PASTEMAC(ch,seti0s)( alpha11_temp ); \ + if ( bli_is_skew_hermitian( struc ) ) \ + PASTEMAC(ch,setr0s)( alpha11_temp ); \ + if ( bli_is_skew_symmetric( struc ) ) \ + PASTEMAC(ch,set0s)( alpha11_temp ); \ \ /* psi1 = psi1 + alpha * alpha11 * chi1; */ \ + PASTEMAC(ch,scal2s)( *alpha, conjx_chi1, alpha_chi1 ); \ PASTEMAC(ch,axpys)( alpha_chi1, alpha11_temp, *psi1 ); \ \ - /* psi1 = psi1 + alpha * a21' * x2; */ \ + /* Apply conjx to chi1 and and scale by alpha. */ \ + PASTEMAC(ch,copycjs)( conjx, *chi1, conjx_chi1 ); \ + PASTEMAC(ch,scal2s)( alpha1, conjx_chi1, alpha_chi1 ); \ +\ + /* psi1 = psi1 +/- alpha * a21' * x2; */ \ kfp_dv \ ( \ conj0, \ conjx, \ n_ahead, \ - alpha, \ + &alpha0, \ a21, rs_at, \ x2, incx, \ one, \ @@ -160,7 +177,7 @@ void PASTEMAC(ch,varname) \ cntx \ ); \ \ - /* y2 = y2 + alpha * a21 * chi1; */ \ + /* y2 = y2 +/- alpha * a21 * chi1; */ \ kfp_av \ ( \ conj1, \ diff --git a/frame/2/hemv/bli_hemv_unb_var4.c b/frame/2/hemv/bli_hemv_unb_var4.c index 704299ab1f..f58a0ddc73 100644 --- a/frame/2/hemv/bli_hemv_unb_var4.c +++ b/frame/2/hemv/bli_hemv_unb_var4.c @@ -39,10 +39,10 @@ \ void PASTEMAC(ch,varname) \ ( \ + struc_t struc, \ uplo_t uplo, \ conj_t conja, \ conj_t conjx, \ - conj_t conjh, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ @@ -70,25 +70,37 @@ void PASTEMAC(ch,varname) \ dim_t n_ahead; \ inc_t rs_at, cs_at; \ conj_t conj0, conj1; \ + ctype alpha0, alpha1; \ +\ + rs_at = rs_a; \ + cs_at = cs_a; \ +\ + conj0 = conja; \ + conj1 = conja; \ +\ + PASTEMAC(ch,copys)( *alpha, alpha0 ); \ + PASTEMAC(ch,copys)( *alpha, alpha1 ); \ \ /* The algorithm will be expressed in terms of the lower triangular case; the upper triangular case is supported by swapping the row and column strides of A and toggling some conj parameters. */ \ if ( bli_is_lower( uplo ) ) \ { \ - rs_at = rs_a; \ - cs_at = cs_a; \ + if ( bli_is_hermitian( struc ) || bli_is_skew_hermitian( struc ) ) \ + bli_toggle_conj( &conj0 ); \ \ - conj0 = bli_apply_conj( conjh, conja ); \ - conj1 = conja; \ + if ( bli_is_skew_symmetric( struc ) || bli_is_skew_hermitian( struc ) ) \ + PASTEMAC(ch,neg2s)( *alpha, alpha0 ); \ } \ else /* if ( bli_is_upper( uplo ) ) */ \ { \ - rs_at = cs_a; \ - cs_at = rs_a; \ + bli_swap_incs( &rs_at, &cs_at ); \ +\ + if ( bli_is_hermitian( struc ) || bli_is_skew_hermitian( struc ) ) \ + bli_toggle_conj( &conj1 ); \ \ - conj0 = conja; \ - conj1 = bli_apply_conj( conjh, conja ); \ + if ( bli_is_skew_symmetric( struc ) || bli_is_skew_hermitian( struc ) ) \ + PASTEMAC(ch,neg2s)( *alpha, alpha1 ); \ } \ \ /* If beta is zero, use setv. Otherwise, scale by beta. */ \ @@ -136,9 +148,9 @@ void PASTEMAC(ch,varname) \ \ /* Apply conjx to chi1 and and scale by alpha. */ \ PASTEMAC(ch,copycjs)( conjx, *chi1, conjx_chi1 ); \ - PASTEMAC(ch,scal2s)( *alpha, conjx_chi1, alpha_chi1 ); \ + PASTEMAC(ch,scal2s)( alpha0, conjx_chi1, alpha_chi1 ); \ \ - /* y0 = y0 + alpha * a10t' * chi1; */ \ + /* y0 = y0 +/- alpha * a10t' * chi1; */ \ kfp_av \ ( \ conj0, \ @@ -152,13 +164,21 @@ void PASTEMAC(ch,varname) \ /* For hemv, explicitly set the imaginary component of alpha11 to zero. */ \ PASTEMAC(ch,copycjs)( conja, *alpha11, alpha11_temp ); \ - if ( bli_is_conj( conjh ) ) \ + if ( bli_is_hermitian( struc ) ) \ PASTEMAC(ch,seti0s)( alpha11_temp ); \ + if ( bli_is_skew_hermitian( struc ) ) \ + PASTEMAC(ch,setr0s)( alpha11_temp ); \ + if ( bli_is_skew_symmetric( struc ) ) \ + PASTEMAC(ch,set0s)( alpha11_temp ); \ \ /* psi1 = psi1 + alpha * alpha11 * chi1; */ \ + PASTEMAC(ch,scal2s)( *alpha, conjx_chi1, alpha_chi1 ); \ PASTEMAC(ch,axpys)( alpha_chi1, alpha11_temp, *psi1 ); \ \ - /* y2 = y2 + alpha * a21 * chi1; */ \ + /* Apply conjx to chi1 and and scale by alpha. */ \ + PASTEMAC(ch,scal2s)( alpha1, conjx_chi1, alpha_chi1 ); \ +\ + /* y2 = y2 +/- alpha * a21 * chi1; */ \ kfp_av \ ( \ conj1, \ diff --git a/frame/2/hemv/bli_hemv_unf_var1.c b/frame/2/hemv/bli_hemv_unf_var1.c index bb96d9ae59..0e14eba26d 100644 --- a/frame/2/hemv/bli_hemv_unf_var1.c +++ b/frame/2/hemv/bli_hemv_unf_var1.c @@ -39,10 +39,10 @@ \ void PASTEMAC(ch,varname) \ ( \ + struc_t struc, \ uplo_t uplo, \ conj_t conja, \ conj_t conjx, \ - conj_t conjh, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ @@ -78,25 +78,37 @@ void PASTEMAC(ch,varname) \ dim_t f_ahead, f_behind; \ inc_t rs_at, cs_at; \ conj_t conj0, conj1; \ + ctype alpha0, alpha1; \ +\ + rs_at = rs_a; \ + cs_at = cs_a; \ +\ + conj0 = conja; \ + conj1 = conja; \ +\ + PASTEMAC(ch,copys)( *alpha, alpha0 ); \ + PASTEMAC(ch,copys)( *alpha, alpha1 ); \ \ /* The algorithm will be expressed in terms of the lower triangular case; the upper triangular case is supported by swapping the row and column strides of A and toggling some conj parameters. */ \ if ( bli_is_lower( uplo ) ) \ { \ - rs_at = rs_a; \ - cs_at = cs_a; \ + if ( bli_is_hermitian( struc ) || bli_is_skew_hermitian( struc ) ) \ + bli_toggle_conj( &conj1 ); \ \ - conj0 = conja; \ - conj1 = bli_apply_conj( conjh, conja ); \ + if ( bli_is_skew_symmetric( struc ) || bli_is_skew_hermitian( struc ) ) \ + PASTEMAC(ch,neg2s)( *alpha, alpha1 ); \ } \ else /* if ( bli_is_upper( uplo ) ) */ \ { \ - rs_at = cs_a; \ - cs_at = rs_a; \ + bli_swap_incs( &rs_at, &cs_at ); \ \ - conj0 = bli_apply_conj( conjh, conja ); \ - conj1 = conja; \ + if ( bli_is_hermitian( struc ) || bli_is_skew_hermitian( struc ) ) \ + bli_toggle_conj( &conj0 ); \ +\ + if ( bli_is_skew_symmetric( struc ) || bli_is_skew_hermitian( struc ) ) \ + PASTEMAC(ch,neg2s)( *alpha, alpha0 ); \ } \ \ /* If beta is zero, use setv. Otherwise, scale by beta. */ \ @@ -142,8 +154,8 @@ void PASTEMAC(ch,varname) \ y0 = y + (0 )*incy; \ y1 = y + (i )*incy; \ \ - /* y1 = y1 + alpha * A10 * x0; (dotxf) */ \ - /* y0 = y0 + alpha * A10' * x1; (axpyf) */ \ + /* y1 = y1 +/- alpha * A10 * x0; (dotxf) */ \ + /* y0 = y0 +/- alpha * A10' * x1; (axpyf) */ \ kfp_xf \ ( \ conj0, \ @@ -152,7 +164,8 @@ void PASTEMAC(ch,varname) \ conjx, \ n_behind, \ f, \ - alpha, \ + &alpha0, \ + &alpha1, \ A10, cs_at, rs_at, \ x0, incx, \ x1, incx, \ @@ -175,9 +188,9 @@ void PASTEMAC(ch,varname) \ psi11 = y1 + (k )*incy; \ y21 = y1 + (k+1)*incy; \ \ - /* y01 = y01 + alpha * a10t' * chi11; */ \ + /* y01 = y01 +/- alpha * a10t' * chi11; */ \ PASTEMAC(ch,copycjs)( conjx, *chi11, conjx_chi11 ); \ - PASTEMAC(ch,scal2s)( *alpha, conjx_chi11, alpha_chi11 ); \ + PASTEMAC(ch,scal2s)( alpha1, conjx_chi11, alpha_chi11 ); \ if ( bli_is_conj( conj1 ) ) \ { \ for ( j = 0; j < f_behind; ++j ) \ @@ -192,13 +205,19 @@ void PASTEMAC(ch,varname) \ /* For hemv, explicitly set the imaginary component of alpha11 to zero. */ \ PASTEMAC(ch,copycjs)( conja, *alpha11, alpha11_temp ); \ - if ( bli_is_conj( conjh ) ) \ + if ( bli_is_hermitian( struc ) ) \ PASTEMAC(ch,seti0s)( alpha11_temp ); \ + if ( bli_is_skew_hermitian( struc ) ) \ + PASTEMAC(ch,setr0s)( alpha11_temp ); \ + if ( bli_is_skew_symmetric( struc ) ) \ + PASTEMAC(ch,set0s)( alpha11_temp ); \ \ /* psi11 = psi11 + alpha * alpha11 * chi11; */ \ + PASTEMAC(ch,scal2s)( *alpha, conjx_chi11, alpha_chi11 ); \ PASTEMAC(ch,axpys)( alpha_chi11, alpha11_temp, *psi11 ); \ \ - /* y21 = y21 + alpha * a21 * chi11; */ \ + /* y21 = y21 +/- alpha * a21 * chi11; */ \ + PASTEMAC(ch,scal2s)( alpha0, conjx_chi11, alpha_chi11 ); \ if ( bli_is_conj( conj0 ) ) \ { \ for ( j = 0; j < f_ahead; ++j ) \ diff --git a/frame/2/hemv/bli_hemv_unf_var1a.c b/frame/2/hemv/bli_hemv_unf_var1a.c index f20a6de849..4443b3827e 100644 --- a/frame/2/hemv/bli_hemv_unf_var1a.c +++ b/frame/2/hemv/bli_hemv_unf_var1a.c @@ -39,10 +39,10 @@ \ void PASTEMAC(ch,varname) \ ( \ + struc_t struc, \ uplo_t uplo, \ conj_t conja, \ conj_t conjx, \ - conj_t conjh, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ @@ -69,25 +69,37 @@ void PASTEMAC(ch,varname) \ dim_t n_behind; \ inc_t rs_at, cs_at; \ conj_t conj0, conj1; \ + ctype alpha0, alpha1; \ +\ + rs_at = rs_a; \ + cs_at = cs_a; \ +\ + conj0 = conja; \ + conj1 = conja; \ +\ + PASTEMAC(ch,copys)( *alpha, alpha0 ); \ + PASTEMAC(ch,copys)( *alpha, alpha1 ); \ \ /* The algorithm will be expressed in terms of the lower triangular case; the upper triangular case is supported by swapping the row and column strides of A and toggling some conj parameters. */ \ if ( bli_is_lower( uplo ) ) \ { \ - rs_at = rs_a; \ - cs_at = cs_a; \ + if ( bli_is_hermitian( struc ) || bli_is_skew_hermitian( struc ) ) \ + bli_toggle_conj( &conj1 ); \ \ - conj0 = conja; \ - conj1 = bli_apply_conj( conjh, conja ); \ + if ( bli_is_skew_symmetric( struc ) || bli_is_skew_hermitian( struc ) ) \ + PASTEMAC(ch,neg2s)( *alpha, alpha1 ); \ } \ else /* if ( bli_is_upper( uplo ) ) */ \ { \ - rs_at = cs_a; \ - cs_at = rs_a; \ + bli_swap_incs( &rs_at, &cs_at ); \ \ - conj0 = bli_apply_conj( conjh, conja ); \ - conj1 = conja; \ + if ( bli_is_hermitian( struc ) || bli_is_skew_hermitian( struc ) ) \ + bli_toggle_conj( &conj0 ); \ +\ + if ( bli_is_skew_symmetric( struc ) || bli_is_skew_hermitian( struc ) ) \ + PASTEMAC(ch,neg2s)( *alpha, alpha0 ); \ } \ \ /* If beta is zero, use setv. Otherwise, scale by beta. */ \ @@ -133,10 +145,10 @@ void PASTEMAC(ch,varname) \ \ /* Apply conjx to chi1 and and scale by alpha. */ \ PASTEMAC(ch,copycjs)( conjx, *chi1, conjx_chi1 ); \ - PASTEMAC(ch,scal2s)( *alpha, conjx_chi1, alpha_chi1 ); \ + PASTEMAC(ch,scal2s)( alpha1, conjx_chi1, alpha_chi1 ); \ \ - /* psi1 = psi1 + alpha * a10t * x0; (dotv) */ \ - /* y0 = y0 + alpha * a10t' * chi1; (axpyv) */ \ + /* psi1 = psi1 +/- alpha * a10t * x0; (dotv) */ \ + /* y0 = y0 +/- alpha * a10t' * chi1; (axpyv) */ \ kfp_vf \ ( \ conj0, \ @@ -150,15 +162,20 @@ void PASTEMAC(ch,varname) \ y0, incy, \ cntx \ ); \ - PASTEMAC(ch,axpys)( *alpha, rho, *psi1 ); \ + PASTEMAC(ch,axpys)( alpha0, rho, *psi1 ); \ \ /* For hemv, explicitly set the imaginary component of alpha11 to zero. */ \ PASTEMAC(ch,copycjs)( conja, *alpha11, alpha11_temp ); \ - if ( bli_is_conj( conjh ) ) \ + if ( bli_is_hermitian( struc ) ) \ PASTEMAC(ch,seti0s)( alpha11_temp ); \ + if ( bli_is_skew_hermitian( struc ) ) \ + PASTEMAC(ch,setr0s)( alpha11_temp ); \ + if ( bli_is_skew_symmetric( struc ) ) \ + PASTEMAC(ch,set0s)( alpha11_temp ); \ \ /* psi1 = psi1 + alpha * alpha11 * chi1; */ \ + PASTEMAC(ch,scal2s)( *alpha, conjx_chi1, alpha_chi1 ); \ PASTEMAC(ch,axpys)( alpha_chi1, alpha11_temp, *psi1 ); \ \ } \ diff --git a/frame/2/hemv/bli_hemv_unf_var3.c b/frame/2/hemv/bli_hemv_unf_var3.c index ef25a35627..107284aeb8 100644 --- a/frame/2/hemv/bli_hemv_unf_var3.c +++ b/frame/2/hemv/bli_hemv_unf_var3.c @@ -39,10 +39,10 @@ \ void PASTEMAC(ch,varname) \ ( \ + struc_t struc, \ uplo_t uplo, \ conj_t conja, \ conj_t conjx, \ - conj_t conjh, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ @@ -78,25 +78,37 @@ void PASTEMAC(ch,varname) \ dim_t f_ahead, f_behind; \ inc_t rs_at, cs_at; \ conj_t conj0, conj1; \ + ctype alpha0, alpha1; \ +\ + rs_at = rs_a; \ + cs_at = cs_a; \ +\ + conj0 = conja; \ + conj1 = conja; \ +\ + PASTEMAC(ch,copys)( *alpha, alpha0 ); \ + PASTEMAC(ch,copys)( *alpha, alpha1 ); \ \ /* The algorithm will be expressed in terms of the lower triangular case; the upper triangular case is supported by swapping the row and column strides of A and toggling some conj parameters. */ \ if ( bli_is_lower( uplo ) ) \ { \ - rs_at = rs_a; \ - cs_at = cs_a; \ + if ( bli_is_hermitian( struc ) || bli_is_skew_hermitian( struc ) ) \ + bli_toggle_conj( &conj0 ); \ \ - conj0 = bli_apply_conj( conjh, conja ); \ - conj1 = conja; \ + if ( bli_is_skew_symmetric( struc ) || bli_is_skew_hermitian( struc ) ) \ + PASTEMAC(ch,neg2s)( *alpha, alpha0 ); \ } \ else /* if ( bli_is_upper( uplo ) ) */ \ { \ - rs_at = cs_a; \ - cs_at = rs_a; \ + bli_swap_incs( &rs_at, &cs_at ); \ \ - conj0 = conja; \ - conj1 = bli_apply_conj( conjh, conja ); \ + if ( bli_is_hermitian( struc ) || bli_is_skew_hermitian( struc ) ) \ + bli_toggle_conj( &conj1 ); \ +\ + if ( bli_is_skew_symmetric( struc ) || bli_is_skew_hermitian( struc ) ) \ + PASTEMAC(ch,neg2s)( *alpha, alpha1 ); \ } \ \ /* If beta is zero, use setv. Otherwise, scale by beta. */ \ @@ -155,9 +167,9 @@ void PASTEMAC(ch,varname) \ psi11 = y1 + (k )*incy; \ y21 = y1 + (k+1)*incy; \ \ - /* y01 = y01 + alpha * a10t' * chi11; */ \ + /* y01 = y01 +/- alpha * a10t' * chi11; */ \ PASTEMAC(ch,copycjs)( conjx, *chi11, conjx_chi11 ); \ - PASTEMAC(ch,scal2s)( *alpha, conjx_chi11, alpha_chi11 ); \ + PASTEMAC(ch,scal2s)( alpha0, conjx_chi11, alpha_chi11 ); \ if ( bli_is_conj( conj0 ) ) \ { \ for ( j = 0; j < f_behind; ++j ) \ @@ -172,13 +184,19 @@ void PASTEMAC(ch,varname) \ /* For hemv, explicitly set the imaginary component of alpha11 to zero. */ \ PASTEMAC(ch,copycjs)( conja, *alpha11, alpha11_temp ); \ - if ( bli_is_conj( conjh ) ) \ + if ( bli_is_hermitian( struc ) ) \ PASTEMAC(ch,seti0s)( alpha11_temp ); \ + if ( bli_is_skew_hermitian( struc ) ) \ + PASTEMAC(ch,setr0s)( alpha11_temp ); \ + if ( bli_is_skew_symmetric( struc ) ) \ + PASTEMAC(ch,set0s)( alpha11_temp ); \ \ /* psi11 = psi11 + alpha * alpha11 * chi11; */ \ + PASTEMAC(ch,scal2s)( *alpha, conjx_chi11, alpha_chi11 ); \ PASTEMAC(ch,axpys)( alpha_chi11, alpha11_temp, *psi11 ); \ \ - /* y21 = y21 + alpha * a21 * chi11; */ \ + /* y21 = y21 +/- alpha * a21 * chi11; */ \ + PASTEMAC(ch,scal2s)( alpha1, conjx_chi11, alpha_chi11 ); \ if ( bli_is_conj( conj1 ) ) \ { \ for ( j = 0; j < f_ahead; ++j ) \ @@ -191,8 +209,8 @@ void PASTEMAC(ch,varname) \ } \ } \ \ - /* y1 = y1 + alpha * A21' * x2; (dotxf) */ \ - /* y2 = y2 + alpha * A21 * x1; (axpyf) */ \ + /* y1 = y1 +/- alpha * A21' * x2; (dotxf) */ \ + /* y2 = y2 +/- alpha * A21 * x1; (axpyf) */ \ kfp_xf \ ( \ conj0, \ @@ -201,7 +219,8 @@ void PASTEMAC(ch,varname) \ conjx, \ n_ahead, \ f, \ - alpha, \ + &alpha0, \ + &alpha1, \ A21, rs_at, cs_at, \ x2, incx, \ x1, incx, \ diff --git a/frame/2/hemv/bli_hemv_unf_var3a.c b/frame/2/hemv/bli_hemv_unf_var3a.c index 3501a9ac74..4bdb0c9f7e 100644 --- a/frame/2/hemv/bli_hemv_unf_var3a.c +++ b/frame/2/hemv/bli_hemv_unf_var3a.c @@ -39,10 +39,10 @@ \ void PASTEMAC(ch,varname) \ ( \ + struc_t struc, \ uplo_t uplo, \ conj_t conja, \ conj_t conjx, \ - conj_t conjh, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ @@ -69,25 +69,37 @@ void PASTEMAC(ch,varname) \ dim_t n_ahead; \ inc_t rs_at, cs_at; \ conj_t conj0, conj1; \ + ctype alpha0, alpha1; \ +\ + rs_at = rs_a; \ + cs_at = cs_a; \ +\ + conj0 = conja; \ + conj1 = conja; \ +\ + PASTEMAC(ch,copys)( *alpha, alpha0 ); \ + PASTEMAC(ch,copys)( *alpha, alpha1 ); \ \ /* The algorithm will be expressed in terms of the lower triangular case; the upper triangular case is supported by swapping the row and column strides of A and toggling some conj parameters. */ \ if ( bli_is_lower( uplo ) ) \ { \ - rs_at = rs_a; \ - cs_at = cs_a; \ + if ( bli_is_hermitian( struc ) || bli_is_skew_hermitian( struc ) ) \ + bli_toggle_conj( &conj0 ); \ \ - conj0 = bli_apply_conj( conjh, conja ); \ - conj1 = conja; \ + if ( bli_is_skew_symmetric( struc ) || bli_is_skew_hermitian( struc ) ) \ + PASTEMAC(ch,neg2s)( *alpha, alpha0 ); \ } \ else /* if ( bli_is_upper( uplo ) ) */ \ { \ - rs_at = cs_a; \ - cs_at = rs_a; \ + bli_swap_incs( &rs_at, &cs_at ); \ \ - conj0 = conja; \ - conj1 = bli_apply_conj( conjh, conja ); \ + if ( bli_is_hermitian( struc ) || bli_is_skew_hermitian( struc ) ) \ + bli_toggle_conj( &conj1 ); \ +\ + if ( bli_is_skew_symmetric( struc ) || bli_is_skew_hermitian( struc ) ) \ + PASTEMAC(ch,neg2s)( *alpha, alpha1 ); \ } \ \ /* If beta is zero, use setv. Otherwise, scale by beta. */ \ @@ -134,15 +146,20 @@ void PASTEMAC(ch,varname) \ /* For hemv, explicitly set the imaginary component of alpha11 to zero. */ \ PASTEMAC(ch,copycjs)( conja, *alpha11, alpha11_temp ); \ - if ( bli_is_conj( conjh ) ) \ + if ( bli_is_hermitian( struc ) ) \ PASTEMAC(ch,seti0s)( alpha11_temp ); \ -\ - /* Apply conjx to chi1 and and scale by alpha. */ \ - PASTEMAC(ch,copycjs)( conjx, *chi1, conjx_chi1 ); \ - PASTEMAC(ch,scal2s)( *alpha, conjx_chi1, alpha_chi1 ); \ + if ( bli_is_skew_hermitian( struc ) ) \ + PASTEMAC(ch,setr0s)( alpha11_temp ); \ + if ( bli_is_skew_symmetric( struc ) ) \ + PASTEMAC(ch,set0s)( alpha11_temp ); \ \ /* psi1 = psi1 + alpha * alpha11 * chi1; */ \ + PASTEMAC(ch,scal2s)( *alpha, conjx_chi1, alpha_chi1 ); \ PASTEMAC(ch,axpys)( alpha_chi1, alpha11_temp, *psi1 ); \ +\ + /* Apply conjx to chi1 and and scale by alpha. */ \ + PASTEMAC(ch,copycjs)( conjx, *chi1, conjx_chi1 ); \ + PASTEMAC(ch,scal2s)( alpha1, conjx_chi1, alpha_chi1 ); \ \ /* psi1 = psi1 + alpha * a21' * x2; (dotv) */ \ /* y2 = y2 + alpha * a21 * chi1; (axpyv) */ \ @@ -159,7 +176,7 @@ void PASTEMAC(ch,varname) \ y2, incy, \ cntx \ ); \ - PASTEMAC(ch,axpys)( *alpha, rho, *psi1 ); \ + PASTEMAC(ch,axpys)( alpha0, rho, *psi1 ); \ } \ } diff --git a/frame/2/hemv/bli_hemv_var.h b/frame/2/hemv/bli_hemv_var.h index 8054c8354f..d51dc5247c 100644 --- a/frame/2/hemv/bli_hemv_var.h +++ b/frame/2/hemv/bli_hemv_var.h @@ -42,7 +42,6 @@ \ void PASTEMAC(opname) \ ( \ - conj_t conjh, \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ @@ -77,10 +76,10 @@ GENPROT( hemv_unf_var3a ) \ void PASTEMAC(ch,varname) \ ( \ + struc_t struca, \ uplo_t uplo, \ conj_t conja, \ conj_t conjx, \ - conj_t conjh, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ diff --git a/frame/2/hemv/bli_hemv_var_oapi.c b/frame/2/hemv/bli_hemv_var_oapi.c index b35aec5f43..4198106976 100644 --- a/frame/2/hemv/bli_hemv_var_oapi.c +++ b/frame/2/hemv/bli_hemv_var_oapi.c @@ -39,7 +39,6 @@ \ void PASTEMAC(varname) \ ( \ - conj_t conjh, \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ @@ -51,26 +50,27 @@ void PASTEMAC(varname) \ { \ bli_init_once(); \ \ - num_t dt = bli_obj_dt( a ); \ + num_t dt = bli_obj_dt( a ); \ \ - uplo_t uplo = bli_obj_uplo( a ); \ - conj_t conja = bli_obj_conj_status( a ); \ - conj_t conjx = bli_obj_conj_status( x ); \ + struc_t struc = bli_obj_struc( a ); \ + uplo_t uplo = bli_obj_uplo( a ); \ + conj_t conja = bli_obj_conj_status( a ); \ + conj_t conjx = bli_obj_conj_status( x ); \ \ - dim_t m = bli_obj_length( a ); \ + dim_t m = bli_obj_length( a ); \ \ - void* buf_a = bli_obj_buffer_at_off( a ); \ - inc_t rs_a = bli_obj_row_stride( a ); \ - inc_t cs_a = bli_obj_col_stride( a ); \ + void* buf_a = bli_obj_buffer_at_off( a ); \ + inc_t rs_a = bli_obj_row_stride( a ); \ + inc_t cs_a = bli_obj_col_stride( a ); \ \ - void* buf_x = bli_obj_buffer_at_off( x ); \ - inc_t incx = bli_obj_vector_inc( x ); \ + void* buf_x = bli_obj_buffer_at_off( x ); \ + inc_t incx = bli_obj_vector_inc( x ); \ \ - void* buf_y = bli_obj_buffer_at_off( y ); \ - inc_t incy = bli_obj_vector_inc( y ); \ + void* buf_y = bli_obj_buffer_at_off( y ); \ + inc_t incy = bli_obj_vector_inc( y ); \ \ - void* buf_alpha = bli_obj_buffer_for_1x1( dt, alpha ); \ - void* buf_beta = bli_obj_buffer_for_1x1( dt, beta ); \ + void* buf_alpha = bli_obj_buffer_for_1x1( dt, alpha ); \ + void* buf_beta = bli_obj_buffer_for_1x1( dt, beta ); \ \ /* Query a type-specific function pointer, except one that uses void* for function arguments instead of typed pointers. */ \ @@ -79,10 +79,10 @@ void PASTEMAC(varname) \ \ f \ ( \ + struc, \ uplo, \ conja, \ conjx, \ - conjh, \ m, \ buf_alpha, \ buf_a, rs_a, cs_a, \ diff --git a/frame/2/her2/bli_her2_unb_var1.c b/frame/2/her2/bli_her2_unb_var1.c index e0bfd77733..e11668feea 100644 --- a/frame/2/her2/bli_her2_unb_var1.c +++ b/frame/2/her2/bli_her2_unb_var1.c @@ -39,10 +39,10 @@ \ void PASTEMAC(ch,varname) \ ( \ + struc_t struc, \ uplo_t uplo, \ conj_t conjx, \ conj_t conjy, \ - conj_t conjh, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ @@ -63,45 +63,54 @@ void PASTEMAC(ch,varname) \ ctype alpha1; \ ctype alpha0_chi1; \ ctype alpha1_psi1; \ - ctype alpha0_chi1_psi1; \ ctype conjx0_chi1; \ - ctype conjy1_psi1; \ + ctype conjx1_chi1; \ ctype conjy0_psi1; \ + ctype conjy1_psi1; \ dim_t i; \ dim_t n_behind; \ inc_t rs_ct, cs_ct; \ - conj_t conj0, conj1; \ + conj_t conjh, conjx0, conjy0, conjx1, conjy1; \ +\ + rs_ct = rs_c; \ + cs_ct = cs_c; \ +\ + conjh = ( bli_is_hermitian( struc ) || bli_is_skew_hermitian( struc ) ) ? BLIS_CONJUGATE : BLIS_NO_CONJUGATE; \ + conjx0 = conjx; \ + conjy1 = conjy; \ \ /* The algorithm will be expressed in terms of the lower triangular case; the upper triangular case is supported by swapping the row and column strides of A and toggling some conj parameters. */ \ if ( bli_is_lower( uplo ) ) \ { \ - rs_ct = rs_c; \ - cs_ct = cs_c; \ -\ PASTEMAC(ch,copys)( *alpha, alpha0 ); \ PASTEMAC(ch,copycjs)( conjh, *alpha, alpha1 ); \ +\ + if ( bli_is_skew_symmetric( struc ) || bli_is_skew_hermitian( struc ) ) \ + PASTEMAC(ch,negs)( alpha1 ); \ } \ else /* if ( bli_is_upper( uplo ) ) */ \ { \ - rs_ct = cs_c; \ - cs_ct = rs_c; \ + bli_swap_incs( &rs_ct, &cs_ct ); \ \ /* Toggle conjugation of conjx/conjy, but only if we are being invoked as her2; for syr2, conjx/conjy are unchanged. */ \ - conjx = bli_apply_conj( conjh, conjx ); \ - conjy = bli_apply_conj( conjh, conjy ); \ + conjx0 = bli_apply_conj( conjh, conjx0 ); \ + conjy1 = bli_apply_conj( conjh, conjy1 ); \ \ PASTEMAC(ch,copycjs)( conjh, *alpha, alpha0 ); \ PASTEMAC(ch,copys)( *alpha, alpha1 ); \ +\ + if ( bli_is_skew_symmetric( struc ) || bli_is_skew_hermitian( struc ) ) \ + PASTEMAC(ch,negs)( alpha0 ); \ } \ \ /* Apply conjh (which carries the conjugation component of the Hermitian transpose, if applicable) to conjx and/or conjy as needed to arrive at the effective conjugation for the vector subproblems. */ \ - conj0 = bli_apply_conj( conjh, conjy ); \ - conj1 = bli_apply_conj( conjh, conjx ); \ + conjx1 = bli_apply_conj( conjh, conjx0 ); \ + conjy0 = bli_apply_conj( conjh, conjy1 ); \ \ /* Query the context for the kernel function pointer. */ \ axpyv_ker_ft kfp_av = bli_cntx_get_ukr_dt( dt, BLIS_AXPYV_KER, cntx ); \ @@ -117,22 +126,19 @@ void PASTEMAC(ch,varname) \ gamma11 = c + (i )*rs_ct + (i )*cs_ct; \ \ /* Apply conjx and/or conjy to chi1 and/or psi1. */ \ - PASTEMAC(ch,copycjs)( conjx, *chi1, conjx0_chi1 ); \ - PASTEMAC(ch,copycjs)( conjy, *psi1, conjy1_psi1 ); \ - PASTEMAC(ch,copycjs)( conj0, *psi1, conjy0_psi1 ); \ + PASTEMAC(ch,copycjs)( conjx0, *chi1, conjx0_chi1 ); \ + PASTEMAC(ch,copycjs)( conjx1, *chi1, conjx1_chi1 ); \ + PASTEMAC(ch,copycjs)( conjy0, *psi1, conjy0_psi1 ); \ + PASTEMAC(ch,copycjs)( conjy1, *psi1, conjy1_psi1 ); \ \ /* Compute scalars for vector subproblems. */ \ PASTEMAC(ch,scal2s)( alpha0, conjx0_chi1, alpha0_chi1 ); \ PASTEMAC(ch,scal2s)( alpha1, conjy1_psi1, alpha1_psi1 ); \ -\ - /* Compute alpha * chi1 * conj(psi1) after both chi1 and psi1 have - already been conjugated, if needed, by conjx and conjy. */ \ - PASTEMAC(ch,scal2s)( alpha0_chi1, conjy0_psi1, alpha0_chi1_psi1 ); \ \ /* c10t = c10t + alpha * chi1 * y0'; */ \ kfp_av \ ( \ - conj0, \ + conjy0, \ n_behind, \ &alpha0_chi1, \ y0, incy, \ @@ -140,10 +146,10 @@ void PASTEMAC(ch,varname) \ cntx \ ); \ \ - /* c10t = c10t + conj(alpha) * psi1 * x0'; */ \ + /* c10t = c10t +/- conj(alpha) * psi1 * x0'; */ \ kfp_av \ ( \ - conj1, \ + conjx1, \ n_behind, \ &alpha1_psi1, \ x0, incx, \ @@ -151,17 +157,18 @@ void PASTEMAC(ch,varname) \ cntx \ ); \ \ - /* gamma11 = gamma11 + alpha * chi1 * conj(psi1) \ - + conj(alpha) * psi1 * conj(chi1); */ \ - PASTEMAC(ch,adds)( alpha0_chi1_psi1, *gamma11 ); \ - PASTEMAC(ch,adds)( alpha0_chi1_psi1, *gamma11 ); \ + /* gamma11 = gamma11 + alpha * chi1 * conj(psi1) \ + +/- conj(alpha) * psi1 * conj(chi1); */ \ + PASTEMAC(ch,axpys)( alpha0_chi1, conjy0_psi1, *gamma11 ); \ + PASTEMAC(ch,axpys)( alpha1_psi1, conjx1_chi1, *gamma11 ); \ \ /* For her2, explicitly set the imaginary component of gamma11 to zero. */ \ - if ( bli_is_conj( conjh ) ) \ + if ( bli_is_hermitian( struc ) ) \ PASTEMAC(ch,seti0s)( *gamma11 ); \ + if ( bli_is_skew_hermitian( struc ) ) \ + PASTEMAC(ch,setr0s)( *gamma11 ); \ } \ } INSERT_GENTFUNC_BASIC( her2_unb_var1 ) - diff --git a/frame/2/her2/bli_her2_unb_var2.c b/frame/2/her2/bli_her2_unb_var2.c index 0ab92fb384..9680575ca7 100644 --- a/frame/2/her2/bli_her2_unb_var2.c +++ b/frame/2/her2/bli_her2_unb_var2.c @@ -39,10 +39,10 @@ \ void PASTEMAC(ch,varname) \ ( \ + struc_t struc, \ uplo_t uplo, \ conj_t conjx, \ conj_t conjy, \ - conj_t conjh, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ @@ -64,51 +64,55 @@ void PASTEMAC(ch,varname) \ ctype alpha1; \ ctype alpha0_psi1; \ ctype alpha1_psi1; \ - ctype alpha0_chi1_psi1; \ + ctype conjx0_chi1; \ + ctype conjx1_chi1; \ ctype conjy0_psi1; \ ctype conjy1_psi1; \ - ctype conjx0_chi1; \ dim_t i; \ dim_t n_behind; \ dim_t n_ahead; \ inc_t rs_ct, cs_ct; \ - conj_t conj0, conj1; \ - conj_t conjh_conjy; \ + conj_t conjh, conjx0, conjy0, conjx1, conjy1; \ \ - /* Eliminate unused variable warnings. */ \ - ( void )conjh_conjy; \ + rs_ct = rs_c; \ + cs_ct = cs_c; \ +\ + conjh = ( bli_is_hermitian( struc ) || bli_is_skew_hermitian( struc ) ) ? BLIS_CONJUGATE : BLIS_NO_CONJUGATE; \ + conjx0 = conjx; \ + conjy1 = conjy; \ \ /* The algorithm will be expressed in terms of the lower triangular case; the upper triangular case is supported by swapping the row and column strides of A and toggling some conj parameters. */ \ if ( bli_is_lower( uplo ) ) \ { \ - rs_ct = rs_c; \ - cs_ct = cs_c; \ -\ PASTEMAC(ch,copys)( *alpha, alpha0 ); \ PASTEMAC(ch,copycjs)( conjh, *alpha, alpha1 ); \ +\ + if ( bli_is_skew_symmetric( struc ) || bli_is_skew_hermitian( struc ) ) \ + PASTEMAC(ch,negs)( alpha1 ); \ } \ else /* if ( bli_is_upper( uplo ) ) */ \ { \ - rs_ct = cs_c; \ - cs_ct = rs_c; \ + bli_swap_incs( &rs_ct, &cs_ct ); \ \ /* Toggle conjugation of conjx/conjy, but only if we are being invoked as her2; for syr2, conjx/conjy are unchanged. */ \ - conjx = bli_apply_conj( conjh, conjx ); \ - conjy = bli_apply_conj( conjh, conjy ); \ + conjx0 = bli_apply_conj( conjh, conjx0 ); \ + conjy1 = bli_apply_conj( conjh, conjy1 ); \ \ PASTEMAC(ch,copycjs)( conjh, *alpha, alpha0 ); \ PASTEMAC(ch,copys)( *alpha, alpha1 ); \ +\ + if ( bli_is_skew_symmetric( struc ) || bli_is_skew_hermitian( struc ) ) \ + PASTEMAC(ch,negs)( alpha0 ); \ } \ \ /* Apply conjh (which carries the conjugation component of the Hermitian transpose, if applicable) to conjx and/or conjy as needed to arrive at the effective conjugation for the vector subproblems. */ \ - conj0 = conjx; \ - conj1 = bli_apply_conj( conjh, conjx ); \ - conjh_conjy = bli_apply_conj( conjh, conjy ); \ + conjx1 = bli_apply_conj( conjh, conjx0 ); \ + conjy0 = bli_apply_conj( conjh, conjy1 ); \ \ /* Query the context for the kernel function pointer. */ \ axpyv_ker_ft kfp_av = bli_cntx_get_ukr_dt( dt, BLIS_AXPYV_KER, cntx ); \ @@ -126,22 +130,19 @@ void PASTEMAC(ch,varname) \ c21 = c + (i+1)*rs_ct + (i )*cs_ct; \ \ /* Apply conjx and/or conjy to chi1 and/or psi1. */ \ - PASTEMAC(ch,copycjs)( conjh_conjy, *psi1, conjy0_psi1 ); \ - PASTEMAC(ch,copycjs)( conjy, *psi1, conjy1_psi1 ); \ - PASTEMAC(ch,copycjs)( conj0, *chi1, conjx0_chi1 ); \ + PASTEMAC(ch,copycjs)( conjx0, *chi1, conjx0_chi1 ); \ + PASTEMAC(ch,copycjs)( conjx1, *chi1, conjx1_chi1 ); \ + PASTEMAC(ch,copycjs)( conjy0, *psi1, conjy0_psi1 ); \ + PASTEMAC(ch,copycjs)( conjy1, *psi1, conjy1_psi1 ); \ \ /* Compute scalars for vector subproblems. */ \ PASTEMAC(ch,scal2s)( alpha0, conjy0_psi1, alpha0_psi1 ); \ PASTEMAC(ch,scal2s)( alpha1, conjy1_psi1, alpha1_psi1 ); \ -\ - /* Compute alpha * chi1 * conj(psi1) after both chi1 and psi1 have - already been conjugated, if needed, by conjx and conjy. */ \ - PASTEMAC(ch,scal2s)( alpha0_psi1, conjx0_chi1, alpha0_chi1_psi1 ); \ \ /* c21 = c21 + alpha * x2 * conj(psi1); */ \ kfp_av \ ( \ - conj0, \ + conjx0, \ n_ahead, \ &alpha0_psi1, \ x2, incx, \ @@ -152,7 +153,7 @@ void PASTEMAC(ch,varname) \ /* c10t = c10t + conj(alpha) * psi1 * x0'; */ \ kfp_av \ ( \ - conj1, \ + conjx1, \ n_behind, \ &alpha1_psi1, \ x0, incx, \ @@ -160,15 +161,17 @@ void PASTEMAC(ch,varname) \ cntx \ ); \ \ - /* gamma11 = gamma11 + alpha * chi1 * conj(psi1) \ - + conj(alpha) * psi1 * conj(chi1); */ \ - PASTEMAC(ch,adds)( alpha0_chi1_psi1, *gamma11 ); \ - PASTEMAC(ch,adds)( alpha0_chi1_psi1, *gamma11 ); \ + /* gamma11 = gamma11 + alpha * chi1 * conj(psi1) \ + +/- conj(alpha) * psi1 * conj(chi1); */ \ + PASTEMAC(ch,axpys)( alpha0_psi1, conjx0_chi1, *gamma11 ); \ + PASTEMAC(ch,axpys)( alpha1_psi1, conjx1_chi1, *gamma11 ); \ \ /* For her2, explicitly set the imaginary component of gamma11 to zero. */ \ - if ( bli_is_conj( conjh ) ) \ + if ( bli_is_hermitian( struc ) ) \ PASTEMAC(ch,seti0s)( *gamma11 ); \ + if ( bli_is_skew_hermitian( struc ) ) \ + PASTEMAC(ch,setr0s)( *gamma11 ); \ } \ } diff --git a/frame/2/her2/bli_her2_unb_var3.c b/frame/2/her2/bli_her2_unb_var3.c index dc2630c46a..63307c630f 100644 --- a/frame/2/her2/bli_her2_unb_var3.c +++ b/frame/2/her2/bli_her2_unb_var3.c @@ -39,10 +39,10 @@ \ void PASTEMAC(ch,varname) \ ( \ + struc_t struc, \ uplo_t uplo, \ conj_t conjx, \ conj_t conjy, \ - conj_t conjh, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ @@ -64,51 +64,55 @@ void PASTEMAC(ch,varname) \ ctype alpha1; \ ctype alpha0_chi1; \ ctype alpha1_chi1; \ - ctype alpha0_chi1_psi1; \ ctype conjx0_chi1; \ ctype conjx1_chi1; \ ctype conjy0_psi1; \ + ctype conjy1_psi1; \ dim_t i; \ dim_t n_behind; \ dim_t n_ahead; \ inc_t rs_ct, cs_ct; \ - conj_t conj0, conj1; \ - conj_t conjh_conjx; \ + conj_t conjh, conjx0, conjy0, conjx1, conjy1; \ \ - /* Eliminate unused variable warnings. */ \ - ( void )conjh_conjx; \ + rs_ct = rs_c; \ + cs_ct = cs_c; \ +\ + conjh = ( bli_is_hermitian( struc ) || bli_is_skew_hermitian( struc ) ) ? BLIS_CONJUGATE : BLIS_NO_CONJUGATE; \ + conjx0 = conjx; \ + conjy1 = conjy; \ \ /* The algorithm will be expressed in terms of the lower triangular case; the upper triangular case is supported by swapping the row and column strides of A and toggling some conj parameters. */ \ if ( bli_is_lower( uplo ) ) \ { \ - rs_ct = rs_c; \ - cs_ct = cs_c; \ -\ PASTEMAC(ch,copys)( *alpha, alpha0 ); \ PASTEMAC(ch,copycjs)( conjh, *alpha, alpha1 ); \ +\ + if ( bli_is_skew_symmetric( struc ) || bli_is_skew_hermitian( struc ) ) \ + PASTEMAC(ch,negs)( alpha1 ); \ } \ else /* if ( bli_is_upper( uplo ) ) */ \ { \ - rs_ct = cs_c; \ - cs_ct = rs_c; \ + bli_swap_incs( &rs_ct, &cs_ct ); \ \ /* Toggle conjugation of conjx/conjy, but only if we are being invoked as her2; for syr2, conjx/conjy are unchanged. */ \ - conjx = bli_apply_conj( conjh, conjx ); \ - conjy = bli_apply_conj( conjh, conjy ); \ + conjx0 = bli_apply_conj( conjh, conjx0 ); \ + conjy1 = bli_apply_conj( conjh, conjy1 ); \ \ PASTEMAC(ch,copycjs)( conjh, *alpha, alpha0 ); \ PASTEMAC(ch,copys)( *alpha, alpha1 ); \ +\ + if ( bli_is_skew_symmetric( struc ) || bli_is_skew_hermitian( struc ) ) \ + PASTEMAC(ch,negs)( alpha0 ); \ } \ \ /* Apply conjh (which carries the conjugation component of the Hermitian transpose, if applicable) to conjx and/or conjy as needed to arrive at the effective conjugation for the vector subproblems. */ \ - conj0 = bli_apply_conj( conjh, conjy ); \ - conj1 = conjy; \ - conjh_conjx = bli_apply_conj( conjh, conjx ); \ + conjx1 = bli_apply_conj( conjh, conjx0 ); \ + conjy0 = bli_apply_conj( conjh, conjy1 ); \ \ /* Query the context for the kernel function pointer. */ \ axpyv_ker_ft kfp_av = bli_cntx_get_ukr_dt( dt, BLIS_AXPYV_KER, cntx ); \ @@ -126,22 +130,19 @@ void PASTEMAC(ch,varname) \ c21 = c + (i+1)*rs_ct + (i )*cs_ct; \ \ /* Apply conjx and/or conjy to chi1 and/or psi1. */ \ - PASTEMAC(ch,copycjs)( conjx, *chi1, conjx0_chi1 ); \ - PASTEMAC(ch,copycjs)( conjh_conjx, *chi1, conjx1_chi1 ); \ - PASTEMAC(ch,copycjs)( conj0, *psi1, conjy0_psi1 ); \ + PASTEMAC(ch,copycjs)( conjx0, *chi1, conjx0_chi1 ); \ + PASTEMAC(ch,copycjs)( conjx1, *chi1, conjx1_chi1 ); \ + PASTEMAC(ch,copycjs)( conjy0, *psi1, conjy0_psi1 ); \ + PASTEMAC(ch,copycjs)( conjy1, *psi1, conjy1_psi1 ); \ \ /* Compute scalars for vector subproblems. */ \ PASTEMAC(ch,scal2s)( alpha0, conjx0_chi1, alpha0_chi1 ); \ PASTEMAC(ch,scal2s)( alpha1, conjx1_chi1, alpha1_chi1 ); \ -\ - /* Compute alpha * chi1 * conj(psi1) after both chi1 and psi1 have - already been conjugated, if needed, by conjx and conjy. */ \ - PASTEMAC(ch,scal2s)( alpha0_chi1, conjy0_psi1, alpha0_chi1_psi1 ); \ \ /* c10t = c10t + alpha * chi1 * y0'; */ \ kfp_av \ ( \ - conj0, \ + conjy0, \ n_behind, \ &alpha0_chi1, \ y0, incy, \ @@ -152,7 +153,7 @@ void PASTEMAC(ch,varname) \ /* c21 = c21 + conj(alpha) * y2 * conj(chi1); */ \ kfp_av \ ( \ - conj1, \ + conjy1, \ n_ahead, \ &alpha1_chi1, \ y2, incy, \ @@ -160,15 +161,17 @@ void PASTEMAC(ch,varname) \ cntx \ ); \ \ - /* gamma11 = gamma11 + alpha * chi1 * conj(psi1) \ - + conj(alpha) * psi1 * conj(chi1); */ \ - PASTEMAC(ch,adds)( alpha0_chi1_psi1, *gamma11 ); \ - PASTEMAC(ch,adds)( alpha0_chi1_psi1, *gamma11 ); \ + /* gamma11 = gamma11 + alpha * chi1 * conj(psi1) \ + +/- conj(alpha) * psi1 * conj(chi1); */ \ + PASTEMAC(ch,axpys)( alpha0_chi1, conjy0_psi1, *gamma11 ); \ + PASTEMAC(ch,axpys)( alpha1_chi1, conjy1_psi1, *gamma11 ); \ \ /* For her2, explicitly set the imaginary component of gamma11 to zero. */ \ - if ( bli_is_conj( conjh ) ) \ + if ( bli_is_hermitian( struc ) ) \ PASTEMAC(ch,seti0s)( *gamma11 ); \ + if ( bli_is_skew_hermitian( struc ) ) \ + PASTEMAC(ch,setr0s)( *gamma11 ); \ } \ } diff --git a/frame/2/her2/bli_her2_unb_var4.c b/frame/2/her2/bli_her2_unb_var4.c index 59902654d9..0a577b9f99 100644 --- a/frame/2/her2/bli_her2_unb_var4.c +++ b/frame/2/her2/bli_her2_unb_var4.c @@ -39,10 +39,10 @@ \ void PASTEMAC(ch,varname) \ ( \ + struc_t struc, \ uplo_t uplo, \ conj_t conjx, \ conj_t conjy, \ - conj_t conjh, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ @@ -63,53 +63,54 @@ void PASTEMAC(ch,varname) \ ctype alpha1; \ ctype alpha0_psi1; \ ctype alpha1_chi1; \ - ctype alpha0_chi1_psi1; \ - ctype conjy0_psi1; \ - ctype conjx1_chi1; \ ctype conjx0_chi1; \ + ctype conjx1_chi1; \ + ctype conjy0_psi1; \ + ctype conjy1_psi1; \ dim_t i; \ dim_t n_ahead; \ inc_t rs_ct, cs_ct; \ - conj_t conj0, conj1; \ - conj_t conjh_conjx; \ - conj_t conjh_conjy; \ + conj_t conjh, conjx0, conjy0, conjx1, conjy1; \ +\ + rs_ct = rs_c; \ + cs_ct = cs_c; \ \ - /* Eliminate unused variable warnings. */ \ - ( void )conjh_conjx; \ - ( void )conjh_conjy; \ + conjh = ( bli_is_hermitian( struc ) || bli_is_skew_hermitian( struc ) ) ? BLIS_CONJUGATE : BLIS_NO_CONJUGATE; \ + conjx0 = conjx; \ + conjy1 = conjy; \ \ /* The algorithm will be expressed in terms of the lower triangular case; the upper triangular case is supported by swapping the row and column strides of A and toggling some conj parameters. */ \ if ( bli_is_lower( uplo ) ) \ { \ - rs_ct = rs_c; \ - cs_ct = cs_c; \ -\ PASTEMAC(ch,copys)( *alpha, alpha0 ); \ PASTEMAC(ch,copycjs)( conjh, *alpha, alpha1 ); \ +\ + if ( bli_is_skew_symmetric( struc ) || bli_is_skew_hermitian( struc ) ) \ + PASTEMAC(ch,negs)( alpha1 ); \ } \ else /* if ( bli_is_upper( uplo ) ) */ \ { \ - rs_ct = cs_c; \ - cs_ct = rs_c; \ + bli_swap_incs( &rs_ct, &cs_ct ); \ \ /* Toggle conjugation of conjx/conjy, but only if we are being invoked as her2; for syr2, conjx/conjy are unchanged. */ \ - conjx = bli_apply_conj( conjh, conjx ); \ - conjy = bli_apply_conj( conjh, conjy ); \ + conjx0 = bli_apply_conj( conjh, conjx0 ); \ + conjy1 = bli_apply_conj( conjh, conjy1 ); \ \ PASTEMAC(ch,copycjs)( conjh, *alpha, alpha0 ); \ PASTEMAC(ch,copys)( *alpha, alpha1 ); \ +\ + if ( bli_is_skew_symmetric( struc ) || bli_is_skew_hermitian( struc ) ) \ + PASTEMAC(ch,negs)( alpha0 ); \ } \ \ /* Apply conjh (which carries the conjugation component of the Hermitian transpose, if applicable) to conjx and/or conjy as needed to arrive at the effective conjugation for the vector subproblems. */ \ - conj0 = conjx; \ - conj1 = conjy; \ - conjh_conjx = bli_apply_conj( conjh, conjx ); \ - conjh_conjy = bli_apply_conj( conjh, conjy ); \ + conjx1 = bli_apply_conj( conjh, conjx0 ); \ + conjy0 = bli_apply_conj( conjh, conjy1 ); \ \ /* Query the context for the kernel function pointer. */ \ axpyv_ker_ft kfp_av = bli_cntx_get_ukr_dt( dt, BLIS_AXPYV_KER, cntx ); \ @@ -125,22 +126,19 @@ void PASTEMAC(ch,varname) \ c21 = c + (i+1)*rs_ct + (i )*cs_ct; \ \ /* Apply conjx and/or conjy to chi1 and/or psi1. */ \ - PASTEMAC(ch,copycjs)( conjh_conjy, *psi1, conjy0_psi1 ); \ - PASTEMAC(ch,copycjs)( conjh_conjx, *chi1, conjx1_chi1 ); \ - PASTEMAC(ch,copycjs)( conj0, *chi1, conjx0_chi1 ); \ + PASTEMAC(ch,copycjs)( conjx0, *chi1, conjx0_chi1 ); \ + PASTEMAC(ch,copycjs)( conjx1, *chi1, conjx1_chi1 ); \ + PASTEMAC(ch,copycjs)( conjy0, *psi1, conjy0_psi1 ); \ + PASTEMAC(ch,copycjs)( conjy1, *psi1, conjy1_psi1 ); \ \ /* Compute scalars for vector subproblems. */ \ PASTEMAC(ch,scal2s)( alpha0, conjy0_psi1, alpha0_psi1 ); \ PASTEMAC(ch,scal2s)( alpha1, conjx1_chi1, alpha1_chi1 ); \ -\ - /* Compute alpha * chi1 * conj(psi1) after both chi1 and psi1 have - already been conjugated, if needed, by conjx and conjy. */ \ - PASTEMAC(ch,scal2s)( alpha0_psi1, conjx0_chi1, alpha0_chi1_psi1 ); \ \ /* c21 = c21 + alpha * x2 * conj(psi1); */ \ kfp_av \ ( \ - conj0, \ + conjx0, \ n_ahead, \ &alpha0_psi1, \ x2, incx, \ @@ -151,7 +149,7 @@ void PASTEMAC(ch,varname) \ /* c21 = c21 + conj(alpha) * y2 * conj(chi1); */ \ kfp_av \ ( \ - conj1, \ + conjy1, \ n_ahead, \ &alpha1_chi1, \ y2, incy, \ @@ -159,15 +157,17 @@ void PASTEMAC(ch,varname) \ cntx \ ); \ \ - /* gamma11 = gamma11 + alpha * chi1 * conj(psi1) \ - + conj(alpha) * psi1 * conj(chi1); */ \ - PASTEMAC(ch,adds)( alpha0_chi1_psi1, *gamma11 ); \ - PASTEMAC(ch,adds)( alpha0_chi1_psi1, *gamma11 ); \ + /* gamma11 = gamma11 + alpha * chi1 * conj(psi1) \ + +/- conj(alpha) * psi1 * conj(chi1); */ \ + PASTEMAC(ch,axpys)( alpha0_psi1, conjx0_chi1, *gamma11 ); \ + PASTEMAC(ch,axpys)( alpha1_chi1, conjy1_psi1, *gamma11 ); \ \ /* For her2, explicitly set the imaginary component of gamma11 to zero. */ \ - if ( bli_is_conj( conjh ) ) \ + if ( bli_is_hermitian( struc ) ) \ PASTEMAC(ch,seti0s)( *gamma11 ); \ + if ( bli_is_skew_hermitian( struc ) ) \ + PASTEMAC(ch,setr0s)( *gamma11 ); \ } \ } diff --git a/frame/2/her2/bli_her2_unf_var1.c b/frame/2/her2/bli_her2_unf_var1.c index aa0de6a2f2..a3113578f4 100644 --- a/frame/2/her2/bli_her2_unf_var1.c +++ b/frame/2/her2/bli_her2_unf_var1.c @@ -39,10 +39,10 @@ \ void PASTEMAC(ch,varname) \ ( \ + struc_t struc, \ uplo_t uplo, \ conj_t conjx, \ conj_t conjy, \ - conj_t conjh, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ @@ -63,45 +63,54 @@ void PASTEMAC(ch,varname) \ ctype alpha1; \ ctype alpha0_chi1; \ ctype alpha1_psi1; \ - ctype alpha0_chi1_psi1; \ ctype conjx0_chi1; \ - ctype conjy1_psi1; \ + ctype conjx1_chi1; \ ctype conjy0_psi1; \ + ctype conjy1_psi1; \ dim_t i; \ dim_t n_behind; \ inc_t rs_ct, cs_ct; \ - conj_t conj0, conj1; \ + conj_t conjh, conjx0, conjy0, conjx1, conjy1; \ +\ + rs_ct = rs_c; \ + cs_ct = cs_c; \ +\ + conjh = ( bli_is_hermitian( struc ) || bli_is_skew_hermitian( struc ) ) ? BLIS_CONJUGATE : BLIS_NO_CONJUGATE; \ + conjx0 = conjx; \ + conjy1 = conjy; \ \ /* The algorithm will be expressed in terms of the lower triangular case; the upper triangular case is supported by swapping the row and column strides of A and toggling some conj parameters. */ \ if ( bli_is_lower( uplo ) ) \ { \ - rs_ct = rs_c; \ - cs_ct = cs_c; \ -\ PASTEMAC(ch,copys)( *alpha, alpha0 ); \ PASTEMAC(ch,copycjs)( conjh, *alpha, alpha1 ); \ +\ + if ( bli_is_skew_symmetric( struc ) || bli_is_skew_hermitian( struc ) ) \ + PASTEMAC(ch,negs)( alpha1 ); \ } \ else /* if ( bli_is_upper( uplo ) ) */ \ { \ - rs_ct = cs_c; \ - cs_ct = rs_c; \ + bli_swap_incs( &rs_ct, &cs_ct ); \ \ /* Toggle conjugation of conjx/conjy, but only if we are being invoked as her2; for syr2, conjx/conjy are unchanged. */ \ - conjx = bli_apply_conj( conjh, conjx ); \ - conjy = bli_apply_conj( conjh, conjy ); \ + conjx0 = bli_apply_conj( conjh, conjx0 ); \ + conjy1 = bli_apply_conj( conjh, conjy1 ); \ \ PASTEMAC(ch,copycjs)( conjh, *alpha, alpha0 ); \ PASTEMAC(ch,copys)( *alpha, alpha1 ); \ +\ + if ( bli_is_skew_symmetric( struc ) || bli_is_skew_hermitian( struc ) ) \ + PASTEMAC(ch,negs)( alpha0 ); \ } \ \ /* Apply conjh (which carries the conjugation component of the Hermitian transpose, if applicable) to conjx and/or conjy as needed to arrive at the effective conjugation for the vector subproblems. */ \ - conj0 = bli_apply_conj( conjh, conjy ); \ - conj1 = bli_apply_conj( conjh, conjx ); \ + conjx1 = bli_apply_conj( conjh, conjx0 ); \ + conjy0 = bli_apply_conj( conjh, conjy1 ); \ \ /* Query the context for the kernel function pointer. */ \ axpy2v_ker_ft kfp_2v = bli_cntx_get_ukr_dt( dt, BLIS_AXPY2V_KER, cntx ); \ @@ -117,24 +126,21 @@ void PASTEMAC(ch,varname) \ gamma11 = c + (i )*rs_ct + (i )*cs_ct; \ \ /* Apply conjx and/or conjy to chi1 and/or psi1. */ \ - PASTEMAC(ch,copycjs)( conjx, *chi1, conjx0_chi1 ); \ - PASTEMAC(ch,copycjs)( conjy, *psi1, conjy1_psi1 ); \ - PASTEMAC(ch,copycjs)( conj0, *psi1, conjy0_psi1 ); \ + PASTEMAC(ch,copycjs)( conjx0, *chi1, conjx0_chi1 ); \ + PASTEMAC(ch,copycjs)( conjx1, *chi1, conjx1_chi1 ); \ + PASTEMAC(ch,copycjs)( conjy0, *psi1, conjy0_psi1 ); \ + PASTEMAC(ch,copycjs)( conjy1, *psi1, conjy1_psi1 ); \ \ /* Compute scalars for vector subproblems. */ \ PASTEMAC(ch,scal2s)( alpha0, conjx0_chi1, alpha0_chi1 ); \ PASTEMAC(ch,scal2s)( alpha1, conjy1_psi1, alpha1_psi1 ); \ -\ - /* Compute alpha * chi1 * conj(psi1) after both chi1 and psi1 have - already been conjugated, if needed, by conjx and conjy. */ \ - PASTEMAC(ch,scal2s)( alpha0_chi1, conjy0_psi1, alpha0_chi1_psi1 ); \ \ /* c10t = c10t + alpha * chi1 * y0'; */ \ /* c10t = c10t + conj(alpha) * psi1 * x0'; */ \ kfp_2v \ ( \ - conj0, \ - conj1, \ + conjy0, \ + conjx1, \ n_behind, \ &alpha0_chi1, \ &alpha1_psi1, \ @@ -144,15 +150,17 @@ void PASTEMAC(ch,varname) \ cntx \ ); \ \ - /* gamma11 = gamma11 + alpha * chi1 * conj(psi1) \ - + conj(alpha) * psi1 * conj(chi1); */ \ - PASTEMAC(ch,adds)( alpha0_chi1_psi1, *gamma11 ); \ - PASTEMAC(ch,adds)( alpha0_chi1_psi1, *gamma11 ); \ + /* gamma11 = gamma11 + alpha * chi1 * conj(psi1) \ + +/- conj(alpha) * psi1 * conj(chi1); */ \ + PASTEMAC(ch,axpys)( alpha0_chi1, conjy0_psi1, *gamma11 ); \ + PASTEMAC(ch,axpys)( alpha1_psi1, conjx1_chi1, *gamma11 ); \ \ /* For her2, explicitly set the imaginary component of gamma11 to zero. */ \ - if ( bli_is_conj( conjh ) ) \ + if ( bli_is_hermitian( struc ) ) \ PASTEMAC(ch,seti0s)( *gamma11 ); \ + if ( bli_is_skew_hermitian( struc ) ) \ + PASTEMAC(ch,setr0s)( *gamma11 ); \ } \ } diff --git a/frame/2/her2/bli_her2_unf_var4.c b/frame/2/her2/bli_her2_unf_var4.c index 4095e5e653..9739175a45 100644 --- a/frame/2/her2/bli_her2_unf_var4.c +++ b/frame/2/her2/bli_her2_unf_var4.c @@ -39,10 +39,10 @@ \ void PASTEMAC(ch,varname) \ ( \ + struc_t struc, \ uplo_t uplo, \ conj_t conjx, \ conj_t conjy, \ - conj_t conjh, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ @@ -63,53 +63,54 @@ void PASTEMAC(ch,varname) \ ctype alpha1; \ ctype alpha0_psi1; \ ctype alpha1_chi1; \ - ctype alpha0_chi1_psi1; \ - ctype conjy0_psi1; \ - ctype conjx1_chi1; \ ctype conjx0_chi1; \ + ctype conjx1_chi1; \ + ctype conjy0_psi1; \ + ctype conjy1_psi1; \ dim_t i; \ dim_t n_ahead; \ inc_t rs_ct, cs_ct; \ - conj_t conj0, conj1; \ - conj_t conjh_conjx; \ - conj_t conjh_conjy; \ + conj_t conjh, conjx0, conjy0, conjx1, conjy1; \ +\ + rs_ct = rs_c; \ + cs_ct = cs_c; \ \ - /* Eliminate unused variable warnings. */ \ - ( void )conjh_conjx; \ - ( void )conjh_conjy; \ + conjh = ( bli_is_hermitian( struc ) || bli_is_skew_hermitian( struc ) ) ? BLIS_CONJUGATE : BLIS_NO_CONJUGATE; \ + conjx0 = conjx; \ + conjy1 = conjy; \ \ /* The algorithm will be expressed in terms of the lower triangular case; the upper triangular case is supported by swapping the row and column strides of A and toggling some conj parameters. */ \ if ( bli_is_lower( uplo ) ) \ { \ - rs_ct = rs_c; \ - cs_ct = cs_c; \ -\ PASTEMAC(ch,copys)( *alpha, alpha0 ); \ PASTEMAC(ch,copycjs)( conjh, *alpha, alpha1 ); \ +\ + if ( bli_is_skew_symmetric( struc ) || bli_is_skew_hermitian( struc ) ) \ + PASTEMAC(ch,negs)( alpha1 ); \ } \ else /* if ( bli_is_upper( uplo ) ) */ \ { \ - rs_ct = cs_c; \ - cs_ct = rs_c; \ + bli_swap_incs( &rs_ct, &cs_ct ); \ \ /* Toggle conjugation of conjx/conjy, but only if we are being invoked as her2; for syr2, conjx/conjy are unchanged. */ \ - conjx = bli_apply_conj( conjh, conjx ); \ - conjy = bli_apply_conj( conjh, conjy ); \ + conjx0 = bli_apply_conj( conjh, conjx0 ); \ + conjy1 = bli_apply_conj( conjh, conjy1 ); \ \ PASTEMAC(ch,copycjs)( conjh, *alpha, alpha0 ); \ PASTEMAC(ch,copys)( *alpha, alpha1 ); \ +\ + if ( bli_is_skew_symmetric( struc ) || bli_is_skew_hermitian( struc ) ) \ + PASTEMAC(ch,negs)( alpha0 ); \ } \ \ /* Apply conjh (which carries the conjugation component of the Hermitian transpose, if applicable) to conjx and/or conjy as needed to arrive at the effective conjugation for the vector subproblems. */ \ - conj0 = conjx; \ - conj1 = conjy; \ - conjh_conjx = bli_apply_conj( conjh, conjx ); \ - conjh_conjy = bli_apply_conj( conjh, conjy ); \ + conjx1 = bli_apply_conj( conjh, conjx0 ); \ + conjy0 = bli_apply_conj( conjh, conjy1 ); \ \ /* Query the context for the kernel function pointer. */ \ axpy2v_ker_ft kfp_2v = bli_cntx_get_ukr_dt( dt, BLIS_AXPY2V_KER, cntx ); \ @@ -125,24 +126,21 @@ void PASTEMAC(ch,varname) \ c21 = c + (i+1)*rs_ct + (i )*cs_ct; \ \ /* Apply conjx and/or conjy to chi1 and/or psi1. */ \ - PASTEMAC(ch,copycjs)( conjh_conjy, *psi1, conjy0_psi1 ); \ - PASTEMAC(ch,copycjs)( conjh_conjx, *chi1, conjx1_chi1 ); \ - PASTEMAC(ch,copycjs)( conj0, *chi1, conjx0_chi1 ); \ + PASTEMAC(ch,copycjs)( conjx0, *chi1, conjx0_chi1 ); \ + PASTEMAC(ch,copycjs)( conjx1, *chi1, conjx1_chi1 ); \ + PASTEMAC(ch,copycjs)( conjy0, *psi1, conjy0_psi1 ); \ + PASTEMAC(ch,copycjs)( conjy1, *psi1, conjy1_psi1 ); \ \ /* Compute scalars for vector subproblems. */ \ PASTEMAC(ch,scal2s)( alpha0, conjy0_psi1, alpha0_psi1 ); \ PASTEMAC(ch,scal2s)( alpha1, conjx1_chi1, alpha1_chi1 ); \ -\ - /* Compute alpha * chi1 * conj(psi1) after both chi1 and psi1 have - already been conjugated, if needed, by conjx and conjy. */ \ - PASTEMAC(ch,scal2s)( alpha0_psi1, conjx0_chi1, alpha0_chi1_psi1 ); \ \ /* c21 = c21 + alpha * x2 * conj(psi1); */ \ /* c21 = c21 + conj(alpha) * y2 * conj(chi1); */ \ kfp_2v \ ( \ - conj0, \ - conj1, \ + conjx0, \ + conjy1, \ n_ahead, \ &alpha0_psi1, \ &alpha1_chi1, \ @@ -152,15 +150,17 @@ void PASTEMAC(ch,varname) \ cntx \ ); \ \ - /* gamma11 = gamma11 + alpha * chi1 * conj(psi1) \ - + conj(alpha) * psi1 * conj(chi1); */ \ - PASTEMAC(ch,adds)( alpha0_chi1_psi1, *gamma11 ); \ - PASTEMAC(ch,adds)( alpha0_chi1_psi1, *gamma11 ); \ + /* gamma11 = gamma11 + alpha * chi1 * conj(psi1) \ + +/- conj(alpha) * psi1 * conj(chi1); */ \ + PASTEMAC(ch,axpys)( alpha0_psi1, conjx0_chi1, *gamma11 ); \ + PASTEMAC(ch,axpys)( alpha1_chi1, conjy1_psi1, *gamma11 ); \ \ /* For her2, explicitly set the imaginary component of gamma11 to zero. */ \ - if ( bli_is_conj( conjh ) ) \ + if ( bli_is_hermitian( struc ) ) \ PASTEMAC(ch,seti0s)( *gamma11 ); \ + if ( bli_is_skew_hermitian( struc ) ) \ + PASTEMAC(ch,setr0s)( *gamma11 ); \ } \ } diff --git a/frame/2/her2/bli_her2_var.h b/frame/2/her2/bli_her2_var.h index 52862940bb..74671408c8 100644 --- a/frame/2/her2/bli_her2_var.h +++ b/frame/2/her2/bli_her2_var.h @@ -42,9 +42,7 @@ \ void PASTEMAC(opname) \ ( \ - conj_t conjh, \ obj_t* alpha, \ - obj_t* alpha_conj, \ obj_t* x, \ obj_t* y, \ obj_t* c, \ @@ -75,10 +73,10 @@ GENPROT( her2_unf_var4 ) \ void PASTEMAC(ch,varname) \ ( \ + struc_t struc, \ uplo_t uplo, \ conj_t conjx, \ conj_t conjy, \ - conj_t conjh, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ diff --git a/frame/2/her2/bli_her2_var_oapi.c b/frame/2/her2/bli_her2_var_oapi.c index 75217c7700..546cc86bc4 100644 --- a/frame/2/her2/bli_her2_var_oapi.c +++ b/frame/2/her2/bli_her2_var_oapi.c @@ -39,9 +39,7 @@ \ void PASTEMAC(varname) \ ( \ - conj_t conjh, \ obj_t* alpha, \ - obj_t* alpha_conj, \ obj_t* x, \ obj_t* y, \ obj_t* c, \ @@ -53,6 +51,7 @@ void PASTEMAC(varname) \ \ num_t dt = bli_obj_dt( c ); \ \ + struc_t struc = bli_obj_struc( c ); \ uplo_t uplo = bli_obj_uplo( c ); \ conj_t conjx = bli_obj_conj_status( x ); \ conj_t conjy = bli_obj_conj_status( y ); \ @@ -78,10 +77,10 @@ void PASTEMAC(varname) \ \ f \ ( \ + struc, \ uplo, \ conjx, \ conjy, \ - conjh, \ m, \ buf_alpha, \ buf_x, incx, \ diff --git a/frame/3/bli_l3_check.c b/frame/3/bli_l3_check.c index 93146bc180..a8520cdcc1 100644 --- a/frame/3/bli_l3_check.c +++ b/frame/3/bli_l3_check.c @@ -126,6 +126,40 @@ void bli_hemm_check bli_check_error_code( e_val ); } +void bli_shmm_check + ( + side_t side, + const obj_t* alpha, + const obj_t* a, + const obj_t* b, + const obj_t* beta, + const obj_t* c, + const cntx_t* cntx + ) +{ + err_t e_val; + + // Perform checks common to hemm/symm/trmm/trsm. + + bli_hemm_basic_check( side, alpha, a, b, beta, c, cntx ); + + // Check matrix squareness. + + e_val = bli_check_square_object( a ); + bli_check_error_code( e_val ); + + // Check object structure. + + e_val = bli_check_skew_hermitian_object( a ); + bli_check_error_code( e_val ); + + e_val = bli_check_general_object( b ); + bli_check_error_code( e_val ); + + e_val = bli_check_general_object( c ); + bli_check_error_code( e_val ); +} + void bli_herk_check ( const obj_t* alpha, @@ -210,6 +244,49 @@ void bli_her2k_check bli_check_error_code( e_val ); } +void bli_shr2k_check + ( + const obj_t* alpha, + const obj_t* a, + const obj_t* b, + const obj_t* beta, + const obj_t* c, + const cntx_t* cntx + ) +{ + err_t e_val; + obj_t ah, bh; + + // Alias A and B to A^H and B^H so we can perform dimension checks. + bli_obj_alias_with_trans( BLIS_CONJ_TRANSPOSE, a, &ah ); + bli_obj_alias_with_trans( BLIS_CONJ_TRANSPOSE, b, &bh ); + + // Check basic properties of the operation. + + bli_her2k_basic_check( alpha, a, &bh, b, &ah, beta, c, cntx ); + + // Check matrix squareness. + + e_val = bli_check_square_object( c ); + bli_check_error_code( e_val ); + + // Check matrix structure. + + e_val = bli_check_skew_hermitian_object( c ); + bli_check_error_code( e_val ); + + e_val = bli_check_general_object( a ); + bli_check_error_code( e_val ); + + e_val = bli_check_general_object( b ); + bli_check_error_code( e_val ); + + // Check for real-valued beta. + + e_val = bli_check_real_valued_object( beta ); + bli_check_error_code( e_val ); +} + void bli_symm_check ( side_t side, @@ -244,6 +321,40 @@ void bli_symm_check bli_check_error_code( e_val ); } +void bli_skmm_check + ( + side_t side, + const obj_t* alpha, + const obj_t* a, + const obj_t* b, + const obj_t* beta, + const obj_t* c, + const cntx_t* cntx + ) +{ + err_t e_val; + + // Check basic properties of the operation. + + bli_hemm_basic_check( side, alpha, a, b, beta, c, cntx ); + + // Check matrix squareness. + + e_val = bli_check_square_object( a ); + bli_check_error_code( e_val ); + + // Check object structure. + + e_val = bli_check_skew_symmetric_object( a ); + bli_check_error_code( e_val ); + + e_val = bli_check_general_object( b ); + bli_check_error_code( e_val ); + + e_val = bli_check_general_object( c ); + bli_check_error_code( e_val ); +} + void bli_syrk_check ( const obj_t* alpha, @@ -315,6 +426,44 @@ void bli_syr2k_check bli_check_error_code( e_val ); } +void bli_skr2k_check + ( + const obj_t* alpha, + const obj_t* a, + const obj_t* b, + const obj_t* beta, + const obj_t* c, + const cntx_t* cntx + ) +{ + err_t e_val; + obj_t at, bt; + + // Alias A and B to A^T and B^T so we can perform dimension checks. + bli_obj_alias_with_trans( BLIS_TRANSPOSE, a, &at ); + bli_obj_alias_with_trans( BLIS_TRANSPOSE, b, &bt ); + + // Check basic properties of the operation. + + bli_her2k_basic_check( alpha, a, &bt, b, &at, beta, c, cntx ); + + // Check matrix squareness. + + e_val = bli_check_square_object( c ); + bli_check_error_code( e_val ); + + // Check matrix structure. + + e_val = bli_check_skew_symmetric_object( c ); + bli_check_error_code( e_val ); + + e_val = bli_check_general_object( a ); + bli_check_error_code( e_val ); + + e_val = bli_check_general_object( b ); + bli_check_error_code( e_val ); +} + void bli_trmm3_check ( side_t side, diff --git a/frame/3/bli_l3_check.h b/frame/3/bli_l3_check.h index ef59cb60fc..33d756a0b1 100644 --- a/frame/3/bli_l3_check.h +++ b/frame/3/bli_l3_check.h @@ -54,6 +54,8 @@ GENPROT( gemm ) GENPROT( gemmt ) GENPROT( her2k ) GENPROT( syr2k ) +GENPROT( shr2k ) +GENPROT( skr2k ) #undef GENPROT @@ -72,6 +74,8 @@ void PASTEMAC(opname,_check) \ GENPROT( hemm ) GENPROT( symm ) +GENPROT( shmm ) +GENPROT( skmm ) GENPROT( trmm3 ) diff --git a/frame/3/bli_l3_ind.c b/frame/3/bli_l3_ind.c index 73e3661374..69a38c3428 100644 --- a/frame/3/bli_l3_ind.c +++ b/frame/3/bli_l3_ind.c @@ -39,9 +39,9 @@ // the induced methods. This array is meant to be read-only. static const bool bli_l3_ind_oper_impl[BLIS_NUM_IND_METHODS][BLIS_NUM_LEVEL3_OPS] = { - /* gemm gemmt hemm herk her2k symm syrk syr2k trmm3 trmm trsm */ -/* 1m */ { TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE }, -/* nat */ { TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE } + /* gemm gemmt hemm shmm herk her2k shr2k symm skmm syrk syr2k skr2k trmm3 trmm trsm */ +/* 1m */ { TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE }, +/* nat */ { TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE } }; // @@ -55,13 +55,16 @@ static const bool bli_l3_ind_oper_impl[BLIS_NUM_IND_METHODS][BLIS_NUM_LEVEL3_OPS static BLIS_THREAD_LOCAL bool bli_l3_ind_oper_st[BLIS_NUM_IND_METHODS][BLIS_NUM_LEVEL3_OPS][2] = { - /* gemm gemmt hemm herk her2k symm - syrk syr2k trmm3 trmm trsm */ + /* gemm gemmt hemm shmm herk her2k + shr2k symm skmm syrk syr2k skr2k + trmm3 trmm trsm */ /* c z */ /* 1m */ { {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, - {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE} }, + {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, + {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE} }, /* nat */ { {TRUE,TRUE}, {TRUE,TRUE}, {TRUE,TRUE}, {TRUE,TRUE}, {TRUE,TRUE}, {TRUE,TRUE}, - {TRUE,TRUE}, {TRUE,TRUE}, {TRUE,TRUE}, {TRUE,TRUE}, {TRUE,TRUE} }, + {TRUE,TRUE}, {TRUE,TRUE}, {TRUE,TRUE}, {TRUE,TRUE}, {TRUE,TRUE}, {TRUE,TRUE}, + {TRUE,TRUE}, {TRUE,TRUE}, {TRUE,TRUE} }, }; // A mutex to allow synchronous access to the bli_l3_ind_oper_st array. @@ -87,6 +90,8 @@ GENFUNC( gemm, BLIS_GEMM ) GENFUNC( gemmt, BLIS_GEMMT ) GENFUNC( hemm, BLIS_HEMM ) GENFUNC( symm, BLIS_SYMM ) +GENFUNC( shmm, BLIS_SHMM ) +GENFUNC( skmm, BLIS_SKMM ) GENFUNC( trmm3, BLIS_TRMM3 ) GENFUNC( trmm, BLIS_TRMM ) GENFUNC( trsm, BLIS_TRSM ) @@ -203,7 +208,7 @@ void bli_l3_ind_oper_set_enable( opid_t oper, ind_t method, num_t dt, bool statu if ( !bli_is_complex( dt ) ) return; if ( !bli_opid_is_level3( oper ) ) return; - // BLIS currently implements herk/her2k/syrk/syr2k in terms of the user- + // BLIS currently implements herk/her2k/shr2k/syrk/syr2k/skr2k in terms of the user- // level gemmt (expert) API, and so those operations choose to execute // 1m (or not) based on the induced method enablement status of gemmt. // In other words, changing the enablement status of those operations @@ -211,8 +216,10 @@ void bli_l3_ind_oper_set_enable( opid_t oper, ind_t method, num_t dt, bool statu // operations' induced method enablement statuses to that of gemmt. if ( method != BLIS_NAT && ( oper == BLIS_HERK || oper == BLIS_HER2K || + oper == BLIS_SHR2K || oper == BLIS_SYRK || - oper == BLIS_SYR2K ) ) + oper == BLIS_SYR2K || + oper == BLIS_SKR2K ) ) oper = BLIS_GEMMT; // Disallow changing status of native execution. @@ -250,8 +257,10 @@ bool bli_l3_ind_oper_get_enable( opid_t oper, ind_t method, num_t dt ) // operations' induced method enablement statuses to that of gemmt. if ( method != BLIS_NAT && ( oper == BLIS_HERK || oper == BLIS_HER2K || + oper == BLIS_SHR2K || oper == BLIS_SYRK || - oper == BLIS_SYR2K ) ) + oper == BLIS_SYR2K || + oper == BLIS_SKR2K ) ) oper = BLIS_GEMMT; { diff --git a/frame/3/bli_l3_ind.h b/frame/3/bli_l3_ind.h index a14ad783c9..e3dfd161e1 100644 --- a/frame/3/bli_l3_ind.h +++ b/frame/3/bli_l3_ind.h @@ -48,6 +48,8 @@ GENPROT( gemm ) GENPROT( gemmt ) GENPROT( hemm ) GENPROT( symm ) +GENPROT( shmm ) +GENPROT( skmm ) GENPROT( trmm3 ) GENPROT( trmm ) GENPROT( trsm ) diff --git a/frame/3/bli_l3_oapi.c b/frame/3/bli_l3_oapi.c index 2344b9eb81..db46e1a4f0 100644 --- a/frame/3/bli_l3_oapi.c +++ b/frame/3/bli_l3_oapi.c @@ -59,6 +59,8 @@ GENFRONT( gemm ) GENFRONT( gemmt ) GENFRONT( her2k ) GENFRONT( syr2k ) +GENFRONT( shr2k ) +GENFRONT( skr2k ) #undef GENFRONT @@ -81,6 +83,8 @@ void PASTEMAC(opname) \ GENFRONT( hemm ) GENFRONT( symm ) +GENFRONT( shmm ) +GENFRONT( skmm ) GENFRONT( trmm3 ) diff --git a/frame/3/bli_l3_oapi.h b/frame/3/bli_l3_oapi.h index 12fb68e971..e1815bc518 100644 --- a/frame/3/bli_l3_oapi.h +++ b/frame/3/bli_l3_oapi.h @@ -54,6 +54,8 @@ GENPROT( gemm ) GENPROT( gemmt ) GENPROT( her2k ) GENPROT( syr2k ) +GENPROT( shr2k ) +GENPROT( skr2k ) #undef GENPROT @@ -71,6 +73,8 @@ BLIS_EXPORT_BLIS void PASTEMAC(opname) \ GENPROT( hemm ) GENPROT( symm ) +GENPROT( shmm ) +GENPROT( skmm ) GENPROT( trmm3 ) diff --git a/frame/3/bli_l3_oapi_ex.c b/frame/3/bli_l3_oapi_ex.c index fcb51bb45c..342c607290 100644 --- a/frame/3/bli_l3_oapi_ex.c +++ b/frame/3/bli_l3_oapi_ex.c @@ -291,6 +291,92 @@ void PASTEMAC(syr2k,BLIS_OAPI_EX_SUF) } +void PASTEMAC(shr2k,BLIS_OAPI_EX_SUF) + ( + const obj_t* alpha, + const obj_t* a, + const obj_t* b, + const obj_t* beta, + const obj_t* c, + const cntx_t* cntx, + const rntm_t* rntm + ) +{ + bli_init_once(); + + obj_t ah; + obj_t bh; + obj_t alphah; + obj_t minus_alphah; + + // Check parameters. + if ( bli_error_checking_is_enabled() ) + bli_shr2k_check( alpha, a, b, beta, c, cntx ); + + bli_obj_alias_to( alpha, &alphah ); + bli_obj_toggle_conj( &alphah ); + + bli_obj_scalar_init_detached( bli_obj_dt( alpha ), &minus_alphah ); + bli_negsc( &alphah, &minus_alphah ); + + bli_obj_alias_to( a, &ah ); + bli_obj_toggle_trans( &ah ); + bli_obj_toggle_conj( &ah ); + + bli_obj_alias_to( b, &bh ); + bli_obj_toggle_trans( &bh ); + bli_obj_toggle_conj( &bh ); + + // Invoke gemmt twice, using beta only the first time. + PASTEMAC(gemmt,BLIS_OAPI_EX_SUF)( alpha, a, &bh, beta, c, cntx, rntm ); + PASTEMAC(gemmt,BLIS_OAPI_EX_SUF)( &minus_alphah, b, &ah, &BLIS_ONE, c, cntx, rntm ); + + // The skew-Hermitian rank-2k product was computed as alpha*A*B'-alpha'*B*A', even for + // the diagonal elements. Mathematically, the real components of + // diagonal elements of a skew-Hermitian rank-2k product should always be + // zero. However, in practice, they sometimes accumulate meaningless + // non-zero values. To prevent this, we explicitly set those values + // to zero before returning. + bli_setrd( &BLIS_ZERO, c ); +} + + +void PASTEMAC(skr2k,BLIS_OAPI_EX_SUF) + ( + const obj_t* alpha, + const obj_t* a, + const obj_t* b, + const obj_t* beta, + const obj_t* c, + const cntx_t* cntx, + const rntm_t* rntm + ) +{ + bli_init_once(); + + obj_t at; + obj_t bt; + obj_t minus_alpha; + + // Check parameters. + if ( bli_error_checking_is_enabled() ) + bli_skr2k_check( alpha, a, b, beta, c, cntx ); + + bli_obj_scalar_init_detached( bli_obj_dt( alpha ), &minus_alpha ); + bli_negsc( alpha, &minus_alpha ); + + bli_obj_alias_to( b, &bt ); + bli_obj_toggle_trans( &bt ); + + bli_obj_alias_to( a, &at ); + bli_obj_toggle_trans( &at ); + + // Invoke gemmt twice, using beta only the first time. + PASTEMAC(gemmt,BLIS_OAPI_EX_SUF)( alpha, a, &bt, beta, c, cntx, rntm ); + PASTEMAC(gemmt,BLIS_OAPI_EX_SUF)( &minus_alpha, b, &at, &BLIS_ONE, c, cntx, rntm ); +} + + void PASTEMAC(herk,BLIS_OAPI_EX_SUF) ( const obj_t* alpha, @@ -519,6 +605,180 @@ void PASTEMAC(symm,BLIS_OAPI_EX_SUF) } +void PASTEMAC(shmm,BLIS_OAPI_EX_SUF) + ( + side_t side, + const obj_t* alpha, + const obj_t* a, + const obj_t* b, + const obj_t* beta, + const obj_t* c, + const cntx_t* cntx, + const rntm_t* rntm + ) +{ + bli_init_once(); + + // Check the operands. + if ( bli_error_checking_is_enabled() ) + bli_shmm_check( side, alpha, a, b, beta, c, cntx ); + + // Check for zero dimensions, alpha == 0, or other conditions which + // mean that we don't actually have to perform a full l3 operation. + if ( bli_l3_return_early_if_trivial( alpha, a, b, beta, c ) == BLIS_SUCCESS ) + return; + + // Default to using native execution. + num_t dt = bli_obj_dt( c ); + ind_t im = BLIS_NAT; + + // If all matrix operands are complex and of the same storage datatype, try + // to get an induced method (if one is available and enabled). + if ( bli_obj_dt( a ) == bli_obj_dt( c ) && + bli_obj_dt( b ) == bli_obj_dt( c ) && + bli_obj_is_complex( c ) ) + { + // Find the highest priority induced method that is both enabled and + // available for the current operation. (If an induced method is + // available but not enabled, or simply unavailable, BLIS_NAT will + // be returned here.) + im = bli_shmmind_find_avail( dt ); + } + + // If necessary, obtain a valid context from the gks using the induced + // method id determined above. + if ( cntx == NULL ) cntx = bli_gks_query_cntx(); + + // Alias A, B, and C in case we need to apply transformations. + obj_t a_local; + obj_t b_local; + obj_t c_local; + bli_obj_alias_submatrix( a, &a_local ); + bli_obj_alias_submatrix( b, &b_local ); + bli_obj_alias_submatrix( c, &c_local ); + + // If the Hermitian/symmetric matrix A is being multiplied from the right, + // swap A and B so that the Hermitian/symmetric matrix will actually be on + // the right. + if ( bli_is_right( side ) ) + { + bli_obj_swap( &a_local, &b_local ); + } + + gemm_cntl_t cntl; + bli_gemm_cntl_init + ( + im, + BLIS_SHMM, + alpha, + &a_local, + &b_local, + beta, + &c_local, + cntx, + &cntl + ); + + // Invoke the internal back-end. + bli_l3_thread_decorator + ( + &a_local, + &b_local, + &c_local, + cntx, + ( cntl_t* )&cntl, + rntm + ); +} + + +void PASTEMAC(skmm,BLIS_OAPI_EX_SUF) + ( + side_t side, + const obj_t* alpha, + const obj_t* a, + const obj_t* b, + const obj_t* beta, + const obj_t* c, + const cntx_t* cntx, + const rntm_t* rntm + ) +{ + bli_init_once(); + + // Check the operands. + if ( bli_error_checking_is_enabled() ) + bli_skmm_check( side, alpha, a, b, beta, c, cntx ); + + // Check for zero dimensions, alpha == 0, or other conditions which + // mean that we don't actually have to perform a full l3 operation. + if ( bli_l3_return_early_if_trivial( alpha, a, b, beta, c ) == BLIS_SUCCESS ) + return; + + // Default to using native execution. + num_t dt = bli_obj_dt( c ); + ind_t im = BLIS_NAT; + + // If all matrix operands are complex and of the same storage datatype, try + // to get an induced method (if one is available and enabled). + if ( bli_obj_dt( a ) == bli_obj_dt( c ) && + bli_obj_dt( b ) == bli_obj_dt( c ) && + bli_obj_is_complex( c ) ) + { + // Find the highest priority induced method that is both enabled and + // available for the current operation. (If an induced method is + // available but not enabled, or simply unavailable, BLIS_NAT will + // be returned here.) + im = bli_skmmind_find_avail( dt ); + } + + // If necessary, obtain a valid context from the gks using the induced + // method id determined above. + if ( cntx == NULL ) cntx = bli_gks_query_cntx(); + + // Alias A, B, and C in case we need to apply transformations. + obj_t a_local; + obj_t b_local; + obj_t c_local; + bli_obj_alias_submatrix( a, &a_local ); + bli_obj_alias_submatrix( b, &b_local ); + bli_obj_alias_submatrix( c, &c_local ); + + // If the Hermitian/symmetric matrix A is being multiplied from the right, + // swap A and B so that the Hermitian/symmetric matrix will actually be on + // the right. + if ( bli_is_right( side ) ) + { + bli_obj_swap( &a_local, &b_local ); + } + + gemm_cntl_t cntl; + bli_gemm_cntl_init + ( + im, + BLIS_SKMM, + alpha, + &a_local, + &b_local, + beta, + &c_local, + cntx, + &cntl + ); + + // Invoke the internal back-end. + bli_l3_thread_decorator + ( + &a_local, + &b_local, + &c_local, + cntx, + ( cntl_t* )&cntl, + rntm + ); +} + + void PASTEMAC(trmm3,BLIS_OAPI_EX_SUF) ( side_t side, diff --git a/frame/3/bli_l3_oapi_ex.h b/frame/3/bli_l3_oapi_ex.h index 09f7f4a5d0..ad4e66e94d 100644 --- a/frame/3/bli_l3_oapi_ex.h +++ b/frame/3/bli_l3_oapi_ex.h @@ -56,6 +56,8 @@ GENPROT( gemm ) GENPROT( gemmt ) GENPROT( her2k ) GENPROT( syr2k ) +GENPROT( shr2k ) +GENPROT( skr2k ) #ifdef BLIS_ENABLE_SANDBOX GENPROT( gemm_def ) #endif @@ -78,6 +80,8 @@ BLIS_EXPORT_BLIS void PASTEMAC(opname,BLIS_OAPI_EX_SUF) \ GENPROT( hemm ) GENPROT( symm ) +GENPROT( shmm ) +GENPROT( skmm ) GENPROT( trmm3 ) diff --git a/frame/3/bli_l3_tapi.c b/frame/3/bli_l3_tapi.c index 164f4401e5..2bd89d7be3 100644 --- a/frame/3/bli_l3_tapi.c +++ b/frame/3/bli_l3_tapi.c @@ -152,6 +152,8 @@ void PASTEMAC(ch,opname) \ INSERT_GENTFUNC_BASIC( hemm, BLIS_HERMITIAN ) INSERT_GENTFUNC_BASIC( symm, BLIS_SYMMETRIC ) +INSERT_GENTFUNC_BASIC( shmm, BLIS_SKEW_HERMITIAN ) +INSERT_GENTFUNC_BASIC( skmm, BLIS_SKEW_SYMMETRIC ) #undef GENTFUNCR @@ -224,6 +226,7 @@ void PASTEMAC(ch,opname) \ } INSERT_GENTFUNCR_BASIC( her2k ) +INSERT_GENTFUNCR_BASIC( shr2k ) #undef GENTFUNC @@ -296,6 +299,7 @@ void PASTEMAC(ch,opname) \ } INSERT_GENTFUNC_BASIC( syr2k ) +INSERT_GENTFUNC_BASIC( skr2k ) #undef GENTFUNC diff --git a/frame/3/bli_l3_tapi.h b/frame/3/bli_l3_tapi.h index 81ddf6945a..b0c94309a1 100644 --- a/frame/3/bli_l3_tapi.h +++ b/frame/3/bli_l3_tapi.h @@ -77,6 +77,8 @@ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ INSERT_GENTPROT_BASIC( hemm ) INSERT_GENTPROT_BASIC( symm ) +INSERT_GENTPROT_BASIC( shmm ) +INSERT_GENTPROT_BASIC( skmm ) #undef GENTPROTR @@ -115,6 +117,7 @@ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ); INSERT_GENTPROTR_BASIC( her2k ) +INSERT_GENTPROTR_BASIC( shr2k ) #undef GENTPROT @@ -154,6 +157,7 @@ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ INSERT_GENTPROT_BASIC( gemmt ) INSERT_GENTPROT_BASIC( syr2k ) +INSERT_GENTPROT_BASIC( skr2k ) #undef GENTPROT diff --git a/frame/3/bli_l3_tapi_ex.c b/frame/3/bli_l3_tapi_ex.c index 04560c1ca5..5605b13591 100644 --- a/frame/3/bli_l3_tapi_ex.c +++ b/frame/3/bli_l3_tapi_ex.c @@ -162,6 +162,8 @@ void PASTEMAC(ch,opname,BLIS_OAPI_EX_SUF) \ INSERT_GENTFUNC_BASIC( hemm, BLIS_HERMITIAN ) INSERT_GENTFUNC_BASIC( symm, BLIS_SYMMETRIC ) +INSERT_GENTFUNC_BASIC( shmm, BLIS_SKEW_HERMITIAN ) +INSERT_GENTFUNC_BASIC( skmm, BLIS_SKEW_SYMMETRIC ) #undef GENTFUNCR @@ -221,7 +223,7 @@ INSERT_GENTFUNCR_BASIC( herk ) #undef GENTFUNCR -#define GENTFUNCR( ctype, ctype_r, ch, chr, opname ) \ +#define GENTFUNCR( ctype, ctype_r, ch, chr, opname, struca ) \ \ void PASTEMAC(ch,opname,BLIS_OAPI_EX_SUF) \ ( \ @@ -267,7 +269,7 @@ void PASTEMAC(ch,opname,BLIS_OAPI_EX_SUF) \ bli_obj_set_conjtrans( transa, &ao ); \ bli_obj_set_conjtrans( transb, &bo ); \ \ - bli_obj_set_struc( BLIS_HERMITIAN, &co ); \ + bli_obj_set_struc( struca, &co ); \ \ PASTEMAC(opname,BLIS_OAPI_EX_SUF) \ ( \ @@ -281,7 +283,8 @@ void PASTEMAC(ch,opname,BLIS_OAPI_EX_SUF) \ ); \ } -INSERT_GENTFUNCR_BASIC( her2k ) +INSERT_GENTFUNCR_BASIC( her2k, BLIS_HERMITIAN ) +INSERT_GENTFUNCR_BASIC( shr2k, BLIS_SKEW_HERMITIAN ) #undef GENTFUNC @@ -340,7 +343,7 @@ INSERT_GENTFUNC_BASIC( syrk ) #undef GENTFUNC -#define GENTFUNC( ctype, ch, opname ) \ +#define GENTFUNC( ctype, ch, opname, struca ) \ \ void PASTEMAC(ch,opname,BLIS_OAPI_EX_SUF) \ ( \ @@ -385,7 +388,7 @@ void PASTEMAC(ch,opname,BLIS_OAPI_EX_SUF) \ bli_obj_set_conjtrans( transa, &ao ); \ bli_obj_set_conjtrans( transb, &bo ); \ \ - bli_obj_set_struc( BLIS_SYMMETRIC, &co ); \ + bli_obj_set_struc( struca, &co ); \ \ PASTEMAC(opname,BLIS_OAPI_EX_SUF) \ ( \ @@ -399,7 +402,8 @@ void PASTEMAC(ch,opname,BLIS_OAPI_EX_SUF) \ ); \ } -INSERT_GENTFUNC_BASIC( syr2k ) +INSERT_GENTFUNC_BASIC( syr2k, BLIS_SYMMETRIC ) +INSERT_GENTFUNC_BASIC( skr2k, BLIS_SKEW_SYMMETRIC ) #undef GENTFUNC diff --git a/frame/3/bli_l3_tapi_ex.h b/frame/3/bli_l3_tapi_ex.h index 872cecfa79..5010ebfd26 100644 --- a/frame/3/bli_l3_tapi_ex.h +++ b/frame/3/bli_l3_tapi_ex.h @@ -81,6 +81,8 @@ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname,BLIS_TAPI_EX_SUF) \ INSERT_GENTPROT_BASIC( hemm ) INSERT_GENTPROT_BASIC( symm ) +INSERT_GENTPROT_BASIC( shmm ) +INSERT_GENTPROT_BASIC( skmm ) #undef GENTPROTR @@ -123,6 +125,7 @@ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname,BLIS_TAPI_EX_SUF) \ ); INSERT_GENTPROTR_BASIC( her2k ) +INSERT_GENTPROTR_BASIC( shr2k ) #undef GENTPROT @@ -166,6 +169,7 @@ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname,BLIS_TAPI_EX_SUF) \ INSERT_GENTPROT_BASIC( gemmt ) INSERT_GENTPROT_BASIC( syr2k ) +INSERT_GENTPROT_BASIC( skr2k ) #undef GENTPROT diff --git a/frame/base/bli_check.c b/frame/base/bli_check.c index e949b6361e..75e3a8d48e 100644 --- a/frame/base/bli_check.c +++ b/frame/base/bli_check.c @@ -642,6 +642,26 @@ err_t bli_check_symmetric_object( const obj_t* a ) return e_val; } +err_t bli_check_skew_hermitian_object( const obj_t* a ) +{ + err_t e_val = BLIS_SUCCESS; + + if ( !bli_obj_is_skew_hermitian( a ) ) + e_val = BLIS_EXPECTED_SKEW_HERMITIAN_OBJECT; + + return e_val; +} + +err_t bli_check_skew_symmetric_object( const obj_t* a ) +{ + err_t e_val = BLIS_SUCCESS; + + if ( !bli_obj_is_skew_symmetric( a ) ) + e_val = BLIS_EXPECTED_SKEW_SYMMETRIC_OBJECT; + + return e_val; +} + err_t bli_check_triangular_object( const obj_t* a ) { err_t e_val = BLIS_SUCCESS; diff --git a/frame/base/bli_check.h b/frame/base/bli_check.h index d4690b0adb..f7c3f4ac76 100644 --- a/frame/base/bli_check.h +++ b/frame/base/bli_check.h @@ -83,6 +83,8 @@ err_t bli_check_matrix_strides( dim_t m, dim_t n, inc_t rs, inc_t cs, inc_t is ) err_t bli_check_general_object( const obj_t* a ); err_t bli_check_hermitian_object( const obj_t* a ); err_t bli_check_symmetric_object( const obj_t* a ); +err_t bli_check_skew_hermitian_object( const obj_t* a ); +err_t bli_check_skew_symmetric_object( const obj_t* a ); err_t bli_check_triangular_object( const obj_t* a ); err_t bli_check_object_struc( const obj_t* a, struc_t struc ); diff --git a/frame/base/bli_error.c b/frame/base/bli_error.c index 415e9ae059..b04ff9bc59 100644 --- a/frame/base/bli_error.c +++ b/frame/base/bli_error.c @@ -82,6 +82,8 @@ static const char *bli_error_string[-BLIS_ERROR_CODE_MAX] = [-BLIS_EXPECTED_HERMITIAN_OBJECT] = "Expected Hermitian object.", [-BLIS_EXPECTED_SYMMETRIC_OBJECT] = "Expected symmetric object.", [-BLIS_EXPECTED_TRIANGULAR_OBJECT] = "Expected triangular object.", + [-BLIS_EXPECTED_SKEW_HERMITIAN_OBJECT] = "Expected skew-Hermitian object.", + [-BLIS_EXPECTED_SKEW_SYMMETRIC_OBJECT] = "Expected skew-symmetric object.", [-BLIS_EXPECTED_UPPER_OR_LOWER_OBJECT] = "Expected upper or lower triangular object.", diff --git a/frame/base/bli_info.c b/frame/base/bli_info.c index 53ec287cb5..93ecb40e1a 100644 --- a/frame/base/bli_info.c +++ b/frame/base/bli_info.c @@ -233,6 +233,10 @@ const char* bli_info_get_her2k_impl_string( num_t dt ) { return bli_ind_oper_get const char* bli_info_get_symm_impl_string( num_t dt ) { return bli_ind_oper_get_avail_impl_string( BLIS_SYMM, dt ); } const char* bli_info_get_syrk_impl_string( num_t dt ) { return bli_ind_oper_get_avail_impl_string( BLIS_GEMMT, dt ); } const char* bli_info_get_syr2k_impl_string( num_t dt ) { return bli_ind_oper_get_avail_impl_string( BLIS_GEMMT, dt ); } +const char* bli_info_get_shmm_impl_string( num_t dt ) { return bli_ind_oper_get_avail_impl_string( BLIS_SHMM, dt ); } +const char* bli_info_get_shr2k_impl_string( num_t dt ) { return bli_ind_oper_get_avail_impl_string( BLIS_GEMMT, dt ); } +const char* bli_info_get_skmm_impl_string( num_t dt ) { return bli_ind_oper_get_avail_impl_string( BLIS_SKMM, dt ); } +const char* bli_info_get_skr2k_impl_string( num_t dt ) { return bli_ind_oper_get_avail_impl_string( BLIS_GEMMT, dt ); } const char* bli_info_get_trmm_impl_string( num_t dt ) { return bli_ind_oper_get_avail_impl_string( BLIS_TRMM, dt ); } const char* bli_info_get_trmm3_impl_string( num_t dt ) { return bli_ind_oper_get_avail_impl_string( BLIS_TRMM3, dt ); } const char* bli_info_get_trsm_impl_string( num_t dt ) { return bli_ind_oper_get_avail_impl_string( BLIS_TRSM, dt ); } diff --git a/frame/base/bli_info.h b/frame/base/bli_info.h index 50c337fea4..5aad80c531 100644 --- a/frame/base/bli_info.h +++ b/frame/base/bli_info.h @@ -104,6 +104,10 @@ BLIS_EXPORT_BLIS const char* bli_info_get_her2k_impl_string( num_t dt ); BLIS_EXPORT_BLIS const char* bli_info_get_symm_impl_string( num_t dt ); BLIS_EXPORT_BLIS const char* bli_info_get_syrk_impl_string( num_t dt ); BLIS_EXPORT_BLIS const char* bli_info_get_syr2k_impl_string( num_t dt ); +BLIS_EXPORT_BLIS const char* bli_info_get_shmm_impl_string( num_t dt ); +BLIS_EXPORT_BLIS const char* bli_info_get_shr2k_impl_string( num_t dt ); +BLIS_EXPORT_BLIS const char* bli_info_get_skmm_impl_string( num_t dt ); +BLIS_EXPORT_BLIS const char* bli_info_get_skr2k_impl_string( num_t dt ); BLIS_EXPORT_BLIS const char* bli_info_get_trmm_impl_string( num_t dt ); BLIS_EXPORT_BLIS const char* bli_info_get_trmm3_impl_string( num_t dt ); BLIS_EXPORT_BLIS const char* bli_info_get_trsm_impl_string( num_t dt ); diff --git a/frame/include/bli_obj_macro_defs.h b/frame/include/bli_obj_macro_defs.h index 79b74a2c82..266320111c 100644 --- a/frame/include/bli_obj_macro_defs.h +++ b/frame/include/bli_obj_macro_defs.h @@ -336,6 +336,18 @@ BLIS_INLINE bool bli_obj_is_symmetric( const obj_t* obj ) ( bli_obj_struc( obj ) == BLIS_BITVAL_SYMMETRIC ); } +BLIS_INLINE bool bli_obj_is_skew_hermitian( const obj_t* obj ) +{ + return ( bool ) + ( bli_obj_struc( obj ) == BLIS_BITVAL_SKEW_HERMITIAN ); +} + +BLIS_INLINE bool bli_obj_is_skew_symmetric( const obj_t* obj ) +{ + return ( bool ) + ( bli_obj_struc( obj ) == BLIS_BITVAL_SKEW_SYMMETRIC ); +} + BLIS_INLINE bool bli_obj_is_triangular( const obj_t* obj ) { return ( bool ) @@ -504,6 +516,18 @@ BLIS_INLINE bool bli_obj_root_is_symmetric( const obj_t* obj ) ( bli_obj_is_symmetric( bli_obj_root( obj ) ) ); } +BLIS_INLINE bool bli_obj_root_is_skew_hermitian( const obj_t* obj ) +{ + return ( bool ) + ( bli_obj_is_skew_hermitian( bli_obj_root( obj ) ) ); +} + +BLIS_INLINE bool bli_obj_root_is_skew_symmetric( const obj_t* obj ) +{ + return ( bool ) + ( bli_obj_is_skew_symmetric( bli_obj_root( obj ) ) ); +} + BLIS_INLINE bool bli_obj_root_is_triangular( const obj_t* obj ) { return ( bool ) @@ -517,6 +541,13 @@ BLIS_INLINE bool bli_obj_root_is_herm_or_symm( const obj_t* obj ) bli_obj_is_symmetric( bli_obj_root( obj ) ) ); } +BLIS_INLINE bool bli_obj_root_is_skew_herm_or_symm( const obj_t* obj ) +{ + return ( bool ) + ( bli_obj_is_skew_hermitian( bli_obj_root( obj ) ) || + bli_obj_is_skew_symmetric( bli_obj_root( obj ) ) ); +} + BLIS_INLINE bool bli_obj_root_is_upper( const obj_t* obj ) { return ( bool ) diff --git a/frame/include/bli_param_macro_defs.h b/frame/include/bli_param_macro_defs.h index 77ee0ddc4e..d6fdf8ec15 100644 --- a/frame/include/bli_param_macro_defs.h +++ b/frame/include/bli_param_macro_defs.h @@ -366,6 +366,18 @@ BLIS_INLINE bool bli_is_symmetric( struc_t struc ) ( struc == BLIS_SYMMETRIC ); } +BLIS_INLINE bool bli_is_skew_hermitian( struc_t struc ) +{ + return ( bool ) + ( struc == BLIS_SKEW_HERMITIAN ); +} + +BLIS_INLINE bool bli_is_skew_symmetric( struc_t struc ) +{ + return ( bool ) + ( struc == BLIS_SKEW_SYMMETRIC ); +} + BLIS_INLINE bool bli_is_triangular( struc_t struc ) { return ( bool ) @@ -379,6 +391,13 @@ BLIS_INLINE bool bli_is_herm_or_symm( struc_t struc ) bli_is_symmetric( struc ) ); } +BLIS_INLINE bool bli_is_skew_herm_or_symm( struc_t struc ) +{ + return ( bool ) + ( bli_is_skew_hermitian( struc ) || + bli_is_skew_symmetric( struc ) ); +} + // conj diff --git a/frame/include/bli_scalar_macro_defs.h b/frame/include/bli_scalar_macro_defs.h index 2eea517fdd..66bbb952f6 100644 --- a/frame/include/bli_scalar_macro_defs.h +++ b/frame/include/bli_scalar_macro_defs.h @@ -97,6 +97,7 @@ #include "bli_invscaljris.h" #include "bli_neg2ris.h" +#include "bli_negris.h" #include "bli_scalris.h" #include "bli_scaljris.h" @@ -165,6 +166,7 @@ #include "bli_invscaljs.h" #include "bli_neg2s.h" +#include "bli_negs.h" #include "bli_rands.h" #include "bli_randnp2s.h" @@ -180,6 +182,7 @@ #include "bli_set1s.h" +#include "bli_setr0s.h" #include "bli_seti0s.h" #include "bli_sqrt2s.h" diff --git a/frame/include/bli_type_defs.h b/frame/include/bli_type_defs.h index df0b2a4252..c1942718a2 100644 --- a/frame/include/bli_type_defs.h +++ b/frame/include/bli_type_defs.h @@ -273,7 +273,7 @@ typedef void (*free_ft) ( void* p ); #define BLIS_PACK_REV_IF_UPPER_NUM_BITS 1 #define BLIS_PACK_REV_IF_LOWER_NUM_BITS 1 #define BLIS_PACK_BUFFER_NUM_BITS 2 -#define BLIS_STRUC_NUM_BITS 2 +#define BLIS_STRUC_NUM_BITS 3 // @@ -307,6 +307,10 @@ typedef void (*free_ft) ( void* p ); // This is the total number of bits, which should always be <= 32 #define BLIS_INFO_NUM_BITS ( BLIS_SCALAR_DT_SHIFT + BLIS_DATATYPE_NUM_BITS ) +#if BLIS_INFO_NUM_BITS > 32 +#error "Too many info bits" +#endif + // // -- BLIS info bit field masks ------------------------------------------------ // @@ -384,6 +388,8 @@ typedef void (*free_ft) ( void* p ); #define BLIS_BITVAL_HERMITIAN ( 0x1 << BLIS_STRUC_SHIFT ) #define BLIS_BITVAL_SYMMETRIC ( 0x2 << BLIS_STRUC_SHIFT ) #define BLIS_BITVAL_TRIANGULAR ( 0x3 << BLIS_STRUC_SHIFT ) +#define BLIS_BITVAL_SKEW_HERMITIAN ( 0x4 << BLIS_STRUC_SHIFT ) +#define BLIS_BITVAL_SKEW_SYMMETRIC ( 0x5 << BLIS_STRUC_SHIFT ) // @@ -437,7 +443,9 @@ typedef enum BLIS_GENERAL = BLIS_BITVAL_GENERAL, BLIS_HERMITIAN = BLIS_BITVAL_HERMITIAN, BLIS_SYMMETRIC = BLIS_BITVAL_SYMMETRIC, - BLIS_TRIANGULAR = BLIS_BITVAL_TRIANGULAR + BLIS_TRIANGULAR = BLIS_BITVAL_TRIANGULAR, + BLIS_SKEW_HERMITIAN = BLIS_BITVAL_SKEW_HERMITIAN, + BLIS_SKEW_SYMMETRIC = BLIS_BITVAL_SKEW_SYMMETRIC, } struc_t; @@ -863,11 +871,15 @@ typedef enum BLIS_GEMM = 0, BLIS_GEMMT, BLIS_HEMM, + BLIS_SHMM, BLIS_HERK, BLIS_HER2K, + BLIS_SHR2K, BLIS_SYMM, + BLIS_SKMM, BLIS_SYRK, BLIS_SYR2K, + BLIS_SKR2K, BLIS_TRMM3, BLIS_TRMM, BLIS_TRSM, @@ -1550,6 +1562,8 @@ typedef enum BLIS_EXPECTED_HERMITIAN_OBJECT = ( -61), BLIS_EXPECTED_SYMMETRIC_OBJECT = ( -62), BLIS_EXPECTED_TRIANGULAR_OBJECT = ( -63), + BLIS_EXPECTED_SKEW_HERMITIAN_OBJECT = ( -64), + BLIS_EXPECTED_SKEW_SYMMETRIC_OBJECT = ( -65), // Storage-specific errors BLIS_EXPECTED_UPPER_OR_LOWER_OBJECT = ( -70), diff --git a/frame/include/level0/bli_negs.h b/frame/include/level0/bli_negs.h new file mode 100644 index 0000000000..a1f3b9c6da --- /dev/null +++ b/frame/include/level0/bli_negs.h @@ -0,0 +1,60 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2023, Southern Methodist University + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef BLIS_NEGS_H +#define BLIS_NEGS_H + +// negs + +// Notes: +// - The first char encodes the type of x. + +#define bli_snegs( x ) bli_snegris( bli_sreal(x), bli_simag(x) ) +#define bli_dnegs( x ) bli_dnegris( bli_dreal(x), bli_dimag(x) ) + +#ifndef BLIS_ENABLE_C99_COMPLEX + +#define bli_cnegs( x ) bli_cnegris( bli_creal(x), bli_cimag(x) ) +#define bli_znegs( x ) bli_znegris( bli_zreal(x), bli_zimag(x) ) + +#else // ifdef BLIS_ENABLE_C99_COMPLEX + +#define bli_cnegs( x ) { (x) = -(x); } +#define bli_znegs( x ) { (x) = -(x); } + +#endif // BLIS_ENABLE_C99_COMPLEX + + +#endif + diff --git a/frame/include/level0/bli_setr0s.h b/frame/include/level0/bli_setr0s.h new file mode 100644 index 0000000000..c87147c271 --- /dev/null +++ b/frame/include/level0/bli_setr0s.h @@ -0,0 +1,44 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef BLIS_SETR0S_H +#define BLIS_SETR0S_H + +#define bli_ssetr0s( a ) bli_ssetrs( 0.0F, (a) ) +#define bli_dsetr0s( a ) bli_dsetrs( 0.0 , (a) ) +#define bli_csetr0s( a ) bli_csetrs( 0.0F, (a) ) +#define bli_zsetr0s( a ) bli_zsetrs( 0.0 , (a) ) + +#endif + diff --git a/frame/include/level0/bli_sets.h b/frame/include/level0/bli_sets.h index 758fc29d6d..4d9cb6b842 100644 --- a/frame/include/level0/bli_sets.h +++ b/frame/include/level0/bli_sets.h @@ -41,19 +41,19 @@ // - The first char encodes the type of x. // - The second char encodes the type of y. -#define bli_sssets( xr, xi, y ) { (y) = (xr); } -#define bli_dssets( xr, xi, y ) { (y) = (xr); } -#define bli_cssets( xr, xi, y ) { (y) = (xr); } -#define bli_zssets( xr, xi, y ) { (y) = (xr); } -#define bli_issets( xr, xi, y ) { (y) = (xr); } +#define bli_sssets( xr, xi, y ) { (y) = (xr); (void)(xi); } +#define bli_dssets( xr, xi, y ) { (y) = (xr); (void)(xi); } +#define bli_cssets( xr, xi, y ) { (y) = (xr); (void)(xi); } +#define bli_zssets( xr, xi, y ) { (y) = (xr); (void)(xi); } +#define bli_issets( xr, xi, y ) { (y) = (xr); (void)(xi); } -#define bli_sdsets( xr, xi, y ) { (y) = (xr); } -#define bli_ddsets( xr, xi, y ) { (y) = (xr); } -#define bli_cdsets( xr, xi, y ) { (y) = (xr); } -#define bli_zdsets( xr, xi, y ) { (y) = (xr); } -#define bli_idsets( xr, xi, y ) { (y) = (xr); } +#define bli_sdsets( xr, xi, y ) { (y) = (xr); (void)(xi); } +#define bli_ddsets( xr, xi, y ) { (y) = (xr); (void)(xi); } +#define bli_cdsets( xr, xi, y ) { (y) = (xr); (void)(xi); } +#define bli_zdsets( xr, xi, y ) { (y) = (xr); (void)(xi); } +#define bli_idsets( xr, xi, y ) { (y) = (xr); (void)(xi); } -#ifndef BLIS_ENABLE_C99_COMPLEX +#ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scsets( xr, xi, y ) { bli_creal(y) = (xr); bli_cimag(y) = (xi); } #define bli_dcsets( xr, xi, y ) { bli_creal(y) = (xr); bli_cimag(y) = (xi); } @@ -81,11 +81,11 @@ #endif // BLIS_ENABLE_C99_COMPLEX -#define bli_sisets( xr, xi, y ) { (y) = bli_sreal(xr); } -#define bli_disets( xr, xi, y ) { (y) = bli_dreal(xr); } -#define bli_cisets( xr, xi, y ) { (y) = bli_creal(xr); } -#define bli_zisets( xr, xi, y ) { (y) = bli_zreal(xr); } -#define bli_iisets( xr, xi, y ) { (y) = (xr); } +#define bli_sisets( xr, xi, y ) { (y) = bli_sreal(xr); (void)(xi); } +#define bli_disets( xr, xi, y ) { (y) = bli_dreal(xr); (void)(xi); } +#define bli_cisets( xr, xi, y ) { (y) = bli_creal(xr); (void)(xi); } +#define bli_zisets( xr, xi, y ) { (y) = bli_zreal(xr); (void)(xi); } +#define bli_iisets( xr, xi, y ) { (y) = (xr); (void)(xi); } #define bli_ssets( xr, xi, y ) bli_sssets( xr, xi, y ) diff --git a/frame/include/level0/ri/bli_copycjris.h b/frame/include/level0/ri/bli_copycjris.h index c832323701..35712c35b2 100644 --- a/frame/include/level0/ri/bli_copycjris.h +++ b/frame/include/level0/ri/bli_copycjris.h @@ -39,11 +39,13 @@ #define bli_scopycjris( conj, xr, xi, yr, yi ) \ { \ + (void)conj; \ bli_scopyris( (xr), (xi), (yr), (yi) ); \ } #define bli_dcopycjris( conj, xr, xi, yr, yi ) \ { \ + (void)conj; \ bli_dcopyris( (xr), (xi), (yr), (yi) ); \ } @@ -63,6 +65,7 @@ #define bli_icopycjris( conj, xr, xi, yr, yi ) \ { \ + (void)conj; \ bli_icopyris( (xr), (xi), (yr), (yi) ); \ } diff --git a/frame/include/level0/ri/bli_negris.h b/frame/include/level0/ri/bli_negris.h new file mode 100644 index 0000000000..1f9c28eeb9 --- /dev/null +++ b/frame/include/level0/ri/bli_negris.h @@ -0,0 +1,63 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef BLIS_NEGRIS_H +#define BLIS_NEGRIS_H + +// negris + +#define bli_snegris( ar, ai ) \ +{ \ + (ar) = -(ar); \ +} + +#define bli_dnegris( ar, ai ) \ +{ \ + (ar) = -(ar); \ +} + +#define bli_cnegris( ar, ai ) \ +{ \ + (ar) = -(ar); \ + (ai) = -(ai); \ +} + +#define bli_znegris( ar, ai ) \ +{ \ + (ar) = -(ar); \ + (ai) = -(ai); \ +} + +#endif + diff --git a/frame/util/bli_util_check.c b/frame/util/bli_util_check.c index 3fafb4e50c..88114023d5 100644 --- a/frame/util/bli_util_check.c +++ b/frame/util/bli_util_check.c @@ -66,6 +66,8 @@ void PASTEMAC(opname,_check) \ GENFRONT( mkherm ) GENFRONT( mksymm ) +GENFRONT( mkskewherm ) +GENFRONT( mkskewsymm ) GENFRONT( mktrim ) diff --git a/frame/util/bli_util_check.h b/frame/util/bli_util_check.h index 26986b52cc..90089bc515 100644 --- a/frame/util/bli_util_check.h +++ b/frame/util/bli_util_check.h @@ -59,6 +59,8 @@ void PASTEMAC(opname,_check) \ GENPROT( mkherm ) GENPROT( mksymm ) +GENPROT( mkskewherm ) +GENPROT( mkskewsymm ) GENPROT( mktrim ) diff --git a/frame/util/bli_util_fpa.c b/frame/util/bli_util_fpa.c index bbba052c62..b63d29852d 100644 --- a/frame/util/bli_util_fpa.c +++ b/frame/util/bli_util_fpa.c @@ -53,6 +53,8 @@ PASTEMAC(opname,BLIS_TAPI_EX_SUF,_qfp)( num_t dt ) \ GENFRONT( asumv ) GENFRONT( mkherm ) GENFRONT( mksymm ) +GENFRONT( mkskewherm ) +GENFRONT( mkskewsymm ) GENFRONT( mktrim ) GENFRONT( norm1v ) GENFRONT( normfv ) diff --git a/frame/util/bli_util_fpa.h b/frame/util/bli_util_fpa.h index 5ee0f4adb3..f7cd276806 100644 --- a/frame/util/bli_util_fpa.h +++ b/frame/util/bli_util_fpa.h @@ -45,6 +45,8 @@ PASTEMAC(opname,BLIS_TAPI_EX_SUF,_qfp)( num_t dt ); GENPROT( asumv ) GENPROT( mkherm ) GENPROT( mksymm ) +GENPROT( mkskewherm ) +GENPROT( mkskewsymm ) GENPROT( mktrim ) GENPROT( norm1v ) GENPROT( normfv ) diff --git a/frame/util/bli_util_ft.h b/frame/util/bli_util_ft.h index 2bb1943d76..616ff68d59 100644 --- a/frame/util/bli_util_ft.h +++ b/frame/util/bli_util_ft.h @@ -67,6 +67,8 @@ typedef void (*PASTECH(ch,opname,EX_SUF,tsuf)) \ INSERT_GENTDEF( mkherm ) INSERT_GENTDEF( mksymm ) +INSERT_GENTDEF( mkskewherm ) +INSERT_GENTDEF( mkskewsymm ) INSERT_GENTDEF( mktrim ) // norm1v, normfv, normiv diff --git a/frame/util/bli_util_oapi.c b/frame/util/bli_util_oapi.c index 4810b6f001..42efbcdac8 100644 --- a/frame/util/bli_util_oapi.c +++ b/frame/util/bli_util_oapi.c @@ -124,6 +124,8 @@ void PASTEMAC(opname,EX_SUF) \ GENFRONT( mkherm ) GENFRONT( mksymm ) +GENFRONT( mkskewherm ) +GENFRONT( mkskewsymm ) GENFRONT( mktrim ) diff --git a/frame/util/bli_util_oapi.h b/frame/util/bli_util_oapi.h index 2a1d700d8e..df02502510 100644 --- a/frame/util/bli_util_oapi.h +++ b/frame/util/bli_util_oapi.h @@ -61,6 +61,8 @@ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ GENPROT( mkherm ) GENPROT( mksymm ) +GENPROT( mkskewherm ) +GENPROT( mkskewsymm ) GENPROT( mktrim ) diff --git a/frame/util/bli_util_tapi.c b/frame/util/bli_util_tapi.c index c3521f244c..9adc2d24c1 100644 --- a/frame/util/bli_util_tapi.c +++ b/frame/util/bli_util_tapi.c @@ -116,6 +116,8 @@ void PASTEMAC(ch,opname,EX_SUF) \ INSERT_GENTFUNC_BASIC( mkherm ) INSERT_GENTFUNC_BASIC( mksymm ) +INSERT_GENTFUNC_BASIC( mkskewherm ) +INSERT_GENTFUNC_BASIC( mkskewsymm ) INSERT_GENTFUNC_BASIC( mktrim ) diff --git a/frame/util/bli_util_tapi.h b/frame/util/bli_util_tapi.h index 715b22a26c..78e50c4770 100644 --- a/frame/util/bli_util_tapi.h +++ b/frame/util/bli_util_tapi.h @@ -64,6 +64,8 @@ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname,EX_SUF) \ INSERT_GENTPROT_BASIC( mkherm ) INSERT_GENTPROT_BASIC( mksymm ) +INSERT_GENTPROT_BASIC( mkskewherm ) +INSERT_GENTPROT_BASIC( mkskewsymm ) INSERT_GENTPROT_BASIC( mktrim ) diff --git a/frame/util/bli_util_unb_var1.c b/frame/util/bli_util_unb_var1.c index b3767e6a8b..bb67be3ca4 100644 --- a/frame/util/bli_util_unb_var1.c +++ b/frame/util/bli_util_unb_var1.c @@ -182,6 +182,122 @@ void PASTEMAC(ch,varname) \ INSERT_GENTFUNC_BASIC( mksymm_unb_var1 ) +#undef GENTFUNCR +#define GENTFUNCR( ctype, ctype_r, ch, chr, varname ) \ +\ +void PASTEMAC(ch,varname) \ + ( \ + uplo_t uploa, \ + dim_t m, \ + ctype* a, inc_t rs_a, inc_t cs_a, \ + cntx_t* cntx, \ + rntm_t* rntm \ + ) \ +{ \ + ctype_r* zeror = PASTEMAC(chr,0); \ + ctype* minus_one = PASTEMAC(ch,m1); \ + doff_t diagoffa; \ +\ + /* If the dimension is zero, return early. */ \ + if ( bli_zero_dim1( m ) ) return; \ +\ + /* In order to avoid the main diagonal, we must nudge the diagonal either + up or down by one, depending on which triangle is currently stored. */ \ + if ( bli_is_upper( uploa ) ) diagoffa = 1; \ + else /*if ( bli_is_lower( uploa ) )*/ diagoffa = -1; \ +\ + /* We will be reflecting the stored region over the diagonal into the + unstored region, so a transposition is necessary. Furthermore, since + we are creating a Hermitian matrix, we must also conjugate. */ \ + PASTEMAC(ch,scal2m,BLIS_TAPI_EX_SUF) \ + ( \ + diagoffa, \ + BLIS_NONUNIT_DIAG, \ + uploa, \ + BLIS_CONJ_TRANSPOSE, \ + m, \ + m, \ + minus_one, \ + a, rs_a, cs_a, \ + a, rs_a, cs_a, \ + cntx, \ + rntm \ + ); \ +\ + /* Set the real parts of the diagonal elements to zero. */ \ + PASTEMAC(ch,setrd,BLIS_TAPI_EX_SUF) \ + ( \ + 0, \ + m, \ + m, \ + zeror, \ + a, rs_a, cs_a, \ + cntx, \ + rntm \ + ); \ +} + +INSERT_GENTFUNCR_BASIC( mkskewherm_unb_var1 ) + + +#undef GENTFUNC +#define GENTFUNC( ctype, ch, varname ) \ +\ +void PASTEMAC(ch,varname) \ + ( \ + uplo_t uploa, \ + dim_t m, \ + ctype* a, inc_t rs_a, inc_t cs_a, \ + cntx_t* cntx, \ + rntm_t* rntm \ + ) \ +{ \ + doff_t diagoffa; \ + ctype* zero = PASTEMAC(ch,0); \ + ctype* minus_one = PASTEMAC(ch,m1); \ +\ + /* If the dimension is zero, return early. */ \ + if ( bli_zero_dim1( m ) ) return; \ +\ + /* In order to avoid the main diagonal, we must nudge the diagonal either + up or down by one, depending on which triangle is currently stored. */ \ + if ( bli_is_upper( uploa ) ) diagoffa = 1; \ + else /*if ( bli_is_lower( uploa ) )*/ diagoffa = -1; \ +\ + /* We will be reflecting the stored region over the diagonal into the + unstored region, so a transposition is necessary. */ \ + PASTEMAC(ch,scal2m,BLIS_TAPI_EX_SUF) \ + ( \ + diagoffa, \ + BLIS_NONUNIT_DIAG, \ + uploa, \ + BLIS_TRANSPOSE, \ + m, \ + m, \ + minus_one, \ + a, rs_a, cs_a, \ + a, rs_a, cs_a, \ + cntx, \ + rntm \ + ); \ +\ + /* Set the diagonal elements to zero. */ \ + PASTEMAC(ch,setd,BLIS_TAPI_EX_SUF) \ + ( \ + BLIS_NO_CONJUGATE, \ + 0, \ + m, \ + m, \ + zero, \ + a, rs_a, cs_a, \ + cntx, \ + rntm \ + ); \ +} + +INSERT_GENTFUNC_BASIC( mkskewsymm_unb_var1 ) + + #undef GENTFUNC #define GENTFUNC( ctype, ch, varname ) \ \ diff --git a/frame/util/bli_util_unb_var1.h b/frame/util/bli_util_unb_var1.h index 978183f965..94721ae00e 100644 --- a/frame/util/bli_util_unb_var1.h +++ b/frame/util/bli_util_unb_var1.h @@ -66,6 +66,8 @@ void PASTEMAC(ch,varname) \ INSERT_GENTPROT_BASIC( mkherm_unb_var1 ) INSERT_GENTPROT_BASIC( mksymm_unb_var1 ) +INSERT_GENTPROT_BASIC( mkskewherm_unb_var1 ) +INSERT_GENTPROT_BASIC( mkskewsymm_unb_var1 ) INSERT_GENTPROT_BASIC( mktrim_unb_var1 ) diff --git a/kernels/penryn/1f/bli_dotxaxpyf_penryn_int.c b/kernels/penryn/1f/bli_dotxaxpyf_penryn_int.c index 0148d3f924..a0af822bc5 100644 --- a/kernels/penryn/1f/bli_dotxaxpyf_penryn_int.c +++ b/kernels/penryn/1f/bli_dotxaxpyf_penryn_int.c @@ -51,7 +51,8 @@ void bli_ddotxaxpyf_penryn_int conj_t conjx, dim_t m, dim_t b_n, - const void* alpha, + const void* alphaw, + const void* alphax, const void* a, inc_t inca, inc_t lda, const void* w, inc_t incw, const void* x, inc_t incx, @@ -61,13 +62,14 @@ void bli_ddotxaxpyf_penryn_int const cntx_t* cntx ) { - const double* restrict alpha_cast = alpha; - const double* restrict a_cast = a; - const double* restrict w_cast = w; - const double* restrict x_cast = x; - const double* restrict beta_cast = beta; - double* restrict y_cast = y; - double* restrict z_cast = z; + const double* restrict alphaw_cast = alphaw; + const double* restrict alphax_cast = alphax; + const double* restrict a_cast = a; + const double* restrict w_cast = w; + const double* restrict x_cast = x; + const double* restrict beta_cast = beta; + double* restrict y_cast = y; + double* restrict z_cast = z; const dim_t n_elem_per_reg = 2; const dim_t n_iter_unroll = 2; @@ -152,7 +154,8 @@ void bli_ddotxaxpyf_penryn_int conjx, m, b_n, - alpha_cast, + alphaw_cast, + alphax_cast, a_cast, inca, lda, w_cast, incw, x_cast, incx, @@ -182,10 +185,10 @@ void bli_ddotxaxpyf_penryn_int chi2 = *(x_cast + 2*incx); chi3 = *(x_cast + 3*incx); - PASTEMAC(d,d,scals)( *alpha_cast, chi0 ); - PASTEMAC(d,d,scals)( *alpha_cast, chi1 ); - PASTEMAC(d,d,scals)( *alpha_cast, chi2 ); - PASTEMAC(d,d,scals)( *alpha_cast, chi3 ); + PASTEMAC(d,d,scals)( *alphax_cast, chi0 ); + PASTEMAC(d,d,scals)( *alphax_cast, chi1 ); + PASTEMAC(d,d,scals)( *alphax_cast, chi2 ); + PASTEMAC(d,d,scals)( *alphax_cast, chi3 ); PASTEMAC(d,set0s)( rho0 ); PASTEMAC(d,set0s)( rho1 ); @@ -341,7 +344,7 @@ void bli_ddotxaxpyf_penryn_int rho1v.d[1] = rho3; betav.v = _mm_loaddup_pd( ( double* ) beta_cast ); - alphav.v = _mm_loaddup_pd( ( double* ) alpha_cast ); + alphav.v = _mm_loaddup_pd( ( double* ) alphaw_cast ); psi0v.v = _mm_load_pd( ( double* )(y_cast + 0*n_elem_per_reg ) ); psi1v.v = _mm_load_pd( ( double* )(y_cast + 1*n_elem_per_reg ) ); diff --git a/kernels/sifive_x280/1f/bli_dotxaxpyf_sifive_x280_intr/bli_dotxaxpyf_sifive_x280_intr.c b/kernels/sifive_x280/1f/bli_dotxaxpyf_sifive_x280_intr/bli_dotxaxpyf_sifive_x280_intr.c index dc1bca9f6a..710eb5a392 100644 --- a/kernels/sifive_x280/1f/bli_dotxaxpyf_sifive_x280_intr/bli_dotxaxpyf_sifive_x280_intr.c +++ b/kernels/sifive_x280/1f/bli_dotxaxpyf_sifive_x280_intr/bli_dotxaxpyf_sifive_x280_intr.c @@ -47,7 +47,8 @@ conj_t conjx, \ dim_t m, \ dim_t b, \ - const T* restrict alpha_, \ + const T* restrict alphaw_, \ + const T* restrict alphax_, \ const T* restrict a_, inc_t inca, inc_t lda, \ const T* restrict w_, inc_t incw, \ const T* restrict x_, inc_t incx, \ diff --git a/kernels/sifive_x280/1f/bli_dotxaxpyf_sifive_x280_intr/bli_dotxaxpyf_sifive_x280_intr_complex.c b/kernels/sifive_x280/1f/bli_dotxaxpyf_sifive_x280_intr/bli_dotxaxpyf_sifive_x280_intr_complex.c index d8a984064d..d4c740fccd 100644 --- a/kernels/sifive_x280/1f/bli_dotxaxpyf_sifive_x280_intr/bli_dotxaxpyf_sifive_x280_intr_complex.c +++ b/kernels/sifive_x280/1f/bli_dotxaxpyf_sifive_x280_intr/bli_dotxaxpyf_sifive_x280_intr_complex.c @@ -126,9 +126,9 @@ RVV_TYPE_F(PREC, m1) y##i##_r, y##i##_i; \ if (PASTEMAC(PRECISION_CHAR, eq0)(*beta)) { \ if (bli_is_conj(conjatw)) \ - VCMUL_VF_CONJ(PREC, m1, y##i##_r, y##i##_i, dot##i##_r, dot##i##_i, alpha->real, alpha->imag, 1); \ + VCMUL_VF_CONJ(PREC, m1, y##i##_r, y##i##_i, dot##i##_r, dot##i##_i, alphaw->real, alphaw->imag, 1); \ else \ - VCMUL_VF(PREC, m1, y##i##_r, y##i##_i, dot##i##_r, dot##i##_i, alpha->real, alpha->imag, 1); \ + VCMUL_VF(PREC, m1, y##i##_r, y##i##_i, dot##i##_r, dot##i##_i, alphaw->real, alphaw->imag, 1); \ y[i * incy].real = VFMV_F_S(PREC)(y##i##_r); \ y[i * incy].imag = VFMV_F_S(PREC)(y##i##_i); \ } \ @@ -137,9 +137,9 @@ y##i##_r = VFMV_S_F(PREC, m1)(y[i * incy].real, 1); \ y##i##_i = VFMV_S_F(PREC, m1)(y[i * incy].imag, 1); \ if (bli_is_conj(conjatw)) \ - VCMACC_VF_CONJ(PREC, m1, y##i##_r, y##i##_i, alpha->real, alpha->imag, dot##i##_r, dot##i##_i, 1); \ + VCMACC_VF_CONJ(PREC, m1, y##i##_r, y##i##_i, alphaw->real, alphaw->imag, dot##i##_r, dot##i##_i, 1); \ else \ - VCMACC_VF(PREC, m1, y##i##_r, y##i##_i, alpha->real, alpha->imag, dot##i##_r, dot##i##_i, 1); \ + VCMACC_VF(PREC, m1, y##i##_r, y##i##_i, alphaw->real, alphaw->imag, dot##i##_r, dot##i##_i, 1); \ y[i * incy].real = VFMV_F_S(PREC)(y##i##_r); \ y[i * incy].imag = VFMV_F_S(PREC)(y##i##_i); \ } \ @@ -147,10 +147,12 @@ DOTXAXPYF(PRECISION_CHAR, void) { - // Computes y := beta * y + alpha * conjat(A^T) * conjx(x) - + // Computes y := beta * y + alphaw * conjat(A^T) * conjx(x) + // z := z + alphax * conja(A) * conjx(x) + (void) cntx; // Suppress unused parameter warnings - const DATATYPE* restrict alpha = alpha_; + const DATATYPE* restrict alphaw = alphaw_; + const DATATYPE* restrict alphax = alphax_; const DATATYPE* restrict a = a_; const DATATYPE* restrict w = w_; const DATATYPE* restrict x = x_; @@ -159,7 +161,7 @@ DOTXAXPYF(PRECISION_CHAR, void) DATATYPE* restrict z = z_; if (b == 0) return; - if (m == 0 || PASTEMAC(PRECISION_CHAR, eq0)(*alpha)) { + if (m == 0 || PASTEMAC(PRECISION_CHAR, eq0)(*alphaw)) { if (PASTEMAC(PRECISION_CHAR, eq0)(*beta)) SETV(PRECISION_CHAR)(BLIS_NO_CONJUGATE, b, beta, y, incy, NULL); else @@ -262,7 +264,7 @@ DOTXAXPYF(PRECISION_CHAR, void) } } } - + RVV_TYPE_FX(PREC, LMUL, 2) zvec; if (incz == 1) zvec = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) z_tmp, vl); @@ -271,9 +273,9 @@ DOTXAXPYF(PRECISION_CHAR, void) RVV_TYPE_F(PREC, LMUL) zvec_r = VGET_V_F(PREC, LMUL, 2)(zvec, 0); RVV_TYPE_F(PREC, LMUL) zvec_i = VGET_V_F(PREC, LMUL, 2)(zvec, 1); if (bli_is_conj(conjax)) - VCMACC_VF_CONJ(PREC, LMUL, zvec_r, zvec_i, alpha->real, alpha->imag, zacc_r, zacc_i, vl); + VCMACC_VF_CONJ(PREC, LMUL, zvec_r, zvec_i, alphax->real, alphax->imag, zacc_r, zacc_i, vl); else - VCMACC_VF(PREC, LMUL, zvec_r, zvec_i, alpha->real, alpha->imag, zacc_r, zacc_i, vl); + VCMACC_VF(PREC, LMUL, zvec_r, zvec_i, alphax->real, alphax->imag, zacc_r, zacc_i, vl); zvec = VSET_V_F(PREC, LMUL, 2)(zvec, 0, zvec_r); zvec = VSET_V_F(PREC, LMUL, 2)(zvec, 1, zvec_i); if (incz == 1) @@ -379,7 +381,7 @@ DOTXAXPYF(PRECISION_CHAR, void) } } } - + RVV_TYPE_FX(PREC, LMUL, 2) zvec; if (incz == 1) zvec = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) z_tmp, vl); @@ -388,9 +390,9 @@ DOTXAXPYF(PRECISION_CHAR, void) RVV_TYPE_F(PREC, LMUL) zvec_r = VGET_V_F(PREC, LMUL, 2)(zvec, 0); RVV_TYPE_F(PREC, LMUL) zvec_i = VGET_V_F(PREC, LMUL, 2)(zvec, 1); if (bli_is_conj(conjax)) - VCMACC_VF_CONJ(PREC, LMUL, zvec_r, zvec_i, alpha->real, alpha->imag, zacc_r, zacc_i, vl); + VCMACC_VF_CONJ(PREC, LMUL, zvec_r, zvec_i, alphax->real, alphax->imag, zacc_r, zacc_i, vl); else - VCMACC_VF(PREC, LMUL, zvec_r, zvec_i, alpha->real, alpha->imag, zacc_r, zacc_i, vl); + VCMACC_VF(PREC, LMUL, zvec_r, zvec_i, alphax->real, alphax->imag, zacc_r, zacc_i, vl); zvec = VSET_V_F(PREC, LMUL, 2)(zvec, 0, zvec_r); zvec = VSET_V_F(PREC, LMUL, 2)(zvec, 1, zvec_i); if (incz == 1) diff --git a/kernels/sifive_x280/1f/bli_dotxaxpyf_sifive_x280_intr/bli_dotxaxpyf_sifive_x280_intr_real.c b/kernels/sifive_x280/1f/bli_dotxaxpyf_sifive_x280_intr/bli_dotxaxpyf_sifive_x280_intr_real.c index 57ef4f7447..272de7dc5b 100644 --- a/kernels/sifive_x280/1f/bli_dotxaxpyf_sifive_x280_intr/bli_dotxaxpyf_sifive_x280_intr_real.c +++ b/kernels/sifive_x280/1f/bli_dotxaxpyf_sifive_x280_intr/bli_dotxaxpyf_sifive_x280_intr_real.c @@ -118,28 +118,29 @@ RVV_TYPE_F(PREC, m1) dot##i = VFMV_S_F(PREC, m1)(0., 1); \ dot##i = VF_REDUSUM_VS(PREC, LMUL)(yacc##i, dot##i, m); \ if (PASTEMAC(PRECISION_CHAR, eq0)(*beta)) { \ - dot##i = VFMUL_VF(PREC, m1)(dot##i, *alpha, 1); \ + dot##i = VFMUL_VF(PREC, m1)(dot##i, *alphaw, 1); \ y[i * incy] = VFMV_F_S(PREC)(dot##i); \ } \ else { \ y[i * incy] *= *beta; \ RVV_TYPE_F(PREC, m1) y##i = VFMV_S_F(PREC, m1)(y[i * incy], 1); \ - y##i = VFMACC_VF(PREC, m1)(y##i, *alpha, dot##i, 1); \ + y##i = VFMACC_VF(PREC, m1)(y##i, *alphaw, dot##i, 1); \ y[i * incy] = VFMV_F_S(PREC)(y##i); \ } \ } while (0) DOTXAXPYF(PRECISION_CHAR, void) { - // Computes y := beta * y + alpha * conjat(A^T) * conjw(w) - // z := z + alpha * conja(A) * conjx(x) - + // Computes y := beta * y + alphaw * conjat(A^T) * conjw(w) + // z := z + alphax * conja(A) * conjx(x) + (void) conjat; // Suppress unused parameter warnings (void) conja; (void) conjw; (void) conjx; (void) cntx; - const DATATYPE* restrict alpha = alpha_; + const DATATYPE* restrict alphaw = alphaw_; + const DATATYPE* restrict alphax = alphax_; const DATATYPE* restrict a = a_; const DATATYPE* restrict w = w_; const DATATYPE* restrict x = x_; @@ -148,7 +149,7 @@ DOTXAXPYF(PRECISION_CHAR, void) DATATYPE* restrict z = z_; if (b == 0) return; - if (m == 0 || PASTEMAC(PRECISION_CHAR, eq0)(*alpha)) { + if (m == 0 || PASTEMAC(PRECISION_CHAR, eq0)(*alphaw)) { if (PASTEMAC(PRECISION_CHAR, eq0)(*beta)) SETV(PRECISION_CHAR)(BLIS_NO_CONJUGATE, b, beta, y, incy, NULL); else @@ -191,12 +192,12 @@ DOTXAXPYF(PRECISION_CHAR, void) zvec = VLE_V_F(PREC, LMUL)(z_tmp, vl); else zvec = VLSE_V_F(PREC, LMUL)(z_tmp, FLT_SIZE * incz, vl); - zvec = VFMACC_VF(PREC, LMUL)(zvec, *alpha, zacc, vl); + zvec = VFMACC_VF(PREC, LMUL)(zvec, *alphax, zacc, vl); if (incz == 1) VSE_V_F(PREC, LMUL)(z_tmp, zvec, vl); else VSSE_V_F(PREC, LMUL)(z_tmp, FLT_SIZE * incz, zvec, vl); - + a_tmp += vl * inca; w_tmp += vl * incw; z_tmp += vl * incz; @@ -248,12 +249,12 @@ DOTXAXPYF(PRECISION_CHAR, void) zvec = VLE_V_F(PREC, LMUL)(z_tmp, vl); else zvec = VLSE_V_F(PREC, LMUL)(z_tmp, FLT_SIZE * incz, vl); - zvec = VFMACC_VF(PREC, LMUL)(zvec, *alpha, zacc, vl); + zvec = VFMACC_VF(PREC, LMUL)(zvec, *alphax, zacc, vl); if (incz == 1) VSE_V_F(PREC, LMUL)(z_tmp, zvec, vl); else VSSE_V_F(PREC, LMUL)(z_tmp, FLT_SIZE * incz, zvec, vl); - + a_tmp += vl * inca; w_tmp += vl * incw; z_tmp += vl * incz; diff --git a/ref_kernels/1f/bli_dotxaxpyf_ref.c b/ref_kernels/1f/bli_dotxaxpyf_ref.c index 6cfa5168c5..c3cb1aa412 100644 --- a/ref_kernels/1f/bli_dotxaxpyf_ref.c +++ b/ref_kernels/1f/bli_dotxaxpyf_ref.c @@ -46,7 +46,8 @@ void PASTEMAC(ch,opname,arch,suf) \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ - const void* alpha0, \ + const void* alphaw0, \ + const void* alphax0, \ const void* a0, inc_t inca, inc_t lda, \ const void* w0, inc_t incw, \ const void* x0, inc_t incx, \ @@ -57,16 +58,17 @@ void PASTEMAC(ch,opname,arch,suf) \ ) \ { \ /* A is m x n. */ \ - /* y = beta * y + alpha * A^T w; */ \ - /* z = z + alpha * A x; */ \ + /* y = beta * y + alphaw * A^T w; */ \ + /* z = z + alphax * A x; */ \ \ - const ctype* restrict alpha = alpha0; \ - const ctype* restrict a = a0; \ - const ctype* restrict w = w0; \ - const ctype* restrict x = x0; \ - const ctype* restrict beta = beta0; \ - ctype* restrict y = y0; \ - ctype* restrict z = z0; \ + const ctype* restrict alphaw = alphaw0; \ + const ctype* restrict alphax = alphax0; \ + const ctype* restrict a = a0; \ + const ctype* restrict w = w0; \ + const ctype* restrict x = x0; \ + const ctype* restrict beta = beta0; \ + ctype* restrict y = y0; \ + ctype* restrict z = z0; \ \ if ( 1 && inca == 1 && incw == 1 && incx == 1 && \ incy == 1 && incz == 1 && b_n == ff ) \ @@ -83,9 +85,6 @@ void PASTEMAC(ch,opname,arch,suf) \ { \ for ( dim_t i = 0; i < ff; ++i ) PASTEMAC(ch,scals)( *beta, y[i] ); \ } \ -\ - /* If the vectors are empty or if alpha is zero, return early. */ \ - if ( bli_zero_dim1( m ) || PASTEMAC(ch,eq0)( *alpha ) ) return; \ \ /* Initialize r vector to 0. */ \ for ( dim_t i = 0; i < ff; ++i ) PASTEMAC(ch,set0s)( r[i] ); \ @@ -95,13 +94,13 @@ void PASTEMAC(ch,opname,arch,suf) \ { \ PRAGMA_SIMD \ for ( dim_t i = 0; i < ff; ++i ) \ - PASTEMAC(ch,scal2js)( *alpha, x[i], ax[i] ); \ + PASTEMAC(ch,scal2js)( *alphax, x[i], ax[i] ); \ } \ else \ { \ PRAGMA_SIMD \ for ( dim_t i = 0; i < ff; ++i ) \ - PASTEMAC(ch,scal2s)( *alpha, x[i], ax[i] ); \ + PASTEMAC(ch,scal2s)( *alphax, x[i], ax[i] ); \ } \ \ /* If a must be conjugated, we do so indirectly by first toggling the @@ -164,7 +163,7 @@ void PASTEMAC(ch,opname,arch,suf) \ \ for ( dim_t i = 0; i < ff; ++i ) \ { \ - PASTEMAC(ch,axpys)( *alpha, r[i], y[i] ); \ + PASTEMAC(ch,axpys)( *alphaw, r[i], y[i] ); \ } \ } \ else \ @@ -180,7 +179,7 @@ void PASTEMAC(ch,opname,arch,suf) \ conjw, \ m, \ b_n, \ - alpha, \ + alphaw, \ a, inca, lda, \ w, incw, \ beta, \ @@ -194,7 +193,7 @@ void PASTEMAC(ch,opname,arch,suf) \ conjx, \ m, \ b_n, \ - alpha, \ + alphax, \ a, inca, lda, \ x, incx, \ z, incz, \ diff --git a/ref_kernels/1m/bli_packm_cxc_diag_1er_ref.c b/ref_kernels/1m/bli_packm_cxc_diag_1er_ref.c index fad987c4bf..e81d307379 100644 --- a/ref_kernels/1m/bli_packm_cxc_diag_1er_ref.c +++ b/ref_kernels/1m/bli_packm_cxc_diag_1er_ref.c @@ -51,12 +51,28 @@ do { \ } while (0) +#define PACKM_SET0_1E( chp_r, mnk ) \ +do { \ + PASTEMAC(chp_r,set0s)( *(pi1_ri + (mnk*2 + 0)*cdim_bcast + d + mnk*ldp2) ); \ + PASTEMAC(chp_r,set0s)( *(pi1_ri + (mnk*2 + 1)*cdim_bcast + d + mnk*ldp2) ); \ + PASTEMAC(chp_r,set0s)( *(pi1_ir + (mnk*2 + 0)*cdim_bcast + d + mnk*ldp2) ); \ + PASTEMAC(chp_r,set0s)( *(pi1_ir + (mnk*2 + 1)*cdim_bcast + d + mnk*ldp2) ); \ +} while (0) + + +#define PACKM_SET0_1R( chp_r, mnk ) \ +do { \ + PASTEMAC(chp_r,set0s)( *(pi1_r + mnk*cdim_bcast + d + mnk*ldp2) ); \ + PASTEMAC(chp_r,set0s)( *(pi1_i + mnk*cdim_bcast + d + mnk*ldp2) ); \ +} while (0) + + #define PACKM_SCAL_1E( ctypep_r, cha, chp, mn, k, op ) \ do { \ ctypep_r alpha_r, alpha_i, ka_r, ka_i; \ PASTEMAC(cha,chp,copyris)( *(alpha1 + mn *inca2 + 0 + k*lda2), \ - *(alpha1 + mn *inca2 + 1 + k*lda2), \ - alpha_r, alpha_i ); \ + *(alpha1 + mn *inca2 + 1 + k*lda2), \ + alpha_r, alpha_i ); \ PASTEMAC(chp,op)( kappa_r, kappa_i, alpha_r, alpha_i, ka_r, ka_i ); \ PASTEMAC(chp,copyris)( ka_r, ka_i, *(pi1_ri + (mn*2 + 0)*cdim_bcast + d + k*ldp2), \ *(pi1_ri + (mn*2 + 1)*cdim_bcast + d + k*ldp2) ); \ @@ -69,8 +85,8 @@ do { \ do { \ ctypep_r alpha_r, alpha_i, ka_r, ka_i; \ PASTEMAC(cha,chp,copyris)( *(alpha1 + mn *inca2 + 0 + k*lda2), \ - *(alpha1 + mn *inca2 + 1 + k*lda2), \ - alpha_r, alpha_i ); \ + *(alpha1 + mn *inca2 + 1 + k*lda2), \ + alpha_r, alpha_i ); \ PASTEMAC(chp,op)( kappa_r, kappa_i, alpha_r, alpha_i, ka_r, ka_i ); \ PASTEMAC(chp,copyris)( ka_r, ka_i, *(pi1_r + mn*cdim_bcast + d + k*ldp2), \ *(pi1_i + mn*cdim_bcast + d + k*ldp2) ); \ @@ -145,8 +161,6 @@ void PASTEMAC(cha,chp,opname,arch,suf) \ const inc_t lda2 = 2 * lda; \ const inc_t ldp2 = 2 * ldp; \ \ - ctypep_r kappa_r = ( ( ctypep_r* )kappa )[0]; \ - ctypep_r kappa_i = ( ( ctypep_r* )kappa )[1]; \ ctypep_r one = *PASTEMAC(chp_r,1); \ ctypep_r zero = *PASTEMAC(chp_r,0); \ const ctypea_r* restrict alpha1 = ( const ctypea_r* )a; \ @@ -165,17 +179,23 @@ void PASTEMAC(cha,chp,opname,arch,suf) \ ctypep_r* restrict pi1_ir = ( ctypep_r* )p + ldp; \ \ /* write the strictly lower part if it exists */ \ - if ( bli_is_lower( uploa ) || bli_is_herm_or_symm( struca ) ) \ + if ( bli_is_lower( uploa ) || bli_is_herm_or_symm( struca ) || bli_is_skew_herm_or_symm( struca ) ) \ { \ - dim_t inca_l2 = inca2; \ - dim_t lda_l2 = lda2; \ - conj_t conja_l = conja; \ + dim_t inca_l2 = inca2; \ + dim_t lda_l2 = lda2; \ + conj_t conja_l = conja; \ + ctypep_r kappa_r = ( ( ctypep_r* )kappa )[0]; \ + ctypep_r kappa_i = ( ( ctypep_r* )kappa )[1]; \ \ if ( bli_is_upper( uploa ) ) \ { \ bli_swap_incs( &inca_l2, &lda_l2 ); \ - if ( bli_is_hermitian( struca ) ) \ +\ + if ( bli_is_hermitian( struca ) || bli_is_skew_hermitian( struca ) ) \ bli_toggle_conj( &conja_l ); \ +\ + if ( bli_is_skew_symmetric( struca ) || bli_is_skew_hermitian( struca ) ) \ + PASTEMAC(chp,negris)( kappa_r, kappa_i ); \ } \ \ if ( bli_is_conj( conja_l ) ) PACKM_DIAG_BODY_1E_L( ctypep_r, cha, chp, scal2jris ); \ @@ -184,22 +204,31 @@ void PASTEMAC(cha,chp,opname,arch,suf) \ \ /* write the strictly upper part if it exists */ \ /* assume either symmetric, hermitian, or triangular */ \ - if ( bli_is_upper( uploa ) || bli_is_herm_or_symm( struca ) ) \ + if ( bli_is_upper( uploa ) || bli_is_herm_or_symm( struca ) || bli_is_skew_herm_or_symm( struca ) ) \ { \ - dim_t inca_u2 = inca2; \ - dim_t lda_u2 = lda2; \ - conj_t conja_u = conja; \ + dim_t inca_u2 = inca2; \ + dim_t lda_u2 = lda2; \ + conj_t conja_u = conja; \ + ctypep_r kappa_r = ( ( ctypep_r* )kappa )[0]; \ + ctypep_r kappa_i = ( ( ctypep_r* )kappa )[1]; \ \ if ( bli_is_lower( uploa ) ) \ { \ bli_swap_incs( &inca_u2, &lda_u2 ); \ - if ( bli_is_hermitian( struca ) ) \ +\ + if ( bli_is_hermitian( struca ) || bli_is_skew_hermitian( struca ) ) \ bli_toggle_conj( &conja_u ); \ +\ + if ( bli_is_skew_symmetric( struca ) || bli_is_skew_hermitian( struca ) ) \ + PASTEMAC(chp,negris)( kappa_r, kappa_i ); \ } \ \ if ( bli_is_conj( conja_u ) ) PACKM_DIAG_BODY_1E_U( ctypep_r, cha, chp, scal2jris ); \ else PACKM_DIAG_BODY_1E_U( ctypep_r, cha, chp, scal2ris ); \ } \ +\ + ctypep_r kappa_r = ( ( ctypep_r* )kappa )[0]; \ + ctypep_r kappa_i = ( ( ctypep_r* )kappa )[1]; \ \ /* write the diagonal */ \ if ( bli_is_unit_diag( diaga ) ) \ @@ -221,6 +250,41 @@ void PASTEMAC(cha,chp,opname,arch,suf) \ PASTEMAC(chp_r,scal2s)( kappa_r, alpha_r, *(pi1_ir + (mnk*2 + 1)*cdim_bcast + d + mnk*ldp2) ); \ } \ } \ + else if ( bli_is_skew_hermitian( struca ) ) \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( dim_t mnk = 0; mnk < cdim; ++mnk ) \ + for ( dim_t d = 0; d < cdim_bcast; ++d ) \ + { \ + ctypep_r alpha_i; \ + PASTEMAC(cha_r,chp_r,copys)( *(alpha1 + mnk*(inca2 + lda2) + 1), alpha_i ); \ + PASTEMAC(chp_r,scal2s)( kappa_i, alpha_i, *(pi1_ri + (mnk*2 + 0)*cdim_bcast + d + mnk*ldp2) ); \ + PASTEMAC(chp_r,scal2s)( -kappa_r, alpha_i, *(pi1_ri + (mnk*2 + 1)*cdim_bcast + d + mnk*ldp2) ); \ + PASTEMAC(chp_r,scal2s)( kappa_r, alpha_i, *(pi1_ir + (mnk*2 + 0)*cdim_bcast + d + mnk*ldp2) ); \ + PASTEMAC(chp_r,scal2s)( kappa_i, alpha_i, *(pi1_ir + (mnk*2 + 1)*cdim_bcast + d + mnk*ldp2) ); \ + } \ + } \ + else \ + { \ + for ( dim_t mnk = 0; mnk < cdim; ++mnk ) \ + for ( dim_t d = 0; d < cdim_bcast; ++d ) \ + { \ + ctypep_r alpha_i; \ + PASTEMAC(cha_r,chp_r,copys)( *(alpha1 + mnk*(inca2 + lda2) + 1), alpha_i ); \ + PASTEMAC(chp_r,scal2s)( -kappa_i, alpha_i, *(pi1_ri + (mnk*2 + 0)*cdim_bcast + d + mnk*ldp2) ); \ + PASTEMAC(chp_r,scal2s)( kappa_r, alpha_i, *(pi1_ri + (mnk*2 + 1)*cdim_bcast + d + mnk*ldp2) ); \ + PASTEMAC(chp_r,scal2s)( -kappa_r, alpha_i, *(pi1_ir + (mnk*2 + 0)*cdim_bcast + d + mnk*ldp2) ); \ + PASTEMAC(chp_r,scal2s)( -kappa_i, alpha_i, *(pi1_ir + (mnk*2 + 1)*cdim_bcast + d + mnk*ldp2) ); \ + } \ + } \ + } \ + else if ( bli_is_skew_symmetric( struca ) ) \ + { \ + for ( dim_t mnk = 0; mnk < cdim; ++mnk ) \ + for ( dim_t d = 0; d < cdim_bcast; ++d ) \ + PACKM_SET0_1E( chp_r, mnk ); \ + } \ else if ( bli_is_conj( conja )) \ { \ for ( dim_t mnk = 0; mnk < cdim; ++mnk ) \ @@ -268,17 +332,23 @@ void PASTEMAC(cha,chp,opname,arch,suf) \ ctypep_r* restrict pi1_i = ( ctypep_r* )p + ldp; \ \ /* write the strictly lower part if it exists */ \ - if ( bli_is_lower( uploa ) || bli_is_herm_or_symm( struca ) ) \ + if ( bli_is_lower( uploa ) || bli_is_herm_or_symm( struca ) || bli_is_skew_herm_or_symm( struca ) ) \ { \ - dim_t inca_l2 = inca2; \ - dim_t lda_l2 = lda2; \ - conj_t conja_l = conja; \ + dim_t inca_l2 = inca2; \ + dim_t lda_l2 = lda2; \ + conj_t conja_l = conja; \ + ctypep_r kappa_r = ( ( ctypep_r* )kappa )[0]; \ + ctypep_r kappa_i = ( ( ctypep_r* )kappa )[1]; \ \ if ( bli_is_upper( uploa ) ) \ { \ bli_swap_incs( &inca_l2, &lda_l2 ); \ - if ( bli_is_hermitian( struca ) ) \ +\ + if ( bli_is_hermitian( struca ) || bli_is_skew_hermitian( struca ) ) \ bli_toggle_conj( &conja_l ); \ +\ + if ( bli_is_skew_symmetric( struca ) || bli_is_skew_hermitian( struca ) ) \ + PASTEMAC(chp,negris)( kappa_r, kappa_i ); \ } \ \ if ( bli_is_conj( conja_l ) ) PACKM_DIAG_BODY_1R_L( ctypep_r, cha, chp, scal2jris ); \ @@ -287,22 +357,31 @@ void PASTEMAC(cha,chp,opname,arch,suf) \ \ /* write the strictly upper part if it exists */ \ /* assume either symmetric, hermitian, or triangular */ \ - if ( bli_is_upper( uploa ) || bli_is_herm_or_symm( struca ) ) \ + if ( bli_is_upper( uploa ) || bli_is_herm_or_symm( struca ) || bli_is_skew_herm_or_symm( struca ) ) \ { \ - dim_t inca_u2 = inca2; \ - dim_t lda_u2 = lda2; \ - conj_t conja_u = conja; \ + dim_t inca_u2 = inca2; \ + dim_t lda_u2 = lda2; \ + conj_t conja_u = conja; \ + ctypep_r kappa_r = ( ( ctypep_r* )kappa )[0]; \ + ctypep_r kappa_i = ( ( ctypep_r* )kappa )[1]; \ \ if ( bli_is_lower( uploa ) ) \ { \ bli_swap_incs( &inca_u2, &lda_u2 ); \ - if ( bli_is_hermitian( struca ) ) \ +\ + if ( bli_is_hermitian( struca ) || bli_is_skew_hermitian( struca ) ) \ bli_toggle_conj( &conja_u ); \ +\ + if ( bli_is_skew_symmetric( struca ) || bli_is_skew_hermitian( struca ) ) \ + PASTEMAC(chp,negris)( kappa_r, kappa_i ); \ } \ \ if ( bli_is_conj( conja_u ) ) PACKM_DIAG_BODY_1R_U( ctypep_r, cha, chp, scal2jris ); \ else PACKM_DIAG_BODY_1R_U( ctypep_r, cha, chp, scal2ris ); \ } \ +\ + ctypep_r kappa_r = ( ( ctypep_r* )kappa )[0]; \ + ctypep_r kappa_i = ( ( ctypep_r* )kappa )[1]; \ \ /* write the diagonal */ \ if ( bli_is_unit_diag( diaga ) ) \ @@ -322,6 +401,37 @@ void PASTEMAC(cha,chp,opname,arch,suf) \ PASTEMAC(chp_r,scal2s)( kappa_i, alpha_r, *(pi1_i + mnk*(cdim_bcast + ldp2) + d) ); \ } \ } \ + else if ( bli_is_skew_hermitian( struca ) ) \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( dim_t mnk = 0; mnk < cdim; ++mnk ) \ + for ( dim_t d = 0; d < cdim_bcast; ++d ) \ + { \ + ctypep_r alpha_i; \ + PASTEMAC(cha_r,chp_r,copys)( *(alpha1 + mnk*(inca2 + lda2) + 1), alpha_i ); \ + PASTEMAC(chp_r,scal2s)( kappa_i, alpha_i, *(pi1_r + mnk*(cdim_bcast + ldp2) + d) ); \ + PASTEMAC(chp_r,scal2s)( -kappa_r, alpha_i, *(pi1_i + mnk*(cdim_bcast + ldp2) + d) ); \ + } \ + } \ + else \ + { \ + for ( dim_t mnk = 0; mnk < cdim; ++mnk ) \ + for ( dim_t d = 0; d < cdim_bcast; ++d ) \ + { \ + ctypep_r alpha_i; \ + PASTEMAC(cha_r,chp_r,copys)( *(alpha1 + mnk*(inca2 + lda2) + 1), alpha_i ); \ + PASTEMAC(chp_r,scal2s)( -kappa_i, alpha_i, *(pi1_r + mnk*(cdim_bcast + ldp2) + d) ); \ + PASTEMAC(chp_r,scal2s)( kappa_r, alpha_i, *(pi1_i + mnk*(cdim_bcast + ldp2) + d) ); \ + } \ + } \ + } \ + else if ( bli_is_skew_symmetric( struca ) ) \ + { \ + for ( dim_t mnk = 0; mnk < cdim; ++mnk ) \ + for ( dim_t d = 0; d < cdim_bcast; ++d ) \ + PACKM_SET0_1R( chp_r, mnk ); \ + } \ else if ( bli_is_conj( conja ) ) \ { \ for ( dim_t mnk = 0; mnk < cdim; ++mnk ) \ diff --git a/ref_kernels/1m/bli_packm_cxc_diag_ref.c b/ref_kernels/1m/bli_packm_cxc_diag_ref.c index 635bb9900b..670bef02db 100644 --- a/ref_kernels/1m/bli_packm_cxc_diag_ref.c +++ b/ref_kernels/1m/bli_packm_cxc_diag_ref.c @@ -35,7 +35,7 @@ #include "blis.h" -#define PACKM_DIAG_BODY( ctypea, ctypep, cha, chp, mn_min, mn_max, dfac, inca, lda, op ) \ +#define PACKM_DIAG_BODY( ctypea, ctypep, cha, chp, mn_min, mn_max, cdim_bcast, inca, lda, op ) \ \ do \ { \ @@ -44,9 +44,9 @@ do \ { \ ctypep alpha_cast, kappa_alpha; \ PASTEMAC(cha,chp,copys)( *(alpha1 + mn*inca + k*lda), alpha_cast ); \ - PASTEMAC(chp,op)( kappa_cast, alpha_cast, kappa_alpha ); \ - for ( dim_t d = 0; d < dfac; d++ ) \ - PASTEMAC(chp,copys)( kappa_alpha, *(pi1 + mn*dfac + d + k*ldp) ); \ + PASTEMAC(chp,op)( kappa_use, alpha_cast, kappa_alpha ); \ + for ( dim_t d = 0; d < cdim_bcast; d++ ) \ + PASTEMAC(chp,copys)( kappa_alpha, *(pi1 + mn*cdim_bcast + d + k*ldp) ); \ } \ } while(0) @@ -93,17 +93,24 @@ void PASTEMAC(cha,chp,opname,arch,suf) \ ctypep* restrict pi1 = p; \ \ /* write the strictly lower part if it exists */ \ - if ( bli_is_lower( uploa ) || bli_is_herm_or_symm( struca ) ) \ + if ( bli_is_lower( uploa ) || bli_is_herm_or_symm( struca ) || bli_is_skew_herm_or_symm( struca ) ) \ { \ dim_t inca_l = inca; \ dim_t lda_l = lda; \ conj_t conja_l = conja; \ + ctypep kappa_use; \ +\ + PASTEMAC(chp,copys)( kappa_cast, kappa_use ); \ \ if ( bli_is_upper( uploa ) ) \ { \ bli_swap_incs( &inca_l, &lda_l ); \ - if ( bli_is_hermitian( struca ) ) \ +\ + if ( bli_is_hermitian( struca ) || bli_is_skew_hermitian( struca ) ) \ bli_toggle_conj( &conja_l ); \ +\ + if ( bli_is_skew_symmetric( struca ) || bli_is_skew_hermitian( struca ) ) \ + PASTEMAC(chp,neg2s)( kappa_cast, kappa_use ); \ } \ \ if ( bli_is_conj( conja_l ) ) PACKM_DIAG_BODY_L( ctypea, ctypep, cha, chp, scal2js ); \ @@ -112,17 +119,26 @@ void PASTEMAC(cha,chp,opname,arch,suf) \ \ /* write the strictly upper part if it exists */ \ /* assume either symmetric, hermitian, or triangular */ \ - if ( bli_is_upper( uploa ) || bli_is_herm_or_symm( struca ) ) \ + if ( bli_is_upper( uploa ) || bli_is_herm_or_symm( struca ) || bli_is_skew_herm_or_symm( struca ) ) \ { \ - dim_t inca_u = inca; \ - dim_t lda_u = lda; \ - conj_t conja_u = conja; \ + dim_t inca_u = inca; \ + dim_t lda_u = lda; \ + conj_t conja_u = conja; \ + ctypep kappa_use; \ +\ + PASTEMAC(chp,copys)( kappa_cast, kappa_use ); \ \ if ( bli_is_lower( uploa ) ) \ { \ bli_swap_incs( &inca_u, &lda_u ); \ - if ( bli_is_hermitian( struca ) ) \ +\ + if ( bli_is_hermitian( struca ) || \ + bli_is_skew_hermitian( struca ) ) \ bli_toggle_conj( &conja_u ); \ +\ + if ( bli_is_skew_symmetric( struca ) || \ + bli_is_skew_hermitian( struca ) ) \ + PASTEMAC(chp,neg2s)( kappa_cast, kappa_use ); \ } \ \ if ( bli_is_conj( conja_u ) ) PACKM_DIAG_BODY_U( ctypea, ctypep, cha, chp, scal2js ); \ @@ -148,7 +164,38 @@ void PASTEMAC(cha,chp,opname,arch,suf) \ PASTEMAC(chp,copys)( kappa_alpha, *(pi1 + mnk*(cdim_bcast + ldp) + d) ); \ } \ } \ - else if ( bli_is_conj( conja )) \ + else if ( bli_is_skew_hermitian( struca ) ) \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( dim_t mnk = 0; mnk < cdim; ++mnk ) \ + { \ + ctypep alpha_cast; \ + PASTEMAC(cha,chp,copys)( *(alpha1 + mnk*(inca + lda)), alpha_cast ); \ + PASTEMAC(chp,setr0s)( alpha_cast ); \ + for ( dim_t d = 0; d < cdim_bcast; ++d ) \ + PASTEMAC(chp,scal2js)( kappa_cast, alpha_cast, *(pi1 + mnk*(cdim_bcast + ldp) + d) ); \ + } \ + } \ + else \ + { \ + for ( dim_t mnk = 0; mnk < cdim; ++mnk ) \ + { \ + ctypep alpha_cast; \ + PASTEMAC(cha,chp,copys)( *(alpha1 + mnk*(inca + lda)), alpha_cast ); \ + PASTEMAC(chp,setr0s)( alpha_cast ); \ + for ( dim_t d = 0; d < cdim_bcast; ++d ) \ + PASTEMAC(chp,scal2s)( kappa_cast, alpha_cast, *(pi1 + mnk*(cdim_bcast + ldp) + d) ); \ + } \ + } \ + } \ + else if ( bli_is_skew_symmetric( struca ) ) \ + { \ + for ( dim_t mnk = 0; mnk < cdim; ++mnk ) \ + for ( dim_t d = 0; d < cdim_bcast; ++d ) \ + PASTEMAC(chp,set0s)( *(pi1 + mnk*(cdim_bcast + ldp) + d) ); \ + } \ + else if ( bli_is_conj( conja ) ) \ { \ for ( dim_t mnk = 0; mnk < cdim; ++mnk ) \ { \ diff --git a/ref_kernels/1m/bli_packm_cxc_diag_ro_ref.c b/ref_kernels/1m/bli_packm_cxc_diag_ro_ref.c index bb6fe939e6..dc33f44ba4 100644 --- a/ref_kernels/1m/bli_packm_cxc_diag_ro_ref.c +++ b/ref_kernels/1m/bli_packm_cxc_diag_ro_ref.c @@ -45,8 +45,8 @@ do { \ do { \ ctypep_r alpha_r, alpha_i, ka_r, ka_i; (void)ka_i; \ PASTEMAC(cha,chp,copyris)( *(alpha1 + mn *inca2 + 0 + k*lda2), \ - *(alpha1 + mn *inca2 + 1 + k*lda2), \ - alpha_r, alpha_i ); \ + *(alpha1 + mn *inca2 + 1 + k*lda2), \ + alpha_r, alpha_i ); \ PASTEMAC(chp,op)( kappa_r, kappa_i, alpha_r, alpha_i, ka_r, ka_i ); \ PASTEMAC(chp_r,copys)( ka_r, *(pi1_r + mn*cdim_bcast + d + k*ldp) ); \ } while (0) @@ -98,8 +98,6 @@ void PASTEMAC(cha,chp,opname,arch,suf) \ const inc_t inca2 = 2 * inca; \ const inc_t lda2 = 2 * lda; \ \ - ctypep_r kappa_r = ( ( ctypep_r* )kappa )[0]; \ - ctypep_r kappa_i = ( ( ctypep_r* )kappa )[1]; \ ctypep_r one = *PASTEMAC(chp_r,1); \ const ctypea_r* restrict alpha1 = ( const ctypea_r* )a; \ \ @@ -114,17 +112,23 @@ void PASTEMAC(cha,chp,opname,arch,suf) \ ctypep_r* restrict pi1_r = ( ctypep_r* )p; \ \ /* write the strictly lower part if it exists */ \ - if ( bli_is_lower( uploa ) || bli_is_herm_or_symm( struca ) ) \ + if ( bli_is_lower( uploa ) || bli_is_herm_or_symm( struca ) || bli_is_skew_herm_or_symm( struca ) ) \ { \ - dim_t inca_l2 = inca2; \ - dim_t lda_l2 = lda2; \ - conj_t conja_l = conja; \ + dim_t inca_l2 = inca2; \ + dim_t lda_l2 = lda2; \ + conj_t conja_l = conja; \ + ctypep_r kappa_r = ( ( ctypep_r* )kappa )[0]; \ + ctypep_r kappa_i = ( ( ctypep_r* )kappa )[1]; \ \ if ( bli_is_upper( uploa ) ) \ { \ bli_swap_incs( &inca_l2, &lda_l2 ); \ - if ( bli_is_hermitian( struca ) ) \ +\ + if ( bli_is_hermitian( struca ) || bli_is_skew_hermitian( struca ) ) \ bli_toggle_conj( &conja_l ); \ +\ + if ( bli_is_skew_symmetric( struca ) || bli_is_skew_hermitian( struca ) ) \ + PASTEMAC(chp,negris)( kappa_r, kappa_i ); \ } \ \ if ( bli_is_conj( conja_l ) ) PACKM_DIAG_BODY_RO_L( ctypep_r, cha, chp, chp_r, scal2jris ); \ @@ -133,22 +137,31 @@ void PASTEMAC(cha,chp,opname,arch,suf) \ \ /* write the strictly upper part if it exists */ \ /* assume either symmetric, hermitian, or triangular */ \ - if ( bli_is_upper( uploa ) || bli_is_herm_or_symm( struca ) ) \ + if ( bli_is_upper( uploa ) || bli_is_herm_or_symm( struca ) || bli_is_skew_herm_or_symm( struca ) ) \ { \ - dim_t inca_u2 = inca2; \ - dim_t lda_u2 = lda2; \ - conj_t conja_u = conja; \ + dim_t inca_u2 = inca2; \ + dim_t lda_u2 = lda2; \ + conj_t conja_u = conja; \ + ctypep_r kappa_r = ( ( ctypep_r* )kappa )[0]; \ + ctypep_r kappa_i = ( ( ctypep_r* )kappa )[1]; \ \ if ( bli_is_lower( uploa ) ) \ { \ bli_swap_incs( &inca_u2, &lda_u2 ); \ - if ( bli_is_hermitian( struca ) ) \ +\ + if ( bli_is_hermitian( struca ) || bli_is_skew_hermitian( struca ) ) \ bli_toggle_conj( &conja_u ); \ +\ + if ( bli_is_skew_symmetric( struca ) || bli_is_skew_hermitian( struca ) ) \ + PASTEMAC(chp,negris)( kappa_r, kappa_i ); \ } \ \ if ( bli_is_conj( conja_u ) ) PACKM_DIAG_BODY_RO_U( ctypep_r, cha, chp, chp_r, scal2jris ); \ else PACKM_DIAG_BODY_RO_U( ctypep_r, cha, chp, chp_r, scal2ris ); \ } \ +\ + ctypep_r kappa_r = ( ( ctypep_r* )kappa )[0]; \ + ctypep_r kappa_i = ( ( ctypep_r* )kappa )[1]; \ \ /* write the diagonal */ \ if ( bli_is_unit_diag( diaga ) ) \ @@ -167,6 +180,22 @@ void PASTEMAC(cha,chp,opname,arch,suf) \ PASTEMAC(chp_r,scal2s)( kappa_r, alpha_r, *(pi1_r + mnk*(cdim_bcast + ldp) + d) ); \ } \ } \ + else if ( bli_is_skew_hermitian( struca ) ) \ + { \ + for ( dim_t mnk = 0; mnk < cdim; ++mnk ) \ + for ( dim_t d = 0; d < cdim_bcast; ++d ) \ + { \ + ctypep_r alpha_i; \ + PASTEMAC(cha_r,chp_r,copys)( *(alpha1 + mnk*(inca2 + lda2 + 1)), alpha_i ); \ + PASTEMAC(chp_r,scal2s)( -kappa_i, alpha_i, *(pi1_r + mnk*(cdim_bcast + ldp) + d) ); \ + } \ + } \ + else if ( bli_is_skew_symmetric( struca ) ) \ + { \ + for ( dim_t mnk = 0; mnk < cdim; ++mnk ) \ + for ( dim_t d = 0; d < cdim_bcast; ++d ) \ + PASTEMAC(chp_r,set0s)( *(pi1_r + mnk*(cdim_bcast + ldp) + d) ); \ + } \ else if ( bli_is_conj( conja ) ) \ { \ for ( dim_t mnk = 0; mnk < cdim; ++mnk ) \ diff --git a/testsuite/input.operations b/testsuite/input.operations index e6c39e631e..0116d050ed 100644 --- a/testsuite/input.operations +++ b/testsuite/input.operations @@ -240,6 +240,10 @@ -1 # dimensions: m ??? # parameters: uploa conja conjx +1 # shmv +-1 # dimensions: m +??? # parameters: uploa conja conjx + 1 # her -1 # dimensions: m ?? # parameters: uploc conjx @@ -248,10 +252,18 @@ -1 # dimensions: m ??? # parameters: uploc conjx conjy +1 # shr2 +-1 # dimensions: m +??? # parameters: uploc conjx conjy + 1 # symv -1 # dimensions: m ??? # parameters: uploa conja conjx +1 # skmv +-1 # dimensions: m +??? # parameters: uploa conja conjx + 1 # syr -1 # dimensions: m ?? # parameters: uploc conjx @@ -260,6 +272,10 @@ -1 # dimensions: m ??? # parameters: uploc conjx conjy +1 # skr2 +-1 # dimensions: m +??? # parameters: uploc conjx conjy + 1 # trmv -1 # dimensions: m ??? # parameters: uploa transa diaga @@ -296,6 +312,10 @@ -1 -1 # dimensions: m n ???? # parameters: side uploa conja transb +1 # shmm +-1 -1 # dimensions: m n +???? # parameters: side uploa conja transb + 1 # herk -1 -1 # dimensions: m k ?? # parameters: uploc transa @@ -304,10 +324,18 @@ -1 -1 # dimensions: m k ??? # parameters: uploc transa transb +1 # shr2k +-1 -1 # dimensions: m k +??? # parameters: uploc transa transb + 1 # symm -1 -1 # dimensions: m n ???? # parameters: side uploa conja transb +1 # skmm +-1 -1 # dimensions: m n +???? # parameters: side uploa conja transb + 1 # syrk -1 -1 # dimensions: m k ?? # parameters: uploc transa @@ -316,6 +344,10 @@ -1 -1 # dimensions: m k ??? # parameters: uploc transa transb +1 # skr2k +-1 -1 # dimensions: m k +??? # parameters: uploc transa transb + 1 # trmm -1 -1 # dimensions: m n ???? # parameters: side uploa transa diaga diff --git a/testsuite/input.operations.fast b/testsuite/input.operations.fast index ecd526aaaf..a78c489990 100644 --- a/testsuite/input.operations.fast +++ b/testsuite/input.operations.fast @@ -240,6 +240,10 @@ -1 # dimensions: m ??? # parameters: uploa conja conjx +1 # shmv +-1 # dimensions: m +??? # parameters: uploa conja conjx + 1 # her -1 # dimensions: m ?? # parameters: uploc conjx @@ -248,10 +252,18 @@ -1 # dimensions: m ??? # parameters: uploc conjx conjy +1 # shr2 +-1 # dimensions: m +??? # parameters: uploc conjx conjy + 1 # symv -1 # dimensions: m ??? # parameters: uploa conja conjx +1 # skmv +-1 # dimensions: m +??? # parameters: uploa conja conjx + 1 # syr -1 # dimensions: m ?? # parameters: uploc conjx @@ -260,6 +272,10 @@ -1 # dimensions: m ??? # parameters: uploc conjx conjy +1 # skr2 +-1 # dimensions: m +??? # parameters: uploc conjx conjy + 1 # trmv -1 # dimensions: m ??? # parameters: uploa transa diaga @@ -296,6 +312,10 @@ nn # parameters: transa transb -1 -1 # dimensions: m n ??nn # parameters: side uploa conja transb +1 # shmm +-1 -1 # dimensions: m n +??nn # parameters: side uploa conja transb + 1 # herk -1 -1 # dimensions: m k ?n # parameters: uploc transa @@ -304,10 +324,18 @@ nn # parameters: transa transb -1 -1 # dimensions: m k ?nn # parameters: uploc transa transb +1 # shr2k +-1 -1 # dimensions: m k +?nn # parameters: uploc transa transb + 1 # symm -1 -1 # dimensions: m n ??nn # parameters: side uploa conja transb +1 # skmm +-1 -1 # dimensions: m n +??nn # parameters: side uploa conja transb + 1 # syrk -1 -1 # dimensions: m k ?n # parameters: uploc transa @@ -316,6 +344,10 @@ nn # parameters: transa transb -1 -1 # dimensions: m k ?nn # parameters: uploc transa transb +1 # skr2k +-1 -1 # dimensions: m k +?nn # parameters: uploc transa transb + 1 # trmm -1 -1 # dimensions: m n ??n? # parameters: side uploa transa diaga diff --git a/testsuite/input.operations.mixed b/testsuite/input.operations.mixed index baab1ea479..447b7dd02e 100644 --- a/testsuite/input.operations.mixed +++ b/testsuite/input.operations.mixed @@ -240,6 +240,10 @@ -1 # dimensions: m ??? # parameters: uploa conja conjx +1 # shmv +-1 # dimensions: m +??? # parameters: uploa conja conjx + 1 # her -1 # dimensions: m ?? # parameters: uploc conjx @@ -248,10 +252,18 @@ -1 # dimensions: m ??? # parameters: uploc conjx conjy +1 # shr2 +-1 # dimensions: m +??? # parameters: uploc conjx conjy + 1 # symv -1 # dimensions: m ??? # parameters: uploa conja conjx +1 # skmv +-1 # dimensions: m +??? # parameters: uploa conja conjx + 1 # syr -1 # dimensions: m ?? # parameters: uploc conjx @@ -260,6 +272,10 @@ -1 # dimensions: m ??? # parameters: uploc conjx conjy +1 # skr2 +-1 # dimensions: m +??? # parameters: uploc conjx conjy + 1 # trmv -1 # dimensions: m ??? # parameters: uploa transa diaga @@ -296,6 +312,10 @@ nn # parameters: transa transb -1 -1 # dimensions: m n ??nn # parameters: side uploa conja transb +1 # shmm +-1 -1 # dimensions: m n +??nn # parameters: side uploa conja transb + 1 # herk -1 -1 # dimensions: m k ?n # parameters: uploc transa @@ -304,10 +324,18 @@ nn # parameters: transa transb -1 -1 # dimensions: m k ?nn # parameters: uploc transa transb +1 # shr2k +-1 -1 # dimensions: m k +?nn # parameters: uploc transa transb + 1 # symm -1 -1 # dimensions: m n ??nn # parameters: side uploa conja transb +1 # skmm +-1 -1 # dimensions: m n +??nn # parameters: side uploa conja transb + 1 # syrk -1 -1 # dimensions: m k ?n # parameters: uploc transa @@ -316,6 +344,10 @@ nn # parameters: transa transb -1 -1 # dimensions: m k ?nn # parameters: uploc transa transb +1 # skr2k +-1 -1 # dimensions: m k +?nn # parameters: uploc transa transb + 1 # trmm -1 -1 # dimensions: m n ??n? # parameters: side uploa transa diaga diff --git a/testsuite/input.operations.salt b/testsuite/input.operations.salt index ecd526aaaf..a78c489990 100644 --- a/testsuite/input.operations.salt +++ b/testsuite/input.operations.salt @@ -240,6 +240,10 @@ -1 # dimensions: m ??? # parameters: uploa conja conjx +1 # shmv +-1 # dimensions: m +??? # parameters: uploa conja conjx + 1 # her -1 # dimensions: m ?? # parameters: uploc conjx @@ -248,10 +252,18 @@ -1 # dimensions: m ??? # parameters: uploc conjx conjy +1 # shr2 +-1 # dimensions: m +??? # parameters: uploc conjx conjy + 1 # symv -1 # dimensions: m ??? # parameters: uploa conja conjx +1 # skmv +-1 # dimensions: m +??? # parameters: uploa conja conjx + 1 # syr -1 # dimensions: m ?? # parameters: uploc conjx @@ -260,6 +272,10 @@ -1 # dimensions: m ??? # parameters: uploc conjx conjy +1 # skr2 +-1 # dimensions: m +??? # parameters: uploc conjx conjy + 1 # trmv -1 # dimensions: m ??? # parameters: uploa transa diaga @@ -296,6 +312,10 @@ nn # parameters: transa transb -1 -1 # dimensions: m n ??nn # parameters: side uploa conja transb +1 # shmm +-1 -1 # dimensions: m n +??nn # parameters: side uploa conja transb + 1 # herk -1 -1 # dimensions: m k ?n # parameters: uploc transa @@ -304,10 +324,18 @@ nn # parameters: transa transb -1 -1 # dimensions: m k ?nn # parameters: uploc transa transb +1 # shr2k +-1 -1 # dimensions: m k +?nn # parameters: uploc transa transb + 1 # symm -1 -1 # dimensions: m n ??nn # parameters: side uploa conja transb +1 # skmm +-1 -1 # dimensions: m n +??nn # parameters: side uploa conja transb + 1 # syrk -1 -1 # dimensions: m k ?n # parameters: uploc transa @@ -316,6 +344,10 @@ nn # parameters: transa transb -1 -1 # dimensions: m k ?nn # parameters: uploc transa transb +1 # skr2k +-1 -1 # dimensions: m k +?nn # parameters: uploc transa transb + 1 # trmm -1 -1 # dimensions: m n ??n? # parameters: side uploa transa diaga diff --git a/testsuite/src/test_dotxaxpyf.c b/testsuite/src/test_dotxaxpyf.c index ec519de51e..d5f7037268 100644 --- a/testsuite/src/test_dotxaxpyf.c +++ b/testsuite/src/test_dotxaxpyf.c @@ -70,7 +70,8 @@ void libblis_test_dotxaxpyf_experiment void libblis_test_dotxaxpyf_impl ( iface_t iface, - obj_t* alpha, + obj_t* alphaw, + obj_t* alphax, obj_t* at, obj_t* a, obj_t* w, @@ -84,7 +85,8 @@ void libblis_test_dotxaxpyf_impl void libblis_test_dotxaxpyf_check ( test_params_t* params, - obj_t* alpha, + obj_t* alphaw, + obj_t* alphax, obj_t* at, obj_t* a, obj_t* w, @@ -177,7 +179,7 @@ void libblis_test_dotxaxpyf_experiment conj_t conjat, conja, conjw, conjx; - obj_t alpha, at, a, w, x, beta, y, z; + obj_t alphaw, alphax, at, a, w, x, beta, y, z; obj_t y_save, z_save; cntx_t* cntx; @@ -206,7 +208,8 @@ void libblis_test_dotxaxpyf_experiment bli_param_map_char_to_blis_conj( pc_str[3], &conjx ); // Create test scalars. - bli_obj_scalar_init_detached( datatype, &alpha ); + bli_obj_scalar_init_detached( datatype, &alphaw ); + bli_obj_scalar_init_detached( datatype, &alphax ); bli_obj_scalar_init_detached( datatype, &beta ); // Create test operands (vectors and/or matrices). @@ -222,12 +225,14 @@ void libblis_test_dotxaxpyf_experiment // Set alpha. if ( bli_obj_is_real( &y ) ) { - bli_setsc( 1.2, 0.0, &alpha ); + bli_setsc( 1.2, 0.0, &alphaw ); + bli_setsc( 0.8, 0.0, &alphax ); bli_setsc( -1.0, 0.0, &beta ); } else { - bli_setsc( 1.2, 0.1, &alpha ); + bli_setsc( 1.2, 0.1, &alphaw ); + bli_setsc( 0.8, 0.1, &alphax ); bli_setsc( -1.0, -0.1, &beta ); } @@ -260,7 +265,7 @@ void libblis_test_dotxaxpyf_experiment time = bli_clock(); libblis_test_dotxaxpyf_impl( iface, - &alpha, &at, &a, &w, &x, &beta, &y, &z, + &alphaw, &alphax, &at, &a, &w, &x, &beta, &y, &z, cntx ); time_min = bli_clock_min_diff( time_min, time ); @@ -271,7 +276,7 @@ void libblis_test_dotxaxpyf_experiment if ( bli_obj_is_complex( &y ) ) *perf *= 4.0; // Perform checks. - libblis_test_dotxaxpyf_check( params, &alpha, &at, &a, &w, &x, &beta, &y, &z, &y_save, &z_save, resid ); + libblis_test_dotxaxpyf_check( params, &alphaw, &alphax, &at, &a, &w, &x, &beta, &y, &z, &y_save, &z_save, resid ); // Zero out performance and residual if either output vector is empty. libblis_test_check_empty_problem( &y, perf, resid ); @@ -292,7 +297,8 @@ void libblis_test_dotxaxpyf_experiment void libblis_test_dotxaxpyf_impl ( iface_t iface, - obj_t* alpha, + obj_t* alphaw, + obj_t* alphax, obj_t* at, obj_t* a, obj_t* w, @@ -306,7 +312,7 @@ void libblis_test_dotxaxpyf_impl switch ( iface ) { case BLIS_TEST_SEQ_FRONT_END: - bli_dotxaxpyf_ex( alpha, at, a, w, x, beta, y, z, cntx, NULL ); + bli_dotxaxpyf_ex( alphaw, alphax, at, a, w, x, beta, y, z, cntx, NULL ); break; default: @@ -319,7 +325,8 @@ void libblis_test_dotxaxpyf_impl void libblis_test_dotxaxpyf_check ( test_params_t* params, - obj_t* alpha, + obj_t* alphaw, + obj_t* alphax, obj_t* at, obj_t* a, obj_t* w, @@ -356,13 +363,13 @@ void libblis_test_dotxaxpyf_check // - z is randomized. // - at is an alias to a. // Note: - // - alpha and beta should have a non-zero imaginary component in the + // - alphaw, alphax, and beta should have a non-zero imaginary component in the // complex cases in order to more fully exercise the implementation. // // Under these conditions, we assume that the implementation for // - // y := beta * y_orig + alpha * conjat(A^T) * conjw(w) - // z := z_orig + alpha * conja(A) * conjx(x) + // y := beta * y_orig + alphaw * conjat(A^T) * conjw(w) + // z := z_orig + alphax * conja(A) * conjx(x) // // is functioning correctly if // @@ -391,7 +398,7 @@ void libblis_test_dotxaxpyf_check bli_acquire_mpart_l2r( BLIS_SUBPART1, i, 1, at, &a1 ); bli_acquire_vpart_f2b( BLIS_SUBPART1, i, 1, &v, &psi1 ); - bli_dotxv( alpha, &a1, w, beta, &psi1 ); + bli_dotxv( alphaw, &a1, w, beta, &psi1 ); } // q := q + alpha * conja(a) * conjx(x) @@ -401,7 +408,7 @@ void libblis_test_dotxaxpyf_check bli_acquire_vpart_f2b( BLIS_SUBPART1, i, 1, x, &chi1 ); bli_copysc( &chi1, &alpha_chi1 ); - bli_mulsc( alpha, &alpha_chi1 ); + bli_mulsc( alphax, &alpha_chi1 ); bli_axpyv( &alpha_chi1, &a1, &q ); } diff --git a/testsuite/src/test_libblis.c b/testsuite/src/test_libblis.c index aed0cd8178..3d629964a3 100644 --- a/testsuite/src/test_libblis.c +++ b/testsuite/src/test_libblis.c @@ -331,11 +331,15 @@ void libblis_test_level2_ops( thread_data_t* tdata, test_params_t* params, test_ libblis_test_gemv( tdata, params, &(ops->gemv) ); libblis_test_ger( tdata, params, &(ops->ger) ); libblis_test_hemv( tdata, params, &(ops->hemv) ); + libblis_test_shmv( tdata, params, &(ops->shmv) ); libblis_test_her( tdata, params, &(ops->her) ); libblis_test_her2( tdata, params, &(ops->her2) ); + libblis_test_shr2( tdata, params, &(ops->shr2) ); libblis_test_symv( tdata, params, &(ops->symv) ); + libblis_test_skmv( tdata, params, &(ops->skmv) ); libblis_test_syr( tdata, params, &(ops->syr) ); libblis_test_syr2( tdata, params, &(ops->syr2) ); + libblis_test_skr2( tdata, params, &(ops->skr2) ); libblis_test_trmv( tdata, params, &(ops->trmv) ); libblis_test_trsv( tdata, params, &(ops->trsv) ); } @@ -356,11 +360,15 @@ void libblis_test_level3_ops( thread_data_t* tdata, test_params_t* params, test_ libblis_test_gemm( tdata, params, &(ops->gemm) ); libblis_test_gemmt( tdata, params, &(ops->gemmt) ); libblis_test_hemm( tdata, params, &(ops->hemm) ); + libblis_test_shmm( tdata, params, &(ops->shmm) ); libblis_test_herk( tdata, params, &(ops->herk) ); libblis_test_her2k( tdata, params, &(ops->her2k) ); + libblis_test_shr2k( tdata, params, &(ops->shr2k) ); libblis_test_symm( tdata, params, &(ops->symm) ); + libblis_test_skmm( tdata, params, &(ops->skmm) ); libblis_test_syrk( tdata, params, &(ops->syrk) ); libblis_test_syr2k( tdata, params, &(ops->syr2k) ); + libblis_test_skr2k( tdata, params, &(ops->skr2k) ); libblis_test_trmm( tdata, params, &(ops->trmm) ); libblis_test_trmm3( tdata, params, &(ops->trmm3) ); libblis_test_trsm( tdata, params, &(ops->trsm) ); @@ -436,11 +444,15 @@ void libblis_test_read_ops_file( char* input_filename, test_ops_t* ops ) libblis_test_read_op_info( ops, input_stream, BLIS_NOID, BLIS_TEST_DIMS_MN, 2, &(ops->gemv) ); libblis_test_read_op_info( ops, input_stream, BLIS_NOID, BLIS_TEST_DIMS_MN, 2, &(ops->ger) ); libblis_test_read_op_info( ops, input_stream, BLIS_NOID, BLIS_TEST_DIMS_M, 3, &(ops->hemv) ); + libblis_test_read_op_info( ops, input_stream, BLIS_NOID, BLIS_TEST_DIMS_M, 3, &(ops->shmv) ); libblis_test_read_op_info( ops, input_stream, BLIS_NOID, BLIS_TEST_DIMS_M, 2, &(ops->her) ); libblis_test_read_op_info( ops, input_stream, BLIS_NOID, BLIS_TEST_DIMS_M, 3, &(ops->her2) ); + libblis_test_read_op_info( ops, input_stream, BLIS_NOID, BLIS_TEST_DIMS_M, 3, &(ops->shr2) ); libblis_test_read_op_info( ops, input_stream, BLIS_NOID, BLIS_TEST_DIMS_M, 3, &(ops->symv) ); + libblis_test_read_op_info( ops, input_stream, BLIS_NOID, BLIS_TEST_DIMS_M, 3, &(ops->skmv) ); libblis_test_read_op_info( ops, input_stream, BLIS_NOID, BLIS_TEST_DIMS_M, 2, &(ops->syr) ); libblis_test_read_op_info( ops, input_stream, BLIS_NOID, BLIS_TEST_DIMS_M, 3, &(ops->syr2) ); + libblis_test_read_op_info( ops, input_stream, BLIS_NOID, BLIS_TEST_DIMS_M, 3, &(ops->skr2) ); libblis_test_read_op_info( ops, input_stream, BLIS_NOID, BLIS_TEST_DIMS_M, 3, &(ops->trmv) ); libblis_test_read_op_info( ops, input_stream, BLIS_NOID, BLIS_TEST_DIMS_M, 3, &(ops->trsv) ); @@ -453,11 +465,15 @@ void libblis_test_read_ops_file( char* input_filename, test_ops_t* ops ) libblis_test_read_op_info( ops, input_stream, BLIS_GEMM, BLIS_TEST_DIMS_MNK, 2, &(ops->gemm) ); libblis_test_read_op_info( ops, input_stream, BLIS_GEMMT, BLIS_TEST_DIMS_MK, 3, &(ops->gemmt) ); libblis_test_read_op_info( ops, input_stream, BLIS_HEMM, BLIS_TEST_DIMS_MN, 4, &(ops->hemm) ); + libblis_test_read_op_info( ops, input_stream, BLIS_SHMM, BLIS_TEST_DIMS_MN, 4, &(ops->shmm) ); libblis_test_read_op_info( ops, input_stream, BLIS_HERK, BLIS_TEST_DIMS_MK, 2, &(ops->herk) ); libblis_test_read_op_info( ops, input_stream, BLIS_HER2K, BLIS_TEST_DIMS_MK, 3, &(ops->her2k) ); + libblis_test_read_op_info( ops, input_stream, BLIS_SHR2K, BLIS_TEST_DIMS_MK, 3, &(ops->shr2k) ); libblis_test_read_op_info( ops, input_stream, BLIS_SYMM, BLIS_TEST_DIMS_MN, 4, &(ops->symm) ); + libblis_test_read_op_info( ops, input_stream, BLIS_SKMM, BLIS_TEST_DIMS_MN, 4, &(ops->skmm) ); libblis_test_read_op_info( ops, input_stream, BLIS_SYRK, BLIS_TEST_DIMS_MK, 2, &(ops->syrk) ); libblis_test_read_op_info( ops, input_stream, BLIS_SYR2K, BLIS_TEST_DIMS_MK, 3, &(ops->syr2k) ); + libblis_test_read_op_info( ops, input_stream, BLIS_SKR2K, BLIS_TEST_DIMS_MK, 3, &(ops->skr2k) ); libblis_test_read_op_info( ops, input_stream, BLIS_TRMM, BLIS_TEST_DIMS_MN, 4, &(ops->trmm) ); libblis_test_read_op_info( ops, input_stream, BLIS_TRMM3, BLIS_TEST_DIMS_MN, 5, &(ops->trmm3) ); libblis_test_read_op_info( ops, input_stream, BLIS_TRSM, BLIS_TEST_DIMS_MN, 4, &(ops->trsm) ); @@ -1032,6 +1048,26 @@ void libblis_test_output_params_struct( FILE* os, test_params_t* params ) bli_info_get_syr2k_impl_string( BLIS_DOUBLE ), bli_info_get_syr2k_impl_string( BLIS_SCOMPLEX ), bli_info_get_syr2k_impl_string( BLIS_DCOMPLEX ) ); + libblis_test_fprintf_c( os, " shmm %7s %7s %7s %7s\n", + bli_info_get_shmm_impl_string( BLIS_FLOAT ), + bli_info_get_shmm_impl_string( BLIS_DOUBLE ), + bli_info_get_shmm_impl_string( BLIS_SCOMPLEX ), + bli_info_get_shmm_impl_string( BLIS_DCOMPLEX ) ); + libblis_test_fprintf_c( os, " shr2k %7s %7s %7s %7s\n", + bli_info_get_shr2k_impl_string( BLIS_FLOAT ), + bli_info_get_shr2k_impl_string( BLIS_DOUBLE ), + bli_info_get_shr2k_impl_string( BLIS_SCOMPLEX ), + bli_info_get_shr2k_impl_string( BLIS_DCOMPLEX ) ); + libblis_test_fprintf_c( os, " skmm %7s %7s %7s %7s\n", + bli_info_get_skmm_impl_string( BLIS_FLOAT ), + bli_info_get_skmm_impl_string( BLIS_DOUBLE ), + bli_info_get_skmm_impl_string( BLIS_SCOMPLEX ), + bli_info_get_skmm_impl_string( BLIS_DCOMPLEX ) ); + libblis_test_fprintf_c( os, " skr2k %7s %7s %7s %7s\n", + bli_info_get_skr2k_impl_string( BLIS_FLOAT ), + bli_info_get_skr2k_impl_string( BLIS_DOUBLE ), + bli_info_get_skr2k_impl_string( BLIS_SCOMPLEX ), + bli_info_get_skr2k_impl_string( BLIS_DCOMPLEX ) ); libblis_test_fprintf_c( os, " trmm %7s %7s %7s %7s\n", bli_info_get_trmm_impl_string( BLIS_FLOAT ), bli_info_get_trmm_impl_string( BLIS_DOUBLE ), diff --git a/testsuite/src/test_libblis.h b/testsuite/src/test_libblis.h index 7c1e52805d..b611f0a8f1 100644 --- a/testsuite/src/test_libblis.h +++ b/testsuite/src/test_libblis.h @@ -260,11 +260,15 @@ typedef struct test_ops_s test_op_t gemv; test_op_t ger; test_op_t hemv; + test_op_t shmv; test_op_t her; test_op_t her2; + test_op_t shr2; test_op_t symv; + test_op_t skmv; test_op_t syr; test_op_t syr2; + test_op_t skr2; test_op_t trmv; test_op_t trsv; @@ -277,11 +281,15 @@ typedef struct test_ops_s test_op_t gemm; test_op_t gemmt; test_op_t hemm; + test_op_t shmm; test_op_t herk; test_op_t her2k; + test_op_t shr2k; test_op_t symm; + test_op_t skmm; test_op_t syrk; test_op_t syr2k; + test_op_t skr2k; test_op_t trmm; test_op_t trmm3; test_op_t trsm; @@ -538,11 +546,15 @@ char libblis_test_proj_dtchar_to_precchar( char dt_char ); #include "test_gemv.h" #include "test_ger.h" #include "test_hemv.h" +#include "test_shmv.h" #include "test_her.h" #include "test_her2.h" +#include "test_shr2.h" #include "test_symv.h" +#include "test_skmv.h" #include "test_syr.h" #include "test_syr2.h" +#include "test_skr2.h" #include "test_trmv.h" #include "test_trsv.h" @@ -555,11 +567,15 @@ char libblis_test_proj_dtchar_to_precchar( char dt_char ); #include "test_gemm.h" #include "test_gemmt.h" #include "test_hemm.h" +#include "test_shmm.h" #include "test_herk.h" #include "test_her2k.h" +#include "test_shr2k.h" #include "test_symm.h" +#include "test_skmm.h" #include "test_syrk.h" #include "test_syr2k.h" +#include "test_skr2k.h" #include "test_trmm.h" #include "test_trmm3.h" #include "test_trsm.h" diff --git a/testsuite/src/test_shmm.c b/testsuite/src/test_shmm.c new file mode 100644 index 0000000000..4ec0436aa1 --- /dev/null +++ b/testsuite/src/test_shmm.c @@ -0,0 +1,398 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" +#include "test_libblis.h" + + +// Static variables. +static char* op_str = "shmm"; +static char* o_types = "mmm"; // a b c +static char* p_types = "such"; // side uploa conja transb +static thresh_t thresh[BLIS_NUM_FP_TYPES] = { { 1e-04, 1e-05 }, // warn, pass for s + { 1e-04, 1e-05 }, // warn, pass for c + { 1e-13, 1e-14 }, // warn, pass for d + { 1e-13, 1e-14 } }; // warn, pass for z + +// Local prototypes. +void libblis_test_shmm_deps + ( + thread_data_t* tdata, + test_params_t* params, + test_op_t* op + ); + +void libblis_test_shmm_experiment + ( + test_params_t* params, + test_op_t* op, + iface_t iface, + char* dc_str, + char* pc_str, + char* sc_str, + unsigned int p_cur, + double* perf, + double* resid + ); + +void libblis_test_shmm_impl + ( + iface_t iface, + side_t side, + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c + ); + +void libblis_test_shmm_check + ( + test_params_t* params, + side_t side, + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c, + obj_t* c_orig, + double* resid + ); + + + +void libblis_test_shmm_deps + ( + thread_data_t* tdata, + test_params_t* params, + test_op_t* op + ) +{ + libblis_test_randv( tdata, params, &(op->ops->randv) ); + libblis_test_randm( tdata, params, &(op->ops->randm) ); + libblis_test_setv( tdata, params, &(op->ops->setv) ); + libblis_test_normfv( tdata, params, &(op->ops->normfv) ); + libblis_test_subv( tdata, params, &(op->ops->subv) ); + libblis_test_scalv( tdata, params, &(op->ops->scalv) ); + libblis_test_copym( tdata, params, &(op->ops->copym) ); + libblis_test_scalm( tdata, params, &(op->ops->scalm) ); + libblis_test_gemv( tdata, params, &(op->ops->gemv) ); + libblis_test_shmv( tdata, params, &(op->ops->shmv) ); +} + + + +void libblis_test_shmm + ( + thread_data_t* tdata, + test_params_t* params, + test_op_t* op + ) +{ + + // Return early if this test has already been done. + if ( libblis_test_op_is_done( op ) ) return; + + // Return early if operation is disabled. + if ( libblis_test_op_is_disabled( op ) || + libblis_test_l3_is_disabled( op ) ) return; + + // Call dependencies first. + if ( TRUE ) libblis_test_shmm_deps( tdata, params, op ); + + // Execute the test driver for each implementation requested. + //if ( op->front_seq == ENABLE ) + { + libblis_test_op_driver( tdata, + params, + op, + BLIS_TEST_SEQ_FRONT_END, + op_str, + p_types, + o_types, + thresh, + libblis_test_shmm_experiment ); + } +} + + + +void libblis_test_shmm_experiment + ( + test_params_t* params, + test_op_t* op, + iface_t iface, + char* dc_str, + char* pc_str, + char* sc_str, + unsigned int p_cur, + double* perf, + double* resid + ) +{ + unsigned int n_repeats = params->n_repeats; + unsigned int i; + + double time_min = DBL_MAX; + double time; + + num_t datatype; + + dim_t m, n; + dim_t mn_side; + + side_t side; + uplo_t uploa; + conj_t conja; + trans_t transb; + + obj_t alpha, a, b, beta, c; + obj_t c_save; + + + // Use the datatype of the first char in the datatype combination string. + bli_param_map_char_to_blis_dt( dc_str[0], &datatype ); + + // Map the dimension specifier to actual dimensions. + m = libblis_test_get_dim_from_prob_size( op->dim_spec[0], p_cur ); + n = libblis_test_get_dim_from_prob_size( op->dim_spec[1], p_cur ); + + // Map parameter characters to BLIS constants. + bli_param_map_char_to_blis_side( pc_str[0], &side ); + bli_param_map_char_to_blis_uplo( pc_str[1], &uploa ); + bli_param_map_char_to_blis_conj( pc_str[2], &conja ); + bli_param_map_char_to_blis_trans( pc_str[3], &transb ); + + // Create test scalars. + bli_obj_scalar_init_detached( datatype, &alpha ); + bli_obj_scalar_init_detached( datatype, &beta ); + + // Create test operands (vectors and/or matrices). + bli_set_dim_with_side( side, m, n, &mn_side ); + libblis_test_mobj_create( params, datatype, BLIS_NO_TRANSPOSE, + sc_str[1], mn_side, mn_side, &a ); + libblis_test_mobj_create( params, datatype, transb, + sc_str[2], m, n, &b ); + libblis_test_mobj_create( params, datatype, BLIS_NO_TRANSPOSE, + sc_str[0], m, n, &c ); + libblis_test_mobj_create( params, datatype, BLIS_NO_TRANSPOSE, + sc_str[0], m, n, &c_save ); + + // Set alpha and beta. + if ( bli_obj_is_real( &c ) ) + { + bli_setsc( 1.2, 0.0, &alpha ); + bli_setsc( -1.0, 0.0, &beta ); + } + else + { + bli_setsc( 1.2, 0.8, &alpha ); + bli_setsc( -1.0, 1.0, &beta ); + } + + // Set the structure and uplo properties of A. + bli_obj_set_struc( BLIS_SKEW_HERMITIAN, &a ); + bli_obj_set_uplo( uploa, &a ); + + // Randomize A, make it densely skew-Hermitian, and zero the unstored triangle + // to ensure the implementation reads only from the stored region. + libblis_test_mobj_randomize( params, TRUE, &a ); + bli_mkskewherm( &a ); + bli_mktrim( &a ); + + // Randomize B and C, and save C. + libblis_test_mobj_randomize( params, TRUE, &b ); + libblis_test_mobj_randomize( params, TRUE, &c ); + bli_copym( &c, &c_save ); + + // Apply the remaining parameters. + bli_obj_set_conj( conja, &a ); + bli_obj_set_conjtrans( transb, &b ); + + // Repeat the experiment n_repeats times and record results. + for ( i = 0; i < n_repeats; ++i ) + { + bli_copym( &c_save, &c ); + + time = bli_clock(); + + libblis_test_shmm_impl( iface, side, &alpha, &a, &b, &beta, &c ); + + time_min = bli_clock_min_diff( time_min, time ); + } + + // Estimate the performance of the best experiment repeat. + *perf = ( 2.0 * mn_side * m * n ) / time_min / FLOPS_PER_UNIT_PERF; + if ( bli_obj_is_complex( &c ) ) *perf *= 4.0; + + // Perform checks. + libblis_test_shmm_check( params, side, &alpha, &a, &b, &beta, &c, &c_save, resid ); + + // Zero out performance and residual if output matrix is empty. + libblis_test_check_empty_problem( &c, perf, resid ); + + // Free the test objects. + bli_obj_free( &a ); + bli_obj_free( &b ); + bli_obj_free( &c ); + bli_obj_free( &c_save ); +} + + + +void libblis_test_shmm_impl + ( + iface_t iface, + side_t side, + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c + ) +{ + switch ( iface ) + { + case BLIS_TEST_SEQ_FRONT_END: + bli_shmm( side, alpha, a, b, beta, c ); + break; + + default: + libblis_test_printf_error( "Invalid interface type.\n" ); + } +} + + + +void libblis_test_shmm_check + ( + test_params_t* params, + side_t side, + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c, + obj_t* c_orig, + double* resid + ) +{ + num_t dt = bli_obj_dt( c ); + num_t dt_real = bli_obj_dt_proj_to_real( c ); + + dim_t m = bli_obj_length( c ); + dim_t n = bli_obj_width( c ); + + obj_t norm; + obj_t t, v, w, z; + + double junk; + + // + // Pre-conditions: + // - a is randomized and skew-Hermitian. + // - b is randomized. + // - c_orig is randomized. + // Note: + // - alpha and beta should have non-zero imaginary components in the + // complex cases in order to more fully exercise the implementation. + // + // Under these conditions, we assume that the implementation for + // + // C := beta * C_orig + alpha * conja(A) * transb(B) (side = left) + // C := beta * C_orig + alpha * transb(B) * conja(A) (side = right) + // + // is functioning correctly if + // + // normfv( v - z ) + // + // is negligible, where + // + // v = C * t + // + // z = ( beta * C_orig + alpha * conja(A) * transb(B) ) * t (side = left) + // = beta * C_orig * t + alpha * conja(A) * transb(B) * t + // = beta * C_orig * t + alpha * conja(A) * w + // = beta * C_orig * t + z + // + // z = ( beta * C_orig + alpha * transb(B) * conja(A) ) * t (side = right) + // = beta * C_orig * t + alpha * transb(B) * conja(A) * t + // = beta * C_orig * t + alpha * transb(B) * w + // = beta * C_orig * t + z + + bli_obj_scalar_init_detached( dt_real, &norm ); + + if ( bli_is_left( side ) ) + { + bli_obj_create( dt, n, 1, 0, 0, &t ); + bli_obj_create( dt, m, 1, 0, 0, &v ); + bli_obj_create( dt, m, 1, 0, 0, &w ); + bli_obj_create( dt, m, 1, 0, 0, &z ); + } + else // else if ( bli_is_right( side ) ) + { + bli_obj_create( dt, n, 1, 0, 0, &t ); + bli_obj_create( dt, m, 1, 0, 0, &v ); + bli_obj_create( dt, n, 1, 0, 0, &w ); + bli_obj_create( dt, m, 1, 0, 0, &z ); + } + + libblis_test_vobj_randomize( params, TRUE, &t ); + + bli_gemv( &BLIS_ONE, c, &t, &BLIS_ZERO, &v ); + + if ( bli_is_left( side ) ) + { + bli_gemv( &BLIS_ONE, b, &t, &BLIS_ZERO, &w ); + bli_shmv( alpha, a, &w, &BLIS_ZERO, &z ); + } + else // else if ( bli_is_right( side ) ) + { + bli_shmv( &BLIS_ONE, a, &t, &BLIS_ZERO, &w ); + bli_gemv( alpha, b, &w, &BLIS_ZERO, &z ); + } + + bli_gemv( beta, c_orig, &t, &BLIS_ONE, &z ); + + bli_subv( &z, &v ); + bli_normfv( &v, &norm ); + bli_getsc( &norm, resid, &junk ); + + bli_obj_free( &t ); + bli_obj_free( &v ); + bli_obj_free( &w ); + bli_obj_free( &z ); +} + diff --git a/testsuite/src/test_shmm.h b/testsuite/src/test_shmm.h new file mode 100644 index 0000000000..57a75bcb1c --- /dev/null +++ b/testsuite/src/test_shmm.h @@ -0,0 +1,42 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +void libblis_test_shmm + ( + thread_data_t* tdata, + test_params_t* params, + test_op_t* op + ); + diff --git a/testsuite/src/test_shmv.c b/testsuite/src/test_shmv.c new file mode 100644 index 0000000000..5da44db0ac --- /dev/null +++ b/testsuite/src/test_shmv.c @@ -0,0 +1,351 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" +#include "test_libblis.h" + + +// Static variables. +static char* op_str = "shmv"; +static char* o_types = "mvv"; // a x y +static char* p_types = "ucc"; // uploa conja conjx +static thresh_t thresh[BLIS_NUM_FP_TYPES] = { { 1e-04, 1e-05 }, // warn, pass for s + { 1e-04, 1e-05 }, // warn, pass for c + { 1e-13, 1e-14 }, // warn, pass for d + { 1e-13, 1e-14 } }; // warn, pass for z + +// Local prototypes. +void libblis_test_shmv_deps + ( + thread_data_t* tdata, + test_params_t* params, + test_op_t* op + ); + +void libblis_test_shmv_experiment + ( + test_params_t* params, + test_op_t* op, + iface_t iface, + char* dc_str, + char* pc_str, + char* sc_str, + unsigned int p_cur, + double* perf, + double* resid + ); + +void libblis_test_shmv_impl + ( + iface_t iface, + obj_t* alpha, + obj_t* a, + obj_t* x, + obj_t* beta, + obj_t* y + ); + +void libblis_test_shmv_check + ( + test_params_t* params, + obj_t* alpha, + obj_t* a, + obj_t* x, + obj_t* beta, + obj_t* y, + obj_t* y_orig, + double* resid + ); + + + +void libblis_test_shmv_deps + ( + thread_data_t* tdata, + test_params_t* params, + test_op_t* op + ) +{ + libblis_test_randv( tdata, params, &(op->ops->randv) ); + libblis_test_randm( tdata, params, &(op->ops->randm) ); + libblis_test_normfv( tdata, params, &(op->ops->normfv) ); + libblis_test_subv( tdata, params, &(op->ops->subv) ); + libblis_test_copyv( tdata, params, &(op->ops->copyv) ); + libblis_test_scalv( tdata, params, &(op->ops->scalv) ); + libblis_test_copym( tdata, params, &(op->ops->copym) ); + libblis_test_gemv( tdata, params, &(op->ops->gemv) ); +} + + + +void libblis_test_shmv + ( + thread_data_t* tdata, + test_params_t* params, + test_op_t* op + ) +{ + + // Return early if this test has already been done. + if ( libblis_test_op_is_done( op ) ) return; + + // Return early if operation is disabled. + if ( libblis_test_op_is_disabled( op ) || + libblis_test_l2_is_disabled( op ) ) return; + + // Call dependencies first. + if ( TRUE ) libblis_test_shmv_deps( tdata, params, op ); + + // Execute the test driver for each implementation requested. + //if ( op->front_seq == ENABLE ) + { + libblis_test_op_driver( tdata, + params, + op, + BLIS_TEST_SEQ_FRONT_END, + op_str, + p_types, + o_types, + thresh, + libblis_test_shmv_experiment ); + } +} + + + +void libblis_test_shmv_experiment + ( + test_params_t* params, + test_op_t* op, + iface_t iface, + char* dc_str, + char* pc_str, + char* sc_str, + unsigned int p_cur, + double* perf, + double* resid + ) +{ + unsigned int n_repeats = params->n_repeats; + unsigned int i; + + double time_min = DBL_MAX; + double time; + + num_t datatype; + + dim_t m; + + uplo_t uploa; + conj_t conja; + conj_t conjx; + + obj_t alpha, a, x, beta, y; + obj_t y_save; + + + // Use the datatype of the first char in the datatype combination string. + bli_param_map_char_to_blis_dt( dc_str[0], &datatype ); + + // Map the dimension specifier to an actual dimension. + m = libblis_test_get_dim_from_prob_size( op->dim_spec[0], p_cur ); + + // Map parameter characters to BLIS constants. + bli_param_map_char_to_blis_uplo( pc_str[0], &uploa ); + bli_param_map_char_to_blis_conj( pc_str[1], &conja ); + bli_param_map_char_to_blis_conj( pc_str[2], &conjx ); + + // Create test scalars. + bli_obj_scalar_init_detached( datatype, &alpha ); + bli_obj_scalar_init_detached( datatype, &beta ); + + // Create test operands (vectors and/or matrices). + libblis_test_mobj_create( params, datatype, BLIS_NO_TRANSPOSE, + sc_str[0], m, m, &a ); + libblis_test_vobj_create( params, datatype, + sc_str[1], m, &x ); + libblis_test_vobj_create( params, datatype, + sc_str[2], m, &y ); + libblis_test_vobj_create( params, datatype, + sc_str[2], m, &y_save ); + + // Set alpha and beta. + if ( bli_obj_is_real( &y ) ) + { + bli_setsc( 1.0, 0.0, &alpha ); + bli_setsc( -1.0, 0.0, &beta ); + } + else + { + bli_setsc( 0.5, 0.5, &alpha ); + bli_setsc( -0.5, 0.5, &beta ); + } + + // Set the structure and uplo properties of A. + bli_obj_set_struc( BLIS_SKEW_HERMITIAN, &a ); + bli_obj_set_uplo( uploa, &a ); + + // Randomize A, make it densely Hermitian, and zero the unstored triangle + // to ensure the implementation reads only from the stored region. + libblis_test_mobj_randomize( params, TRUE, &a ); + bli_mkskewherm( &a ); + bli_mktrim( &a ); + + // Randomize x and y, and save y. + libblis_test_vobj_randomize( params, TRUE, &x ); + libblis_test_vobj_randomize( params, TRUE, &y ); + bli_copyv( &y, &y_save ); + + // Apply the remaining parameters. + bli_obj_set_conj( conja, &a ); + bli_obj_set_conj( conjx, &x ); + + // Repeat the experiment n_repeats times and record results. + for ( i = 0; i < n_repeats; ++i ) + { + bli_copym( &y_save, &y ); + + time = bli_clock(); + + libblis_test_shmv_impl( iface, &alpha, &a, &x, &beta, &y ); + + time_min = bli_clock_min_diff( time_min, time ); + } + + // Estimate the performance of the best experiment repeat. + *perf = ( 1.0 * m * m ) / time_min / FLOPS_PER_UNIT_PERF; + if ( bli_obj_is_complex( &y ) ) *perf *= 4.0; + + // Perform checks. + libblis_test_shmv_check( params, &alpha, &a, &x, &beta, &y, &y_save, resid ); + + // Zero out performance and residual if output vector is empty. + libblis_test_check_empty_problem( &y, perf, resid ); + + // Free the test objects. + bli_obj_free( &a ); + bli_obj_free( &x ); + bli_obj_free( &y ); + bli_obj_free( &y_save ); +} + + + +void libblis_test_shmv_impl + ( + iface_t iface, + obj_t* alpha, + obj_t* a, + obj_t* x, + obj_t* beta, + obj_t* y + ) +{ + switch ( iface ) + { + case BLIS_TEST_SEQ_FRONT_END: + bli_shmv( alpha, a, x, beta, y ); + break; + + default: + libblis_test_printf_error( "Invalid interface type.\n" ); + } +} + + + +void libblis_test_shmv_check + ( + test_params_t* params, + obj_t* alpha, + obj_t* a, + obj_t* x, + obj_t* beta, + obj_t* y, + obj_t* y_orig, + double* resid + ) +{ + num_t dt = bli_obj_dt( y ); + num_t dt_real = bli_obj_dt_proj_to_real( y ); + + dim_t m = bli_obj_vector_dim( y ); + + obj_t v; + obj_t norm; + + double junk; + + // + // Pre-conditions: + // - a is randomized and skew-Hermitian. + // - x is randomized. + // - y_orig is randomized. + // Note: + // - alpha and beta should have non-zero imaginary components in the + // complex cases in order to more fully exercise the implementation. + // + // Under these conditions, we assume that the implementation for + // + // y := beta * y_orig + alpha * conja(A) * conjx(x) + // + // is functioning correctly if + // + // normfv( y - v ) + // + // is negligible, where + // + // v = beta * y_orig + alpha * conja(A_dense) * x + // + + bli_obj_scalar_init_detached( dt_real, &norm ); + + bli_obj_create( dt, m, 1, 0, 0, &v ); + + bli_copyv( y_orig, &v ); + + bli_mkskewherm( a ); + bli_obj_set_struc( BLIS_GENERAL, a ); + bli_obj_set_uplo( BLIS_DENSE, a ); + + bli_gemv( alpha, a, x, beta, &v ); + + bli_subv( &v, y ); + bli_normfv( y, &norm ); + bli_getsc( &norm, resid, &junk ); + + bli_obj_free( &v ); +} + diff --git a/testsuite/src/test_shmv.h b/testsuite/src/test_shmv.h new file mode 100644 index 0000000000..fbe2cf4f6e --- /dev/null +++ b/testsuite/src/test_shmv.h @@ -0,0 +1,42 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +void libblis_test_shmv + ( + thread_data_t* tdata, + test_params_t* params, + test_op_t* op + ); + diff --git a/testsuite/src/test_shr2.c b/testsuite/src/test_shr2.c new file mode 100644 index 0000000000..d15b12423e --- /dev/null +++ b/testsuite/src/test_shr2.c @@ -0,0 +1,372 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" +#include "test_libblis.h" + + +// Static variables. +static char* op_str = "shr2"; +static char* o_types = "vvm"; // x y a +static char* p_types = "ucc"; // uploa conjx conjy +static thresh_t thresh[BLIS_NUM_FP_TYPES] = { { 1e-04, 1e-05 }, // warn, pass for s + { 1e-04, 1e-05 }, // warn, pass for c + { 1e-13, 1e-14 }, // warn, pass for d + { 1e-13, 1e-14 } }; // warn, pass for z + +// Local prototypes. +void libblis_test_shr2_deps + ( + thread_data_t* tdata, + test_params_t* params, + test_op_t* op + ); + +void libblis_test_shr2_experiment + ( + test_params_t* params, + test_op_t* op, + iface_t iface, + char* dc_str, + char* pc_str, + char* sc_str, + unsigned int p_cur, + double* perf, + double* resid + ); + +void libblis_test_shr2_impl + ( + iface_t iface, + obj_t* alpha, + obj_t* x, + obj_t* y, + obj_t* a + ); + +void libblis_test_shr2_check + ( + test_params_t* params, + obj_t* alpha, + obj_t* x, + obj_t* y, + obj_t* a, + obj_t* a_orig, + double* resid + ); + + + +void libblis_test_shr2_deps + ( + thread_data_t* tdata, + test_params_t* params, + test_op_t* op + ) +{ + libblis_test_randv( tdata, params, &(op->ops->randv) ); + libblis_test_randm( tdata, params, &(op->ops->randm) ); + libblis_test_normfv( tdata, params, &(op->ops->normfv) ); + libblis_test_subv( tdata, params, &(op->ops->subv) ); + libblis_test_copym( tdata, params, &(op->ops->copym) ); + libblis_test_scal2v( tdata, params, &(op->ops->scal2v) ); + libblis_test_dotv( tdata, params, &(op->ops->dotv) ); + libblis_test_gemv( tdata, params, &(op->ops->gemv) ); +} + + + +void libblis_test_shr2 + ( + thread_data_t* tdata, + test_params_t* params, + test_op_t* op + ) +{ + + // Return early if this test has already been done. + if ( libblis_test_op_is_done( op ) ) return; + + // Return early if operation is disabled. + if ( libblis_test_op_is_disabled( op ) || + libblis_test_l2_is_disabled( op ) ) return; + + // Call dependencies first. + if ( TRUE ) libblis_test_shr2_deps( tdata, params, op ); + + // Execute the test driver for each implementation requested. + //if ( op->front_seq == ENABLE ) + { + libblis_test_op_driver( tdata, + params, + op, + BLIS_TEST_SEQ_FRONT_END, + op_str, + p_types, + o_types, + thresh, + libblis_test_shr2_experiment ); + } +} + + + +void libblis_test_shr2_experiment + ( + test_params_t* params, + test_op_t* op, + iface_t iface, + char* dc_str, + char* pc_str, + char* sc_str, + unsigned int p_cur, + double* perf, + double* resid + ) +{ + unsigned int n_repeats = params->n_repeats; + unsigned int i; + + double time_min = DBL_MAX; + double time; + + num_t datatype; + + dim_t m; + + uplo_t uploa; + conj_t conjx, conjy; + + obj_t alpha, x, y, a; + obj_t a_save; + + + // Use the datatype of the first char in the datatype combination string. + bli_param_map_char_to_blis_dt( dc_str[0], &datatype ); + + // Map the dimension specifier to an actual dimension. + m = libblis_test_get_dim_from_prob_size( op->dim_spec[0], p_cur ); + + // Map parameter characters to BLIS constants. + bli_param_map_char_to_blis_uplo( pc_str[0], &uploa ); + bli_param_map_char_to_blis_conj( pc_str[1], &conjx ); + bli_param_map_char_to_blis_conj( pc_str[2], &conjy ); + + // Create test scalars. + bli_obj_scalar_init_detached( datatype, &alpha ); + + // Create test operands (vectors and/or matrices). + libblis_test_vobj_create( params, datatype, + sc_str[0], m, &x ); + libblis_test_vobj_create( params, datatype, + sc_str[1], m, &y ); + libblis_test_mobj_create( params, datatype, BLIS_NO_TRANSPOSE, + sc_str[2], m, m, &a ); + libblis_test_mobj_create( params, datatype, BLIS_NO_TRANSPOSE, + sc_str[2], m, m, &a_save ); + + // Set alpha. + //bli_copysc( &BLIS_MINUS_ONE, &alpha ); + bli_setsc( -1.0, 1.0, &alpha ); + + // Randomize x and y. + libblis_test_vobj_randomize( params, TRUE, &x ); + libblis_test_vobj_randomize( params, TRUE, &y ); + + // Set the structure and uplo properties of A. + bli_obj_set_struc( BLIS_SKEW_HERMITIAN, &a ); + bli_obj_set_uplo( uploa, &a ); + + // Randomize A, make it densely Hermitian, and zero the unstored triangle + // to ensure the implementation is reads only from the stored region. + libblis_test_mobj_randomize( params, TRUE, &a ); + bli_mkskewherm( &a ); + bli_mktrim( &a ); + + // Save A and set its structure and uplo properties. + bli_obj_set_struc( BLIS_SKEW_HERMITIAN, &a_save ); + bli_obj_set_uplo( uploa, &a_save ); + bli_copym( &a, &a_save ); + + // Apply the remaining parameters. + bli_obj_set_conj( conjx, &x ); + bli_obj_set_conj( conjy, &y ); + + // Repeat the experiment n_repeats times and record results. + for ( i = 0; i < n_repeats; ++i ) + { + bli_copym( &a_save, &a ); + + time = bli_clock(); + + libblis_test_shr2_impl( iface, &alpha, &x, &y, &a ); + + time_min = bli_clock_min_diff( time_min, time ); + } + + // Estimate the performance of the best experiment repeat. + *perf = ( 2.0 * m * m ) / time_min / FLOPS_PER_UNIT_PERF; + if ( bli_obj_is_complex( &a ) ) *perf *= 4.0; + + // Perform checks. + libblis_test_shr2_check( params, &alpha, &x, &y, &a, &a_save, resid ); + + // Zero out performance and residual if output matrix is empty. + libblis_test_check_empty_problem( &a, perf, resid ); + + // Free the test objects. + bli_obj_free( &x ); + bli_obj_free( &y ); + bli_obj_free( &a ); + bli_obj_free( &a_save ); +} + + + +void libblis_test_shr2_impl + ( + iface_t iface, + obj_t* alpha, + obj_t* x, + obj_t* y, + obj_t* a + ) +{ + switch ( iface ) + { + case BLIS_TEST_SEQ_FRONT_END: + bli_shr2( alpha, x, y, a ); + break; + + default: + libblis_test_printf_error( "Invalid interface type.\n" ); + } +} + + + +void libblis_test_shr2_check + ( + test_params_t* params, + obj_t* alpha, + obj_t* x, + obj_t* y, + obj_t* a, + obj_t* a_orig, + double* resid + ) +{ + num_t dt = bli_obj_dt( a ); + num_t dt_real = bli_obj_dt_proj_to_real( a ); + + dim_t m_a = bli_obj_length( a ); + + obj_t xh, yh, alphac; + obj_t t, v, w1, w2; + obj_t rho, norm; + + double junk; + + // + // Pre-conditions: + // - x is randomized. + // - y is randomized. + // - a is randomized and skew-Hermitian. + // + // Under these conditions, we assume that the implementation for + // + // A := A_orig + alpha * conjx(x) * conjy(y)^H - conj(alpha) * conjy(y) * conjx(x)^H + // + // is functioning correctly if + // + // normfv( v - w ) + // + // is negligible, where + // + // v = A * t + // w = ( A_orig + alpha * conjx(x) * conjy(y)^H - conj(alpha) * conjy(y) * conjx(x)^H ) * t + // = A_orig * t + alpha * conjx(x) * conjy(y)^H * t - conj(alpha) * conjy(y) * conjx(x)^H * t + // = A_orig * t + alpha * conjx(x) * conjy(y)^H * t - conj(alpha) * conjy(y) * rho + // = A_orig * t + alpha * conjx(x) * conjy(y)^H * t - w1 + // = A_orig * t + alpha * conjx(x) * rho - w1 + // = A_orig * t + w2 - w1 + // + + bli_mkskewherm( a ); + bli_mkskewherm( a_orig ); + bli_obj_set_struc( BLIS_GENERAL, a ); + bli_obj_set_struc( BLIS_GENERAL, a_orig ); + bli_obj_set_uplo( BLIS_DENSE, a ); + bli_obj_set_uplo( BLIS_DENSE, a_orig ); + + bli_obj_scalar_init_detached( dt, &rho ); + bli_obj_scalar_init_detached( dt, &alphac ); + bli_obj_scalar_init_detached( dt_real, &norm ); + + bli_obj_create( dt, m_a, 1, 0, 0, &t ); + bli_obj_create( dt, m_a, 1, 0, 0, &v ); + bli_obj_create( dt, m_a, 1, 0, 0, &w1 ); + bli_obj_create( dt, m_a, 1, 0, 0, &w2 ); + + bli_obj_alias_with_conj( BLIS_CONJUGATE, x, &xh ); + bli_obj_alias_with_conj( BLIS_CONJUGATE, y, &yh ); + bli_obj_alias_with_conj( BLIS_CONJUGATE, alpha, &alphac ); + + libblis_test_vobj_randomize( params, TRUE, &t ); + + bli_gemv( &BLIS_ONE, a, &t, &BLIS_ZERO, &v ); + + bli_dotv( &xh, &t, &rho ); + bli_mulsc( &alphac, &rho ); + bli_scal2v( &rho, y, &w1 ); + + bli_dotv( &yh, &t, &rho ); + bli_mulsc( alpha, &rho ); + bli_scal2v( &rho, x, &w2 ); + + bli_xpbyv( &w2, &BLIS_MINUS_ONE, &w1 ); + + bli_gemv( &BLIS_ONE, a_orig, &t, &BLIS_ONE, &w1 ); + + bli_subv( &w1, &v ); + bli_normfv( &v, &norm ); + bli_getsc( &norm, resid, &junk ); + + bli_obj_free( &t ); + bli_obj_free( &v ); + bli_obj_free( &w1 ); + bli_obj_free( &w2 ); +} + diff --git a/testsuite/src/test_shr2.h b/testsuite/src/test_shr2.h new file mode 100644 index 0000000000..2cef423927 --- /dev/null +++ b/testsuite/src/test_shr2.h @@ -0,0 +1,42 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +void libblis_test_shr2 + ( + thread_data_t* tdata, + test_params_t* params, + test_op_t* op + ); + diff --git a/testsuite/src/test_shr2k.c b/testsuite/src/test_shr2k.c new file mode 100644 index 0000000000..e0a56c2697 --- /dev/null +++ b/testsuite/src/test_shr2k.c @@ -0,0 +1,382 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" +#include "test_libblis.h" + + +// Static variables. +static char* op_str = "shr2k"; +static char* o_types = "mmm"; // a b c +static char* p_types = "uhh"; // uploc transa transb +static thresh_t thresh[BLIS_NUM_FP_TYPES] = { { 1e-04, 1e-05 }, // warn, pass for s + { 1e-04, 1e-05 }, // warn, pass for c + { 1e-13, 1e-14 }, // warn, pass for d + { 1e-13, 1e-14 } }; // warn, pass for z + +// Local prototypes. +void libblis_test_shr2k_deps + ( + thread_data_t* tdata, + test_params_t* params, + test_op_t* op + ); + +void libblis_test_shr2k_experiment + ( + test_params_t* params, + test_op_t* op, + iface_t iface, + char* dc_str, + char* pc_str, + char* sc_str, + unsigned int p_cur, + double* perf, + double* resid + ); + +void libblis_test_shr2k_impl + ( + iface_t iface, + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c + ); + +void libblis_test_shr2k_check + ( + test_params_t* params, + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c, + obj_t* c_orig, + double* resid + ); + + + +void libblis_test_shr2k_deps + ( + thread_data_t* tdata, + test_params_t* params, + test_op_t* op + ) +{ + libblis_test_randv( tdata, params, &(op->ops->randv) ); + libblis_test_randm( tdata, params, &(op->ops->randm) ); + libblis_test_setv( tdata, params, &(op->ops->setv) ); + libblis_test_normfv( tdata, params, &(op->ops->normfv) ); + libblis_test_subv( tdata, params, &(op->ops->subv) ); + libblis_test_scalv( tdata, params, &(op->ops->scalv) ); + libblis_test_copym( tdata, params, &(op->ops->copym) ); + libblis_test_scalm( tdata, params, &(op->ops->scalm) ); + libblis_test_gemv( tdata, params, &(op->ops->gemv) ); + libblis_test_shmv( tdata, params, &(op->ops->shmv) ); +} + + + +void libblis_test_shr2k + ( + thread_data_t* tdata, + test_params_t* params, + test_op_t* op + ) +{ + + // Return early if this test has already been done. + if ( libblis_test_op_is_done( op ) ) return; + + // Return early if operation is disabled. + if ( libblis_test_op_is_disabled( op ) || + libblis_test_l3_is_disabled( op ) ) return; + + // Call dependencies first. + if ( TRUE ) libblis_test_shr2k_deps( tdata, params, op ); + + // Execute the test driver for each implementation requested. + //if ( op->front_seq == ENABLE ) + { + libblis_test_op_driver( tdata, + params, + op, + BLIS_TEST_SEQ_FRONT_END, + op_str, + p_types, + o_types, + thresh, + libblis_test_shr2k_experiment ); + } +} + + + +void libblis_test_shr2k_experiment + ( + test_params_t* params, + test_op_t* op, + iface_t iface, + char* dc_str, + char* pc_str, + char* sc_str, + unsigned int p_cur, + double* perf, + double* resid + ) +{ + unsigned int n_repeats = params->n_repeats; + unsigned int i; + + double time_min = DBL_MAX; + double time; + + num_t datatype; + + dim_t m, k; + + uplo_t uploc; + trans_t transa, transb; + + obj_t alpha, a, b, beta, c; + obj_t c_save; + + + // Use the datatype of the first char in the datatype combination string. + bli_param_map_char_to_blis_dt( dc_str[0], &datatype ); + + // Map the dimension specifier to actual dimensions. + m = libblis_test_get_dim_from_prob_size( op->dim_spec[0], p_cur ); + k = libblis_test_get_dim_from_prob_size( op->dim_spec[1], p_cur ); + + // Map parameter characters to BLIS constants. + bli_param_map_char_to_blis_uplo( pc_str[0], &uploc ); + bli_param_map_char_to_blis_trans( pc_str[1], &transa ); + bli_param_map_char_to_blis_trans( pc_str[2], &transb ); + + // Create test scalars. + bli_obj_scalar_init_detached( datatype, &alpha ); + bli_obj_scalar_init_detached( datatype, &beta ); + + // Create test operands (vectors and/or matrices). + libblis_test_mobj_create( params, datatype, transa, + sc_str[1], m, k, &a ); + libblis_test_mobj_create( params, datatype, transb, + sc_str[2], m, k, &b ); + libblis_test_mobj_create( params, datatype, BLIS_NO_TRANSPOSE, + sc_str[0], m, m, &c ); + libblis_test_mobj_create( params, datatype, BLIS_NO_TRANSPOSE, + sc_str[0], m, m, &c_save ); + + // Set alpha and beta. + if ( bli_obj_is_real( &c ) ) + { + bli_setsc( 0.8, 0.0, &alpha ); + bli_setsc( -1.0, 0.0, &beta ); + } + else + { + // For shr2k, alpha may be complex, but beta must be real-valued + // (in order to preserve the Hermitian structure of C). + bli_setsc( 0.8, 0.5, &alpha ); + bli_setsc( -1.0, 0.0, &beta ); + } + + // Randomize A and B. + libblis_test_mobj_randomize( params, TRUE, &a ); + libblis_test_mobj_randomize( params, TRUE, &b ); + + // Set the structure and uplo properties of C. + bli_obj_set_struc( BLIS_SKEW_HERMITIAN, &c ); + bli_obj_set_uplo( uploc, &c ); + + // Randomize A, make it densely Hermitian, and zero the unstored triangle + // to ensure the implementation is reads only from the stored region. + libblis_test_mobj_randomize( params, TRUE, &c ); + bli_mkskewherm( &c ); + bli_mktrim( &c ); + + // Save C and set its structure and uplo properties. + bli_obj_set_struc( BLIS_SKEW_HERMITIAN, &c_save ); + bli_obj_set_uplo( uploc, &c_save ); + bli_copym( &c, &c_save ); + + // Apply the remaining parameters. + bli_obj_set_conjtrans( transa, &a ); + bli_obj_set_conjtrans( transb, &b ); + + // Repeat the experiment n_repeats times and record results. + for ( i = 0; i < n_repeats; ++i ) + { + bli_copym( &c_save, &c ); + + time = bli_clock(); + + libblis_test_shr2k_impl( iface, &alpha, &a, &b, &beta, &c ); + + time_min = bli_clock_min_diff( time_min, time ); + } + + // Estimate the performance of the best experiment repeat. + *perf = ( 2.0 * m * m * k ) / time_min / FLOPS_PER_UNIT_PERF; + if ( bli_obj_is_complex( &c ) ) *perf *= 4.0; + + // Perform checks. + libblis_test_shr2k_check( params, &alpha, &a, &b, &beta, &c, &c_save, resid ); + + // Zero out performance and residual if output matrix is empty. + libblis_test_check_empty_problem( &c, perf, resid ); + + // Free the test objects. + bli_obj_free( &a ); + bli_obj_free( &b ); + bli_obj_free( &c ); + bli_obj_free( &c_save ); +} + + + +void libblis_test_shr2k_impl + ( + iface_t iface, + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c + ) +{ + switch ( iface ) + { + case BLIS_TEST_SEQ_FRONT_END: + bli_shr2k( alpha, a, b, beta, c ); + break; + + default: + libblis_test_printf_error( "Invalid interface type.\n" ); + } +} + + + +void libblis_test_shr2k_check + ( + test_params_t* params, + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c, + obj_t* c_orig, + double* resid + ) +{ + num_t dt = bli_obj_dt( c ); + num_t dt_real = bli_obj_dt_proj_to_real( c ); + + dim_t m = bli_obj_length( c ); + dim_t k = bli_obj_width_after_trans( a ); + + obj_t alphac, ah, bh; + obj_t norm; + obj_t t, v, w1, w2, z; + + double junk; + + // + // Pre-conditions: + // - a is randomized. + // - b is randomized. + // - c_orig is randomized and Hermitian. + // Note: + // - alpha should have a non-zero imaginary component in the + // complex cases in order to more fully exercise the implementation. + // - beta must be real-valued. + // + // Under these conditions, we assume that the implementation for + // + // C := beta * C_orig + alpha * transa(A) * transb(B)^H + conj(alpha) * transb(B) * transa(A)^H + // + // is functioning correctly if + // + // normfv( v - z ) + // + // is negligible, where + // + // v = C * t + // z = ( beta * C_orig + alpha * transa(A) * transb(B)^H - conj(alpha) * transb(B) * transa(A)^H ) * t + // = beta * C_orig * t + alpha * transa(A) * transb(B)^H * t - conj(alpha) * transb(B) * transa(A)^H * t + // = beta * C_orig * t + alpha * transa(A) * transb(B)^H * t - conj(alpha) * transb(B) * w2 + // = beta * C_orig * t + alpha * transa(A) * w1 - conj(alpha) * transb(B) * w2 + // = beta * C_orig * t + alpha * transa(A) * w1 - z + // = beta * C_orig * t + z + // + + bli_obj_alias_with_trans( BLIS_CONJ_TRANSPOSE, a, &ah ); + bli_obj_alias_with_trans( BLIS_CONJ_TRANSPOSE, b, &bh ); + + bli_obj_scalar_init_detached( dt_real, &norm ); + bli_obj_scalar_init_detached_copy_of( dt, BLIS_CONJUGATE, alpha, &alphac ); + + bli_obj_create( dt, m, 1, 0, 0, &t ); + bli_obj_create( dt, m, 1, 0, 0, &v ); + bli_obj_create( dt, k, 1, 0, 0, &w1 ); + bli_obj_create( dt, k, 1, 0, 0, &w2 ); + bli_obj_create( dt, m, 1, 0, 0, &z ); + + libblis_test_vobj_randomize( params, TRUE, &t ); + + bli_shmv( &BLIS_ONE, c, &t, &BLIS_ZERO, &v ); + + bli_gemv( &BLIS_ONE, &ah, &t, &BLIS_ZERO, &w2 ); + bli_gemv( &BLIS_ONE, &bh, &t, &BLIS_ZERO, &w1 ); + bli_gemv( &alphac, b, &w2, &BLIS_ZERO, &z ); + bli_gemv( alpha, a, &w1, &BLIS_MINUS_ONE, &z ); + bli_shmv( beta, c_orig, &t, &BLIS_ONE, &z ); + + bli_subv( &z, &v ); + bli_normfv( &v, &norm ); + bli_getsc( &norm, resid, &junk ); + + bli_obj_free( &t ); + bli_obj_free( &v ); + bli_obj_free( &w1 ); + bli_obj_free( &w2 ); + bli_obj_free( &z ); +} + diff --git a/testsuite/src/test_shr2k.h b/testsuite/src/test_shr2k.h new file mode 100644 index 0000000000..61f943f569 --- /dev/null +++ b/testsuite/src/test_shr2k.h @@ -0,0 +1,42 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +void libblis_test_shr2k + ( + thread_data_t* tdata, + test_params_t* params, + test_op_t* op + ); + diff --git a/testsuite/src/test_skmm.c b/testsuite/src/test_skmm.c new file mode 100644 index 0000000000..c411b874ab --- /dev/null +++ b/testsuite/src/test_skmm.c @@ -0,0 +1,398 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" +#include "test_libblis.h" + + +// Static variables. +static char* op_str = "skmm"; +static char* o_types = "mmm"; // a b c +static char* p_types = "such"; // side uploa conja transb +static thresh_t thresh[BLIS_NUM_FP_TYPES] = { { 1e-04, 1e-05 }, // warn, pass for s + { 1e-04, 1e-05 }, // warn, pass for c + { 1e-13, 1e-14 }, // warn, pass for d + { 1e-13, 1e-14 } }; // warn, pass for z + +// Local prototypes. +void libblis_test_skmm_deps + ( + thread_data_t* tdata, + test_params_t* params, + test_op_t* op + ); + +void libblis_test_skmm_experiment + ( + test_params_t* params, + test_op_t* op, + iface_t iface, + char* dc_str, + char* pc_str, + char* sc_str, + unsigned int p_cur, + double* perf, + double* resid + ); + +void libblis_test_skmm_impl + ( + iface_t iface, + side_t side, + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c + ); + +void libblis_test_skmm_check + ( + test_params_t* params, + side_t side, + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c, + obj_t* c_orig, + double* resid + ); + + + +void libblis_test_skmm_deps + ( + thread_data_t* tdata, + test_params_t* params, + test_op_t* op + ) +{ + libblis_test_randv( tdata, params, &(op->ops->randv) ); + libblis_test_randm( tdata, params, &(op->ops->randm) ); + libblis_test_setv( tdata, params, &(op->ops->setv) ); + libblis_test_normfv( tdata, params, &(op->ops->normfv) ); + libblis_test_subv( tdata, params, &(op->ops->subv) ); + libblis_test_scalv( tdata, params, &(op->ops->scalv) ); + libblis_test_copym( tdata, params, &(op->ops->copym) ); + libblis_test_scalm( tdata, params, &(op->ops->scalm) ); + libblis_test_gemv( tdata, params, &(op->ops->gemv) ); + libblis_test_skmv( tdata, params, &(op->ops->skmv) ); +} + + + +void libblis_test_skmm + ( + thread_data_t* tdata, + test_params_t* params, + test_op_t* op + ) +{ + + // Return early if this test has already been done. + if ( libblis_test_op_is_done( op ) ) return; + + // Return early if operation is disabled. + if ( libblis_test_op_is_disabled( op ) || + libblis_test_l3_is_disabled( op ) ) return; + + // Call dependencies first. + if ( TRUE ) libblis_test_skmm_deps( tdata, params, op ); + + // Execute the test driver for each implementation requested. + //if ( op->front_seq == ENABLE ) + { + libblis_test_op_driver( tdata, + params, + op, + BLIS_TEST_SEQ_FRONT_END, + op_str, + p_types, + o_types, + thresh, + libblis_test_skmm_experiment ); + } +} + + + +void libblis_test_skmm_experiment + ( + test_params_t* params, + test_op_t* op, + iface_t iface, + char* dc_str, + char* pc_str, + char* sc_str, + unsigned int p_cur, + double* perf, + double* resid + ) +{ + unsigned int n_repeats = params->n_repeats; + unsigned int i; + + double time_min = DBL_MAX; + double time; + + num_t datatype; + + dim_t m, n; + dim_t mn_side; + + side_t side; + uplo_t uploa; + conj_t conja; + trans_t transb; + + obj_t alpha, a, b, beta, c; + obj_t c_save; + + + // Use the datatype of the first char in the datatype combination string. + bli_param_map_char_to_blis_dt( dc_str[0], &datatype ); + + // Map the dimension specifier to actual dimensions. + m = libblis_test_get_dim_from_prob_size( op->dim_spec[0], p_cur ); + n = libblis_test_get_dim_from_prob_size( op->dim_spec[1], p_cur ); + + // Map parameter characters to BLIS constants. + bli_param_map_char_to_blis_side( pc_str[0], &side ); + bli_param_map_char_to_blis_uplo( pc_str[1], &uploa ); + bli_param_map_char_to_blis_conj( pc_str[2], &conja ); + bli_param_map_char_to_blis_trans( pc_str[3], &transb ); + + // Create test scalars. + bli_obj_scalar_init_detached( datatype, &alpha ); + bli_obj_scalar_init_detached( datatype, &beta ); + + // Create test operands (vectors and/or matrices). + bli_set_dim_with_side( side, m, n, &mn_side ); + libblis_test_mobj_create( params, datatype, BLIS_NO_TRANSPOSE, + sc_str[1], mn_side, mn_side, &a ); + libblis_test_mobj_create( params, datatype, transb, + sc_str[2], m, n, &b ); + libblis_test_mobj_create( params, datatype, BLIS_NO_TRANSPOSE, + sc_str[0], m, n, &c ); + libblis_test_mobj_create( params, datatype, BLIS_NO_TRANSPOSE, + sc_str[0], m, n, &c_save ); + + // Set alpha and beta. + if ( bli_obj_is_real( &c ) ) + { + bli_setsc( 0.8, 0.0, &alpha ); + bli_setsc( -1.0, 0.0, &beta ); + } + else + { + bli_setsc( 0.8, 0.6, &alpha ); + bli_setsc( -1.0, 1.0, &beta ); + } + + // Set the structure and uplo properties of A. + bli_obj_set_struc( BLIS_SKEW_SYMMETRIC, &a ); + bli_obj_set_uplo( uploa, &a ); + + // Randomize A, make it densely skew-symmetric, and zero the unstored triangle + // to ensure the implementation reads only from the stored region. + libblis_test_mobj_randomize( params, TRUE, &a ); + bli_mkskewsymm( &a ); + bli_mktrim( &a ); + + // Randomize B and C, and save C. + libblis_test_mobj_randomize( params, TRUE, &b ); + libblis_test_mobj_randomize( params, TRUE, &c ); + bli_copym( &c, &c_save ); + + // Apply the remaining parameters. + bli_obj_set_conj( conja, &a ); + bli_obj_set_conjtrans( transb, &b ); + + // Repeat the experiment n_repeats times and record results. + for ( i = 0; i < n_repeats; ++i ) + { + bli_copym( &c_save, &c ); + + time = bli_clock(); + + libblis_test_skmm_impl( iface, side, &alpha, &a, &b, &beta, &c ); + + time_min = bli_clock_min_diff( time_min, time ); + } + + // Estimate the performance of the best experiment repeat. + *perf = ( 2.0 * mn_side * m * n ) / time_min / FLOPS_PER_UNIT_PERF; + if ( bli_obj_is_complex( &c ) ) *perf *= 4.0; + + // Perform checks. + libblis_test_skmm_check( params, side, &alpha, &a, &b, &beta, &c, &c_save, resid ); + + // Zero out performance and residual if output matrix is empty. + libblis_test_check_empty_problem( &c, perf, resid ); + + // Free the test objects. + bli_obj_free( &a ); + bli_obj_free( &b ); + bli_obj_free( &c ); + bli_obj_free( &c_save ); +} + + + +void libblis_test_skmm_impl + ( + iface_t iface, + side_t side, + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c + ) +{ + switch ( iface ) + { + case BLIS_TEST_SEQ_FRONT_END: + bli_skmm( side, alpha, a, b, beta, c ); + break; + + default: + libblis_test_printf_error( "Invalid interface type.\n" ); + } +} + + + +void libblis_test_skmm_check + ( + test_params_t* params, + side_t side, + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c, + obj_t* c_orig, + double* resid + ) +{ + num_t dt = bli_obj_dt( c ); + num_t dt_real = bli_obj_dt_proj_to_real( c ); + + dim_t m = bli_obj_length( c ); + dim_t n = bli_obj_width( c ); + + obj_t norm; + obj_t t, v, w, z; + + double junk; + + // + // Pre-conditions: + // - a is randomized and skew-symmetric. + // - b is randomized. + // - c_orig is randomized. + // Note: + // - alpha and beta should have non-zero imaginary components in the + // complex cases in order to more fully exercise the implementation. + // + // Under these conditions, we assume that the implementation for + // + // C := beta * C_orig + alpha * conja(A) * transb(B) (side = left) + // C := beta * C_orig + alpha * transb(B) * conja(A) (side = right) + // + // is functioning correctly if + // + // normfv( v - z ) + // + // is negligible, where + // + // v = C * t + // + // z = ( beta * C_orig + alpha * conja(A) * transb(B) ) * t (side = left) + // = beta * C_orig * t + alpha * conja(A) * transb(B) * t + // = beta * C_orig * t + alpha * conja(A) * w + // = beta * C_orig * t + z + // + // z = ( beta * C_orig + alpha * transb(B) * conja(A) ) * t (side = right) + // = beta * C_orig * t + alpha * transb(B) * conja(A) * t + // = beta * C_orig * t + alpha * transb(B) * w + // = beta * C_orig * t + z + + bli_obj_scalar_init_detached( dt_real, &norm ); + + if ( bli_is_left( side ) ) + { + bli_obj_create( dt, n, 1, 0, 0, &t ); + bli_obj_create( dt, m, 1, 0, 0, &v ); + bli_obj_create( dt, m, 1, 0, 0, &w ); + bli_obj_create( dt, m, 1, 0, 0, &z ); + } + else // else if ( bli_is_left( side ) ) + { + bli_obj_create( dt, n, 1, 0, 0, &t ); + bli_obj_create( dt, m, 1, 0, 0, &v ); + bli_obj_create( dt, n, 1, 0, 0, &w ); + bli_obj_create( dt, m, 1, 0, 0, &z ); + } + + libblis_test_vobj_randomize( params, TRUE, &t ); + + bli_gemv( &BLIS_ONE, c, &t, &BLIS_ZERO, &v ); + + if ( bli_is_left( side ) ) + { + bli_gemv( &BLIS_ONE, b, &t, &BLIS_ZERO, &w ); + bli_skmv( alpha, a, &w, &BLIS_ZERO, &z ); + } + else + { + bli_skmv( &BLIS_ONE, a, &t, &BLIS_ZERO, &w ); + bli_gemv( alpha, b, &w, &BLIS_ZERO, &z ); + } + + bli_gemv( beta, c_orig, &t, &BLIS_ONE, &z ); + + bli_subv( &z, &v ); + bli_normfv( &v, &norm ); + bli_getsc( &norm, resid, &junk ); + + bli_obj_free( &t ); + bli_obj_free( &v ); + bli_obj_free( &w ); + bli_obj_free( &z ); +} + diff --git a/testsuite/src/test_skmm.h b/testsuite/src/test_skmm.h new file mode 100644 index 0000000000..e052428ba3 --- /dev/null +++ b/testsuite/src/test_skmm.h @@ -0,0 +1,41 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2023, Southern Methodist University + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +void libblis_test_skmm + ( + thread_data_t* tdata, + test_params_t* params, + test_op_t* op + ); + diff --git a/testsuite/src/test_skmv.c b/testsuite/src/test_skmv.c new file mode 100644 index 0000000000..090c147ba7 --- /dev/null +++ b/testsuite/src/test_skmv.c @@ -0,0 +1,350 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" +#include "test_libblis.h" + + +// Static variables. +static char* op_str = "skmv"; +static char* o_types = "mvv"; // a x y +static char* p_types = "ucc"; // uploa conja conjx +static thresh_t thresh[BLIS_NUM_FP_TYPES] = { { 1e-04, 1e-05 }, // warn, pass for s + { 1e-04, 1e-05 }, // warn, pass for c + { 1e-13, 1e-14 }, // warn, pass for d + { 1e-13, 1e-14 } }; // warn, pass for z + +// Local prototypes. +void libblis_test_skmv_deps + ( + thread_data_t* tdata, + test_params_t* params, + test_op_t* op + ); + +void libblis_test_skmv_experiment + ( + test_params_t* params, + test_op_t* op, + iface_t iface, + char* dc_str, + char* pc_str, + char* sc_str, + unsigned int p_cur, + double* perf, + double* resid + ); + +void libblis_test_skmv_impl + ( + iface_t iface, + obj_t* alpha, + obj_t* a, + obj_t* x, + obj_t* beta, + obj_t* y + ); + +void libblis_test_skmv_check + ( + test_params_t* params, + obj_t* alpha, + obj_t* a, + obj_t* x, + obj_t* beta, + obj_t* y, + obj_t* y_orig, + double* resid + ); + + + +void libblis_test_skmv_deps + ( + thread_data_t* tdata, + test_params_t* params, + test_op_t* op + ) +{ + libblis_test_randv( tdata, params, &(op->ops->randv) ); + libblis_test_randm( tdata, params, &(op->ops->randm) ); + libblis_test_normfv( tdata, params, &(op->ops->normfv) ); + libblis_test_subv( tdata, params, &(op->ops->subv) ); + libblis_test_copyv( tdata, params, &(op->ops->copyv) ); + libblis_test_scalv( tdata, params, &(op->ops->scalv) ); + libblis_test_gemv( tdata, params, &(op->ops->gemv) ); +} + + + +void libblis_test_skmv + ( + thread_data_t* tdata, + test_params_t* params, + test_op_t* op + ) +{ + + // Return early if this test has already been done. + if ( libblis_test_op_is_done( op ) ) return; + + // Return early if operation is disabled. + if ( libblis_test_op_is_disabled( op ) || + libblis_test_l2_is_disabled( op ) ) return; + + // Call dependencies first. + if ( TRUE ) libblis_test_skmv_deps( tdata, params, op ); + + // Execute the test driver for each implementation requested. + //if ( op->front_seq == ENABLE ) + { + libblis_test_op_driver( tdata, + params, + op, + BLIS_TEST_SEQ_FRONT_END, + op_str, + p_types, + o_types, + thresh, + libblis_test_skmv_experiment ); + } +} + + + +void libblis_test_skmv_experiment + ( + test_params_t* params, + test_op_t* op, + iface_t iface, + char* dc_str, + char* pc_str, + char* sc_str, + unsigned int p_cur, + double* perf, + double* resid + ) +{ + unsigned int n_repeats = params->n_repeats; + unsigned int i; + + double time_min = DBL_MAX; + double time; + + num_t datatype; + + dim_t m; + + uplo_t uploa; + conj_t conja; + conj_t conjx; + + obj_t alpha, a, x, beta, y; + obj_t y_save; + + + // Use the datatype of the first char in the datatype combination string. + bli_param_map_char_to_blis_dt( dc_str[0], &datatype ); + + // Map the dimension specifier to an actual dimension. + m = libblis_test_get_dim_from_prob_size( op->dim_spec[0], p_cur ); + + // Map parameter characters to BLIS constants. + bli_param_map_char_to_blis_uplo( pc_str[0], &uploa ); + bli_param_map_char_to_blis_conj( pc_str[1], &conja ); + bli_param_map_char_to_blis_conj( pc_str[2], &conjx ); + + // Create test scalars. + bli_obj_scalar_init_detached( datatype, &alpha ); + bli_obj_scalar_init_detached( datatype, &beta ); + + // Create test operands (vectors and/or matrices). + libblis_test_mobj_create( params, datatype, BLIS_NO_TRANSPOSE, + sc_str[0], m, m, &a ); + libblis_test_vobj_create( params, datatype, + sc_str[1], m, &x ); + libblis_test_vobj_create( params, datatype, + sc_str[2], m, &y ); + libblis_test_vobj_create( params, datatype, + sc_str[2], m, &y_save ); + + // Set alpha and beta. + if ( bli_obj_is_real( &y ) ) + { + bli_setsc( 1.0, 0.0, &alpha ); + bli_setsc( -1.0, 0.0, &beta ); + } + else + { + bli_setsc( 0.5, 0.5, &alpha ); + bli_setsc( -0.5, 0.5, &beta ); + } + + // Set the structure and uplo properties of A. + bli_obj_set_struc( BLIS_SKEW_SYMMETRIC, &a ); + bli_obj_set_uplo( uploa, &a ); + + // Randomize A, make it densely symmetric, and zero the unstored triangle + // to ensure the implementation reads only from the stored region. + libblis_test_mobj_randomize( params, TRUE, &a ); + bli_mkskewsymm( &a ); + bli_mktrim( &a ); + + // Randomize x and y, and save y. + libblis_test_vobj_randomize( params, TRUE, &x ); + libblis_test_vobj_randomize( params, TRUE, &y ); + bli_copyv( &y, &y_save ); + + // Apply the remaining parameters. + bli_obj_set_conj( conja, &a ); + bli_obj_set_conj( conjx, &x ); + + // Repeat the experiment n_repeats times and record results. + for ( i = 0; i < n_repeats; ++i ) + { + bli_copym( &y_save, &y ); + + time = bli_clock(); + + libblis_test_skmv_impl( iface, &alpha, &a, &x, &beta, &y ); + + time_min = bli_clock_min_diff( time_min, time ); + } + + // Estimate the performance of the best experiment repeat. + *perf = ( 1.0 * m * m ) / time_min / FLOPS_PER_UNIT_PERF; + if ( bli_obj_is_complex( &y ) ) *perf *= 4.0; + + // Perform checks. + libblis_test_skmv_check( params, &alpha, &a, &x, &beta, &y, &y_save, resid ); + + // Zero out performance and residual if output vector is empty. + libblis_test_check_empty_problem( &y, perf, resid ); + + // Free the test objects. + bli_obj_free( &a ); + bli_obj_free( &x ); + bli_obj_free( &y ); + bli_obj_free( &y_save ); +} + + + +void libblis_test_skmv_impl + ( + iface_t iface, + obj_t* alpha, + obj_t* a, + obj_t* x, + obj_t* beta, + obj_t* y + ) +{ + switch ( iface ) + { + case BLIS_TEST_SEQ_FRONT_END: + bli_skmv( alpha, a, x, beta, y ); + break; + + default: + libblis_test_printf_error( "Invalid interface type.\n" ); + } +} + + + +void libblis_test_skmv_check + ( + test_params_t* params, + obj_t* alpha, + obj_t* a, + obj_t* x, + obj_t* beta, + obj_t* y, + obj_t* y_orig, + double* resid + ) +{ + num_t dt = bli_obj_dt( y ); + num_t dt_real = bli_obj_dt_proj_to_real( y ); + + dim_t m = bli_obj_vector_dim( y ); + + obj_t v; + obj_t norm; + + double junk; + + // + // Pre-conditions: + // - a is randomized and skew-symmetric. + // - x is randomized. + // - y_orig is randomized. + // Note: + // - alpha and beta should have non-zero imaginary components in the + // complex cases in order to more fully exercise the implementation. + // + // Under these conditions, we assume that the implementation for + // + // y := beta * y_orig + alpha * conja(A) * conjx(x) + // + // is functioning correctly if + // + // normfv( y - v ) + // + // is negligible, where + // + // v = beta * y_orig + alpha * conja(A_dense) * x + // + + bli_obj_scalar_init_detached( dt_real, &norm ); + + bli_obj_create( dt, m, 1, 0, 0, &v ); + + bli_copyv( y_orig, &v ); + + bli_mkskewsymm( a ); + bli_obj_set_struc( BLIS_GENERAL, a ); + bli_obj_set_uplo( BLIS_DENSE, a ); + + bli_gemv( alpha, a, x, beta, &v ); + + bli_subv( &v, y ); + bli_normfv( y, &norm ); + bli_getsc( &norm, resid, &junk ); + + bli_obj_free( &v ); +} + diff --git a/testsuite/src/test_skmv.h b/testsuite/src/test_skmv.h new file mode 100644 index 0000000000..d9688610f1 --- /dev/null +++ b/testsuite/src/test_skmv.h @@ -0,0 +1,42 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +void libblis_test_skmv + ( + thread_data_t* tdata, + test_params_t* params, + test_op_t* op + ); + diff --git a/testsuite/src/test_skr2.c b/testsuite/src/test_skr2.c new file mode 100644 index 0000000000..7f324e8406 --- /dev/null +++ b/testsuite/src/test_skr2.c @@ -0,0 +1,372 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" +#include "test_libblis.h" + + +// Static variables. +static char* op_str = "skr2"; +static char* o_types = "vvm"; // x y a +static char* p_types = "ucc"; // uploa conjx conjy +static thresh_t thresh[BLIS_NUM_FP_TYPES] = { { 1e-04, 1e-05 }, // warn, pass for s + { 1e-04, 1e-05 }, // warn, pass for c + { 1e-13, 1e-14 }, // warn, pass for d + { 1e-13, 1e-14 } }; // warn, pass for z + +// Local prototypes. +void libblis_test_skr2_deps + ( + thread_data_t* tdata, + test_params_t* params, + test_op_t* op + ); + +void libblis_test_skr2_experiment + ( + test_params_t* params, + test_op_t* op, + iface_t iface, + char* dc_str, + char* pc_str, + char* sc_str, + unsigned int p_cur, + double* perf, + double* resid + ); + +void libblis_test_skr2_impl + ( + iface_t iface, + obj_t* alpha, + obj_t* x, + obj_t* y, + obj_t* a + ); + +void libblis_test_skr2_check + ( + test_params_t* params, + obj_t* alpha, + obj_t* x, + obj_t* y, + obj_t* a, + obj_t* a_orig, + double* resid + ); + + + +void libblis_test_skr2_deps + ( + thread_data_t* tdata, + test_params_t* params, + test_op_t* op + ) +{ + libblis_test_randv( tdata, params, &(op->ops->randv) ); + libblis_test_randm( tdata, params, &(op->ops->randm) ); + libblis_test_normfv( tdata, params, &(op->ops->normfv) ); + libblis_test_subv( tdata, params, &(op->ops->subv) ); + libblis_test_copym( tdata, params, &(op->ops->copym) ); + libblis_test_scal2v( tdata, params, &(op->ops->scal2v) ); + libblis_test_dotv( tdata, params, &(op->ops->dotv) ); + libblis_test_gemv( tdata, params, &(op->ops->gemv) ); +} + + + +void libblis_test_skr2 + ( + thread_data_t* tdata, + test_params_t* params, + test_op_t* op + ) +{ + + // Return early if this test has already been done. + if ( libblis_test_op_is_done( op ) ) return; + + // Return early if operation is disabled. + if ( libblis_test_op_is_disabled( op ) || + libblis_test_l2_is_disabled( op ) ) return; + + // Call dependencies first. + if ( TRUE ) libblis_test_skr2_deps( tdata, params, op ); + + // Execute the test driver for each implementation requested. + //if ( op->front_seq == ENABLE ) + { + libblis_test_op_driver( tdata, + params, + op, + BLIS_TEST_SEQ_FRONT_END, + op_str, + p_types, + o_types, + thresh, + libblis_test_skr2_experiment ); + } +} + + + +void libblis_test_skr2_experiment + ( + test_params_t* params, + test_op_t* op, + iface_t iface, + char* dc_str, + char* pc_str, + char* sc_str, + unsigned int p_cur, + double* perf, + double* resid + ) +{ + unsigned int n_repeats = params->n_repeats; + unsigned int i; + + double time_min = DBL_MAX; + double time; + + num_t datatype; + + dim_t m; + + uplo_t uploa; + conj_t conjx, conjy; + + obj_t alpha, x, y, a; + obj_t a_save; + + + // Use the datatype of the first char in the datatype combination string. + bli_param_map_char_to_blis_dt( dc_str[0], &datatype ); + + // Map the dimension specifier to an actual dimension. + m = libblis_test_get_dim_from_prob_size( op->dim_spec[0], p_cur ); + + // Map parameter characters to BLIS constants. + bli_param_map_char_to_blis_uplo( pc_str[0], &uploa ); + bli_param_map_char_to_blis_conj( pc_str[1], &conjx ); + bli_param_map_char_to_blis_conj( pc_str[2], &conjy ); + + // Create test scalars. + bli_obj_scalar_init_detached( datatype, &alpha ); + + // Create test operands (vectors and/or matrices). + libblis_test_vobj_create( params, datatype, + sc_str[0], m, &x ); + libblis_test_vobj_create( params, datatype, + sc_str[1], m, &y ); + libblis_test_mobj_create( params, datatype, BLIS_NO_TRANSPOSE, + sc_str[2], m, m, &a ); + libblis_test_mobj_create( params, datatype, BLIS_NO_TRANSPOSE, + sc_str[2], m, m, &a_save ); + + // Set alpha. + //bli_copysc( &BLIS_MINUS_ONE, &alpha ); + bli_setsc( -1.0, 1.0, &alpha ); + + // Randomize x and y. + libblis_test_vobj_randomize( params, TRUE, &x ); + libblis_test_vobj_randomize( params, TRUE, &y ); + + // Set the structure and uplo properties of A. + bli_obj_set_struc( BLIS_SKEW_SYMMETRIC, &a ); + bli_obj_set_uplo( uploa, &a ); + + // Randomize A, make it densely symmetric, and zero the unstored triangle + // to ensure the implementation is reads only from the stored region. + libblis_test_mobj_randomize( params, TRUE, &a ); + bli_mkskewsymm( &a ); + bli_mktrim( &a ); + + bli_obj_set_struc( BLIS_SKEW_SYMMETRIC, &a_save ); + bli_obj_set_uplo( uploa, &a_save ); + bli_copym( &a, &a_save ); + + // Apply the remaining parameters. + bli_obj_set_conj( conjx, &x ); + bli_obj_set_conj( conjy, &y ); + + // Repeat the experiment n_repeats times and record results. + for ( i = 0; i < n_repeats; ++i ) + { + bli_copym( &a_save, &a ); + + time = bli_clock(); + + libblis_test_skr2_impl( iface, &alpha, &x, &y, &a ); + + time_min = bli_clock_min_diff( time_min, time ); + } + + // Estimate the performance of the best experiment repeat. + *perf = ( 2.0 * m * m ) / time_min / FLOPS_PER_UNIT_PERF; + if ( bli_obj_is_complex( &a ) ) *perf *= 4.0; + + // Perform checks. + libblis_test_skr2_check( params, &alpha, &x, &y, &a, &a_save, resid ); + + // Zero out performance and residual if output matrix is empty. + libblis_test_check_empty_problem( &a, perf, resid ); + + // Free the test objects. + bli_obj_free( &x ); + bli_obj_free( &y ); + bli_obj_free( &a ); + bli_obj_free( &a_save ); +} + + + +void libblis_test_skr2_impl + ( + iface_t iface, + obj_t* alpha, + obj_t* x, + obj_t* y, + obj_t* a + ) +{ + switch ( iface ) + { + case BLIS_TEST_SEQ_FRONT_END: + bli_skr2( alpha, x, y, a ); + break; + + default: + libblis_test_printf_error( "Invalid interface type.\n" ); + } +} + + + +void libblis_test_skr2_check + ( + test_params_t* params, + obj_t* alpha, + obj_t* x, + obj_t* y, + obj_t* a, + obj_t* a_orig, + double* resid + ) +{ + num_t dt = bli_obj_dt( a ); + num_t dt_real = bli_obj_dt_proj_to_real( a ); + + dim_t m_a = bli_obj_length( a ); + + obj_t xt, yt; + obj_t t, v, w1, w2; + obj_t rho, norm; + + double junk; + + // + // Pre-conditions: + // - x is randomized. + // - y is randomized. + // - a is randomized and skew-symmetric. + // Note: + // - alpha should have a non-zero imaginary component in the + // complex cases in order to more fully exercise the implementation. + // + // Under these conditions, we assume that the implementation for + // + // A := A_orig + alpha * conjx(x) * conjy(y)^T - alpha * conjy(y) * conjx(x)^T + // + // is functioning correctly if + // + // normfv( v - w ) + // + // is negligible, where + // + // v = A * t + // w = ( A_orig + alpha * conjx(x) * conjy(y)^T - alpha * conjy(y) * conjx(x)^T ) * t + // = A_orig * t + alpha * conjx(x) * conjy(y)^T * t - alpha * conjy(y) * conjx(x)^T * t + // = A_orig * t + alpha * conjx(x) * conjy(y)^T * t - alpha * conjy(y) * rho + // = A_orig * t + alpha * conjx(x) * conjy(y)^T * t - w1 + // = A_orig * t + alpha * conjx(x) * rho - w1 + // = A_orig * t + w2 - w1 + // + + bli_mkskewsymm( a ); + bli_mkskewsymm( a_orig ); + bli_obj_set_struc( BLIS_GENERAL, a ); + bli_obj_set_struc( BLIS_GENERAL, a_orig ); + bli_obj_set_uplo( BLIS_DENSE, a ); + bli_obj_set_uplo( BLIS_DENSE, a_orig ); + + bli_obj_scalar_init_detached( dt, &rho ); + bli_obj_scalar_init_detached( dt_real, &norm ); + + bli_obj_create( dt, m_a, 1, 0, 0, &t ); + bli_obj_create( dt, m_a, 1, 0, 0, &v ); + bli_obj_create( dt, m_a, 1, 0, 0, &w1 ); + bli_obj_create( dt, m_a, 1, 0, 0, &w2 ); + + bli_obj_alias_to( x, &xt ); + bli_obj_alias_to( y, &yt ); + + libblis_test_vobj_randomize( params, TRUE, &t ); + + bli_gemv( &BLIS_ONE, a, &t, &BLIS_ZERO, &v ); + + bli_dotv( &xt, &t, &rho ); + bli_mulsc( alpha, &rho ); + bli_scal2v( &rho, y, &w1 ); + + bli_dotv( &yt, &t, &rho ); + bli_mulsc( alpha, &rho ); + bli_scal2v( &rho, x, &w2 ); + + bli_xpbyv( &w2, &BLIS_MINUS_ONE, &w1 ); + + bli_gemv( &BLIS_ONE, a_orig, &t, &BLIS_ONE, &w1 ); + + bli_subv( &w1, &v ); + bli_normfv( &v, &norm ); + bli_getsc( &norm, resid, &junk ); + + bli_obj_free( &t ); + bli_obj_free( &v ); + bli_obj_free( &w1 ); + bli_obj_free( &w2 ); +} + diff --git a/testsuite/src/test_skr2.h b/testsuite/src/test_skr2.h new file mode 100644 index 0000000000..fa7db47d7d --- /dev/null +++ b/testsuite/src/test_skr2.h @@ -0,0 +1,42 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +void libblis_test_skr2 + ( + thread_data_t* tdata, + test_params_t* params, + test_op_t* op + ); + diff --git a/testsuite/src/test_skr2k.c b/testsuite/src/test_skr2k.c new file mode 100644 index 0000000000..15ddb028b6 --- /dev/null +++ b/testsuite/src/test_skr2k.c @@ -0,0 +1,380 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" +#include "test_libblis.h" + + +// Static variables. +static char* op_str = "skr2k"; +static char* o_types = "mmm"; // a b c +static char* p_types = "uhh"; // uploc transa transb +static thresh_t thresh[BLIS_NUM_FP_TYPES] = { { 1e-04, 1e-05 }, // warn, pass for s + { 1e-04, 1e-05 }, // warn, pass for c + { 1e-13, 1e-14 }, // warn, pass for d + { 1e-13, 1e-14 } }; // warn, pass for z + +// Local prototypes. +void libblis_test_skr2k_deps + ( + thread_data_t* tdata, + test_params_t* params, + test_op_t* op + ); + +void libblis_test_skr2k_experiment + ( + test_params_t* params, + test_op_t* op, + iface_t iface, + char* dc_str, + char* pc_str, + char* sc_str, + unsigned int p_cur, + double* perf, + double* resid + ); + +void libblis_test_skr2k_impl + ( + iface_t iface, + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c + ); + +void libblis_test_skr2k_check + ( + test_params_t* params, + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c, + obj_t* c_orig, + double* resid + ); + + + +void libblis_test_skr2k_deps + ( + thread_data_t* tdata, + test_params_t* params, + test_op_t* op + ) +{ + libblis_test_randv( tdata, params, &(op->ops->randv) ); + libblis_test_randm( tdata, params, &(op->ops->randm) ); + libblis_test_setv( tdata, params, &(op->ops->setv) ); + libblis_test_normfv( tdata, params, &(op->ops->normfv) ); + libblis_test_subv( tdata, params, &(op->ops->subv) ); + libblis_test_scalv( tdata, params, &(op->ops->scalv) ); + libblis_test_copym( tdata, params, &(op->ops->copym) ); + libblis_test_scalm( tdata, params, &(op->ops->scalm) ); + libblis_test_gemv( tdata, params, &(op->ops->gemv) ); + libblis_test_skmv( tdata, params, &(op->ops->skmv) ); +} + + + +void libblis_test_skr2k + ( + thread_data_t* tdata, + test_params_t* params, + test_op_t* op + ) +{ + + // Return early if this test has already been done. + if ( libblis_test_op_is_done( op ) ) return; + + // Return early if operation is disabled. + if ( libblis_test_op_is_disabled( op ) || + libblis_test_l3_is_disabled( op ) ) return; + + // Call dependencies first. + if ( TRUE ) libblis_test_skr2k_deps( tdata, params, op ); + + // Execute the test driver for each implementation requested. + //if ( op->front_seq == ENABLE ) + { + libblis_test_op_driver( tdata, + params, + op, + BLIS_TEST_SEQ_FRONT_END, + op_str, + p_types, + o_types, + thresh, + libblis_test_skr2k_experiment ); + } +} + + + +void libblis_test_skr2k_experiment + ( + test_params_t* params, + test_op_t* op, + iface_t iface, + char* dc_str, + char* pc_str, + char* sc_str, + unsigned int p_cur, + double* perf, + double* resid + ) +{ + unsigned int n_repeats = params->n_repeats; + unsigned int i; + + double time_min = DBL_MAX; + double time; + + num_t datatype; + + dim_t m, k; + + uplo_t uploc; + trans_t transa, transb; + + obj_t alpha, a, b, beta, c; + obj_t c_save; + + + // Use the datatype of the first char in the datatype combination string. + bli_param_map_char_to_blis_dt( dc_str[0], &datatype ); + + // Map the dimension specifier to actual dimensions. + m = libblis_test_get_dim_from_prob_size( op->dim_spec[0], p_cur ); + k = libblis_test_get_dim_from_prob_size( op->dim_spec[1], p_cur ); + + // Map parameter characters to BLIS constants. + bli_param_map_char_to_blis_uplo( pc_str[0], &uploc ); + bli_param_map_char_to_blis_trans( pc_str[1], &transa ); + bli_param_map_char_to_blis_trans( pc_str[2], &transb ); + + // Create test scalars. + bli_obj_scalar_init_detached( datatype, &alpha ); + bli_obj_scalar_init_detached( datatype, &beta ); + + // Create test operands (vectors and/or matrices). + libblis_test_mobj_create( params, datatype, transa, + sc_str[1], m, k, &a ); + libblis_test_mobj_create( params, datatype, transb, + sc_str[2], m, k, &b ); + libblis_test_mobj_create( params, datatype, BLIS_NO_TRANSPOSE, + sc_str[0], m, m, &c ); + libblis_test_mobj_create( params, datatype, BLIS_NO_TRANSPOSE, + sc_str[0], m, m, &c_save ); + + // Set alpha and beta. + if ( bli_obj_is_real( &c ) ) + { + bli_setsc( 0.8, 0.0, &alpha ); + bli_setsc( -1.0, 0.0, &beta ); + } + else + { + // For syr2k, both alpha and beta may be complex since, unlike her2k, + // C is symmetric in both the real and complex cases. + bli_setsc( 0.8, 0.5, &alpha ); + bli_setsc( -1.0, 0.5, &beta ); + } + + // Randomize A and B. + libblis_test_mobj_randomize( params, TRUE, &a ); + libblis_test_mobj_randomize( params, TRUE, &b ); + + // Set the structure and uplo properties of C. + bli_obj_set_struc( BLIS_SKEW_SYMMETRIC, &c ); + bli_obj_set_uplo( uploc, &c ); + + // Randomize A, make it densely symmetric, and zero the unstored triangle + // to ensure the implementation is reads only from the stored region. + libblis_test_mobj_randomize( params, TRUE, &c ); + bli_mkskewsymm( &c ); + bli_mktrim( &c ); + + // Save C and set its structure and uplo properties. + bli_obj_set_struc( BLIS_SKEW_SYMMETRIC, &c_save ); + bli_obj_set_uplo( uploc, &c_save ); + bli_copym( &c, &c_save ); + + // Apply the remaining parameters. + bli_obj_set_conjtrans( transa, &a ); + bli_obj_set_conjtrans( transb, &b ); + + // Repeat the experiment n_repeats times and record results. + for ( i = 0; i < n_repeats; ++i ) + { + bli_copym( &c_save, &c ); + + time = bli_clock(); + + libblis_test_skr2k_impl( iface, &alpha, &a, &b, &beta, &c ); + + time_min = bli_clock_min_diff( time_min, time ); + } + + // Estimate the performance of the best experiment repeat. + *perf = ( 2.0 * m * m * k ) / time_min / FLOPS_PER_UNIT_PERF; + if ( bli_obj_is_complex( &c ) ) *perf *= 4.0; + + // Perform checks. + libblis_test_skr2k_check( params, &alpha, &a, &b, &beta, &c, &c_save, resid ); + + // Zero out performance and residual if output matrix is empty. + libblis_test_check_empty_problem( &c, perf, resid ); + + // Free the test objects. + bli_obj_free( &a ); + bli_obj_free( &b ); + bli_obj_free( &c ); + bli_obj_free( &c_save ); +} + + + +void libblis_test_skr2k_impl + ( + iface_t iface, + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c + ) +{ + switch ( iface ) + { + case BLIS_TEST_SEQ_FRONT_END: + bli_skr2k( alpha, a, b, beta, c ); + break; + + default: + libblis_test_printf_error( "Invalid interface type.\n" ); + } +} + + + +void libblis_test_skr2k_check + ( + test_params_t* params, + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c, + obj_t* c_orig, + double* resid + ) +{ + num_t dt = bli_obj_dt( c ); + num_t dt_real = bli_obj_dt_proj_to_real( c ); + + dim_t m = bli_obj_length( c ); + dim_t k = bli_obj_width_after_trans( a ); + + obj_t at, bt; + obj_t norm; + obj_t t, v, w1, w2, z; + + double junk; + + // + // Pre-conditions: + // - a is randomized. + // - b is randomized. + // - c_orig is randomized and skew-symmetric. + // Note: + // - alpha and beta should have non-zero imaginary components in the + // complex cases in order to more fully exercise the implementation. + // + // Under these conditions, we assume that the implementation for + // + // C := beta * C_orig + alpha * transa(A) * transb(B)^T - alpha * transb(B) * transa(A)^T + // + // is functioning correctly if + // + // normfv( v - z ) + // + // is negligible, where + // + // v = C * t + // z = ( beta * C_orig + alpha * transa(A) * transb(B)^T - alpha * transb(B) * transa(A)^T ) * t + // = beta * C_orig * t + alpha * transa(A) * transb(B)^T * t - alpha * transb(B) * transa(A)^T * t + // = beta * C_orig * t + alpha * transa(A) * transb(B)^T * t - alpha * transb(B) * w2 + // = beta * C_orig * t + alpha * transa(A) * w1 - alpha * transb(B) * w2 + // = beta * C_orig * t + alpha * transa(A) * w1 - z + // = beta * C_orig * t + z + // + + bli_obj_alias_with_trans( BLIS_TRANSPOSE, a, &at ); + bli_obj_alias_with_trans( BLIS_TRANSPOSE, b, &bt ); + + bli_obj_scalar_init_detached( dt_real, &norm ); + + bli_obj_create( dt, m, 1, 0, 0, &t ); + bli_obj_create( dt, m, 1, 0, 0, &v ); + bli_obj_create( dt, k, 1, 0, 0, &w1 ); + bli_obj_create( dt, k, 1, 0, 0, &w2 ); + bli_obj_create( dt, m, 1, 0, 0, &z ); + + libblis_test_vobj_randomize( params, TRUE, &t ); + + bli_skmv( &BLIS_ONE, c, &t, &BLIS_ZERO, &v ); + + bli_gemv( &BLIS_ONE, &at, &t, &BLIS_ZERO, &w2 ); + bli_gemv( &BLIS_ONE, &bt, &t, &BLIS_ZERO, &w1 ); + bli_gemv( alpha, b, &w2, &BLIS_ZERO, &z ); + bli_gemv( alpha, a, &w1, &BLIS_MINUS_ONE, &z ); + bli_skmv( beta, c_orig, &t, &BLIS_ONE, &z ); + + bli_subv( &z, &v ); + bli_normfv( &v, &norm ); + bli_getsc( &norm, resid, &junk ); + + bli_obj_free( &t ); + bli_obj_free( &v ); + bli_obj_free( &w1 ); + bli_obj_free( &w2 ); + bli_obj_free( &z ); +} + diff --git a/testsuite/src/test_skr2k.h b/testsuite/src/test_skr2k.h new file mode 100644 index 0000000000..949eaa599c --- /dev/null +++ b/testsuite/src/test_skr2k.h @@ -0,0 +1,42 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +void libblis_test_skr2k + ( + thread_data_t* tdata, + test_params_t* params, + test_op_t* op + ); +