From 2643db0bc20c0da5fcfcfb5179599b713fb3ac97 Mon Sep 17 00:00:00 2001 From: Meghana Vankadari Date: Wed, 12 Feb 2020 12:32:36 +0530 Subject: [PATCH] Made framework changes to initialize specific cache block sizes for TRSM. Details: -This commit addresses the performance optimization(single-thread and multi-thread) for DTRSM on zen2. -This new optimization employs different MC, KC & NC values for TRSM than what is being used in other Level-3 routines like DGEMM. -Changed TRSM framework code to choose these blocksizes for TRSM on zen family configurations. -Added a new field called "trsm_blkszs" to cntx structure in order to store TRSM specific block sizes. -Implemented routines to initialize, set and query the TRSM-specific block sizes. -Defined a new macro "AOCL_BLIS_ZEN" in configure script. This macro is automatically defined for zen family architectures. It enables us to choose different cache block sizes for TRSM instead of common level-3 block sizes. Change-Id: Id8557b1c962a316b1edecca9cd582675eaf35fe6 Signed-off-by: Meghana Vankadari AMD-Internal: [CPUPL-656] --- build/bli_config.h.in | 6 ++ config/zen/bli_cntx_init_zen.c | 14 ++++ config/zen2/bli_cntx_init_zen2.c | 36 ++++++--- configure | 13 ++++ frame/3/bli_l3_blocksize.c | 6 +- frame/3/trsm/bli_trsm_blk_var1.c | 10 ++- frame/3/trsm/bli_trsm_blk_var2.c | 9 ++- frame/base/bli_blksz.c | 73 +++++++++++++++++ frame/base/bli_blksz.h | 33 ++++++++ frame/base/bli_cntx.c | 108 +++++++++++++++++++++++++- frame/base/bli_cntx.h | 17 +++- frame/include/bli_kernel_macro_defs.h | 6 ++ frame/include/bli_type_defs.h | 2 + 13 files changed, 314 insertions(+), 19 deletions(-) diff --git a/build/bli_config.h.in b/build/bli_config.h.in index fa6bbbe12e..df7759faed 100644 --- a/build/bli_config.h.in +++ b/build/bli_config.h.in @@ -51,6 +51,12 @@ #define BLIS_DISABLE_SYSTEM #endif +//This macro is enabled only for ZEN family configurations. +//This enables us to use different cache-blocking sizes for TRSM instead of common level-3 cache-block sizes. +#if @enable_aocl_zen@ +#define AOCL_BLIS_ZEN +#endif + #if @enable_openmp@ #define BLIS_ENABLE_OPENMP #endif diff --git a/config/zen/bli_cntx_init_zen.c b/config/zen/bli_cntx_init_zen.c index ed7287cee0..eb41d018ec 100644 --- a/config/zen/bli_cntx_init_zen.c +++ b/config/zen/bli_cntx_init_zen.c @@ -202,6 +202,20 @@ void bli_cntx_init_zen( cntx_t* cntx ) cntx ); + // Update the context with the current architecture's register and cache + // blocksizes for level-3 TRSM execution. + bli_cntx_set_trsm_blkszs + ( + 5, + // level-3 + BLIS_NC, &blkszs[ BLIS_NC ], + BLIS_KC, &blkszs[ BLIS_KC ], + BLIS_MC, &blkszs[ BLIS_MC ], + BLIS_NR, &blkszs[ BLIS_NR ], + BLIS_MR, &blkszs[ BLIS_MR ], + cntx + ); + // ------------------------------------------------------------------------- // Initialize sup thresholds with architecture-appropriate values. diff --git a/config/zen2/bli_cntx_init_zen2.c b/config/zen2/bli_cntx_init_zen2.c index 0964ce463e..2d92702882 100644 --- a/config/zen2/bli_cntx_init_zen2.c +++ b/config/zen2/bli_cntx_init_zen2.c @@ -174,17 +174,31 @@ void bli_cntx_init_zen2( cntx_t* cntx ) // ------------------------------------------------------------------------- - // Initialize sup thresholds with architecture-appropriate values. - // s d c z -#if 1 - bli_blksz_init_easy( &thresh[ BLIS_MT ], 500, 249, -1, -1 ); - bli_blksz_init_easy( &thresh[ BLIS_NT ], 500, 249, -1, -1 ); - bli_blksz_init_easy( &thresh[ BLIS_KT ], 500, 249, -1, -1 ); -#else - bli_blksz_init_easy( &thresh[ BLIS_MT ], 100000, 100000, -1, -1 ); - bli_blksz_init_easy( &thresh[ BLIS_NT ], 100000, 100000, -1, -1 ); - bli_blksz_init_easy( &thresh[ BLIS_KT ], 100000, 100000, -1, -1 ); -#endif + //Initialize TRSM blocksize objects with architecture-specific values. + //Using different cache block sizes for TRSM instead of common level-3 block sizes. + //Tuning is done for double-precision only. + // s d c z + bli_blksz_init_easy( &blkszs[ BLIS_MC ], 144, 72, 144, 72 ); + bli_blksz_init_easy( &blkszs[ BLIS_KC ], 256, 492, 256, 256 ); + bli_blksz_init_easy( &blkszs[ BLIS_NC ], 4080, 1600, 4080, 4080 ); + + // Update the context with the current architecture's register and cache + // blocksizes for level-3 TRSM problems. + bli_cntx_set_trsm_blkszs + ( + 5, + BLIS_NC, &blkszs[ BLIS_NC ], + BLIS_KC, &blkszs[ BLIS_KC ], + BLIS_MC, &blkszs[ BLIS_MC ], + BLIS_NR, &blkszs[ BLIS_NR ], + BLIS_MR, &blkszs[ BLIS_MR ], + cntx + ); + + // Initialize sup thresholds with architecture-appropriate values. s d c z + bli_blksz_init_easy( &thresh[ BLIS_MT ], 512, 256, -1, -1 ); + bli_blksz_init_easy( &thresh[ BLIS_NT ], 200, 100, -1, -1 ); + bli_blksz_init_easy( &thresh[ BLIS_KT ], 240, 120, -1, -1 ); // Initialize the context with the sup thresholds. bli_cntx_set_l3_sup_thresh diff --git a/configure b/configure index 3c865dad90..5288d0b815 100755 --- a/configure +++ b/configure @@ -3282,6 +3282,18 @@ main() uconf=$(echo ${config_name} | tr '[:lower:]' '[:upper:]') config_name_define="#define BLIS_FAMILY_${uconf}\n" + #create a AOCL specific #define + #This macro is enabled only for zen family configurations. + #This enables us to use different cache block sizes for TRSM instead of common level-3 block sizes. + uconf=$(echo ${config_name} | grep -c 'zen' | cut -d. -f1) + if [[ $uconf == 1 ]]; then + enable_aocl_zen='yes' + enable_aocl_zen_01=1 + else + enable_aocl_zen = 'no'; + enable_aocl_zen_01=0; + fi + # Create a list of #defines, one for each configuration in config_list. config_list_defines="" for conf in ${config_list}; do @@ -3395,6 +3407,7 @@ main() | perl -pe "s/\@config_list_defines\@/${config_list_defines}/g" \ | perl -pe "s/\@kernel_list_defines\@/${kernel_list_defines}/g" \ | sed -e "s/@enable_system@/${enable_system_01}/g" \ + | sed -e "s/\@enable_aocl_zen\@/${enable_aocl_zen_01}/g" \ | sed -e "s/@enable_openmp@/${enable_openmp_01}/g" \ | sed -e "s/@enable_pthreads@/${enable_pthreads_01}/g" \ | sed -e "s/@enable_jrir_slab@/${enable_jrir_slab_01}/g" \ diff --git a/frame/3/bli_l3_blocksize.c b/frame/3/bli_l3_blocksize.c index 58b658d1d8..b461e591a5 100644 --- a/frame/3/bli_l3_blocksize.c +++ b/frame/3/bli_l3_blocksize.c @@ -1,10 +1,11 @@ -/* + /* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2020, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -34,7 +35,6 @@ #include "blis.h" - dim_t bli_l3_determine_kc ( dir_t direct, @@ -311,7 +311,7 @@ dim_t PASTEMAC0(opname) \ /* Extract the execution datatype and use it to query the corresponding blocksize and blocksize maximum values from the blksz_t object. */ \ dt = bli_obj_exec_dt( a ); \ - bsize = bli_cntx_get_blksz( bszid, cntx ); \ + bsize = TRSM_BLKSZ_FUNC( bszid, cntx ); \ b_alg = bli_blksz_get_def( dt, bsize ); \ b_max = bli_blksz_get_max( dt, bsize ); \ \ diff --git a/frame/3/trsm/bli_trsm_blk_var1.c b/frame/3/trsm/bli_trsm_blk_var1.c index 578c37c329..2554197f7c 100644 --- a/frame/3/trsm/bli_trsm_blk_var1.c +++ b/frame/3/trsm/bli_trsm_blk_var1.c @@ -5,7 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. + Copyright (C) 2020, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -80,9 +80,15 @@ void bli_trsm_blk_var1 { obj_t a11_1, c1_1; + //For zen architectures, TRSM uses different MC, KC and NC blocking sizes than other Level-3 routines. + //Hence calling a different function to query TRSM-specific block sizes for zen family. +#ifdef AOCL_BLIS_ZEN + b_alg = bli_determine_blocksize_trsm( direct, i, my_end, &a11, + bli_cntl_bszid( cntl ), cntx ); +#else b_alg = bli_determine_blocksize( direct, i, my_end, &a11, bli_cntl_bszid( cntl ), cntx ); - +#endif // Acquire partitions for A1 and C1. bli_acquire_mpart_mdim( direct, BLIS_SUBPART1, i, b_alg, &a11, &a11_1 ); diff --git a/frame/3/trsm/bli_trsm_blk_var2.c b/frame/3/trsm/bli_trsm_blk_var2.c index 23fd3ed4ca..c0ccad433a 100644 --- a/frame/3/trsm/bli_trsm_blk_var2.c +++ b/frame/3/trsm/bli_trsm_blk_var2.c @@ -5,7 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. + Copyright (C) 2020, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -67,8 +67,15 @@ void bli_trsm_blk_var2 for ( dim_t i = my_start; i < my_end; i += b_alg ) { // Determine the current algorithmic blocksize. + //For zen family, TRSM uses different MC, KC and NC blocksizes than Level-3 routines. + //Hence calling a different function to query TRSM-specific block sizes for zen family. +#ifdef AOCL_BLIS_ZEN + b_alg = bli_determine_blocksize_trsm( direct, i, my_end, b, + bli_cntl_bszid( cntl ), cntx ); +#else b_alg = bli_determine_blocksize( direct, i, my_end, b, bli_cntl_bszid( cntl ), cntx ); +#endif // Acquire partitions for B1 and C1. bli_acquire_mpart_ndim( direct, BLIS_SUBPART1, diff --git a/frame/base/bli_blksz.c b/frame/base/bli_blksz.c index 524653d743..2050c665bb 100644 --- a/frame/base/bli_blksz.c +++ b/frame/base/bli_blksz.c @@ -5,6 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2020, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -301,6 +302,78 @@ dim_t bli_determine_blocksize_b return b_use; } +#ifdef AOCL_BLIS_ZEN + +dim_t bli_determine_blocksize_trsm + ( + dir_t direct, + dim_t i, + dim_t dim, + obj_t* obj, + bszid_t bszid, + cntx_t* cntx + ) +{ + if ( direct == BLIS_FWD ) + return bli_determine_blocksize_trsm_f( i, dim, obj, bszid, cntx ); + else + return bli_determine_blocksize_trsm_b( i, dim, obj, bszid, cntx ); +} + +dim_t bli_determine_blocksize_trsm_f + ( + dim_t i, + dim_t dim, + obj_t* obj, + bszid_t bszid, + cntx_t* cntx + ) +{ + num_t dt; + blksz_t* bsize; + dim_t b_alg, b_max; + dim_t b_use; + + // Extract the execution datatype and use it to query the corresponding + // blocksize and blocksize maximum values from the blksz_t object. + dt = bli_obj_exec_dt( obj ); + bsize = bli_cntx_get_trsm_blksz( bszid, cntx ); + b_alg = bli_blksz_get_def( dt, bsize ); + b_max = bli_blksz_get_max( dt, bsize ); + + b_use = bli_determine_blocksize_f_sub( i, dim, b_alg, b_max ); + + return b_use; +} + +dim_t bli_determine_blocksize_trsm_b + ( + dim_t i, + dim_t dim, + obj_t* obj, + bszid_t bszid, + cntx_t* cntx + ) +{ + num_t dt; + blksz_t* bsize; + dim_t b_alg, b_max; + dim_t b_use; + + // Extract the execution datatype and use it to query the corresponding + // blocksize and blocksize maximum values from the blksz_t object. + dt = bli_obj_exec_dt( obj ); + bsize = bli_cntx_get_trsm_blksz( bszid, cntx ); + b_alg = bli_blksz_get_def( dt, bsize ); + b_max = bli_blksz_get_max( dt, bsize ); + + b_use = bli_determine_blocksize_b_sub( i, dim, b_alg, b_max ); + + return b_use; +} + +#endif + dim_t bli_determine_blocksize_f_sub ( dim_t i, diff --git a/frame/base/bli_blksz.h b/frame/base/bli_blksz.h index 2e0fefeae9..dab3130f65 100644 --- a/frame/base/bli_blksz.h +++ b/frame/base/bli_blksz.h @@ -5,6 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2020, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -278,6 +279,38 @@ dim_t bli_determine_blocksize_b cntx_t* cntx ); +#ifdef AOCL_BLIS_ZEN + +dim_t bli_determine_blocksize_trsm + ( + dir_t direct, + dim_t i, + dim_t dim, + obj_t* obj, + bszid_t bszid, + cntx_t* cntx + ); + +dim_t bli_determine_blocksize_trsm_f + ( + dim_t i, + dim_t dim, + obj_t* obj, + bszid_t bszid, + cntx_t* cntx + ); + +dim_t bli_determine_blocksize_trsm_b + ( + dim_t i, + dim_t dim, + obj_t* obj, + bszid_t bszid, + cntx_t* cntx + ); + +#endif + dim_t bli_determine_blocksize_f_sub ( dim_t i, diff --git a/frame/base/bli_cntx.c b/frame/base/bli_cntx.c index 7c408ce8eb..141a92f054 100644 --- a/frame/base/bli_cntx.c +++ b/frame/base/bli_cntx.c @@ -5,7 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. + Copyright (C) 2020, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -1271,6 +1271,112 @@ void bli_cntx_set_l3_sup_kers( dim_t n_ukrs, ... ) // ----------------------------------------------------------------------------- +#ifdef AOCL_BLIS_ZEN + +void bli_cntx_set_trsm_blkszs( dim_t n_bs, ... ) +{ + // This function should be called from the bli_cntx_init_*() function for + // zen family architectures to set TRSM blocksizes. It should be called after + // bli_cntx_init_defaults() so that the context begins with default + // blocksizes across all datatypes. + + /* Example prototypes: + + void bli_cntx_set_trsm_blkszs + ( + dim_t n_bs, + bszid_t bs0_id, blksz_t* blksz0, + bszid_t bs1_id, blksz_t* blksz1, + bszid_t bs2_id, blksz_t* blksz2, + ... + cntx_t* cntx + ); + */ + + va_list args; + dim_t i; + err_t rval; + + // Allocate some temporary local arrays. + #ifdef BLIS_ENABLE_MEM_TRACING + printf( "bli_cntx_set_blkszs(): " ); + #endif + bszid_t* bszids = bli_malloc_intl( n_bs * sizeof( bszid_t ), &rval ); + + #ifdef BLIS_ENABLE_MEM_TRACING + printf( "bli_cntx_set_blkszs(): " ); + #endif + blksz_t** blkszs = bli_malloc_intl( n_bs * sizeof( blksz_t* ), &rval ); + + // -- Begin variable argument section -- + + // Initialize variable argument environment. + va_start( args, n_bs ); + + // Process n_bs tuples. + for ( i = 0; i < n_bs; ++i ) + { + // Here, we query the variable argument list for: + // - the bszid_t of the blocksize we're about to process, + // - the address of the blksz_t object. + bszid_t bs_id = ( bszid_t )va_arg( args, bszid_t ); + blksz_t* blksz = ( blksz_t* )va_arg( args, blksz_t* ); + + // Store the values in our temporary arrays. + bszids[ i ] = bs_id; + blkszs[ i ] = blksz; + } + + // The last argument should be the context pointer. + cntx_t* cntx = ( cntx_t* )va_arg( args, cntx_t* ); + + // Shutdown variable argument environment and clean up stack. + va_end( args ); + + // -- End variable argument section -- + + // Query the context for the addresses of: + // - the blocksize object array + blksz_t* cntx_l3_trsm_blkszs = bli_cntx_trsm_blkszs_buf( cntx ); + + // Now that we have the context address, we want to copy the values + // from the temporary buffers into the corresponding buffers in the + // context. Notice that the blksz_t* pointers were saved, rather than + // the objects themselves, but we copy the contents of the objects + // when copying into the context. + + // Process each blocksize id tuple provided. + for ( i = 0; i < n_bs; ++i ) + { + // Read the current blocksize id, blksz_t* pointer, blocksize + // multiple id, and blocksize scalar. + bszid_t bs_id = bszids[ i ]; + blksz_t* blksz = blkszs[ i ]; + + blksz_t* cntx_l3_trsm_blksz = &cntx_l3_trsm_blkszs[ bs_id ]; + + // Copy the blksz_t object contents into the appropriate + // location within the context's blksz_t array. + //cntx_trsm_blkszs[ bs_id ] = *blksz; + //bli_blksz_copy( blksz, cntx_trsm_blksz ); + bli_blksz_copy_if_pos( blksz, cntx_l3_trsm_blksz ); + } + + // Free the temporary local arrays. + + #ifdef BLIS_ENABLE_MEM_TRACING + printf( "bli_cntx_set_blkszs(): " ); + #endif + bli_free_intl( blkszs ); + + #ifdef BLIS_ENABLE_MEM_TRACING + printf( "bli_cntx_set_blkszs(): " ); + #endif + bli_free_intl( bszids ); +} +#endif +// ----------------------------------------------------------------------------- + void bli_cntx_set_l1f_kers( dim_t n_kers, ... ) { // This function can be called from the bli_cntx_init_*() function for diff --git a/frame/base/bli_cntx.h b/frame/base/bli_cntx.h index 76350f6bcf..0a791eda7b 100644 --- a/frame/base/bli_cntx.h +++ b/frame/base/bli_cntx.h @@ -6,7 +6,7 @@ Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2016, Hewlett Packard Enterprise Development LP - Copyright (C) 2019, Advanced Micro Devices, Inc. + Copyright (C) 2020, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -85,6 +85,10 @@ BLIS_INLINE func_t* bli_cntx_l3_vir_ukrs_buf( cntx_t* cntx ) { return cntx->l3_vir_ukrs; } +static blksz_t* bli_cntx_trsm_blkszs_buf( cntx_t* cntx ) +{ + return cntx->trsm_blkszs; +} BLIS_INLINE func_t* bli_cntx_l3_nat_ukrs_buf( cntx_t* cntx ) { return cntx->l3_nat_ukrs; @@ -301,6 +305,16 @@ BLIS_INLINE blksz_t* bli_cntx_get_l3_sup_blksz( bszid_t bs_id, cntx_t* cntx ) return blksz; } +BLIS_INLINE blksz_t* bli_cntx_get_trsm_blksz( bszid_t bs_id, cntx_t* cntx ) +{ + blksz_t* blkszs = bli_cntx_trsm_blkszs_buf( cntx ); + blksz_t* blksz = &blkszs[ bs_id ]; + + // Return the address of the blksz_t identified by bs_id. + return blksz; + +} + BLIS_INLINE dim_t bli_cntx_get_l3_sup_blksz_def_dt( num_t dt, bszid_t bs_id, cntx_t* cntx ) { blksz_t* blksz = bli_cntx_get_l3_sup_blksz( bs_id, cntx ); @@ -704,6 +718,7 @@ BLIS_EXPORT_BLIS void bli_cntx_clear( cntx_t* cntx ); BLIS_EXPORT_BLIS void bli_cntx_set_blkszs( ind_t method, dim_t n_bs, ... ); BLIS_EXPORT_BLIS void bli_cntx_set_ind_blkszs( ind_t method, num_t dt, dim_t n_bs, ... ); +BLIS_EXPORT_BLIS void bli_cntx_set_trsm_blkszs( dim_t n_bs, ... ); BLIS_EXPORT_BLIS void bli_cntx_set_l3_nat_ukrs( dim_t n_ukrs, ... ); BLIS_EXPORT_BLIS void bli_cntx_set_l3_vir_ukrs( dim_t n_ukrs, ... ); diff --git a/frame/include/bli_kernel_macro_defs.h b/frame/include/bli_kernel_macro_defs.h index d2487584e7..b244eb183f 100644 --- a/frame/include/bli_kernel_macro_defs.h +++ b/frame/include/bli_kernel_macro_defs.h @@ -5,6 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2020, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -246,6 +247,11 @@ #endif +#ifdef AOCL_BLIS_ZEN +#define TRSM_BLKSZ_FUNC bli_cntx_get_trsm_blksz +#else +#define TRSM_BLKSZ_FUNC bli_cntx_get_blksz +#endif #endif diff --git a/frame/include/bli_type_defs.h b/frame/include/bli_type_defs.h index fe030f193f..c02008d80c 100644 --- a/frame/include/bli_type_defs.h +++ b/frame/include/bli_type_defs.h @@ -1506,6 +1506,8 @@ typedef struct cntx_s blksz_t blkszs[ BLIS_NUM_BLKSZS ]; bszid_t bmults[ BLIS_NUM_BLKSZS ]; + blksz_t trsm_blkszs[ BLIS_NUM_BLKSZS ]; + func_t l3_vir_ukrs[ BLIS_NUM_LEVEL3_UKRS ]; func_t l3_nat_ukrs[ BLIS_NUM_LEVEL3_UKRS ]; mbool_t l3_nat_ukrs_prefs[ BLIS_NUM_LEVEL3_UKRS ];