diff --git a/frame/1m/bli_l1m_ker_ft.h b/frame/1m/bli_l1m_ker_ft.h index 6080d1ebe..0bb586fb2 100644 --- a/frame/1m/bli_l1m_ker_ft.h +++ b/frame/1m/bli_l1m_ker_ft.h @@ -51,6 +51,7 @@ typedef void (*PASTECH(opname,_ker_ft)) \ GENTDEF( packm ) GENTDEF( packm_cxk ) +GENTDEF( packmd_cxk ) GENTDEF( unpackm_cxk ) GENTDEF( packm_cxc_diag ) diff --git a/frame/1m/bli_l1m_ker_params.h b/frame/1m/bli_l1m_ker_params.h index ca89bc9db..20fb8eb82 100644 --- a/frame/1m/bli_l1m_ker_params.h +++ b/frame/1m/bli_l1m_ker_params.h @@ -81,6 +81,21 @@ void* p, inc_t ldp, \ const void* params \ +#define packmd_cxk_params \ +\ + conj_t conja, \ + pack_t schema, \ + dim_t cdim, \ + dim_t cdim_max, \ + dim_t cdim_bcast, \ + dim_t n, \ + dim_t n_max, \ + const void* kappa, \ + const void* a, inc_t inca, inc_t lda, \ + const void* d, inc_t incd, \ + void* p, inc_t ldp, \ + const void* params \ + // unpackm_cxk kernel diff --git a/frame/1m/bli_l1m_ker_prot.h b/frame/1m/bli_l1m_ker_prot.h index 8cc9a6934..dbe5d1424 100644 --- a/frame/1m/bli_l1m_ker_prot.h +++ b/frame/1m/bli_l1m_ker_prot.h @@ -62,6 +62,7 @@ void PASTEMAC(chx,chy,funcname) \ #define PACKM_DIAG_KER_PROT( ctype, ch, fn ) L1MTPROT( ctype, ch, fn, packm_cxc_diag ); #define PACKM_KER_PROT2( ctypex, ctypey, chx, chy, fn ) L1MTPROT2( ctypex, ctypey, chx, chy, fn, packm_cxk ); +#define PACKMD_KER_PROT2( ctypex, ctypey, chx, chy, fn ) L1MTPROT2( ctypex, ctypey, chx, chy, fn, packmd_cxk ); #define UNPACKM_KER_PROT2( ctypex, ctypey, chx, chy, fn ) L1MTPROT2( ctypex, ctypey, chx, chy, fn, unpackm_cxk ); #define PACKM_DIAG_KER_PROT2( ctypex, ctypey, chx, chy, fn ) L1MTPROT2( ctypex, ctypey, chx, chy, fn, packm_cxc_diag ); diff --git a/frame/1m/packm/bli_packm.h b/frame/1m/packm/bli_packm.h index 87e38d5de..e0966597c 100644 --- a/frame/1m/packm/bli_packm.h +++ b/frame/1m/packm/bli_packm.h @@ -41,6 +41,7 @@ #include "bli_packm_scalar.h" #include "bli_packm_struc_cxk.h" +#include "bli_packmd_struc_cxk.h" #include "bli_packm_blk_var1.h" diff --git a/frame/1m/packm/bli_packmd_struc_cxk.c b/frame/1m/packm/bli_packmd_struc_cxk.c new file mode 100644 index 000000000..ecc6ed667 --- /dev/null +++ b/frame/1m/packm/bli_packmd_struc_cxk.c @@ -0,0 +1,107 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +#undef GENTFUNC2RO +#define GENTFUNC2RO( ctypec_r, ctype_c, ctypep_r, ctypep, chc_r, chc, chp_r, chp, varname ) \ +GENTFUNC2RO_( ctypec_r, ctypec_r, ctypep_r, ctypep_r, chc_r, chc_r, chp_r, chp_r, varname ) \ +GENTFUNC2RO_( ctypec_r, ctypec, ctypep_r, ctypep, chc_r, chc, chp_r, chp, varname ) + +#undef GENTFUNC2RO_ +#define GENTFUNC2RO_( ctypec_r, ctype_c, ctypep_r, ctypep, chc_r, chc, chp_r, chp, varname ) \ +\ +void PASTEMAC(chc,chp,varname) \ + ( \ + struc_t strucc, \ + diag_t diagc, \ + uplo_t uploc, \ + conj_t conjc, \ + pack_t schema, \ + bool invdiag, \ + dim_t panel_dim, \ + dim_t panel_len, \ + dim_t panel_dim_max, \ + dim_t panel_len_max, \ + dim_t panel_dim_off, \ + dim_t panel_len_off, \ + dim_t panel_bcast, \ + const void* kappa, \ + const void* c, inc_t incc, inc_t ldc, \ + void* p, inc_t ldp, \ + const void* params_, \ + const cntx_t* cntx \ + ) \ +{ \ + num_t dt_c = PASTEMAC(chc,type); \ + num_t dt_p = PASTEMAC(chp,type); \ + dim_t dt_c_size = bli_dt_size( dt_c ); \ +\ + ukr_t cxk_ker_id = BLIS_PACKMD_KER; \ +\ + packmd_cxk_ker_ft f_cxk = bli_cntx_get_ukr2_dt( dt_c, dt_p, cxk_ker_id, cntx ); \ +\ + const gemmd_params* params = ( const gemmd_params* )params_; \ +\ + inc_t incd = params->incd; \ + const char* d = ( const char* )params->d + panel_len_off*incd*dt_c_size; \ +\ + /* For general matrices, pack and return early */ \ + if ( bli_is_general( strucc ) ) \ + { \ + f_cxk \ + ( \ + conjc, \ + schema, \ + panel_dim, \ + panel_dim_max, \ + panel_bcast, \ + panel_len, \ + panel_len_max, \ + kappa, \ + c, incc, ldc, \ + d, incd, \ + p, ldp, \ + params, \ + cntx \ + ); \ + return; \ + } \ +\ + bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \ +} + +INSERT_GENTFUNC2RO( packmd_struc_cxk ) +INSERT_GENTFUNC2RO_MIX_P( packmd_struc_cxk ) + diff --git a/frame/1m/packm/bli_packmd_struc_cxk.h b/frame/1m/packm/bli_packmd_struc_cxk.h new file mode 100644 index 000000000..037f8ef75 --- /dev/null +++ b/frame/1m/packm/bli_packmd_struc_cxk.h @@ -0,0 +1,70 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + + +typedef struct gemmd_params +{ + const void* d; + inc_t incd; +} gemmd_params; + + +#undef GENTPROT2 +#define GENTPROT2( ctypec, ctypep, chc, chp, varname ) \ +\ +BLIS_EXPORT_BLIS void PASTEMAC(chc,chp,varname) \ + ( \ + struc_t strucc, \ + diag_t diagc, \ + uplo_t uploc, \ + conj_t conjc, \ + pack_t schema, \ + bool invdiag, \ + dim_t panel_dim, \ + dim_t panel_len, \ + dim_t panel_dim_max, \ + dim_t panel_len_max, \ + dim_t panel_dim_off, \ + dim_t panel_len_off, \ + dim_t panel_bcast, \ + const void* kappa, \ + const void* c, inc_t incc, inc_t ldc, \ + void* p, inc_t ldp, \ + const void* params, \ + const cntx_t* cntx \ + ); + +INSERT_GENTPROT2_BASIC( packmd_struc_cxk ) +INSERT_GENTPROT2_MIX_P( packmd_struc_cxk ) + diff --git a/frame/3/bli_l3_check.c b/frame/3/bli_l3_check.c index 93146bc18..a5312ea2b 100644 --- a/frame/3/bli_l3_check.c +++ b/frame/3/bli_l3_check.c @@ -92,6 +92,110 @@ void bli_gemmt_check bli_check_error_code( e_val ); } +void bli_gemdm_check + ( + const obj_t* alpha, + const obj_t* a, + const obj_t* d, + const obj_t* b, + const obj_t* beta, + const obj_t* c, + const cntx_t* cntx + ) +{ + err_t e_val; + + // Check basic properties of the operation. + + bli_gemm_basic_check( alpha, a, b, beta, c, cntx ); + + // Check object structure. + + e_val = bli_check_general_object( a ); + bli_check_error_code( e_val ); + + e_val = bli_check_general_object( b ); + bli_check_error_code( e_val ); + + e_val = bli_check_general_object( c ); + bli_check_error_code( e_val ); + + e_val = bli_check_general_object( d ); + bli_check_error_code( e_val ); + + // Check vector properties + + e_val = bli_check_vector_object( d ); + bli_check_error_code( e_val ); + + e_val = bli_check_vector_dim_equals( d, bli_obj_width_after_trans( a ) ); + bli_check_error_code( e_val ); + + // Check data types + + e_val = bli_check_consistent_object_datatypes( a, b ); + bli_check_error_code( e_val ); + + e_val = bli_check_consistent_object_datatypes( a, d ); + bli_check_error_code( e_val ); + + e_val = bli_check_consistent_object_datatypes( a, c ); + bli_check_error_code( e_val ); +} + +void bli_gemdmt_check + ( + const obj_t* alpha, + const obj_t* a, + const obj_t* d, + const obj_t* b, + const obj_t* beta, + const obj_t* c, + const cntx_t* cntx + ) +{ + err_t e_val; + + // Check basic properties of the operation. + + bli_gemmt_basic_check( alpha, a, b, beta, c, cntx ); + + // Check matrix squareness. + + e_val = bli_check_square_object( c ); + bli_check_error_code( e_val ); + + // Check object structure. + + e_val = bli_check_general_object( a ); + bli_check_error_code( e_val ); + + e_val = bli_check_general_object( b ); + bli_check_error_code( e_val ); + + e_val = bli_check_general_object( d ); + bli_check_error_code( e_val ); + + // Check vector properties + + e_val = bli_check_vector_object( d ); + bli_check_error_code( e_val ); + + e_val = bli_check_vector_dim_equals( d, bli_obj_width_after_trans( a ) ); + bli_check_error_code( e_val ); + + // Check data types + + e_val = bli_check_consistent_object_datatypes( a, b ); + bli_check_error_code( e_val ); + + e_val = bli_check_consistent_object_datatypes( a, d ); + bli_check_error_code( e_val ); + + e_val = bli_check_consistent_object_datatypes( a, c ); + bli_check_error_code( e_val ); +} + void bli_hemm_check ( side_t side, @@ -210,6 +314,133 @@ void bli_her2k_check bli_check_error_code( e_val ); } +void bli_herkd_check + ( + const obj_t* alpha, + const obj_t* a, + const obj_t* d, + const obj_t* beta, + const obj_t* c, + const cntx_t* cntx + ) +{ + err_t e_val; + obj_t ah; + + // Alias A to A^H so we can perform dimension checks. + bli_obj_alias_with_trans( BLIS_CONJ_TRANSPOSE, a, &ah ); + + // Check basic properties of the operation. + + bli_herk_basic_check( alpha, a, &ah, beta, c, cntx ); + + // Check matrix squareness. + + e_val = bli_check_square_object( c ); + bli_check_error_code( e_val ); + + // Check matrix structure. + + e_val = bli_check_hermitian_object( c ); + bli_check_error_code( e_val ); + + e_val = bli_check_general_object( a ); + bli_check_error_code( e_val ); + + e_val = bli_check_general_object( d ); + bli_check_error_code( e_val ); + + // Check for real-valued alpha and beta. + + e_val = bli_check_real_valued_object( alpha ); + bli_check_error_code( e_val ); + + e_val = bli_check_real_valued_object( beta ); + bli_check_error_code( e_val ); + + // Check vector properties + + e_val = bli_check_vector_object( d ); + bli_check_error_code( e_val ); + + e_val = bli_check_vector_dim_equals( d, bli_obj_width_after_trans( a ) ); + bli_check_error_code( e_val ); + + // Check data types + + e_val = bli_check_consistent_object_datatypes( a, d ); + bli_check_error_code( e_val ); + + e_val = bli_check_consistent_object_datatypes( a, c ); + bli_check_error_code( e_val ); +} + +void bli_her2kd_check + ( + const obj_t* alpha, + const obj_t* a, + const obj_t* d, + const obj_t* b, + const obj_t* beta, + const obj_t* c, + const cntx_t* cntx + ) +{ + err_t e_val; + obj_t ah, bh; + + // Alias A and B to A^H and B^H so we can perform dimension checks. + bli_obj_alias_with_trans( BLIS_CONJ_TRANSPOSE, a, &ah ); + bli_obj_alias_with_trans( BLIS_CONJ_TRANSPOSE, b, &bh ); + + // Check basic properties of the operation. + + bli_her2k_basic_check( alpha, a, &bh, b, &ah, beta, c, cntx ); + + // Check matrix squareness. + + e_val = bli_check_square_object( c ); + bli_check_error_code( e_val ); + + // Check matrix structure. + + e_val = bli_check_hermitian_object( c ); + bli_check_error_code( e_val ); + + e_val = bli_check_general_object( a ); + bli_check_error_code( e_val ); + + e_val = bli_check_general_object( b ); + bli_check_error_code( e_val ); + + e_val = bli_check_general_object( d ); + bli_check_error_code( e_val ); + + // Check for real-valued beta. + + e_val = bli_check_real_valued_object( beta ); + bli_check_error_code( e_val ); + + // Check vector properties + + e_val = bli_check_vector_object( d ); + bli_check_error_code( e_val ); + + e_val = bli_check_vector_dim_equals( d, bli_obj_width_after_trans( a ) ); + bli_check_error_code( e_val ); + + // Check data types + + e_val = bli_check_consistent_object_datatypes( a, b ); + bli_check_error_code( e_val ); + + e_val = bli_check_consistent_object_datatypes( a, d ); + bli_check_error_code( e_val ); + + e_val = bli_check_consistent_object_datatypes( a, c ); + bli_check_error_code( e_val ); +} + void bli_symm_check ( side_t side, @@ -315,6 +546,120 @@ void bli_syr2k_check bli_check_error_code( e_val ); } +void bli_syrkd_check + ( + const obj_t* alpha, + const obj_t* a, + const obj_t* d, + const obj_t* beta, + const obj_t* c, + const cntx_t* cntx + ) +{ + err_t e_val; + obj_t at; + + // Alias A to A^T so we can perform dimension checks. + bli_obj_alias_with_trans( BLIS_TRANSPOSE, a, &at ); + + // Check basic properties of the operation. + + bli_herk_basic_check( alpha, a, &at, beta, c, cntx ); + + // Check matrix squareness. + + e_val = bli_check_square_object( c ); + bli_check_error_code( e_val ); + + // Check matrix structure. + + e_val = bli_check_symmetric_object( c ); + bli_check_error_code( e_val ); + + e_val = bli_check_general_object( a ); + bli_check_error_code( e_val ); + + e_val = bli_check_general_object( d ); + bli_check_error_code( e_val ); + + // Check vector properties + + e_val = bli_check_vector_object( d ); + bli_check_error_code( e_val ); + + e_val = bli_check_vector_dim_equals( d, bli_obj_width_after_trans( a ) ); + bli_check_error_code( e_val ); + + // Check data types + + e_val = bli_check_consistent_object_datatypes( a, d ); + bli_check_error_code( e_val ); + + e_val = bli_check_consistent_object_datatypes( a, c ); + bli_check_error_code( e_val ); +} + +void bli_syr2kd_check + ( + const obj_t* alpha, + const obj_t* a, + const obj_t* d, + const obj_t* b, + const obj_t* beta, + const obj_t* c, + const cntx_t* cntx + ) +{ + err_t e_val; + obj_t at, bt; + + // Alias A and B to A^T and B^T so we can perform dimension checks. + bli_obj_alias_with_trans( BLIS_TRANSPOSE, a, &at ); + bli_obj_alias_with_trans( BLIS_TRANSPOSE, b, &bt ); + + // Check basic properties of the operation. + + bli_her2k_basic_check( alpha, a, &bt, b, &at, beta, c, cntx ); + + // Check matrix squareness. + + e_val = bli_check_square_object( c ); + bli_check_error_code( e_val ); + + // Check matrix structure. + + e_val = bli_check_symmetric_object( c ); + bli_check_error_code( e_val ); + + e_val = bli_check_general_object( a ); + bli_check_error_code( e_val ); + + e_val = bli_check_general_object( b ); + bli_check_error_code( e_val ); + + e_val = bli_check_general_object( d ); + bli_check_error_code( e_val ); + + // Check vector properties + + e_val = bli_check_vector_object( d ); + bli_check_error_code( e_val ); + + e_val = bli_check_vector_dim_equals( d, bli_obj_width_after_trans( a ) ); + bli_check_error_code( e_val ); + + // Check data types + + e_val = bli_check_consistent_object_datatypes( a, b ); + bli_check_error_code( e_val ); + + e_val = bli_check_consistent_object_datatypes( a, d ); + bli_check_error_code( e_val ); + + e_val = bli_check_consistent_object_datatypes( a, c ); + bli_check_error_code( e_val ); +} + void bli_trmm3_check ( side_t side, diff --git a/frame/3/bli_l3_check.h b/frame/3/bli_l3_check.h index ef59cb60f..3d22eaf36 100644 --- a/frame/3/bli_l3_check.h +++ b/frame/3/bli_l3_check.h @@ -56,6 +56,26 @@ GENPROT( her2k ) GENPROT( syr2k ) +#undef GENPROT +#define GENPROT( opname ) \ +\ +void PASTEMAC(opname,_check) \ + ( \ + const obj_t* alpha, \ + const obj_t* a, \ + const obj_t* d, \ + const obj_t* b, \ + const obj_t* beta, \ + const obj_t* c, \ + const cntx_t* cntx \ + ); + +GENPROT( gemdm ) +GENPROT( gemdmt ) +GENPROT( her2kd ) +GENPROT( syr2kd ) + + #undef GENPROT #define GENPROT( opname ) \ \ @@ -91,6 +111,23 @@ GENPROT( herk ) GENPROT( syrk ) +#undef GENPROT +#define GENPROT( opname ) \ +\ +void PASTEMAC(opname,_check) \ + ( \ + const obj_t* alpha, \ + const obj_t* a, \ + const obj_t* d, \ + const obj_t* beta, \ + const obj_t* c, \ + const cntx_t* cntx \ + ); + +GENPROT( herkd ) +GENPROT( syrkd ) + + #undef GENPROT #define GENPROT( opname ) \ \ diff --git a/frame/3/bli_l3_oapi.c b/frame/3/bli_l3_oapi.c index 2344b9eb8..953b5ecd0 100644 --- a/frame/3/bli_l3_oapi.c +++ b/frame/3/bli_l3_oapi.c @@ -60,6 +60,29 @@ GENFRONT( gemmt ) GENFRONT( her2k ) GENFRONT( syr2k ) +#undef GENFRONT +#define GENFRONT( opname ) \ +\ +void PASTEMAC(opname) \ + ( \ + const obj_t* alpha, \ + const obj_t* a, \ + const obj_t* d, \ + const obj_t* b, \ + const obj_t* beta, \ + const obj_t* c \ + ) \ +{ \ + /* Invoke the expert interface and request default cntx_t and rntm_t + objects. */ \ + PASTEMAC(opname,_ex)( alpha, a, d, b, beta, c, NULL, NULL ); \ +} + +GENFRONT( gemdm ) +GENFRONT( gemdmt ) +GENFRONT( her2kd ) +GENFRONT( syr2kd ) + #undef GENFRONT #define GENFRONT( opname ) \ @@ -104,6 +127,27 @@ GENFRONT( herk ) GENFRONT( syrk ) +#undef GENFRONT +#define GENFRONT( opname ) \ +\ +void PASTEMAC(opname) \ + ( \ + const obj_t* alpha, \ + const obj_t* a, \ + const obj_t* d, \ + const obj_t* beta, \ + const obj_t* c \ + ) \ +{ \ + /* Invoke the expert interface and request default cntx_t and rntm_t + objects. */ \ + PASTEMAC(opname,_ex)( alpha, a, d, beta, c, NULL, NULL ); \ +} + +GENFRONT( herkd ) +GENFRONT( syrkd ) + + #undef GENFRONT #define GENFRONT( opname ) \ \ diff --git a/frame/3/bli_l3_oapi.h b/frame/3/bli_l3_oapi.h index 12fb68e97..23ac98d1c 100644 --- a/frame/3/bli_l3_oapi.h +++ b/frame/3/bli_l3_oapi.h @@ -56,6 +56,25 @@ GENPROT( her2k ) GENPROT( syr2k ) +#undef GENPROT +#define GENPROT( opname ) \ +\ +BLIS_EXPORT_BLIS void PASTEMAC(opname) \ + ( \ + const obj_t* alpha, \ + const obj_t* a, \ + const obj_t* d, \ + const obj_t* b, \ + const obj_t* beta, \ + const obj_t* c \ + ); + +GENPROT( gemdm ) +GENPROT( gemdmt ) +GENPROT( her2kd ) +GENPROT( syr2kd ) + + #undef GENPROT #define GENPROT( opname ) \ \ @@ -89,6 +108,22 @@ GENPROT( herk ) GENPROT( syrk ) +#undef GENPROT +#define GENPROT( opname ) \ +\ +BLIS_EXPORT_BLIS void PASTEMAC(opname) \ + ( \ + const obj_t* alpha, \ + const obj_t* a, \ + const obj_t* d, \ + const obj_t* beta, \ + const obj_t* c \ + ); + +GENPROT( herkd ) +GENPROT( syrkd ) + + #undef GENPROT #define GENPROT( opname ) \ \ diff --git a/frame/3/bli_l3_oapi_ex.c b/frame/3/bli_l3_oapi_ex.c index fcb51bb45..afc6d559f 100644 --- a/frame/3/bli_l3_oapi_ex.c +++ b/frame/3/bli_l3_oapi_ex.c @@ -35,6 +35,9 @@ #include "blis.h" +static packm_ker_ft GENARRAY2_MIXP(packmd_struc_cxk,packmd_struc_cxk); + + // // Define object-based interfaces (expert). // @@ -147,6 +150,84 @@ void PASTEMAC(gemm,BLIS_OAPI_EX_SUF) } +void PASTEMAC(gemdm,BLIS_OAPI_EX_SUF) + ( + const obj_t* alpha, + const obj_t* a, + const obj_t* d, + const obj_t* b, + const obj_t* beta, + const obj_t* c, + const cntx_t* cntx, + const rntm_t* rntm + ) +{ + bli_init_once(); + + // Check the operands. + if ( bli_error_checking_is_enabled() ) + bli_gemdm_check( alpha, a, d, b, beta, c, cntx ); + + // Check for zero dimensions, alpha == 0, or other conditions which + // mean that we don't actually have to perform a full l3 operation. + if ( bli_l3_return_early_if_trivial( alpha, a, b, beta, c ) == BLIS_SUCCESS ) + return; + + // Default to using native execution. + ind_t im = BLIS_NAT; + + // If necessary, obtain a valid context from the gks using the induced + // method id determined above. + if ( cntx == NULL ) cntx = bli_gks_query_cntx(); + + // Alias A, B, and C in case we need to apply transformations. + obj_t a_local; + obj_t b_local; + obj_t c_local; + bli_obj_alias_submatrix( a, &a_local ); + bli_obj_alias_submatrix( b, &b_local ); + bli_obj_alias_submatrix( c, &c_local ); + + gemm_cntl_t cntl; + bli_gemm_cntl_init + ( + im, + BLIS_GEMM, + alpha, + &a_local, + &b_local, + beta, + &c_local, + cntx, + &cntl + ); + + gemmd_params params; + params.d = bli_obj_buffer_at_off( d ); + params.incd = bli_obj_vector_inc( d ); + + func_t packmd; + bli_func_set_dt( packmd_struc_cxk[BLIS_FLOAT ][BLIS_FLOAT ], BLIS_FLOAT, &packmd ); + bli_func_set_dt( packmd_struc_cxk[BLIS_DOUBLE ][BLIS_DOUBLE ], BLIS_DOUBLE, &packmd ); + bli_func_set_dt( packmd_struc_cxk[BLIS_SCOMPLEX][BLIS_SCOMPLEX], BLIS_SCOMPLEX, &packmd ); + bli_func_set_dt( packmd_struc_cxk[BLIS_DCOMPLEX][BLIS_DCOMPLEX], BLIS_DCOMPLEX, &packmd ); + + bli_gemm_cntl_set_packb_ukr_simple( &packmd, &cntl ); + bli_gemm_cntl_set_packb_params( ¶ms, &cntl ); + + // Invoke the internal back-end via the thread handler. + bli_l3_thread_decorator + ( + &a_local, + &b_local, + &c_local, + cntx, + ( cntl_t* )&cntl, + rntm + ); +} + + void PASTEMAC(gemmt,BLIS_OAPI_EX_SUF) ( const obj_t* alpha, @@ -225,6 +306,84 @@ void PASTEMAC(gemmt,BLIS_OAPI_EX_SUF) } +void PASTEMAC(gemdmt,BLIS_OAPI_EX_SUF) + ( + const obj_t* alpha, + const obj_t* a, + const obj_t* d, + const obj_t* b, + const obj_t* beta, + const obj_t* c, + const cntx_t* cntx, + const rntm_t* rntm + ) +{ + bli_init_once(); + + // Check the operands. + if ( bli_error_checking_is_enabled() ) + bli_gemdmt_check( alpha, a, d, b, beta, c, cntx ); + + // Check for zero dimensions, alpha == 0, or other conditions which + // mean that we don't actually have to perform a full l3 operation. + if ( bli_l3_return_early_if_trivial( alpha, a, b, beta, c ) == BLIS_SUCCESS ) + return; + + // Default to using native execution. + ind_t im = BLIS_NAT; + + // If necessary, obtain a valid context from the gks using the induced + // method id determined above. + if ( cntx == NULL ) cntx = bli_gks_query_cntx(); + + // Alias A, B, and C in case we need to apply transformations. + obj_t a_local; + obj_t b_local; + obj_t c_local; + bli_obj_alias_submatrix( a, &a_local ); + bli_obj_alias_submatrix( b, &b_local ); + bli_obj_alias_submatrix( c, &c_local ); + + gemm_cntl_t cntl; + bli_gemm_cntl_init + ( + im, + BLIS_GEMMT, + alpha, + &a_local, + &b_local, + beta, + &c_local, + cntx, + &cntl + ); + + gemmd_params params; + params.d = bli_obj_buffer_at_off( d ); + params.incd = bli_obj_vector_inc( d ); + + func_t packmd; + bli_func_set_dt( packmd_struc_cxk[BLIS_FLOAT ][BLIS_FLOAT ], BLIS_FLOAT, &packmd ); + bli_func_set_dt( packmd_struc_cxk[BLIS_DOUBLE ][BLIS_DOUBLE ], BLIS_DOUBLE, &packmd ); + bli_func_set_dt( packmd_struc_cxk[BLIS_SCOMPLEX][BLIS_SCOMPLEX], BLIS_SCOMPLEX, &packmd ); + bli_func_set_dt( packmd_struc_cxk[BLIS_DCOMPLEX][BLIS_DCOMPLEX], BLIS_DCOMPLEX, &packmd ); + + bli_gemm_cntl_set_packb_ukr_simple( &packmd, &cntl ); + bli_gemm_cntl_set_packb_params( ¶ms, &cntl ); + + // Invoke the internal back-end via the thread handler. + bli_l3_thread_decorator + ( + &a_local, + &b_local, + &c_local, + cntx, + ( cntl_t* )&cntl, + rntm + ); +} + + void PASTEMAC(her2k,BLIS_OAPI_EX_SUF) ( const obj_t* alpha, @@ -345,6 +504,132 @@ void PASTEMAC(syrk,BLIS_OAPI_EX_SUF) } +void PASTEMAC(her2kd,BLIS_OAPI_EX_SUF) + ( + const obj_t* alpha, + const obj_t* a, + const obj_t* d, + const obj_t* b, + const obj_t* beta, + const obj_t* c, + const cntx_t* cntx, + const rntm_t* rntm + ) +{ + bli_init_once(); + + // Check parameters. + if ( bli_error_checking_is_enabled() ) + bli_her2k_check( alpha, a, b, beta, c, cntx ); + + obj_t alphah; + obj_t ah; + obj_t dh; + obj_t bh; + bli_obj_alias_with_conj( BLIS_CONJUGATE, alpha, &alphah ); + bli_obj_alias_with_trans( BLIS_CONJ_TRANSPOSE, a, &ah ); + bli_obj_alias_with_conj( BLIS_CONJUGATE, d, &dh ); + bli_obj_alias_with_trans( BLIS_CONJ_TRANSPOSE, b, &bh ); + + // Invoke gemmt twice, using beta only the first time. + PASTEMAC(gemdmt,BLIS_OAPI_EX_SUF)( alpha, a, d, &bh, beta, c, cntx, rntm ); + PASTEMAC(gemdmt,BLIS_OAPI_EX_SUF)( &alphah, b, &dh, &ah, &BLIS_ONE, c, cntx, rntm ); + + // The Hermitian rank-2k product was computed as alpha*A*B'+alpha'*B*A', even for + // the diagonal elements. Mathematically, the imaginary components of + // diagonal elements of a Hermitian rank-2k product should always be + // zero. However, in practice, they sometimes accumulate meaningless + // non-zero values. To prevent this, we explicitly set those values + // to zero before returning. + bli_setid( &BLIS_ZERO, c ); +} + + +void PASTEMAC(syr2kd,BLIS_OAPI_EX_SUF) + ( + const obj_t* alpha, + const obj_t* a, + const obj_t* d, + const obj_t* b, + const obj_t* beta, + const obj_t* c, + const cntx_t* cntx, + const rntm_t* rntm + ) +{ + bli_init_once(); + + // Check parameters. + if ( bli_error_checking_is_enabled() ) + bli_syr2k_check( alpha, a, b, beta, c, cntx ); + + obj_t at; + obj_t bt; + bli_obj_alias_with_trans( BLIS_TRANSPOSE, a, &at ); + bli_obj_alias_with_trans( BLIS_TRANSPOSE, b, &bt ); + + // Invoke gemmt twice, using beta only the first time. + PASTEMAC(gemdmt,BLIS_OAPI_EX_SUF)( alpha, a, d, &bt, beta, c, cntx, rntm ); + PASTEMAC(gemdmt,BLIS_OAPI_EX_SUF)( alpha, b, d, &at, &BLIS_ONE, c, cntx, rntm ); +} + + +void PASTEMAC(herkd,BLIS_OAPI_EX_SUF) + ( + const obj_t* alpha, + const obj_t* a, + const obj_t* d, + const obj_t* beta, + const obj_t* c, + const cntx_t* cntx, + const rntm_t* rntm + ) +{ + bli_init_once(); + + // Check parameters. + if ( bli_error_checking_is_enabled() ) + bli_herk_check( alpha, a, beta, c, cntx ); + + obj_t ah; + bli_obj_alias_with_trans( BLIS_CONJ_TRANSPOSE, a, &ah ); + + PASTEMAC(gemdmt,BLIS_OAPI_EX_SUF)( alpha, a, d, &ah, beta, c, cntx, rntm ); + + // The Hermitian rank-k product was computed as Re(alpha)*A*A', even for the + // diagonal elements. Mathematically, the imaginary components of + // diagonal elements of a Hermitian rank-k product should always be + // zero. However, in practice, they sometimes accumulate meaningless + // non-zero values. To prevent this, we explicitly set those values + // to zero before returning. + bli_setid( &BLIS_ZERO, c ); +} + + +void PASTEMAC(syrkd,BLIS_OAPI_EX_SUF) + ( + const obj_t* alpha, + const obj_t* a, + const obj_t* d, + const obj_t* beta, + const obj_t* c, + const cntx_t* cntx, + const rntm_t* rntm + ) +{ + bli_init_once(); + + // Check parameters. + if ( bli_error_checking_is_enabled() ) + bli_syrk_check( alpha, a, beta, c, cntx ); + + obj_t at; + bli_obj_alias_with_trans( BLIS_TRANSPOSE, a, &at ); + + PASTEMAC(gemdmt,BLIS_OAPI_EX_SUF)( alpha, a, d, &at, beta, c, cntx, rntm ); +} + + void PASTEMAC(hemm,BLIS_OAPI_EX_SUF) ( side_t side, diff --git a/frame/3/bli_l3_oapi_ex.h b/frame/3/bli_l3_oapi_ex.h index 09f7f4a5d..d97aa293e 100644 --- a/frame/3/bli_l3_oapi_ex.h +++ b/frame/3/bli_l3_oapi_ex.h @@ -61,6 +61,27 @@ GENPROT( gemm_def ) #endif +#undef GENPROT +#define GENPROT( opname ) \ +\ +BLIS_EXPORT_BLIS void PASTEMAC(opname,BLIS_OAPI_EX_SUF) \ + ( \ + const obj_t* alpha, \ + const obj_t* a, \ + const obj_t* d, \ + const obj_t* b, \ + const obj_t* beta, \ + const obj_t* c, \ + const cntx_t* cntx, \ + const rntm_t* rntm \ + ); + +GENPROT( gemdm ) +GENPROT( gemdmt ) +GENPROT( her2kd ) +GENPROT( syr2kd ) + + #undef GENPROT #define GENPROT( opname ) \ \ @@ -98,6 +119,24 @@ GENPROT( herk ) GENPROT( syrk ) +#undef GENPROT +#define GENPROT( opname ) \ +\ +BLIS_EXPORT_BLIS void PASTEMAC(opname,BLIS_OAPI_EX_SUF) \ + ( \ + const obj_t* alpha, \ + const obj_t* a, \ + const obj_t* d, \ + const obj_t* beta, \ + const obj_t* c, \ + const cntx_t* cntx, \ + const rntm_t* rntm \ + ); + +GENPROT( herkd ) +GENPROT( syrkd ) + + #undef GENPROT #define GENPROT( opname ) \ \ diff --git a/frame/3/bli_l3_tapi.c b/frame/3/bli_l3_tapi.c index 164f4401e..a27c4d115 100644 --- a/frame/3/bli_l3_tapi.c +++ b/frame/3/bli_l3_tapi.c @@ -74,6 +74,44 @@ void PASTEMAC(ch,opname) \ INSERT_GENTFUNC_BASIC( gemm ) +#undef GENTFUNC +#define GENTFUNC( ctype, ch, opname ) \ +\ +void PASTEMAC(ch,opname) \ + ( \ + trans_t transa, \ + trans_t transb, \ + dim_t m, \ + dim_t n, \ + dim_t k, \ + const ctype* alpha, \ + const ctype* a, inc_t rs_a, inc_t cs_a, \ + const ctype* d, inc_t inc_d, \ + const ctype* b, inc_t rs_b, inc_t cs_b, \ + const ctype* beta, \ + ctype* c, inc_t rs_c, inc_t cs_c \ + ) \ +{ \ + /* Invoke the expert interface and request default cntx_t and rntm_t + objects. */ \ + PASTEMAC(ch,opname,BLIS_TAPI_EX_SUF) \ + ( \ + transa, \ + transb, \ + m, n, k, \ + alpha, \ + a, rs_a, cs_a, \ + d, inc_d, \ + b, rs_b, cs_b, \ + beta, \ + c, rs_c, cs_c, \ + NULL, \ + NULL \ + ); \ +} + +INSERT_GENTFUNC_BASIC( gemdm ) + #undef GENTFUNC #define GENTFUNC( ctype, ch, opname ) \ @@ -113,6 +151,46 @@ void PASTEMAC(ch,opname) \ INSERT_GENTFUNC_BASIC( gemmt ) +#undef GENTFUNC +#define GENTFUNC( ctype, ch, opname ) \ +\ +void PASTEMAC(ch,opname) \ + ( \ + uplo_t uploc, \ + trans_t transa, \ + trans_t transb, \ + dim_t m, \ + dim_t k, \ + const ctype* alpha, \ + const ctype* a, inc_t rs_a, inc_t cs_a, \ + const ctype* d, inc_t inc_d, \ + const ctype* b, inc_t rs_b, inc_t cs_b, \ + const ctype* beta, \ + ctype* c, inc_t rs_c, inc_t cs_c \ + ) \ +{ \ + /* Invoke the expert interface and request default cntx_t and rntm_t + objects. */ \ + PASTEMAC(ch,opname,BLIS_TAPI_EX_SUF) \ + ( \ + uploc, \ + transa, \ + transb, \ + m, k, \ + alpha, \ + a, rs_a, cs_a, \ + d, inc_d, \ + b, rs_b, cs_b, \ + beta, \ + c, rs_c, cs_c, \ + NULL, \ + NULL \ + ); \ +} + +INSERT_GENTFUNC_BASIC( gemdmt ) + + #undef GENTFUNC #define GENTFUNC( ctype, ch, opname, struca ) \ \ @@ -298,6 +376,158 @@ void PASTEMAC(ch,opname) \ INSERT_GENTFUNC_BASIC( syr2k ) +#undef GENTFUNCR +#define GENTFUNCR( ctype, ctype_r, ch, chr, opname ) \ +\ +void PASTEMAC(ch,opname) \ + ( \ + uplo_t uploc, \ + trans_t transa, \ + dim_t m, \ + dim_t k, \ + const ctype_r* alpha, \ + const ctype* a, inc_t rs_a, inc_t cs_a, \ + const ctype_r* d, inc_t inc_d, \ + const ctype_r* beta, \ + ctype* c, inc_t rs_c, inc_t cs_c \ + ) \ +{ \ + /* Invoke the expert interface and request default cntx_t and rntm_t + objects. */ \ + PASTEMAC(ch,opname,BLIS_TAPI_EX_SUF) \ + ( \ + uploc, \ + transa, \ + m, k, \ + alpha, \ + a, rs_a, cs_a, \ + d, inc_d, \ + beta, \ + c, rs_c, cs_c, \ + NULL, \ + NULL \ + ); \ +} + +INSERT_GENTFUNCR_BASIC( herkd ) + + +#undef GENTFUNCR +#define GENTFUNCR( ctype, ctype_r, ch, chr, opname ) \ +\ +void PASTEMAC(ch,opname) \ + ( \ + uplo_t uploc, \ + trans_t transa, \ + trans_t transb, \ + dim_t m, \ + dim_t k, \ + const ctype* alpha, \ + const ctype* a, inc_t rs_a, inc_t cs_a, \ + const ctype* d, inc_t inc_d, \ + const ctype* b, inc_t rs_b, inc_t cs_b, \ + const ctype_r* beta, \ + ctype* c, inc_t rs_c, inc_t cs_c \ + ) \ +{ \ + /* Invoke the expert interface and request default cntx_t and rntm_t + objects. */ \ + PASTEMAC(ch,opname,BLIS_TAPI_EX_SUF) \ + ( \ + uploc, \ + transa, \ + transb, \ + m, k, \ + alpha, \ + a, rs_a, cs_a, \ + d, inc_d, \ + b, rs_b, cs_b, \ + beta, \ + c, rs_c, cs_c, \ + NULL, \ + NULL \ + ); \ +} + +INSERT_GENTFUNCR_BASIC( her2kd ) + + +#undef GENTFUNC +#define GENTFUNC( ctype, ch, opname ) \ +\ +void PASTEMAC(ch,opname) \ + ( \ + uplo_t uploc, \ + trans_t transa, \ + dim_t m, \ + dim_t k, \ + const ctype* alpha, \ + const ctype* a, inc_t rs_a, inc_t cs_a, \ + const ctype* d, inc_t inc_d, \ + const ctype* beta, \ + ctype* c, inc_t rs_c, inc_t cs_c \ + ) \ +{ \ + /* Invoke the expert interface and request default cntx_t and rntm_t + objects. */ \ + PASTEMAC(ch,opname,BLIS_TAPI_EX_SUF) \ + ( \ + uploc, \ + transa, \ + m, k, \ + alpha, \ + a, rs_a, cs_a, \ + d, inc_d, \ + beta, \ + c, rs_c, cs_c, \ + NULL, \ + NULL \ + ); \ +} + +INSERT_GENTFUNC_BASIC( syrkd ) + + +#undef GENTFUNC +#define GENTFUNC( ctype, ch, opname ) \ +\ +void PASTEMAC(ch,opname) \ + ( \ + uplo_t uploc, \ + trans_t transa, \ + trans_t transb, \ + dim_t m, \ + dim_t k, \ + const ctype* alpha, \ + const ctype* a, inc_t rs_a, inc_t cs_a, \ + const ctype* d, inc_t inc_d, \ + const ctype* b, inc_t rs_b, inc_t cs_b, \ + const ctype* beta, \ + ctype* c, inc_t rs_c, inc_t cs_c \ + ) \ +{ \ + /* Invoke the expert interface and request default cntx_t and rntm_t + objects. */ \ + PASTEMAC(ch,opname,BLIS_TAPI_EX_SUF) \ + ( \ + uploc, \ + transa, \ + transb, \ + m, k, \ + alpha, \ + a, rs_a, cs_a, \ + d, inc_d, \ + b, rs_b, cs_b, \ + beta, \ + c, rs_c, cs_c, \ + NULL, \ + NULL \ + ); \ +} + +INSERT_GENTFUNC_BASIC( syr2kd ) + + #undef GENTFUNC #define GENTFUNC( ctype, ch, opname ) \ \ diff --git a/frame/3/bli_l3_tapi.h b/frame/3/bli_l3_tapi.h index 81ddf6945..e0f4219d0 100644 --- a/frame/3/bli_l3_tapi.h +++ b/frame/3/bli_l3_tapi.h @@ -57,6 +57,26 @@ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ INSERT_GENTPROT_BASIC( gemm ) +#undef GENTPROT +#define GENTPROT( ctype, ch, opname ) \ +\ +BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ + ( \ + trans_t transa, \ + trans_t transb, \ + dim_t m, \ + dim_t n, \ + dim_t k, \ + const ctype* alpha, \ + const ctype* a, inc_t rs_a, inc_t cs_a, \ + const ctype* d, inc_t inc_d, \ + const ctype* b, inc_t rs_b, inc_t cs_b, \ + const ctype* beta, \ + ctype* c, inc_t rs_c, inc_t cs_c \ + ); + +INSERT_GENTPROT_BASIC( gemdm ) + #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ @@ -156,6 +176,87 @@ INSERT_GENTPROT_BASIC( gemmt ) INSERT_GENTPROT_BASIC( syr2k ) +#undef GENTPROTR +#define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ +\ +BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ + ( \ + uplo_t uploc, \ + trans_t transa, \ + dim_t m, \ + dim_t k, \ + const ctype_r* alpha, \ + const ctype* a, inc_t rs_a, inc_t cs_a, \ + const ctype_r* d, inc_t inc_d, \ + const ctype_r* beta, \ + ctype* c, inc_t rs_c, inc_t cs_c \ + ); + +INSERT_GENTPROTR_BASIC( herkd ) + + +#undef GENTPROTR +#define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ +\ +BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ + ( \ + uplo_t uploc, \ + trans_t transa, \ + trans_t transb, \ + dim_t m, \ + dim_t k, \ + const ctype* alpha, \ + const ctype* a, inc_t rs_a, inc_t cs_a, \ + const ctype* d, inc_t inc_d, \ + const ctype* b, inc_t rs_b, inc_t cs_b, \ + const ctype_r* beta, \ + ctype* c, inc_t rs_c, inc_t cs_c \ + ); + +INSERT_GENTPROTR_BASIC( her2kd ) + + +#undef GENTPROT +#define GENTPROT( ctype, ch, opname ) \ +\ +BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ + ( \ + uplo_t uploc, \ + trans_t transa, \ + dim_t m, \ + dim_t k, \ + const ctype* alpha, \ + const ctype* a, inc_t rs_a, inc_t cs_a, \ + const ctype* d, inc_t inc_d, \ + const ctype* beta, \ + ctype* c, inc_t rs_c, inc_t cs_c \ + ); + +INSERT_GENTPROT_BASIC( syrkd ) + + +#undef GENTPROT +#define GENTPROT( ctype, ch, opname ) \ +\ +BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ + ( \ + uplo_t uploc, \ + trans_t transa, \ + trans_t transb, \ + dim_t m, \ + dim_t k, \ + const ctype* alpha, \ + const ctype* a, inc_t rs_a, inc_t cs_a, \ + const ctype* d, inc_t inc_d, \ + const ctype* b, inc_t rs_b, inc_t cs_b, \ + const ctype* beta, \ + ctype* c, inc_t rs_c, inc_t cs_c \ + ); + +INSERT_GENTPROT_BASIC( gemdmt ) +INSERT_GENTPROT_BASIC( syr2kd ) + + #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ diff --git a/frame/3/bli_l3_tapi_ex.c b/frame/3/bli_l3_tapi_ex.c index 04560c1ca..7375e8962 100644 --- a/frame/3/bli_l3_tapi_ex.c +++ b/frame/3/bli_l3_tapi_ex.c @@ -98,6 +98,69 @@ void PASTEMAC(ch,opname,BLIS_OAPI_EX_SUF) \ INSERT_GENTFUNC_BASIC( gemm ) +#undef GENTFUNC +#define GENTFUNC( ctype, ch, opname ) \ +\ +void PASTEMAC(ch,opname,BLIS_OAPI_EX_SUF) \ + ( \ + trans_t transa, \ + trans_t transb, \ + dim_t m, \ + dim_t n, \ + dim_t k, \ + const ctype* alpha, \ + const ctype* a, inc_t rs_a, inc_t cs_a, \ + const ctype* d, inc_t inc_d, \ + const ctype* b, inc_t rs_b, inc_t cs_b, \ + const ctype* beta, \ + ctype* c, inc_t rs_c, inc_t cs_c, \ + const cntx_t* cntx, \ + const rntm_t* rntm \ + ) \ +{ \ + bli_init_once(); \ +\ + const num_t dt = PASTEMAC(ch,type); \ +\ + obj_t alphao = BLIS_OBJECT_INITIALIZER_1X1; \ + obj_t ao = BLIS_OBJECT_INITIALIZER; \ + obj_t dobj = BLIS_OBJECT_INITIALIZER; \ + obj_t bo = BLIS_OBJECT_INITIALIZER; \ + obj_t betao = BLIS_OBJECT_INITIALIZER_1X1; \ + obj_t co = BLIS_OBJECT_INITIALIZER; \ +\ + dim_t m_a, n_a; \ + dim_t m_b, n_b; \ +\ + bli_set_dims_with_trans( transa, m, k, &m_a, &n_a ); \ + bli_set_dims_with_trans( transb, k, n, &m_b, &n_b ); \ +\ + bli_obj_init_finish_1x1( dt, ( void* )alpha, &alphao ); \ + bli_obj_init_finish_1x1( dt, ( void* )beta, &betao ); \ +\ + bli_obj_init_finish( dt, m_a, n_a, ( void* )a, rs_a, cs_a, &ao ); \ + bli_obj_init_finish( dt, k, 1, ( void* )d, inc_d, 1, &dobj ); \ + bli_obj_init_finish( dt, m_b, n_b, ( void* )b, rs_b, cs_b, &bo ); \ + bli_obj_init_finish( dt, m, n, c, rs_c, cs_c, &co ); \ +\ + bli_obj_set_conjtrans( transa, &ao ); \ + bli_obj_set_conjtrans( transb, &bo ); \ +\ + PASTEMAC(opname,BLIS_OAPI_EX_SUF) \ + ( \ + &alphao, \ + &ao, \ + &dobj, \ + &bo, \ + &betao, \ + &co, \ + cntx, \ + rntm \ + ); \ +} + +INSERT_GENTFUNC_BASIC( gemdm ) + #undef GENTFUNC #define GENTFUNC( ctype, ch, opname, struca ) \ \ @@ -463,6 +526,325 @@ void PASTEMAC(ch,opname,BLIS_OAPI_EX_SUF) \ INSERT_GENTFUNC_BASIC( gemmt ) +#undef GENTFUNCR +#define GENTFUNCR( ctype, ctype_r, ch, chr, opname ) \ +\ +void PASTEMAC(ch,opname,BLIS_OAPI_EX_SUF) \ + ( \ + uplo_t uploc, \ + trans_t transa, \ + dim_t m, \ + dim_t k, \ + const ctype_r* alpha, \ + const ctype* a, inc_t rs_a, inc_t cs_a, \ + const ctype_r* d, inc_t inc_d, \ + const ctype_r* beta, \ + ctype* c, inc_t rs_c, inc_t cs_c, \ + const cntx_t* cntx, \ + const rntm_t* rntm \ + ) \ +{ \ + bli_init_once(); \ +\ + const num_t dt_r = PASTEMAC(chr,type); \ + const num_t dt = PASTEMAC(ch,type); \ +\ + obj_t alphao = BLIS_OBJECT_INITIALIZER_1X1; \ + obj_t ao = BLIS_OBJECT_INITIALIZER; \ + obj_t dobj = BLIS_OBJECT_INITIALIZER; \ + obj_t betao = BLIS_OBJECT_INITIALIZER_1X1; \ + obj_t co = BLIS_OBJECT_INITIALIZER; \ +\ + dim_t m_a, n_a; \ +\ + bli_set_dims_with_trans( transa, m, k, &m_a, &n_a ); \ +\ + bli_obj_init_finish_1x1( dt_r, ( void* )alpha, &alphao ); \ + bli_obj_init_finish_1x1( dt_r, ( void* )beta, &betao ); \ +\ + bli_obj_init_finish( dt, m_a, n_a, ( void* )a, rs_a, cs_a, &ao ); \ + bli_obj_init_finish( dt, k, 1, ( void* )d, inc_d, 1, &dobj ); \ + bli_obj_init_finish( dt, m, m, c, rs_c, cs_c, &co ); \ +\ + bli_obj_set_uplo( uploc, &co ); \ + bli_obj_set_conjtrans( transa, &ao ); \ +\ + bli_obj_set_struc( BLIS_HERMITIAN, &co ); \ +\ + PASTEMAC(opname,BLIS_OAPI_EX_SUF) \ + ( \ + &alphao, \ + &ao, \ + &dobj, \ + &betao, \ + &co, \ + cntx, \ + rntm \ + ); \ +} + +INSERT_GENTFUNCR_BASIC( herkd ) + + +#undef GENTFUNCR +#define GENTFUNCR( ctype, ctype_r, ch, chr, opname ) \ +\ +void PASTEMAC(ch,opname,BLIS_OAPI_EX_SUF) \ + ( \ + uplo_t uploc, \ + trans_t transa, \ + trans_t transb, \ + dim_t m, \ + dim_t k, \ + const ctype* alpha, \ + const ctype* a, inc_t rs_a, inc_t cs_a, \ + const ctype* d, inc_t inc_d, \ + const ctype* b, inc_t rs_b, inc_t cs_b, \ + const ctype_r* beta, \ + ctype* c, inc_t rs_c, inc_t cs_c, \ + const cntx_t* cntx, \ + const rntm_t* rntm \ + ) \ +{ \ + bli_init_once(); \ +\ + const num_t dt_r = PASTEMAC(chr,type); \ + const num_t dt = PASTEMAC(ch,type); \ +\ + obj_t alphao = BLIS_OBJECT_INITIALIZER_1X1; \ + obj_t ao = BLIS_OBJECT_INITIALIZER; \ + obj_t dobj = BLIS_OBJECT_INITIALIZER; \ + obj_t bo = BLIS_OBJECT_INITIALIZER; \ + obj_t betao = BLIS_OBJECT_INITIALIZER_1X1; \ + obj_t co = BLIS_OBJECT_INITIALIZER; \ +\ + dim_t m_a, n_a; \ + dim_t m_b, n_b; \ +\ + bli_set_dims_with_trans( transa, m, k, &m_a, &n_a ); \ + bli_set_dims_with_trans( transb, m, k, &m_b, &n_b ); \ +\ + bli_obj_init_finish_1x1( dt, ( void* )alpha, &alphao ); \ + bli_obj_init_finish_1x1( dt_r, ( void* )beta, &betao ); \ +\ + bli_obj_init_finish( dt, m_a, n_a, ( void* )a, rs_a, cs_a, &ao ); \ + bli_obj_init_finish( dt, k, 1, ( void* )d, inc_d, 1, &dobj ); \ + bli_obj_init_finish( dt, m_b, n_b, ( void* )b, rs_b, cs_b, &bo ); \ + bli_obj_init_finish( dt, m, m, c, rs_c, cs_c, &co ); \ +\ + bli_obj_set_uplo( uploc, &co ); \ + bli_obj_set_conjtrans( transa, &ao ); \ + bli_obj_set_conjtrans( transb, &bo ); \ +\ + bli_obj_set_struc( BLIS_HERMITIAN, &co ); \ +\ + PASTEMAC(opname,BLIS_OAPI_EX_SUF) \ + ( \ + &alphao, \ + &ao, \ + &dobj, \ + &bo, \ + &betao, \ + &co, \ + cntx, \ + rntm \ + ); \ +} + +INSERT_GENTFUNCR_BASIC( her2kd ) + + +#undef GENTFUNC +#define GENTFUNC( ctype, ch, opname ) \ +\ +void PASTEMAC(ch,opname,BLIS_OAPI_EX_SUF) \ + ( \ + uplo_t uploc, \ + trans_t transa, \ + dim_t m, \ + dim_t k, \ + const ctype* alpha, \ + const ctype* a, inc_t rs_a, inc_t cs_a, \ + const ctype* d, inc_t inc_d, \ + const ctype* beta, \ + ctype* c, inc_t rs_c, inc_t cs_c, \ + const cntx_t* cntx, \ + const rntm_t* rntm \ + ) \ +{ \ + bli_init_once(); \ +\ + const num_t dt = PASTEMAC(ch,type); \ +\ + obj_t alphao = BLIS_OBJECT_INITIALIZER_1X1; \ + obj_t ao = BLIS_OBJECT_INITIALIZER; \ + obj_t dobj = BLIS_OBJECT_INITIALIZER; \ + obj_t betao = BLIS_OBJECT_INITIALIZER_1X1; \ + obj_t co = BLIS_OBJECT_INITIALIZER; \ +\ + dim_t m_a, n_a; \ +\ + bli_set_dims_with_trans( transa, m, k, &m_a, &n_a ); \ +\ + bli_obj_init_finish_1x1( dt, ( void* )alpha, &alphao ); \ + bli_obj_init_finish_1x1( dt, ( void* )beta, &betao ); \ +\ + bli_obj_init_finish( dt, m_a, n_a, ( void* )a, rs_a, cs_a, &ao ); \ + bli_obj_init_finish( dt, k, 1, ( void* )d, inc_d, 1, &dobj ); \ + bli_obj_init_finish( dt, m, m, c, rs_c, cs_c, &co ); \ +\ + bli_obj_set_uplo( uploc, &co ); \ + bli_obj_set_conjtrans( transa, &ao ); \ +\ + bli_obj_set_struc( BLIS_SYMMETRIC, &co ); \ +\ + PASTEMAC(opname,BLIS_OAPI_EX_SUF) \ + ( \ + &alphao, \ + &ao, \ + &dobj, \ + &betao, \ + &co, \ + cntx, \ + rntm \ + ); \ +} + +INSERT_GENTFUNC_BASIC( syrkd ) + + +#undef GENTFUNC +#define GENTFUNC( ctype, ch, opname ) \ +\ +void PASTEMAC(ch,opname,BLIS_OAPI_EX_SUF) \ + ( \ + uplo_t uploc, \ + trans_t transa, \ + trans_t transb, \ + dim_t m, \ + dim_t k, \ + const ctype* alpha, \ + const ctype* a, inc_t rs_a, inc_t cs_a, \ + const ctype* d, inc_t inc_d, \ + const ctype* b, inc_t rs_b, inc_t cs_b, \ + const ctype* beta, \ + ctype* c, inc_t rs_c, inc_t cs_c, \ + const cntx_t* cntx, \ + const rntm_t* rntm \ + ) \ +{ \ + bli_init_once(); \ +\ + const num_t dt = PASTEMAC(ch,type); \ +\ + obj_t alphao = BLIS_OBJECT_INITIALIZER_1X1; \ + obj_t ao = BLIS_OBJECT_INITIALIZER; \ + obj_t dobj = BLIS_OBJECT_INITIALIZER; \ + obj_t bo = BLIS_OBJECT_INITIALIZER; \ + obj_t betao = BLIS_OBJECT_INITIALIZER_1X1; \ + obj_t co = BLIS_OBJECT_INITIALIZER; \ +\ + dim_t m_a, n_a; \ + dim_t m_b, n_b; \ +\ + bli_set_dims_with_trans( transa, m, k, &m_a, &n_a ); \ + bli_set_dims_with_trans( transb, m, k, &m_b, &n_b ); \ +\ + bli_obj_init_finish_1x1( dt, ( void* )alpha, &alphao ); \ + bli_obj_init_finish_1x1( dt, ( void* )beta, &betao ); \ +\ + bli_obj_init_finish( dt, m_a, n_a, ( void* )a, rs_a, cs_a, &ao ); \ + bli_obj_init_finish( dt, k, 1, ( void* )d, inc_d, 1, &dobj ); \ + bli_obj_init_finish( dt, m_b, n_b, ( void* )b, rs_b, cs_b, &bo ); \ + bli_obj_init_finish( dt, m, m, c, rs_c, cs_c, &co ); \ +\ + bli_obj_set_uplo( uploc, &co ); \ + bli_obj_set_conjtrans( transa, &ao ); \ + bli_obj_set_conjtrans( transb, &bo ); \ +\ + bli_obj_set_struc( BLIS_SYMMETRIC, &co ); \ +\ + PASTEMAC(opname,BLIS_OAPI_EX_SUF) \ + ( \ + &alphao, \ + &ao, \ + &dobj, \ + &bo, \ + &betao, \ + &co, \ + cntx, \ + rntm \ + ); \ +} + +INSERT_GENTFUNC_BASIC( syr2kd ) + + +#undef GENTFUNC +#define GENTFUNC( ctype, ch, opname ) \ +\ +void PASTEMAC(ch,opname,BLIS_OAPI_EX_SUF) \ + ( \ + uplo_t uploc, \ + trans_t transa, \ + trans_t transb, \ + dim_t m, \ + dim_t k, \ + const ctype* alpha, \ + const ctype* a, inc_t rs_a, inc_t cs_a, \ + const ctype* d, inc_t inc_d, \ + const ctype* b, inc_t rs_b, inc_t cs_b, \ + const ctype* beta, \ + ctype* c, inc_t rs_c, inc_t cs_c, \ + const cntx_t* cntx, \ + const rntm_t* rntm \ + ) \ +{ \ + bli_init_once(); \ +\ + const num_t dt = PASTEMAC(ch,type); \ +\ + obj_t alphao = BLIS_OBJECT_INITIALIZER_1X1; \ + obj_t ao = BLIS_OBJECT_INITIALIZER; \ + obj_t dobj = BLIS_OBJECT_INITIALIZER; \ + obj_t bo = BLIS_OBJECT_INITIALIZER; \ + obj_t betao = BLIS_OBJECT_INITIALIZER_1X1; \ + obj_t co = BLIS_OBJECT_INITIALIZER; \ +\ + dim_t m_a, n_a; \ + dim_t m_b, n_b; \ +\ + bli_set_dims_with_trans( transa, m, k, &m_a, &n_a ); \ + bli_set_dims_with_trans( transb, k, m, &m_b, &n_b ); \ +\ + bli_obj_init_finish_1x1( dt, ( void* )alpha, &alphao ); \ + bli_obj_init_finish_1x1( dt, ( void* )beta, &betao ); \ +\ + bli_obj_init_finish( dt, m_a, n_a, ( void* )a, rs_a, cs_a, &ao ); \ + bli_obj_init_finish( dt, k, 1, ( void* )d, inc_d, 1, &dobj ); \ + bli_obj_init_finish( dt, m_b, n_b, ( void* )b, rs_b, cs_b, &bo ); \ + bli_obj_init_finish( dt, m, m, c, rs_c, cs_c, &co ); \ +\ + bli_obj_set_uplo( uploc, &co ); \ + bli_obj_set_conjtrans( transa, &ao ); \ + bli_obj_set_conjtrans( transb, &bo ); \ +\ + PASTEMAC(opname,BLIS_OAPI_EX_SUF) \ + ( \ + &alphao, \ + &ao, \ + &dobj, \ + &bo, \ + &betao, \ + &co, \ + cntx, \ + rntm \ + ); \ +} + +INSERT_GENTFUNC_BASIC( gemdmt ) + + #undef GENTFUNC #define GENTFUNC( ctype, ch, opname ) \ \ diff --git a/frame/3/bli_l3_tapi_ex.h b/frame/3/bli_l3_tapi_ex.h index 872cecfa7..e6aae75ac 100644 --- a/frame/3/bli_l3_tapi_ex.h +++ b/frame/3/bli_l3_tapi_ex.h @@ -59,6 +59,28 @@ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname,BLIS_TAPI_EX_SUF) \ INSERT_GENTPROT_BASIC( gemm ) +#undef GENTPROT +#define GENTPROT( ctype, ch, opname ) \ +\ +BLIS_EXPORT_BLIS void PASTEMAC(ch,opname,BLIS_TAPI_EX_SUF) \ + ( \ + trans_t transa, \ + trans_t transb, \ + dim_t m, \ + dim_t n, \ + dim_t k, \ + const ctype* alpha, \ + const ctype* a, inc_t rs_a, inc_t cs_a, \ + const ctype* d, inc_t inc_d, \ + const ctype* b, inc_t rs_b, inc_t cs_b, \ + const ctype* beta, \ + ctype* c, inc_t rs_c, inc_t cs_c, \ + const cntx_t* cntx, \ + const rntm_t* rntm \ + ); + +INSERT_GENTPROT_BASIC( gemdm ) + #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ @@ -168,6 +190,95 @@ INSERT_GENTPROT_BASIC( gemmt ) INSERT_GENTPROT_BASIC( syr2k ) +#undef GENTPROTR +#define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ +\ +BLIS_EXPORT_BLIS void PASTEMAC(ch,opname,BLIS_TAPI_EX_SUF) \ + ( \ + uplo_t uploc, \ + trans_t transa, \ + dim_t m, \ + dim_t k, \ + const ctype_r* alpha, \ + const ctype* a, inc_t rs_a, inc_t cs_a, \ + const ctype_r* d, inc_t inc_d, \ + const ctype_r* beta, \ + ctype* c, inc_t rs_c, inc_t cs_c, \ + const cntx_t* cntx, \ + const rntm_t* rntm \ + ); + +INSERT_GENTPROTR_BASIC( herkd ) + + +#undef GENTPROTR +#define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ +\ +BLIS_EXPORT_BLIS void PASTEMAC(ch,opname,BLIS_TAPI_EX_SUF) \ + ( \ + uplo_t uploc, \ + trans_t transa, \ + trans_t transb, \ + dim_t m, \ + dim_t k, \ + const ctype* alpha, \ + const ctype* a, inc_t rs_a, inc_t cs_a, \ + const ctype* d, inc_t inc_d, \ + const ctype* b, inc_t rs_b, inc_t cs_b, \ + const ctype_r* beta, \ + ctype* c, inc_t rs_c, inc_t cs_c, \ + const cntx_t* cntx, \ + const rntm_t* rntm \ + ); + +INSERT_GENTPROTR_BASIC( her2kd ) + + +#undef GENTPROT +#define GENTPROT( ctype, ch, opname ) \ +\ +BLIS_EXPORT_BLIS void PASTEMAC(ch,opname,BLIS_TAPI_EX_SUF) \ + ( \ + uplo_t uploc, \ + trans_t transa, \ + dim_t m, \ + dim_t k, \ + const ctype* alpha, \ + const ctype* a, inc_t rs_a, inc_t cs_a, \ + const ctype* d, inc_t inc_d, \ + const ctype* beta, \ + ctype* c, inc_t rs_c, inc_t cs_c, \ + const cntx_t* cntx, \ + const rntm_t* rntm \ + ); + +INSERT_GENTPROT_BASIC( syrkd ) + + +#undef GENTPROT +#define GENTPROT( ctype, ch, opname ) \ +\ +BLIS_EXPORT_BLIS void PASTEMAC(ch,opname,BLIS_TAPI_EX_SUF) \ + ( \ + uplo_t uploc, \ + trans_t transa, \ + trans_t transb, \ + dim_t m, \ + dim_t k, \ + const ctype* alpha, \ + const ctype* a, inc_t rs_a, inc_t cs_a, \ + const ctype* d, inc_t inc_d, \ + const ctype* b, inc_t rs_b, inc_t cs_b, \ + const ctype* beta, \ + ctype* c, inc_t rs_c, inc_t cs_c, \ + const cntx_t* cntx, \ + const rntm_t* rntm \ + ); + +INSERT_GENTPROT_BASIC( gemdmt ) +INSERT_GENTPROT_BASIC( syr2kd ) + + #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ diff --git a/frame/include/bli_type_defs.h b/frame/include/bli_type_defs.h index 5bc96e8f2..d133cbb75 100644 --- a/frame/include/bli_type_defs.h +++ b/frame/include/bli_type_defs.h @@ -682,6 +682,7 @@ typedef enum // pack kernels BLIS_PACKM_KER = BLIS_2TYPE_KER, + BLIS_PACKMD_KER, BLIS_PACKM_1ER_KER, BLIS_PACKM_RO_KER, BLIS_PACKM_DIAG_KER, diff --git a/ref_kernels/1m/bli_packmd_cxk_ref.c b/ref_kernels/1m/bli_packmd_cxk_ref.c new file mode 100644 index 000000000..fd2084aee --- /dev/null +++ b/ref_kernels/1m/bli_packmd_cxk_ref.c @@ -0,0 +1,137 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + + +#define PACKM_BODY( ctypea, ctypep, cha, chp, pragma, cdim, dfac, inca, op ) \ +\ +do \ +{ \ + for ( dim_t k = n; k != 0; --k ) \ + { \ + ctypep delta_cast, kappa_delta; \ + PASTEMAC(cha,chp,copys)( *delta1, delta_cast ); \ + PASTEMAC(chp,scal2s)( kappa_cast, delta_cast, kappa_delta ); \ +\ + pragma \ + for ( dim_t mn = 0; mn < cdim; mn++ ) \ + { \ + ctypep alpha_cast, kappa_alpha; \ + PASTEMAC(cha,chp,copys)( *(alpha1 + mn*inca), alpha_cast ); \ + PASTEMAC(chp,op)( kappa_delta, alpha_cast, kappa_alpha ); \ + for ( dim_t d = 0; d < dfac; d++ ) \ + PASTEMAC(chp,copys)( kappa_alpha, *(pi1 + mn*dfac + d) ); \ + } \ +\ + alpha1 += lda; \ + delta1 += incd; \ + pi1 += ldp; \ + } \ +} while(0) + + +#undef GENTFUNC2 +#define GENTFUNC2( ctypea, ctypep, cha, chp, opname, arch, suf ) \ +\ +void PASTEMAC(cha,chp,opname,arch,suf) \ + ( \ + conj_t conja, \ + pack_t schema, \ + dim_t cdim, \ + dim_t cdim_max, \ + dim_t cdim_bcast, \ + dim_t n, \ + dim_t n_max, \ + const void* kappa, \ + const void* a, inc_t inca, inc_t lda, \ + const void* d, inc_t incd, \ + void* p, inc_t ldp, \ + const void* params, \ + const cntx_t* cntx \ + ) \ +{ \ + const dim_t mr = PASTECH(BLIS_MR_, chp); \ + const dim_t nr = PASTECH(BLIS_NR_, chp); \ + const dim_t bbm = PASTECH(BLIS_BBM_, chp); \ + const dim_t bbn = PASTECH(BLIS_BBN_, chp); \ +\ + ctypep kappa_cast = *( ctypep* )kappa; \ + const ctypea* restrict alpha1 = a; \ + const ctypea* restrict delta1 = d; \ + ctypep* restrict pi1 = p; \ +\ + if ( cdim == mr && cdim_bcast == bbm && mr != -1 ) \ + { \ + if ( inca == 1 ) \ + { \ + if ( bli_is_conj( conja ) ) PACKM_BODY( ctypea, ctypep, cha, chp, PRAGMA_SIMD, mr, bbm, 1, scal2js ); \ + else PACKM_BODY( ctypea, ctypep, cha, chp, PRAGMA_SIMD, mr, bbm, 1, scal2s ); \ + } \ + else \ + { \ + if ( bli_is_conj( conja ) ) PACKM_BODY( ctypea, ctypep, cha, chp, PRAGMA_SIMD, mr, bbm, inca, scal2js ); \ + else PACKM_BODY( ctypea, ctypep, cha, chp, PRAGMA_SIMD, mr, bbm, inca, scal2s ); \ + } \ + } \ + else if ( cdim == nr && cdim_bcast == bbn && nr != -1 ) \ + { \ + if ( inca == 1 ) \ + { \ + if ( bli_is_conj( conja ) ) PACKM_BODY( ctypea, ctypep, cha, chp, PRAGMA_SIMD, nr, bbn, 1, scal2js ); \ + else PACKM_BODY( ctypea, ctypep, cha, chp, PRAGMA_SIMD, nr, bbn, 1, scal2s ); \ + } \ + else \ + { \ + if ( bli_is_conj( conja ) ) PACKM_BODY( ctypea, ctypep, cha, chp, PRAGMA_SIMD, nr, bbn, inca, scal2js ); \ + else PACKM_BODY( ctypea, ctypep, cha, chp, PRAGMA_SIMD, nr, bbn, inca, scal2s ); \ + } \ + } \ + else \ + { \ + if ( bli_is_conj( conja ) ) PACKM_BODY( ctypea, ctypep, cha, chp, , cdim, cdim_bcast, inca, scal2js ); \ + else PACKM_BODY( ctypea, ctypep, cha, chp, , cdim, cdim_bcast, inca, scal2s ); \ + } \ +\ + PASTEMAC(chp,set0s_edge) \ + ( \ + cdim*cdim_bcast, cdim_max*cdim_bcast, \ + n, n_max, \ + p, ldp \ + ); \ +} + +INSERT_GENTFUNC2_BASIC( packmd, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX ) +INSERT_GENTFUNC2_MIX_P( packmd, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX ) + diff --git a/ref_kernels/bli_cntx_ref.c b/ref_kernels/bli_cntx_ref.c index 9780f9162..57736d034 100644 --- a/ref_kernels/bli_cntx_ref.c +++ b/ref_kernels/bli_cntx_ref.c @@ -183,18 +183,20 @@ INSERT_PROTMAC_BASIC( GEMMSUP_KER_PROT, gemmsup_gx_ukr_name ) // -- Construct arch-specific names for reference packm kernels -- -#define packm_ker_name GENARNAME(packm) -#define packm_1er_ker_name GENARNAME(packm_1er) +#define packm_ker_name GENARNAME(packm) +#define packmd_ker_name GENARNAME(packmd) +#define packm_1er_ker_name GENARNAME(packm_1er) #define packm_ro_ker_name GENARNAME(packm_ro) -#define packm_diag_ker_name GENARNAME(packm_diag) -#define packm_diag_1er_ker_name GENARNAME(packm_diag_1er) +#define packm_diag_ker_name GENARNAME(packm_diag) +#define packm_diag_1er_ker_name GENARNAME(packm_diag_1er) #define packm_diag_ro_ker_name GENARNAME(packm_diag_ro) -#define unpackm_ker_name GENARNAME(unpackm) +#define unpackm_ker_name GENARNAME(unpackm) // Instantiate prototypes for above functions using the pre-defined packm // kernel prototype-generating macros. INSERT_PROTMAC_MIX_P ( PACKM_KER_PROT2, packm_ker_name ) +INSERT_PROTMAC_MIX_P ( PACKMD_KER_PROT2, packmd_ker_name ) INSERT_PROTMAC_MIX_CO( PACKM_KER_PROT2, packm_1er_ker_name ) INSERT_PROTMAC_MIX_CO( PACKM_KER_PROT2, packm_ro_ker_name ) INSERT_PROTMAC_MIX_P ( PACKM_DIAG_KER_PROT2, packm_diag_ker_name ) @@ -516,6 +518,7 @@ void GENBARNAME(cntx_init) // -- Set level-1m (packm/unpackm) kernels --------------------------------- gen_func_init_mix_p ( &func2s[ bli_ker_idx( BLIS_PACKM_KER ) ], packm_ker_name ); + gen_func_init_mix_p ( &func2s[ bli_ker_idx( BLIS_PACKMD_KER ) ], packmd_ker_name ); gen_func_init_mix_co( &func2s[ bli_ker_idx( BLIS_PACKM_1ER_KER ) ], packm_1er_ker_name ); gen_func_init_mix_co( &func2s[ bli_ker_idx( BLIS_PACKM_RO_KER ) ], packm_ro_ker_name ); gen_func_init_mix_p ( &func2s[ bli_ker_idx( BLIS_PACKM_DIAG_KER ) ], packm_diag_ker_name );