diff --git a/main/acle.md b/main/acle.md index 68d2470d..b8b6916e 100644 --- a/main/acle.md +++ b/main/acle.md @@ -9544,11 +9544,17 @@ Extract vector segment from each pair of quadword segments. Contiguous zero-extending load to quadword (single vector). ``` c - svuint32_t svld1uw_u128[_u32](svbool_t, const uint32_t *ptr); - svuint32_t svld1uw_u128[_u32](svbool_t, const uint32_t *ptr, int64_t vnum); + // Variants are also available for: + // _s8, _u16, _f16, _u32, _s32 + // _bf16, _f16, _f32 + svuint8_t svld1quw[_u8](svbool_t, const uint8_t *ptr); + svuint8_t svld1quw_vnum[_u8](svbool_t, const uint8_t *ptr, int64_t vnum); - svuint64_t svld1ud_u128[_u64](svbool_t, const uint64_t *ptr); - svuint64_t svld1ud_u128[_u64](svbool_t, const uint64_t *ptr, int64_t vnum); + // Variants are also available for: + // _s8, _u16, _f16, _u32, _s32, _u64, _s64 + // _bf16, _f16, _f32, _f64 + svuint8_t svld1qud[_u8](svbool_t, const uint8_t *ptr); + svuint8_t svld1qud_vnum[_u8](svbool_t, const uint8_t *ptr, int64_t vnum); ``` #### LD1B, LD1D, LD1H, LD1W @@ -9599,10 +9605,17 @@ Gather Load Quadword. ``` c // Variants are also available for: - // _u64base_u8, _u64base_u16, _u64base_s16, _u64base_u32, _u64base_s32, - // _u64base _u64, _u64base_s64 - // _u64base_bf16, _u64base_f16, _u64base_f32, _u64base_f64 - svint8_t svld1q_gather[_u64base_s8](svbool_t pg, svint64_t zn, const void *rm); + // _u8, _u16, _s16, _u32, _s32, _u64, _s64 + // _bf16, _f16, _f32, _f64 + svint8_t svld1q_gather[_u64base]_s8(svbool_t pg, svuint64_t zn); + svint8_t svld1q_gather[_u64base]_offset_s8(svbool_t pg, svuint64_t zn, int64_t offset); + svint8_t svld1q_gather[_u64base]_index_s8(svbool_t pg, svuint64_t zn, int64_t index); + svint8_t svld1q_gather_[u64]offset[_s8](svbool_t pg, const int8_t *base, svuint64_t offset); + + // Variants are also available for: + // _u16, _u32, _s32, _u64, _s64 + // _bf16, _f16, _f32, _f64 + svint16_t svld1q_gather_[u64]index[_s16](svbool_t pg, const int16_t *base, svuint64_t index); ``` #### LD2Q, LD3Q, LD4Q @@ -9670,14 +9683,9 @@ Max/Min reduction of quadword vector segments. ``` c // Variants are also available for: // _s8, _u16, _s16, _u32, _s32, _u64, _s64 - // _bf16, _f16, _f32, _f64 - uint8x16_t svmaxqv[_u8]_m(svbool_t pg, svuint8_t zn); - uint8x16_t svmaxqv[_u8]_z(svbool_t pg, svuint8_t zn); - uint8x16_t svmaxqv[_u8]_x(svbool_t pg, svuint8_t zn); - - uint8x16_t svminqv[_u8]_m(svbool_t pg, svuint8_t zn); - uint8x16_t svminqv[_u8]_z(svbool_t pg, svuint8_t zn); - uint8x16_t svminqv[_u8]_x(svbool_t pg, svuint8_t zn); + // _f16, _f32, _f64 + uint8x16_t svmaxqv[_u8](svbool_t pg, svuint8_t zn); + uint8x16_t svminqv[_u8](svbool_t pg, svuint8_t zn); ``` #### FMAXNMQV, FMINNMQV @@ -9686,13 +9694,8 @@ Max/Min recursive reduction of quadword vector segments. ``` c // Variants are also available for _f32, _f64 - float16x8_t svmaxnmqv[_f16]_m(svbool_t pg, svfloat16_t zn); - float16x8_t svmaxnmqv[_f16]_z(svbool_t pg, svfloat16_t zn); - float16x8_t svmaxnmqv[_f16]_x(svbool_t pg, svfloat16_t zn); - - float16x8_t svminnmqv[_f16]_m(svbool_t pg, svfloat16_t zn); - float16x8_t svminnmqv[_f16]_z(svbool_t pg, svfloat16_t zn); - float16x8_t svminnmqv[_f16]_x(svbool_t pg, svfloat16_t zn); + float16x8_t svmaxnmqv[_f16](svbool_t pg, svfloat16_t zn); + float16x8_t svminnmqv[_f16](svbool_t pg, svfloat16_t zn); ``` #### BFMLSLB, BFMLSLT @@ -9772,6 +9775,7 @@ Reverse doublewords in elements. // All the intrinsics below are [SVE2.1 or SME] // Variants are available for: // _s8, _s16, _u16, _s32, _u32, _s64, _u64 + // _bf16, _f16, _f32, _f64 svuint8_t svrevd[_u8]_m(svuint8_t zd, svbool_t pg, svuint8_t zn); svuint8_t svrevd[_u8]_z(svbool_t pg, svuint8_t zn); svuint8_t svrevd[_u8]_x(svbool_t pg, svuint8_t zn); @@ -9803,23 +9807,21 @@ Contiguous store of single vector operand. It is truncating store from quadword. ``` c // Variants are also available for: - // s128_vnum[_s32] - void svst1w_u128_vnum[_u32](svbool_t pg, uint32_t *ptr, uint64_t vnum, - svuint32_t zt); + // _s8, _u16, _f16, _u32, _s32 + // _bf16, _f16, _f32 + void svst1quw[_u8](svbool_t pg, uint8_t *rn, svuint8_t zt); + void svst1quw_vnum[_u8](svbool_t pg, uint8_t *ptr, int64_t vnum, + svuint8_t zt); - // Variants are also available for: - // s128[_s32] - void svst1w_u128[_u32](svbool_t pg, uint32_t *rn, svuint32_t zt); // Variants are also available for: - // s128_vnum[_s64] - void svst1d_u128_vnum[_u64](svbool_t pg, uint64_t *ptr, int64_t vnum, - svuint64_t zt); - - // Variants are also available for: - // s128[_s64] - void svst1d_u128[_u64](svbool_t pg, uint64_t *rn, svuint64_t zt); + // _s8, _u16, _f16, _u32, _s32, _u64, _s64 + // _bf16, _f16, _f32, _f64 + void svst1qud[_u8](svbool_t pg, uint8_t *rn, svuint8_t zt); + void svst1qud_vnum[_u8](svbool_t pg, uint8_t *ptr, int64_t vnum, + svuint8_t zt); ``` + #### ST1B, ST1D, ST1H, ST1W Contiguous store of multi-vector operand. @@ -9926,27 +9928,17 @@ Scatter store quadwords. ``` c // Variants are also available for: - // _u64base_u8, _u64base_u16, _u64base_s16, _u64base_u32, _u64base_s32, - // _u64base_u64, _u64base_s64 - // _u64base_bf16, _u64base_f16, _u64base_f32, _u64base_f64 - void svst1q_scatter[_u64base_s8](svbool_t pg, svint64_t zn, const void *rm, - svint8_t zt); - ``` - -#### ST2Q, ST3Q, ST4Q - -Contiguous store. + // _u8, _u16, _s16, _u32, _s32, _u64, _s64 + // _bf16, _f16, _f32, _f64 + void svst1q_scatter[_u64base]_s8(svbool_t pg, svuint64_t zn, svint8_t data); + void svst1q_scatter[_u64base]_offset_s8(svbool_t pg, svuint64_t zn, int64_t offset, svint8_t data); + void svst1q_scatter[_u64base]_index_s8(svbool_t pg, svuint64_t zn, int64_t index, svint8_t data); + void svst1q_scatter_[u64]offset[_s8](svbool_t pg, const uint *base, svuint64_t offset, svint8_t data); - ``` c // Variants are also available for: - // _s8 _u16, _s16, _u32, _s32, _u64, _s64 + // _u16, _u32, _s32, _u64, _s64 // _bf16, _f16, _f32, _f64 - void svst2q[_u8](svbool_t pg, uint8_t *rn, svuint8x2_t zt); - void svst2q_vnum[_u8](svbool_t pg, uint8_t *rn, int64_t vnum, svuint8x2_t zt); - void svst3q[_u8](svbool_t pg, uint8_t *rn, svuint8x3_t zt); - void svst3q_vnum[_u8](svbool_t pg, uint8_t *rn, int64_t vnum, svuint8x3_t zt); - void svst4q[_u8](svbool_t pg, uint8_t *rn, svuint8x4_t zt); - void svst4q_vnum[_u8](svbool_t pg, uint8_t *rn, int64_t vnum, svuint8x4_t zt); + void svst1q_scatter_[u64]index_s16(svbool_t pg, const int16_t *base, svuint64_t index, svint16_t data); ``` #### ST2Q, ST3Q, ST4Q @@ -9971,9 +9963,9 @@ Programmable table lookup within each quadword vector segment (zeroing). ``` c // Variants are also available for: - // _s8, _u16, _s16, _u32, _s32, _u64, _s64 + // _u8, _u16, _s16, _u32, _s32, _u64, _s64 // _bf16, _f16, _f32, _f64 - svuint8_t svtblq[_u8](svuint8_t zn, svuint8_t zm); + svint8_t svtblq[_s8](svint8_t zn, svuint8_t zm); ``` #### TBXQ @@ -9982,9 +9974,9 @@ Programmable table lookup within each quadword vector segment (merging). ``` c // Variants are also available for: - // _s8, _u16, _s16, _u32, _s32, _u64, _s64 + // _u8, _u16, _s16, _u32, _s32, _u64, _s64 // _bf16, _f16, _f32, _f64 - svuint8_t svtbxq[_u8](svuint8_t zn, svuint8_t zm); + svint8_t svtbxq[_s8](svint8_t fallback, svint8_t zn, svuint8_t zm); ``` #### UZPQ1, UZPQ2