Merge branch 'main' into cde

ARM-software · Sep 12, 2024 · bdb60ae · bdb60ae
2 parents 9ef9e6b + 264f4cd
commit bdb60ae
Show file tree

Hide file tree

Showing 9 changed files with 863 additions and 101 deletions.
diff --git a/.all-contributorsrc b/.all-contributorsrc
@@ -324,6 +324,15 @@
       "contributions": [
         "code"
       ]
+    },
+    {
+      "login": "Lukacma",
+      "name": "Lukacma",
+      "avatar_url": "https://avatars.githubusercontent.com/u/46606997?v=4",
+      "profile": "https://github.com/Lukacma",
+      "contributions": [
+        "doc"
+      ]
     }
   ],
   "contributorsPerLine": 7,

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -10,25 +10,25 @@ jobs:
     runs-on: ubuntu-latest
 
     steps:
-    - uses: actions/checkout@v2
+    - uses: actions/checkout@v4.1.7
     - name: check the correctness of the sources and generate the PDFs
       run: ./build_with_docker.sh
-    - uses: actions/upload-artifact@v2
+    - uses: actions/upload-artifact@v4.4.0
       with:
         name: pdfs
         path: pdfs
 
   build-github-pages:
     runs-on: ubuntu-latest
     steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@v4.1.7
     - name: generate the GitHub Pages locally in order to check for errors
       run: ./tools/build-github-pages.sh build
 
   markdown-link-check:
     runs-on: ubuntu-latest
     steps:
-    - uses: actions/checkout@master
+    - uses: actions/checkout@v4.1.7
     - uses: gaurav-nelson/github-action-markdown-link-check@v1
       with:
           config-file: '.github/workflows/markdown-link-check.json'
@@ -37,7 +37,7 @@ jobs:
     runs-on: ubuntu-latest
     if: github.base_ref == 'main' || github.ref == 'refs/heads/main'
     steps:
-    - uses: actions/checkout@v4
+    - uses: actions/checkout@v4.1.7
       with:
         fetch-depth: 0
     - name: Check correctness of draftversion fields

diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,4 @@
 tmp
 pdfs
-tex2pdf*
+tex2pdf*
+.DS_Store
diff --git a/README.md b/README.md
@@ -6,7 +6,7 @@
 </div>
 
 <!-- ALL-CONTRIBUTORS-BADGE:START - Do not remove or modify this section -->
-[![All Contributors](https://img.shields.io/badge/all_contributors-34-orange.svg?style=flat-square)](#contributors-)
+[![All Contributors](https://img.shields.io/badge/all_contributors-35-orange.svg?style=flat-square)](#contributors-)
 <!-- ALL-CONTRIBUTORS-BADGE:END -->
 ![Continuous Integration](https://github.com/ARM-software/acle/actions/workflows/ci.yml/badge.svg)
 
@@ -130,6 +130,7 @@ Thanks goes to these wonderful people ([emoji key](https://allcontributors.org/d
       <td align="center" valign="top" width="14.28%"><a href="https://github.com/pratlucas"><img src="https://avatars.githubusercontent.com/u/7014318?v=4?s=100" width="100px;" alt="Lucas Duarte Prates"/><br /><sub><b>Lucas Duarte Prates</b></sub></a><br /><a href="https://github.com/ARM-software/acle/commits?author=pratlucas" title="Code">💻</a></td>
       <td align="center" valign="top" width="14.28%"><a href="https://github.com/andrewcarlotti"><img src="https://avatars.githubusercontent.com/u/11681428?v=4?s=100" width="100px;" alt="Andrew Carlotti"/><br /><sub><b>Andrew Carlotti</b></sub></a><br /><a href="https://github.com/ARM-software/acle/pulls?q=is%3Apr+reviewed-by%3Aandrewcarlotti" title="Reviewed Pull Requests">👀</a></td>
       <td align="center" valign="top" width="14.28%"><a href="https://github.com/labrinea"><img src="https://avatars.githubusercontent.com/u/9527365?v=4?s=100" width="100px;" alt="Alexandros Lamprineas"/><br /><sub><b>Alexandros Lamprineas</b></sub></a><br /><a href="https://github.com/ARM-software/acle/commits?author=labrinea" title="Code">💻</a></td>
+      <td align="center" valign="top" width="14.28%"><a href="https://github.com/Lukacma"><img src="https://avatars.githubusercontent.com/u/46606997?v=4?s=100" width="100px;" alt="Lukacma"/><br /><sub><b>Lukacma</b></sub></a><br /><a href="https://github.com/ARM-software/acle/commits?author=Lukacma" title="Documentation">📖</a></td>
     </tr>
   </tbody>
 </table>

diff --git a/main/acle.md b/main/acle.md
diff --git a/neon_intrinsics/advsimd.md b/neon_intrinsics/advsimd.md
diff --git a/neon_intrinsics/advsimd.template.md b/neon_intrinsics/advsimd.template.md
@@ -12,7 +12,7 @@ toc: true
 ---
 
 <!--
-SPDX-FileCopyrightText: Copyright 2014-2023 Arm Limited and/or its affiliates <[email protected]>
+SPDX-FileCopyrightText: Copyright 2014-2024 Arm Limited and/or its affiliates <[email protected]>
 SPDX-FileCopyrightText: Copyright 2021 Matt P. Dziubinski <[email protected]>
 CC-BY-SA-4.0 AND Apache-Patent-License
 See LICENSE.md file for details
@@ -107,7 +107,7 @@ for more information about Arm’s trademarks.
 
 ## Copyright
 
-* Copyright 2014-2023 Arm Limited and/or its affiliates <[email protected]>
+* Copyright 2014-2024 Arm Limited and/or its affiliates <[email protected]>
 * Copyright 2021 Matt P. Dziubinski <[email protected]>
 
 ## Document history
@@ -149,6 +149,7 @@ for more information about Arm’s trademarks.
 ### Changes for next release
 
 * Textual improvements (non-functional changes).
+* Fixed the range of the ``lane`` immediate argument for ``vst2q_lane_f64``.
 
 <!---
 **** Do not remove! ****

diff --git a/tools/intrinsic_db/advsimd.csv b/tools/intrinsic_db/advsimd.csv
@@ -1,4 +1,4 @@
-<COMMENT>	SPDX-FileCopyrightText: Copyright 2014-2023 Arm Limited <[email protected]>
+<COMMENT>	SPDX-FileCopyrightText: Copyright 2014-2024 Arm Limited <[email protected]>
 <COMMENT>	SPDX-FileCopyrightText: Copyright 2021 Matt P. Dziubinski <[email protected]>
 <COMMENT>	SPDX-License-Identifier: Apache-2.0
 <COMMENT>	
@@ -2583,7 +2583,7 @@ void vst2q_lane_u64(uint64_t *ptr, uint64x2x2_t val, __builtin_constant_p(lane))
 void vst2_lane_p64(poly64_t *ptr, poly64x1x2_t val, __builtin_constant_p(lane))	val.val[1] -> Vt2.1D;val.val[0] -> Vt.1D;ptr -> Xn;0 <= lane <= 0	ST2 {Vt.d - Vt2.d}[lane],[Xn]		A64
 void vst2q_lane_p64(poly64_t *ptr, poly64x2x2_t val, __builtin_constant_p(lane))	val.val[1] -> Vt2.2D;val.val[0] -> Vt.2D;ptr -> Xn;0 <= lane <= 1	ST2 {Vt.d - Vt2.d}[lane],[Xn]		A64
 void vst2_lane_f64(float64_t *ptr, float64x1x2_t val, __builtin_constant_p(lane))	val.val[1] -> Vt2.1D;val.val[0] -> Vt.1D;ptr -> Xn;0 <= lane <= 0	ST2 {Vt.d - Vt2.d}[lane],[Xn]		A64
-void vst2q_lane_f64(float64_t *ptr, float64x2x2_t val, __builtin_constant_p(lane))	val.val[1] -> Vt2.2D;val.val[0] -> Vt.2D;ptr -> Xn;0 <= lane <= 2	ST2 {Vt.d - Vt2.d}[lane],[Xn]		A64
+void vst2q_lane_f64(float64_t *ptr, float64x2x2_t val, __builtin_constant_p(lane))	val.val[1] -> Vt2.2D;val.val[0] -> Vt.2D;ptr -> Xn;0 <= lane <= 1	ST2 {Vt.d - Vt2.d}[lane],[Xn]		A64
 void vst3_lane_s16(int16_t *ptr, int16x4x3_t val, __builtin_constant_p(lane))	val.val[2] -> Vt3.4H;val.val[1] -> Vt2.4H;val.val[0] -> Vt.4H;ptr -> Xn;0 <= lane <= 3	ST3 {Vt.h - Vt3.h}[lane],[Xn]		v7/A32/A64
 void vst3q_lane_s16(int16_t *ptr, int16x8x3_t val, __builtin_constant_p(lane))	val.val[2] -> Vt3.8H;val.val[1] -> Vt2.8H;val.val[0] -> Vt.8H;ptr -> Xn;0 <= lane <= 7	ST3 {Vt.h - Vt3.h}[lane],[Xn]		v7/A32/A64
 void vst3_lane_s32(int32_t *ptr, int32x2x3_t val, __builtin_constant_p(lane))	val.val[2] -> Vt3.2S;val.val[1] -> Vt2.2S;val.val[0] -> Vt.2S;ptr -> Xn;0 <= lane <= 1	ST3 {Vt.s - Vt3.s}[lane],[Xn]		v7/A32/A64
@@ -3730,6 +3730,83 @@ float64x2_t vreinterpretq_f64_p128(poly128_t a)	a -> Vd.1Q	NOP	Vd.2D -> result	A
 float16x8_t vreinterpretq_f16_p128(poly128_t a)	a -> Vd.1Q	NOP	Vd.8H -> result	A32/A64
 poly128_t vldrq_p128(poly128_t const *ptr)	ptr -> Xn	LDR Qd,[Xn]	Qd -> result	A32/A64
 void vstrq_p128(poly128_t *ptr, poly128_t val)	val -> Qt;ptr -> Xn	STR Qt,[Xn]		A32/A64
+
+uint8x16_t vluti2_lane_u8(uint8x8_t vn, uint8x8_t vm, __builtin_constant_p(index))	vn -> Vn.16B;vm -> Vm;0 <= index <= 1	LUTI2 Vd.16B, {Vn.16B}, Vm[index]	Vd.16B -> result	A64
+uint8x16_t vluti2_laneq_u8(uint8x8_t vn, uint8x16_t vm, __builtin_constant_p(index))	vn -> Vn.16B;vm -> Vm;0 <= index <= 3	LUTI2 Vd.16B, {Vn.16B}, Vm[index]	Vd.16B -> result	A64
+uint8x16_t vluti2q_lane_u8(uint8x16_t vn, uint8x8_t vm, __builtin_constant_p(index))	vn -> Vn.16B;vm -> Vm;0 <= index <= 1	LUTI2 Vd.16B, {Vn.16B}, Vm[index]	Vd.16B -> result	A64
+uint8x16_t vluti2q_laneq_u8(uint8x16_t vn, uint8x16_t vm, __builtin_constant_p(index))	vn -> Vn.16B;vm -> Vm;0 <= index <= 3	LUTI2 Vd.16B, {Vn.16B}, Vm[index]	Vd.16B -> result	A64
+
+int8x16_t vluti2_lane_s8(int8x8_t vn, uint8x8_t vm, __builtin_constant_p(index))	vn -> Vn.16B;vm -> Vm;0 <= index <= 1	LUTI2 Vd.16B, {Vn.16B}, Vm[index]	Vd.16B -> result	A64
+int8x16_t vluti2_laneq_s8(int8x8_t vn, uint8x16_t vm, __builtin_constant_p(index))	vn -> Vn.16B;vm -> Vm;0 <= index <= 3	LUTI2 Vd.16B, {Vn.16B}, Vm[index]	Vd.16B -> result	A64
+int8x16_t vluti2q_lane_s8(int8x16_t vn, uint8x8_t vm, __builtin_constant_p(index))	vn -> Vn.16B;vm -> Vm;0 <= index <= 1	LUTI2 Vd.16B, {Vn.16B}, Vm[index]	Vd.16B -> result	A64
+int8x16_t vluti2q_laneq_s8(int8x16_t vn, uint8x16_t vm, __builtin_constant_p(index))	vn -> Vn.16B;vm -> Vm;0 <= index <= 3	LUTI2 Vd.16B, {Vn.16B}, Vm[index]	Vd.16B -> result	A64
+
+poly8x16_t vluti2_lane_p8(poly8x8_t vn, uint8x8_t vm, __builtin_constant_p(index))	vn -> Vn.16B;vm -> Vm;0 <= index <= 1	LUTI2 Vd.16B, {Vn.16B}, Vm[index]	Vd.16B -> result	A64
+poly8x16_t vluti2_laneq_p8(poly8x8_t vn, uint8x16_t vm, __builtin_constant_p(index))	vn -> Vn.16B;vm -> Vm;0 <= index <= 3	LUTI2 Vd.16B, {Vn.16B}, Vm[index]	Vd.16B -> result	A64
+poly8x16_t vluti2q_lane_p8(poly8x16_t vn, uint8x8_t vm, __builtin_constant_p(index))	vn -> Vn.16B;vm -> Vm;0 <= index <= 1	LUTI2 Vd.16B, {Vn.16B}, Vm[index]	Vd.16B -> result	A64
+poly8x16_t vluti2q_laneq_p8(poly8x16_t vn, uint8x16_t vm, __builtin_constant_p(index))	vn -> Vn.16B;vm -> Vm;0 <= index <= 3	LUTI2 Vd.16B, {Vn.16B}, Vm[index]	Vd.16B -> result	A64
+
+uint16x8_t vluti2_lane_u16(uint16x4_t vn, uint8x8_t vm, __builtin_constant_p(index))	vn -> Vn.8H;vm -> Vm;0 <= index <= 3	LUTI2 Vd.8H, {Vn.8H}, Vm[index]	Vd.8H -> result	A64
+uint16x8_t vluti2_laneq_u16(uint16x4_t vn, uint8x16_t vm, __builtin_constant_p(index))	vn -> Vn.8H;vm -> Vm;0 <= index <= 7	LUTI2 Vd.8H, {Vn.8H}, Vm[index]	Vd.8H -> result	A64
+uint16x8_t vluti2q_lane_u16(uint16x8_t vn, uint8x8_t vm, __builtin_constant_p(index))	vn -> Vn.8H;vm -> Vm;0 <= index <= 3	LUTI2 Vd.8H, {Vn.8H}, Vm[index]	Vd.8H -> result	A64
+uint16x8_t vluti2q_laneq_u16(uint16x8_t vn, uint8x16_t vm, __builtin_constant_p(index))	vn -> Vn.8H;vm -> Vm;0 <= index <= 7	LUTI2 Vd.8H, {Vn.8H}, Vm[index]	Vd.8H -> result	A64
+
+int16x8_t vluti2_lane_s16(int16x4_t vn, uint8x8_t vm, __builtin_constant_p(index))	vn -> Vn.8H;vm -> Vm;0 <= index <= 3	LUTI2 Vd.8H, {Vn.8H}, Vm[index]	Vd.8H -> result	A64
+int16x8_t vluti2_laneq_s16(int16x4_t vn, uint8x16_t vm, __builtin_constant_p(index))	vn -> Vn.8H;vm -> Vm;0 <= index <= 7	LUTI2 Vd.8H, {Vn.8H}, Vm[index]	Vd.8H -> result	A64
+int16x8_t vluti2q_lane_s16(int16x8_t vn, uint8x8_t vm, __builtin_constant_p(index))	vn -> Vn.8H;vm -> Vm;0 <= index <= 3	LUTI2 Vd.8H, {Vn.8H}, Vm[index]	Vd.8H -> result	A64
+int16x8_t vluti2q_laneq_s16(int16x8_t vn, uint8x16_t vm, __builtin_constant_p(index))	vn -> Vn.8H;vm -> Vm;0 <= index <= 7	LUTI2 Vd.8H, {Vn.8H}, Vm[index]	Vd.8H -> result	A64
+
+float16x8_t vluti2_lane_f16(float16x4_t vn, uint8x8_t vm, __builtin_constant_p(index))	vn -> Vn.8H;vm -> Vm;0 <= index <= 3	LUTI2 Vd.8H, {Vn.8H}, Vm[index]	Vd.8H -> result	A64
+float16x8_t vluti2_laneq_f16(float16x4_t vn, uint8x16_t vm, __builtin_constant_p(index))	vn -> Vn.8H;vm -> Vm;0 <= index <= 7	LUTI2 Vd.8H, {Vn.8H}, Vm[index]	Vd.8H -> result	A64
+float16x8_t vluti2q_lane_f16(float16x8_t vn, uint8x8_t vm, __builtin_constant_p(index))	vn -> Vn.8H;vm -> Vm;0 <= index <= 3	LUTI2 Vd.8H, {Vn.8H}, Vm[index]	Vd.8H -> result	A64
+float16x8_t vluti2q_laneq_f16(float16x8_t vn, uint8x16_t vm, __builtin_constant_p(index))	vn -> Vn.8H;vm -> Vm;0 <= index <= 7	LUTI2 Vd.8H, {Vn.8H}, Vm[index]	Vd.8H -> result	A64
+
+bfloat16x8_t vluti2_lane_bf16(bfloat16x4_t vn, uint8x8_t vm, __builtin_constant_p(index))	vn -> Vn.8H;vm -> Vm;0 <= index <= 3	LUTI2 Vd.8H, {Vn.8H}, Vm[index]	Vd.8H -> result	A64
+bfloat16x8_t vluti2_laneq_bf16(bfloat16x4_t vn, uint8x16_t vm, __builtin_constant_p(index))	vn -> Vn.8H;vm -> Vm;0 <= index <= 7	LUTI2 Vd.8H, {Vn.8H}, Vm[index]	Vd.8H -> result	A64
+bfloat16x8_t vluti2q_lane_bf16(bfloat16x8_t vn, uint8x8_t vm, __builtin_constant_p(index))	vn -> Vn.8H;vm -> Vm;0 <= index <= 3	LUTI2 Vd.8H, {Vn.8H}, Vm[index]	Vd.8H -> result	A64
+bfloat16x8_t vluti2q_laneq_bf16(bfloat16x8_t vn, uint8x16_t vm, __builtin_constant_p(index))	vn -> Vn.8H;vm -> Vm;0 <= index <= 7	LUTI2 Vd.8H, {Vn.8H}, Vm[index]	Vd.8H -> result	A64
+
+poly16x8_t vluti2_lane_p16(poly16x4_t vn, uint8x8_t vm, __builtin_constant_p(index))	vn -> Vn.8H;vm -> Vm;0 <= index <= 3	LUTI2 Vd.8H, {Vn.8H}, Vm[index]	Vd.8H -> result	A64
+poly16x8_t vluti2_laneq_p16(poly16x4_t vn, uint8x16_t vm, __builtin_constant_p(index))	vn -> Vn.8H;vm -> Vm;0 <= index <= 7	LUTI2 Vd.8H, {Vn.8H}, Vm[index]	Vd.8H -> result	A64
+poly16x8_t vluti2q_lane_p16(poly16x8_t vn, uint8x8_t vm, __builtin_constant_p(index))	vn -> Vn.8H;vm -> Vm;0 <= index <= 3	LUTI2 Vd.8H, {Vn.8H}, Vm[index]	Vd.8H -> result	A64
+poly16x8_t vluti2q_laneq_p16(poly16x8_t vn, uint8x16_t vm, __builtin_constant_p(index))	vn -> Vn.8H;vm -> Vm;0 <= index <= 7	LUTI2 Vd.8H, {Vn.8H}, Vm[index]	Vd.8H -> result	A64
+
+uint8x16_t vluti4q_lane_u8(uint8x16_t vn, uint8x8_t vm, __builtin_constant_p(index))	vn -> Vn.16B;vm -> Vm;0 <= index <= 0	LUTI4 Vd.16B, {Vn.16B}, Vm[index]	Vd.16B -> result	A64
+uint8x16_t vluti4q_laneq_u8(uint8x16_t vn, uint8x16_t vm, __builtin_constant_p(index))	vn -> Vn.16B;vm -> Vm;0 <= index <= 1	LUTI4 Vd.16B, {Vn.16B}, Vm[index]	Vd.16B -> result	A64
+
+int8x16_t vluti4q_lane_s8(int8x16_t vn, uint8x8_t vm, __builtin_constant_p(index))	vn -> Vn.16B;vm -> Vm;0 <= index <= 0	LUTI4 Vd.16B, {Vn.16B}, Vm[index]	Vd.16B -> result	A64
+int8x16_t vluti4q_laneq_s8(int8x16_t vn, uint8x16_t vm, __builtin_constant_p(index))	vn -> Vn.16B;vm -> Vm;0 <= index <= 1	LUTI4 Vd.16B, {Vn.16B}, Vm[index]	Vd.16B -> result	A64
+
+poly8x16_t vluti4q_lane_p8(poly8x16_t vn, uint8x8_t vm, __builtin_constant_p(index))	vn -> Vn.16B;vm -> Vm;0 <= index <= 0	LUTI4 Vd.16B, {Vn.16B}, Vm[index]	Vd.16B -> result	A64
+poly8x16_t vluti4q_laneq_p8(poly8x16_t vn, uint8x16_t vm, __builtin_constant_p(index))	vn -> Vn.16B;vm -> Vm;0 <= index <= 1	LUTI4 Vd.16B, {Vn.16B}, Vm[index]	Vd.16B -> result	A64
+
+uint16x8_t vluti4q_lane_u16_x2(uint16x8x2_t vn, uint8x8_t vm, __builtin_constant_p(index))	vn.val[0] -> Vn1.8H;vn.val[1] -> Vn2.8H;vm -> Vm;0 <= index <= 1	LUTI4 Vd.8H, {Vn1.8H, Vn2.8H}, Vm[index]	Vd.8H -> result	A64
+uint16x8_t vluti4q_laneq_u16_x2(uint16x8x2_t vn, uint8x16_t vm, __builtin_constant_p(index))	vn.val[0] -> Vn1.8H;vn.val[1] -> Vn2.8H;vm -> Vm;0 <= index <= 3	LUTI4 Vd.8H, {Vn1.8H, Vn2.8H}, Vm[index]	Vd.8H -> result	A64
+
+int16x8_t vluti4q_lane_s16_x2(int16x8x2_t vn, uint8x8_t vm, __builtin_constant_p(index))	vn.val[0] -> Vn1.8H;vn.val[1] -> Vn2.8H;vm -> Vm;0 <= index <= 1	LUTI4 Vd.8H, {Vn1.8H, Vn2.8H}, Vm[index]	Vd.8H -> result	A64
+int16x8_t vluti4q_laneq_s16_x2(int16x8x2_t vn, uint8x16_t vm, __builtin_constant_p(index))	vn.val[0] -> Vn1.8H;vn.val[1] -> Vn2.8H;vm -> Vm;0 <= index <= 3	LUTI4 Vd.8H, {Vn1.8H, Vn2.8H}, Vm[index]	Vd.8H -> result	A64
+
+float16x8_t vluti4q_lane_f16_x2(float16x8x2_t vn, uint8x8_t vm, __builtin_constant_p(index))	vn.val[0] -> Vn1.8H;vn.val[1] -> Vn2.8H;vm -> Vm;0 <= index <= 1	LUTI4 Vd.8H, {Vn1.8H, Vn2.8H}, Vm[index]	Vd.8H -> result	A64
+float16x8_t vluti4q_laneq_f16_x2(float16x8x2_t vn, uint8x16_t vm, __builtin_constant_p(index))	vn.val[0] -> Vn1.8H;vn.val[1] -> Vn2.8H;vm -> Vm;0 <= index <= 3	LUTI4 Vd.8H, {Vn1.8H, Vn2.8H}, Vm[index]	Vd.8H -> result	A64
+
+bfloat16x8_t vluti4q_lane_bf16_x2(bfloat16x8x2_t vn, uint8x8_t vm, __builtin_constant_p(index))	vn.val[0] -> Vn1.8H;vn.val[1] -> Vn2.8H;vm -> Vm;0 <= index <= 1	LUTI4 Vd.8H, {Vn1.8H, Vn2.8H}, Vm[index]	Vd.8H -> result	A64
+bfloat16x8_t vluti4q_laneq_bf16_x2(bfloat16x8x2_t vn, uint8x16_t vm, __builtin_constant_p(index))	vn.val[0] -> Vn1.8H;vn.val[1] -> Vn2.8H;vm -> Vm;0 <= index <= 3	LUTI4 Vd.8H, {Vn1.8H, Vn2.8H}, Vm[index]	Vd.8H -> result	A64
+
+poly16x8_t vluti4q_lane_p16_x2(poly16x8x2_t vn, uint8x8_t vm, __builtin_constant_p(index))	vn.val[0] -> Vn1.8H;vn.val[1] -> Vn2.8H;vm -> Vm;0 <= index <= 1	LUTI4 Vd.8H, {Vn1.8H, Vn2.8H}, Vm[index]	Vd.8H -> result	A64
+poly16x8_t vluti4q_laneq_p16_x2(poly16x8x2_t vn, uint8x16_t vm, __builtin_constant_p(index))	vn.val[0] -> Vn1.8H;vn.val[1] -> Vn2.8H;vm -> Vm;0 <= index <= 3	LUTI4 Vd.8H, {Vn1.8H, Vn2.8H}, Vm[index]	Vd.8H -> result	A64
+
+float16x4_t vamax_f16(float16x4_t vn, float16x4_t vm)	vn -> Vn.4H;vm -> Vm.4H	FAMAX Vd.4H, Vn.4H, Vm.4H	Vd.4H -> result	A64
+float16x8_t vamaxq_f16(float16x8_t vn, float16x8_t vm)	vn -> Vn.8H;vm -> Vm.8H	FAMAX Vd.8H, Vn.8H, Vm.8H	Vd.8H -> result	A64
+float32x2_t vamax_f32(float32x2_t vn, float32x2_t vm)	vn -> Vn.2S;vm -> Vm.2S	FAMAX Vd.2S, Vn.2S, Vm.2S	Vd.2S -> result	A64
+float32x4_t vamaxq_f32(float32x4_t vn, float32x4_t vm)	vn -> Vn.4S;vm -> Vm.4S	FAMAX Vd.4S, Vn.4S, Vm.4S	Vd.4S -> result	A64
+float64x2_t vamaxq_f64(float64x2_t vn, float64x2_t vm)	vn -> Vn.2D;vm -> Vm.2D	FAMAX Vd.2D, Vn.2D, Vm.2D	Vd.2D -> result	A64
+
+float16x4_t vamin_f16(float16x4_t vn, float16x4_t vm)	vn -> Vn.4H;vm -> Vm.4H	FAMIN Vd.4H, Vn.4H, Vm.4H	Vd.4H -> result	A64
+float16x8_t vaminq_f16(float16x8_t vn, float16x8_t vm)	vn -> Vn.8H;vm -> Vm.8H	FAMIN Vd.8H, Vn.8H, Vm.8H	Vd.8H -> result	A64
+float32x2_t vamin_f32(float32x2_t vn, float32x2_t vm)	vn -> Vn.2S;vm -> Vm.2S	FAMIN Vd.2S, Vn.2S, Vm.2S	Vd.2S -> result	A64
+float32x4_t vaminq_f32(float32x4_t vn, float32x4_t vm)	vn -> Vn.4S;vm -> Vm.4S	FAMIN Vd.4S, Vn.4S, Vm.4S	Vd.4S -> result	A64
+float64x2_t vaminq_f64(float64x2_t vn, float64x2_t vm)	vn -> Vn.2D;vm -> Vm.2D	FAMIN Vd.2D, Vn.2D, Vm.2D	Vd.2D -> result	A64
+
 <SECTION>	Crypto
 uint8x16_t vaeseq_u8(uint8x16_t data, uint8x16_t key)	data -> Vd.16B;key -> Vn.16B	AESE Vd.16B,Vn.16B	Vd.16B -> result	A32/A64
 uint8x16_t vaesdq_u8(uint8x16_t data, uint8x16_t key)	data -> Vd.16B;key -> Vn.16B	AESD Vd.16B,Vn.16B	Vd.16B -> result	A32/A64
@@ -4470,4 +4547,4 @@ float32x4_t vbfmlaltq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b)	r -> Vd
 float32x4_t vbfmlalbq_lane_f32(float32x4_t r, bfloat16x8_t a, bfloat16x4_t b, __builtin_constant_p(lane))	r -> Vd.4S;a -> Vn.8H;b -> Vm.4H;0 <= lane <= 3	BFMLALB Vd.4S,Vn.8H,Vm.H[lane]	Vd.4S -> result	A32/A64
 float32x4_t vbfmlalbq_laneq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b, __builtin_constant_p(lane))	r -> Vd.4S;a -> Vn.8H;b -> Vm.8H;0 <= lane <= 7	BFMLALB Vd.4S,Vn.8H,Vm.H[lane]	Vd.4S -> result	A32/A64
 float32x4_t vbfmlaltq_lane_f32(float32x4_t r, bfloat16x8_t a, bfloat16x4_t b, __builtin_constant_p(lane))	r -> Vd.4S;a -> Vn.8H;b -> Vm.4H;0 <= lane <= 3	BFMLALT Vd.4S,Vn.8H,Vm.H[lane]	Vd.4S -> result	A32/A64
-float32x4_t vbfmlaltq_laneq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b, __builtin_constant_p(lane))	r -> Vd.4S;a -> Vn.8H;b -> Vm.8H;0 <= lane <= 7	BFMLALT Vd.4S,Vn.8H,Vm.H[lane]	Vd.4S -> result	A32/A64
+float32x4_t vbfmlaltq_laneq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b, __builtin_constant_p(lane))	r -> Vd.4S;a -> Vn.8H;b -> Vm.8H;0 <= lane <= 7	BFMLALT Vd.4S,Vn.8H,Vm.H[lane]	Vd.4S -> result	A32/A64