diff --git a/Android.bp b/Android.bp
index 7681205770..f7d4d257d6 100644
--- a/Android.bp
+++ b/Android.bp
@@ -392,6 +392,7 @@ cc_library_static {
"src/core/Utils.cpp",
"src/core/Validate.cpp",
"src/core/Version.cpp",
+ "src/core/helpers/LUTManager.cpp",
"src/core/helpers/SoftmaxHelpers.cpp",
"src/core/helpers/Utils.cpp",
"src/core/helpers/WindowHelpers.cpp",
@@ -488,6 +489,8 @@ cc_library_static {
"src/cpu/kernels/crop/generic/neon/fp16.cpp",
"src/cpu/kernels/crop/generic/neon/fp32.cpp",
"src/cpu/kernels/crop/generic/neon/integer.cpp",
+ "src/cpu/kernels/depth_to_space/nchw/any/impl.cpp",
+ "src/cpu/kernels/depth_to_space/nhwc/any/impl.cpp",
"src/cpu/kernels/depthwiseconv2d/generic/neon/fp16.cpp",
"src/cpu/kernels/depthwiseconv2d/generic/neon/fp32.cpp",
"src/cpu/kernels/depthwiseconv2d/generic/neon/impl.cpp",
@@ -515,6 +518,8 @@ cc_library_static {
"src/cpu/kernels/fuse_batch_normalization/generic/fp16.cpp",
"src/cpu/kernels/fuse_batch_normalization/generic/fp32.cpp",
"src/cpu/kernels/fuse_batch_normalization/nchw/all.cpp",
+ "src/cpu/kernels/fuse_batch_normalization/nchw/neon/fp16.cpp",
+ "src/cpu/kernels/fuse_batch_normalization/nchw/neon/fp32.cpp",
"src/cpu/kernels/fuse_batch_normalization/nhwc/neon/fp16.cpp",
"src/cpu/kernels/fuse_batch_normalization/nhwc/neon/fp32.cpp",
"src/cpu/kernels/gemm_matrix_add/generic/neon/fp16.cpp",
@@ -543,6 +548,10 @@ cc_library_static {
"src/cpu/kernels/meanstddevnorm/generic/neon/fp32.cpp",
"src/cpu/kernels/meanstddevnorm/generic/neon/impl.cpp",
"src/cpu/kernels/meanstddevnorm/generic/neon/qasymm8.cpp",
+ "src/cpu/kernels/mul/generic/neon/fp16.cpp",
+ "src/cpu/kernels/mul/generic/neon/fp32.cpp",
+ "src/cpu/kernels/norm_layer/generic/neon/fp16.cpp",
+ "src/cpu/kernels/norm_layer/generic/neon/fp32.cpp",
"src/cpu/kernels/pool2d/neon/fp16.cpp",
"src/cpu/kernels/pool2d/neon/fp32.cpp",
"src/cpu/kernels/pool2d/neon/nchw/all.cpp",
@@ -1033,6 +1042,7 @@ cc_library_static {
"src/runtime/heuristics/dwc_native/ClDWCNativeHeuristicsHelpers.cpp",
"src/runtime/heuristics/indirect_conv/ClIndirectConvDefaultConfigValhall.cpp",
"src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultConfigValhall.cpp",
+ "src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultVariantValhall.cpp",
"src/runtime/heuristics/matmul_native/ClMatMulNativeHelpers.cpp",
"utils/CommonGraphOptions.cpp",
"utils/GraphUtils.cpp",
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 9dd3e2cef7..35b6ca2b7f 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2023 Arm Limited.
+# Copyright (c) 2023-2024 Arm Limited.
#
# SPDX-License-Identifier: MIT
#
@@ -28,7 +28,7 @@ cmake_minimum_required(VERSION 3.13 FATAL_ERROR)
list(APPEND CMAKE_MESSAGE_CONTEXT ArmCompute)
project(
ArmCompute
- VERSION 33.0.0
+ VERSION 34.0.0
DESCRIPTION
"The Arm Compute Library is a collection of low-level machine learning functions optimized for Arm® Cortex®-A CPU and Arm® Mali™ GPU architectures"
LANGUAGES C CXX ASM)
diff --git a/LICENSE b/LICENSE
index 0d2cb83aaa..781685ab31 100644
--- a/LICENSE
+++ b/LICENSE
@@ -1,6 +1,6 @@
MIT License
-Copyright (c) 2017-2023 Arm Limited
+Copyright (c) 2017-2024 Arm Limited
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
diff --git a/README.md b/README.md
index 9b06dbeabf..71a6518594 100644
--- a/README.md
+++ b/README.md
@@ -1,9 +1,8 @@
-> **⚠ Important**
-> From release 22.05: 'master' branch has been replaced with 'main' following our inclusive language update, more information [here](https://arm-software.github.io/ComputeLibrary/latest/contribution_guidelines.xhtml#S5_0_inc_lang).
-
-> **⚠ Important**
-> From release 22.08: armv7a with Android build will no longer be tested or maintained.
+> **⚠ Deprecation Notice**
+> 24.01 announcement: NCHW data format specific optimizations will gradually be removed from the code base in
+> future releases. The implication of this is that the user is expected to translate NCHW models into NHWC in
+> order to benefit from the optimizations.
> **⚠ Important**
> From release 23.02: The 23.02 release introduces a change to the default tensor extend padding behavior.
@@ -16,7 +15,7 @@
-# Compute Library ![](https://img.shields.io/badge/latest_release-23.11-green)
+# Compute Library ![](https://img.shields.io/badge/latest_release-24.01-green)
The Compute Library is a collection of low-level machine learning functions optimized for Arm® Cortex®-A, Arm® Neoverse® and Arm® Mali™ GPUs architectures.
@@ -44,7 +43,7 @@ Key Features:
## Documentation
-[![Documentation](https://img.shields.io/badge/documentation-23.11-green)](https://arm-software.github.io/ComputeLibrary/latest)
+[![Documentation](https://img.shields.io/badge/documentation-24.01-green)](https://arm-software.github.io/ComputeLibrary/latest)
> Note: The documentation includes the reference API, changelogs, build guide, contribution guide, errata, etc.
@@ -57,24 +56,24 @@ All the binaries can be downloaded from [here](https://github.com/ARM-software/C
| Platform | Operating System | Release archive (Download) |
| -------------- | ---------------- | -------------------------- |
-| Raspberry Pi 4 | Linux® 32bit | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v23.11/arm_compute-v23.11-bin-linux-armv7a-neon.tar.gz) |
-| Raspberry Pi 4 | Linux® 64bit | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v23.11/arm_compute-v23.11-bin-linux-arm64-v8a-neon.tar.gz) |
-| Odroid N2 | Linux® 64bit | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v23.11/arm_compute-v23.11-bin-linux-arm64-v8a-neon.tar.gz) [![](https://img.shields.io/badge/build-opencl-blue)](https://github.com/ARM-software/ComputeLibrary/releases/download/v23.11/arm_compute-v23.11-bin-linux-arm64-v8a-cl.tar.gz) [![](https://img.shields.io/badge/build-neon+cl-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/download/v23.11/arm_compute-v23.11-bin-linux-arm64-v8a-neon-cl.tar.gz) |
-| HiKey960 | Linux® 64bit | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v23.11/arm_compute-v23.11-bin-linux-arm64-v8a-neon.tar.gz) [![](https://img.shields.io/badge/build-opencl-blue)](https://github.com/ARM-software/ComputeLibrary/releases/download/v23.11/arm_compute-v23.11-bin-linux-arm64-v8a-cl.tar.gz) [![](https://img.shields.io/badge/build-neon+cl-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/download/v23.11/arm_compute-v23.11-bin-linux-arm64-v8a-neon-cl.tar.gz) |
+| Raspberry Pi 4 | Linux® 32bit | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.01/arm_compute-v24.01-bin-linux-armv7a-neon.tar.gz) |
+| Raspberry Pi 4 | Linux® 64bit | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.01/arm_compute-v24.01-bin-linux-arm64-v8a-neon.tar.gz) |
+| Odroid N2 | Linux® 64bit | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.01/arm_compute-v24.01-bin-linux-arm64-v8a-neon.tar.gz) [![](https://img.shields.io/badge/build-opencl-blue)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.01/arm_compute-v24.01-bin-linux-arm64-v8a-cl.tar.gz) [![](https://img.shields.io/badge/build-neon+cl-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.01/arm_compute-v24.01-bin-linux-arm64-v8a-neon-cl.tar.gz) |
+| HiKey960 | Linux® 64bit | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.01/arm_compute-v24.01-bin-linux-arm64-v8a-neon.tar.gz) [![](https://img.shields.io/badge/build-opencl-blue)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.01/arm_compute-v24.01-bin-linux-arm64-v8a-cl.tar.gz) [![](https://img.shields.io/badge/build-neon+cl-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.01/arm_compute-v24.01-bin-linux-arm64-v8a-neon-cl.tar.gz) |
| Architecture | Operating System | Release archive (Download) |
| ------------ | ---------------- | -------------------------- |
-| armv7 | Linux® | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v23.11/arm_compute-v23.11-bin-linux-armv7a-neon.tar.gz) [![](https://img.shields.io/badge/build-opencl-blue)](https://github.com/ARM-software/ComputeLibrary/releases/download/v23.11/arm_compute-v23.11-bin-linux-armv7a-cl.tar.gz) [![](https://img.shields.io/badge/build-neon+cl-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/download/v23.11/arm_compute-v23.11-bin-linux-armv7a-neon-cl.tar.gz) |
-| arm64-v8a | Android™ | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v23.11/arm_compute-v23.11-bin-android-arm64-v8a-neon.tar.gz) [![](https://img.shields.io/badge/build-opencl-blue)](https://github.com/ARM-software/ComputeLibrary/releases/download/v23.11/arm_compute-v23.11-bin-android-arm64-v8a-cl.tar.gz) [![](https://img.shields.io/badge/build-neon+cl-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/download/v23.11/arm_compute-v23.11-bin-android-arm64-v8a-neon-cl.tar.gz) |
-| arm64-v8a | Linux® | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v23.11/arm_compute-v23.11-bin-linux-arm64-v8a-neon.tar.gz) [![](https://img.shields.io/badge/build-opencl-blue)](https://github.com/ARM-software/ComputeLibrary/releases/download/v23.11/arm_compute-v23.11-bin-linux-arm64-v8a-cl.tar.gz) [![](https://img.shields.io/badge/build-neon+cl-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/download/v23.11/arm_compute-v23.11-bin-linux-arm64-v8a-neon-cl.tar.gz) |
-| arm64-v8.2-a | Android™ | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v23.11/arm_compute-v23.11-bin-android-arm64-v8.2-a-neon.tar.gz) [![](https://img.shields.io/badge/build-opencl-blue)](https://github.com/ARM-software/ComputeLibrary/releases/download/v23.11/arm_compute-v23.11-bin-android-arm64-v8.2-a-cl.tar.gz) [![](https://img.shields.io/badge/build-neon+cl-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/download/v23.11/arm_compute-v23.11-bin-android-arm64-v8.2-a-neon-cl.tar.gz) |
-| arm64-v8.2-a | Linux® | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v23.11/arm_compute-v23.11-bin-linux-arm64-v8.2-a-neon.tar.gz) [![](https://img.shields.io/badge/build-opencl-blue)](https://github.com/ARM-software/ComputeLibrary/releases/download/v23.11/arm_compute-v23.11-bin-linux-arm64-v8.2-a-cl.tar.gz) [![](https://img.shields.io/badge/build-neon+cl-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/download/v23.11/arm_compute-v23.11-bin-linux-arm64-v8.2-a-neon-cl.tar.gz) |
+| armv7 | Linux® | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.01/arm_compute-v24.01-bin-linux-armv7a-neon.tar.gz) [![](https://img.shields.io/badge/build-opencl-blue)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.01/arm_compute-v24.01-bin-linux-armv7a-cl.tar.gz) [![](https://img.shields.io/badge/build-neon+cl-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.01/arm_compute-v24.01-bin-linux-armv7a-neon-cl.tar.gz) |
+| arm64-v8a | Android™ | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.01/arm_compute-v24.01-bin-android-arm64-v8a-neon.tar.gz) [![](https://img.shields.io/badge/build-opencl-blue)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.01/arm_compute-v24.01-bin-android-arm64-v8a-cl.tar.gz) [![](https://img.shields.io/badge/build-neon+cl-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.01/arm_compute-v24.01-bin-android-arm64-v8a-neon-cl.tar.gz) |
+| arm64-v8a | Linux® | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.01/arm_compute-v24.01-bin-linux-arm64-v8a-neon.tar.gz) [![](https://img.shields.io/badge/build-opencl-blue)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.01/arm_compute-v24.01-bin-linux-arm64-v8a-cl.tar.gz) [![](https://img.shields.io/badge/build-neon+cl-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.01/arm_compute-v24.01-bin-linux-arm64-v8a-neon-cl.tar.gz) |
+| arm64-v8.2-a | Android™ | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.01/arm_compute-v24.01-bin-android-arm64-v8.2-a-neon.tar.gz) [![](https://img.shields.io/badge/build-opencl-blue)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.01/arm_compute-v24.01-bin-android-arm64-v8.2-a-cl.tar.gz) [![](https://img.shields.io/badge/build-neon+cl-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.01/arm_compute-v24.01-bin-android-arm64-v8.2-a-neon-cl.tar.gz) |
+| arm64-v8.2-a | Linux® | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.01/arm_compute-v24.01-bin-linux-arm64-v8.2-a-neon.tar.gz) [![](https://img.shields.io/badge/build-opencl-blue)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.01/arm_compute-v24.01-bin-linux-arm64-v8.2-a-cl.tar.gz) [![](https://img.shields.io/badge/build-neon+cl-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.01/arm_compute-v24.01-bin-linux-arm64-v8.2-a-neon-cl.tar.gz) |
-Please refer to the following link for more pre-built binaries: [![](https://img.shields.io/badge/v23.11-bins-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/tag/v23.11)
+Please refer to the following link for more pre-built binaries: [![](https://img.shields.io/badge/v24.01-bins-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/tag/v24.01)
Pre-build binaries are generated with the following security / good coding practices related flags:
> -Wall, -Wextra, -Wformat=2, -Winit-self, -Wstrict-overflow=2, -Wswitch-default, -Woverloaded-virtual, -Wformat-security, -Wctor-dtor-privacy, -Wsign-promo, -Weffc++, -pedantic, -fstack-protector-strong
diff --git a/SConscript b/SConscript
index 099ff706ad..96b3bdc689 100644
--- a/SConscript
+++ b/SConscript
@@ -1,7 +1,7 @@
#!/usr/bin/python
# -*- coding: utf-8 -*-
-# Copyright (c) 2016-2023 Arm Limited.
+# Copyright (c) 2016-2024 Arm Limited.
#
# SPDX-License-Identifier: MIT
#
@@ -31,15 +31,8 @@ import zlib
import json
import codecs
-from SCons.Warnings import warn, DeprecatedWarning
-
-warn(DeprecatedWarning,
- "DEPRECATION NOTICE: Legacy libarm_compute_core has been deprecated and is scheduled for removal in 24.02 release."
- " Link your application only to libarm_compute for core library functionality"
- )
-
-VERSION = "v23.11"
-LIBRARY_VERSION_MAJOR = 33
+VERSION = "v24.01"
+LIBRARY_VERSION_MAJOR = 34
LIBRARY_VERSION_MINOR = 0
LIBRARY_VERSION_PATCH = 0
SONAME_VERSION = str(LIBRARY_VERSION_MAJOR) + "." + str(LIBRARY_VERSION_MINOR) + "." + str(LIBRARY_VERSION_PATCH)
@@ -89,31 +82,42 @@ def build_obj_list(arch_info, sources, static=False):
# A list of static objects
# A list of shared objects
-def build_lib_objects():
+def build_multiisa_lib_objects():
lib_static_objs = [] # static objects
lib_shared_objs = [] # shared objects
+ # note that ARM_COMPUTE_ENABLE_FP16 is enabled in update_data_type_layout_flags() to make
+ # sure the environment is progated to the validation suite
arm_compute_env.Append(CPPDEFINES = ['ENABLE_NEON', 'ARM_COMPUTE_ENABLE_NEON',
- 'ENABLE_SVE', 'ARM_COMPUTE_ENABLE_SVE',
- 'ARM_COMPUTE_ENABLE_FP16', 'ARM_COMPUTE_ENABLE_BF16',
+ 'ENABLE_SVE', 'ARM_COMPUTE_ENABLE_SVE','ARM_COMPUTE_ENABLE_BF16',
'ARM_COMPUTE_ENABLE_I8MM', 'ARM_COMPUTE_ENABLE_SVEF32MM'])
# Build all the common files for the base architecture
if env['arch'] == 'armv8a':
- lib_static_objs += build_obj_list(filedefs["armv8-a"], lib_files, static=True)
- lib_shared_objs += build_obj_list(filedefs["armv8-a"], lib_files, static=False)
+ lib_static_objs += build_obj_list(filedefs["armv8-a"], misa_lib_files, static=True)
+ lib_shared_objs += build_obj_list(filedefs["armv8-a"], misa_lib_files, static=False)
else:
- lib_static_objs += build_obj_list(filedefs["armv8.2-a"], lib_files, static=True)
- lib_shared_objs += build_obj_list(filedefs["armv8.2-a"], lib_files, static=False)
+ lib_static_objs += build_obj_list(filedefs["armv8.2-a"], misa_lib_files, static=True)
+ lib_shared_objs += build_obj_list(filedefs["armv8.2-a"], misa_lib_files, static=False)
+
+ # Build the FP16 specific files
+ lib_static_objs += build_obj_list(filedefs["armv8.2-a"], misa_lib_files_neon_fp16, static=True)
+ lib_shared_objs += build_obj_list(filedefs["armv8.2-a"], misa_lib_files_neon_fp16, static=False)
# Build the SVE specific files
- lib_static_objs += build_obj_list(filedefs["armv8.2-a-sve"], lib_files_sve, static=True)
- lib_shared_objs += build_obj_list(filedefs["armv8.2-a-sve"], lib_files_sve, static=False)
+ lib_static_objs += build_obj_list(filedefs["armv8.2-a-sve"], misa_lib_files_sve, static=True)
+ lib_shared_objs += build_obj_list(filedefs["armv8.2-a-sve"], misa_lib_files_sve, static=False)
+ lib_static_objs += build_obj_list(filedefs["armv8.2-a-sve"], misa_lib_files_sve_fp16, static=True)
+ lib_shared_objs += build_obj_list(filedefs["armv8.2-a-sve"], misa_lib_files_sve_fp16, static=False)
+
# Build the SVE2 specific files
arm_compute_env.Append(CPPDEFINES = ['ARM_COMPUTE_ENABLE_SVE2'])
- lib_static_objs += build_obj_list(filedefs["armv8.6-a-sve2"], lib_files_sve2, static=True)
- lib_shared_objs += build_obj_list(filedefs["armv8.6-a-sve2"], lib_files_sve2, static=False)
+ lib_static_objs += build_obj_list(filedefs["armv8.6-a-sve2"], misa_lib_files_sve2, static=True)
+ lib_shared_objs += build_obj_list(filedefs["armv8.6-a-sve2"], misa_lib_files_sve2, static=False)
+ lib_static_objs += build_obj_list(filedefs["armv8.6-a-sve2"], misa_lib_files_sve2_fp16, static=True)
+ lib_shared_objs += build_obj_list(filedefs["armv8.6-a-sve2"], misa_lib_files_sve2_fp16, static=False)
+
return lib_static_objs, lib_shared_objs
@@ -291,29 +295,29 @@ def get_attrs_list(env, data_types, data_layouts):
return attrs
-def get_operator_backend_files(filelist, operators, backend='', techs=[], attrs=[]):
+def get_operator_backend_files(filelist, operators, backend='', techs=[], attrs=[], include_common=True):
files = { "common" : [] }
-
# Early return if filelist is empty
if backend not in filelist:
return files
-
# Iterate over operators and create the file lists to compiler
for operator in operators:
if operator in filelist[backend]['operators']:
- files['common'] += filelist[backend]['operators'][operator]["files"]["common"]
+ if include_common :
+ files['common'] += filelist[backend]['operators'][operator]["files"]["common"]
for tech in techs:
if tech in filelist[backend]['operators'][operator]["files"]:
# Add tech as a key to dictionary if not there
if tech not in files:
files[tech] = []
-
# Add tech files to the tech file list
tech_files = filelist[backend]['operators'][operator]["files"][tech]
- files[tech] += tech_files.get('common', [])
+ if include_common:
+ files[tech] += tech_files.get('common', [])
for attr in attrs:
files[tech] += tech_files.get(attr, [])
+
# Remove duplicates if they exist
return {k: list(set(v)) for k,v in files.items()}
@@ -615,6 +619,17 @@ if env['opencl']:
lib_files_sve = []
lib_files_sve2 = []
+# the variables below are used for the multi_isa builds
+# please note that the variables names without the _fp16 suffix
+# do not hold any fp16 files.
+
+misa_lib_files = lib_files
+misa_lib_files_sve = []
+misa_lib_files_sve2 = []
+misa_lib_files_neon_fp16 = []
+misa_lib_files_sve_fp16 = []
+misa_lib_files_sve2_fp16 = []
+
if env['neon']:
# build winograd/depthwise sources for either v7a / v8a
arm_compute_env.Append(CPPPATH = ["src/core/NEON/kernels/arm_gemm",
@@ -627,8 +642,6 @@ if env['neon']:
"arm_compute/core/NEON/kernels/assembly/",
"src/cpu/kernels/assembly/"])
- lib_files += filelist['cpu']['common']
-
# Setup SIMD file list to include
simd = ['neon']
if env['multi_isa']:
@@ -643,7 +656,6 @@ if env['neon']:
else:
attrs = get_attrs_list(env, env['data_type_support'], env['data_layout_support'])
-
if env['fixed_format_kernels']:
attrs.append("fixed_format_kernels")
@@ -651,19 +663,46 @@ if env['neon']:
cpu_operators = custom_operators if use_custom_ops else filelist['cpu']['operators'].keys()
cpu_ops_to_build = resolve_operator_dependencies(filelist, cpu_operators, 'cpu')
- cpu_files = get_operator_backend_files(filelist, cpu_ops_to_build, 'cpu', simd, attrs)
+ if env['multi_isa']:
+ misa_lib_files += filelist['cpu']['common']
- # Shared among ALL CPU files
- lib_files += cpu_files.get('common', [])
+ # For multi_isa builds we need to build fp16 files for armv8.2-a+fp16 so we filter them out of cpu_files removing the attribute fp16
+ attrs.remove('fp16')
+ cpu_files = get_operator_backend_files(filelist, cpu_ops_to_build, 'cpu', simd, attrs)
- # Arm® Neon™ specific files
- lib_files += cpu_files.get('neon', [])
+ # Shared among ALL CPU files
+ misa_lib_files += cpu_files.get('common', [])
- # SVE files only
- lib_files_sve = cpu_files.get('sve', [])
+ # Arm® Neon™ specific files
+ misa_lib_files += cpu_files.get('neon', [])
- # SVE2 files only
- lib_files_sve2 = cpu_files.get('sve2', [])
+ # Get all the fp16 files
+ fp16_cpu_files = get_operator_backend_files(filelist, cpu_ops_to_build, 'cpu', simd, ['fp16'],False)
+
+ misa_lib_files_neon_fp16 = fp16_cpu_files.get('neon',[])
+ misa_lib_files_sve_fp16 = fp16_cpu_files.get('sve',[])
+ misa_lib_files_sve2_fp16 = fp16_cpu_files.get('sve2',[])
+
+ # SVE files only minus FP16
+ misa_lib_files_sve = cpu_files.get('sve', [])
+
+ # SVE2 files only minus FP16
+ misa_lib_files_sve2 = cpu_files.get('sve2', [])
+ else:
+ lib_files += filelist['cpu']['common']
+
+ # Non multi_isa build
+ cpu_files = get_operator_backend_files(filelist, cpu_ops_to_build, 'cpu', simd, attrs)
+
+ # Shared among ALL CPU files
+ lib_files += cpu_files.get('common', [])
+
+ # Arm® Neon™ specific files
+ lib_files += cpu_files.get('neon', [])
+
+ lib_files_sve = cpu_files.get('sve', [])
+
+ lib_files_sve2 = cpu_files.get('sve2', [])
graph_files += Glob('src/graph/backends/NEON/*.cpp')
@@ -681,7 +720,7 @@ Export('bootcode_o')
if (env['multi_isa']):
- lib_static_objs, lib_shared_objs = build_lib_objects()
+ lib_static_objs, lib_shared_objs = build_multiisa_lib_objects()
# STATIC library build.
@@ -708,18 +747,6 @@ if env['os'] != 'bare_metal' and not env['standalone']:
Export('arm_compute_so')
-# Generate dummy core lib for backwards compatibility
-if env['os'] == 'macos':
- # macos static library archiver fails if given an empty list of files
- arm_compute_core_a = build_library('arm_compute_core-static', arm_compute_env, lib_files, static=True)
-else:
- arm_compute_core_a = build_library('arm_compute_core-static', arm_compute_env, [], static=True)
-
-Export('arm_compute_core_a')
-
-if env['os'] != 'bare_metal' and not env['standalone']:
- arm_compute_core_a_so = build_library('arm_compute_core', arm_compute_env, [], static=False)
- Export('arm_compute_core_a_so')
arm_compute_graph_env = arm_compute_env.Clone()
diff --git a/SConstruct b/SConstruct
index 68c518a4a0..cf8fb52bd6 100644
--- a/SConstruct
+++ b/SConstruct
@@ -62,8 +62,14 @@ def read_build_config_json(build_config):
def update_data_type_layout_flags(env, data_types, data_layouts):
# Manage data-types
- if any(i in data_types for i in ['all', 'fp16']):
- env.Append(CXXFLAGS = ['-DENABLE_FP16_KERNELS'])
+ if env['multi_isa']:
+ if any(i in data_types for i in ['all', 'fp16']):
+ env.Append(CXXFLAGS = ['-DENABLE_FP16_KERNELS', '-DARM_COMPUTE_ENABLE_FP16'])
+ else:
+ if not 'v8a' in env['arch'] and not 'v7a' in env['arch'] and not 'armv8r64' in env['arch']:
+ if any(i in data_types for i in ['all', 'fp16']):
+ env.Append(CXXFLAGS = ['-DENABLE_FP16_KERNELS','-DARM_COMPUTE_ENABLE_FP16'])
+
if any(i in data_types for i in ['all', 'fp32']):
env.Append(CXXFLAGS = ['-DENABLE_FP32_KERNELS'])
if any(i in data_types for i in ['all', 'qasymm8']):
@@ -112,7 +118,7 @@ vars.AddVariables(
BoolVariable("exceptions", "Enable/disable C++ exception support", True),
BoolVariable("high_priority", "Generate a library containing only the high priority operators", False),
PathVariable("linker_script", "Use an external linker script", "", PathVariable.PathAccept),
- PathVariable("external_tests_dir", """Add examples, benchmarks and tests to the tests suite from an external path. In order to use this option, the external tests directory must have the following structure:
+ PathVariable("external_tests_dir", """Add examples, benchmarks and tests to the tests suite from an external path. In order to use this option, the external tests directory must have the following structure:
EXTERNAL_TESTS_DIR:
└── tests
├── benchmark
@@ -240,7 +246,6 @@ env.Append(CXXFLAGS = ['-DARCH_ARM',
if not 'windows' in env['os']:
env.Append(CXXFLAGS = ['-Wall','-std=c++14', '-pedantic' ])
-env.Append(CPPDEFINES = ['_GLIBCXX_USE_NANOSLEEP'])
cpp_tool = {'linux': 'g++', 'android' : 'clang++',
'tizen': 'g++', 'macos':'clang++',
@@ -312,8 +317,7 @@ if env['multi_isa']:
Exit(1)
if 'v8a' in env['arch']:
- print("INFO: multi_isa armv8-a architecture build doesn't enable __ARM_FEATURE_FP16_VECTOR_ARITHMETIC. Use armv8.2-a or beyond to enable FP16 vector arithmetic support")
- env.Append(CXXFLAGS = ['-march=armv8-a']) # note: this will disable fp16 extension __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+ env.Append(CXXFLAGS = ['-march=armv8-a'])
else:
if 'v8.6-a' in env['arch']:
if "disable_mmla_fp" not in env['custom_options']:
@@ -536,7 +540,7 @@ if env['standalone']:
if not 'windows' in env['os']:
env.Append(CXXFLAGS = ['-fPIC'])
env.Append(LINKFLAGS = ['-static-libgcc','-static-libstdc++'])
-
+
if env['Werror']:
env.Append(CXXFLAGS = ['-Werror'])
@@ -597,7 +601,7 @@ if env['debug']:
else:
env.Append(CXXFLAGS = ['-Z7','-MTd','-fms-compatibility','-fdelayed-template-parsing'])
env.Append(LINKFLAGS = ['-DEBUG'])
-
+
env.Append(CPPDEFINES = ['ARM_COMPUTE_DEBUG_ENABLED'])
else:
if not 'windows' in env['os']:
diff --git a/arm_compute/core/CL/OpenCL.h b/arm_compute/core/CL/OpenCL.h
index a5c4e39df2..8b5bf97099 100644
--- a/arm_compute/core/CL/OpenCL.h
+++ b/arm_compute/core/CL/OpenCL.h
@@ -21,8 +21,8 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#ifndef ARM_COMPUTE_OPENCL_H
-#define ARM_COMPUTE_OPENCL_H
+#ifndef ACL_ARM_COMPUTE_CORE_CL_OPENCL_H
+#define ACL_ARM_COMPUTE_CORE_CL_OPENCL_H
#include
#include
@@ -139,6 +139,7 @@ class CLSymbols final
DECLARE_FUNCTION_PTR(clWaitForEvents);
DECLARE_FUNCTION_PTR(clCreateImage);
DECLARE_FUNCTION_PTR(clSetKernelExecInfo);
+ DECLARE_FUNCTION_PTR(clGetExtensionFunctionAddressForPlatform);
// Command buffer and mutable dispatch command buffer extensions
DECLARE_FUNCTION_PTR(clCreateCommandBufferKHR);
@@ -159,4 +160,4 @@ class CLSymbols final
std::pair _loaded;
};
} // namespace arm_compute
-#endif /* ARM_COMPUTE_OPENCL_H */
+#endif // ACL_ARM_COMPUTE_CORE_CL_OPENCL_H
diff --git a/arm_compute/core/GPUTarget.h b/arm_compute/core/GPUTarget.h
index affa79a89e..b107a52d9f 100644
--- a/arm_compute/core/GPUTarget.h
+++ b/arm_compute/core/GPUTarget.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2022 Arm Limited.
+ * Copyright (c) 2018-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -21,8 +21,8 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#ifndef ARM_COMPUTE_GPUTARGET_H
-#define ARM_COMPUTE_GPUTARGET_H
+#ifndef ACL_ARM_COMPUTE_CORE_GPUTARGET_H
+#define ACL_ARM_COMPUTE_CORE_GPUTARGET_H
#include "support/Traits.h"
@@ -39,6 +39,7 @@ enum class GPUTarget
MIDGARD = 0x100,
BIFROST = 0x200,
VALHALL = 0x300,
+ FIFTHGEN = 0X400,
T600 = 0x110,
T700 = 0x120,
T800 = 0x130,
@@ -62,6 +63,8 @@ enum class GPUTarget
G310 = 0x343,
G715 = 0x350,
G615 = 0x351,
+ G720 = 0x410,
+ G620 = 0X411
};
/** Enable bitwise operations on GPUTarget enumerations */
@@ -114,4 +117,4 @@ inline bool gpu_target_is_in(GPUTarget target_to_check, GPUTarget target)
return target_to_check == target;
}
} // namespace arm_compute
-#endif /* ARM_COMPUTE_GPUTARGET_H */
+#endif // ACL_ARM_COMPUTE_CORE_GPUTARGET_H
diff --git a/arm_compute/core/utils/misc/ShapeCalculator.h b/arm_compute/core/utils/misc/ShapeCalculator.h
index 86dcfdc3d0..e97d81390e 100644
--- a/arm_compute/core/utils/misc/ShapeCalculator.h
+++ b/arm_compute/core/utils/misc/ShapeCalculator.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2023 Arm Limited.
+ * Copyright (c) 2017-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -60,7 +60,14 @@ inline TensorShape calculate_reduce_mean_shape(ITensorInfo *input, const Coordin
{
// We have to sort the reduction axis vectors in order for remove_dimension
// to work properly
+
+// Suppress warning produced by a compiler bug in GCC
+// https://gcc.gnu.org/bugzilla/show_bug.cgi?id=104165
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Warray-bounds"
std::sort(axis_local.begin(), axis_local.begin() + reduction_ops);
+#pragma GCC diagnostic pop
+
for (int i = 0; i < reduction_ops; ++i)
{
out_shape.remove_dimension(axis_local[i] - i, false);
diff --git a/arm_compute/function_info/ActivationLayerInfo.h b/arm_compute/function_info/ActivationLayerInfo.h
index 195b67cf99..9390d0c54f 100644
--- a/arm_compute/function_info/ActivationLayerInfo.h
+++ b/arm_compute/function_info/ActivationLayerInfo.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016-2023 Arm Limited.
+ * Copyright (c) 2016-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -21,13 +21,19 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#ifndef ACL_ARM_COMPUTE_FUNCTION_INFO_ACTIVATIONLAYERINFO
-#define ACL_ARM_COMPUTE_FUNCTION_INFO_ACTIVATIONLAYERINFO
+#ifndef ACL_ARM_COMPUTE_FUNCTION_INFO_ACTIVATIONLAYERINFO_H
+#define ACL_ARM_COMPUTE_FUNCTION_INFO_ACTIVATIONLAYERINFO_H
#include "arm_compute/core/CoreTypes.h"
+#include "arm_compute/core/Error.h"
#include "arm_compute/core/QuantizationInfo.h"
#include
+#include
+
+#ifdef __aarch64__
+#include
+#endif // __arch64__
namespace arm_compute
{
@@ -58,7 +64,10 @@ class ActivationLayerInfo
typedef arm_compute::ActivationFunction ActivationFunction;
/** Lookup table */
- using LookupTable256 = std::array;
+#ifdef __aarch64__
+ using LookupTable256 = std::array;
+ using LookupTable65536 = std::array;
+#endif // __aarch64__
ActivationLayerInfo() = default;
/** Default Constructor
@@ -101,6 +110,16 @@ class ActivationLayerInfo
{
_lut = std::move(lut);
}
+
+ const LookupTable65536 &lut_fp16() const
+ {
+ ARM_COMPUTE_ERROR_ON(_lut_fp16 == nullptr);
+ return *_lut_fp16;
+ }
+ void setLookupTable65536(std::shared_ptr lut)
+ {
+ _lut_fp16 = lut;
+ }
#endif // __aarch64__
private:
ActivationFunction _act = {ActivationLayerInfo::ActivationFunction::IDENTITY};
@@ -109,8 +128,9 @@ class ActivationLayerInfo
bool _enabled = {false};
#ifdef __aarch64__
- LookupTable256 _lut = {};
+ LookupTable256 _lut = {};
+ std::shared_ptr _lut_fp16{nullptr};
#endif // __aarch64__
};
} // namespace arm_compute
-#endif /* ACL_ARM_COMPUTE_FUNCTION_INFO_ACTIVATIONLAYERINFO */
+#endif // ACL_ARM_COMPUTE_FUNCTION_INFO_ACTIVATIONLAYERINFO_H
diff --git a/arm_compute/runtime/NEON/functions/NEDepthToSpaceLayer.h b/arm_compute/runtime/NEON/functions/NEDepthToSpaceLayer.h
index c7df29a704..d27369670e 100644
--- a/arm_compute/runtime/NEON/functions/NEDepthToSpaceLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEDepthToSpaceLayer.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -21,25 +21,27 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#ifndef ARM_COMPUTE_NEDEPTHTOSPACELAYER_H
-#define ARM_COMPUTE_NEDEPTHTOSPACELAYER_H
+#ifndef ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NEDEPTHTOSPACELAYER_H
+#define ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NEDEPTHTOSPACELAYER_H
#include "arm_compute/core/Types.h"
#include "arm_compute/runtime/IFunction.h"
-#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h"
+
+#include
namespace arm_compute
{
// Forward declarations
class ITensor;
class ITensorInfo;
+class NEDepthToSpaceLayerKernel;
/** Basic function to run @ref NEDepthToSpaceLayerKernel. */
-class NEDepthToSpaceLayer : public INESimpleFunctionNoBorder
+class NEDepthToSpaceLayer : public IFunction
{
public:
/** Constructor */
- NEDepthToSpaceLayer() = default;
+ NEDepthToSpaceLayer();
/** Prevent instances of this class from being copied (As this class contains pointers) */
NEDepthToSpaceLayer(const NEDepthToSpaceLayer &) = delete;
/** Prevent instances of this class from being copied (As this class contains pointers) */
@@ -49,7 +51,7 @@ class NEDepthToSpaceLayer : public INESimpleFunctionNoBorder
/** Prevent instances of this class from being moved (As this class contains non movable objects) */
NEDepthToSpaceLayer &operator=(NEDepthToSpaceLayer &&) = delete;
/** Default destructor */
- ~NEDepthToSpaceLayer() = default;
+ ~NEDepthToSpaceLayer();
/** Set the input and output tensors.
*
* Valid data layouts:
@@ -75,6 +77,11 @@ class NEDepthToSpaceLayer : public INESimpleFunctionNoBorder
* @return a status
*/
static Status validate(const ITensorInfo *input, const ITensorInfo *output, int32_t block_shape);
+
+ void run() override;
+
+private:
+ std::unique_ptr _kernel;
};
} // namespace arm_compute
-#endif /* ARM_COMPUTE_NEDEPTHTOSPACELAYER_H */
+#endif // ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NEDEPTHTOSPACELAYER_H
diff --git a/arm_compute/runtime/Scheduler.h b/arm_compute/runtime/Scheduler.h
index bd29cbb31f..7c83f86caa 100644
--- a/arm_compute/runtime/Scheduler.h
+++ b/arm_compute/runtime/Scheduler.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2019, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -21,8 +21,8 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#ifndef ARM_COMPUTE_SCHEDULER_H
-#define ARM_COMPUTE_SCHEDULER_H
+#ifndef ACL_ARM_COMPUTE_RUNTIME_SCHEDULER_H
+#define ACL_ARM_COMPUTE_RUNTIME_SCHEDULER_H
#include "arm_compute/runtime/IScheduler.h"
@@ -81,4 +81,4 @@ class Scheduler
Scheduler();
};
} // namespace arm_compute
-#endif /* ARM_COMPUTE_SCHEDULER_H */
+#endif // ACL_ARM_COMPUTE_RUNTIME_SCHEDULER_H
diff --git a/cmake/Options.cmake b/cmake/Options.cmake
index bc51cbbc0d..e5c8cb8efe 100644
--- a/cmake/Options.cmake
+++ b/cmake/Options.cmake
@@ -116,4 +116,4 @@ endif()
if(ARM_COMPUTE_ENABLE_FIXED_FORMAT_KERNELS)
add_definitions(-DARM_COMPUTE_ENABLE_FIXED_FORMAT_KERNELS)
endif()
-add_definitions(-D_GLIBCXX_USE_NANOSLEEP)
\ No newline at end of file
+add_definitions(-D_GLIBCXX_USE_NANOSLEEP)
diff --git a/docs/Doxyfile b/docs/Doxyfile
index 0b2f32ad1a..0d8654944d 100644
--- a/docs/Doxyfile
+++ b/docs/Doxyfile
@@ -38,7 +38,7 @@ PROJECT_NAME = "Compute Library"
# could be handy for archiving the generated documentation or if some version
# control system is used.
-PROJECT_NUMBER = 23.11
+PROJECT_NUMBER = 24.01
# Using the PROJECT_BRIEF tag one can provide an optional one line description
# for a project that appears at the top of each page and should give viewer a
diff --git a/docs/user_guide/how_to_build_and_run_examples.dox b/docs/user_guide/how_to_build_and_run_examples.dox
index 4da26d31bc..775cb6abbe 100644
--- a/docs/user_guide/how_to_build_and_run_examples.dox
+++ b/docs/user_guide/how_to_build_and_run_examples.dox
@@ -76,21 +76,21 @@ The examples get automatically built by scons as part of the build process of th
To cross compile a Arm® Neon™ example for Linux 32bit:
- arm-linux-gnueabihf-g++ examples/neon_cnn.cpp utils/Utils.cpp -I. -Iinclude -std=c++14 -mfpu=neon -L. -larm_compute -larm_compute_core -o neon_cnn
+ arm-linux-gnueabihf-g++ examples/neon_cnn.cpp utils/Utils.cpp -I. -Iinclude -std=c++14 -mfpu=neon -L. -larm_compute -o neon_cnn
To cross compile a Arm® Neon™ example for Linux 64bit:
- aarch64-linux-gnu-g++ examples/neon_cnn.cpp utils/Utils.cpp -I. -Iinclude -std=c++14 -L. -larm_compute -larm_compute_core -o neon_cnn
+ aarch64-linux-gnu-g++ examples/neon_cnn.cpp utils/Utils.cpp -I. -Iinclude -std=c++14 -L. -larm_compute -o neon_cnn
(notice the only difference with the 32 bit command is that we don't need the -mfpu option and the compiler's name is different)
To cross compile an OpenCL example for Linux 32bit:
- arm-linux-gnueabihf-g++ examples/cl_sgemm.cpp utils/Utils.cpp -I. -Iinclude -std=c++14 -mfpu=neon -L. -larm_compute -larm_compute_core -o cl_sgemm -DARM_COMPUTE_CL
+ arm-linux-gnueabihf-g++ examples/cl_sgemm.cpp utils/Utils.cpp -I. -Iinclude -std=c++14 -mfpu=neon -L. -larm_compute -o cl_sgemm -DARM_COMPUTE_CL
To cross compile an OpenCL example for Linux 64bit:
- aarch64-linux-gnu-g++ examples/cl_sgemm.cpp utils/Utils.cpp -I. -Iinclude -std=c++14 -L. -larm_compute -larm_compute_core -o cl_sgemm -DARM_COMPUTE_CL
+ aarch64-linux-gnu-g++ examples/cl_sgemm.cpp utils/Utils.cpp -I. -Iinclude -std=c++14 -L. -larm_compute -o cl_sgemm -DARM_COMPUTE_CL
(notice the only difference with the 32 bit command is that we don't need the -mfpu option and the compiler's name is different)
@@ -98,43 +98,43 @@ To cross compile the examples with the Graph API, such as graph_lenet.cpp, you n
i.e. to cross compile the "graph_lenet" example for Linux 32bit:
- arm-linux-gnueabihf-g++ examples/graph_lenet.cpp utils/Utils.cpp utils/GraphUtils.cpp utils/CommonGraphOptions.cpp -I. -Iinclude -std=c++14 -mfpu=neon -L. -larm_compute_graph -larm_compute -larm_compute_core -Wl,--allow-shlib-undefined -o graph_lenet
+ arm-linux-gnueabihf-g++ examples/graph_lenet.cpp utils/Utils.cpp utils/GraphUtils.cpp utils/CommonGraphOptions.cpp -I. -Iinclude -std=c++14 -mfpu=neon -L. -larm_compute_graph -larm_compute -Wl,--allow-shlib-undefined -o graph_lenet
i.e. to cross compile the "graph_lenet" example for Linux 64bit:
- aarch64-linux-gnu-g++ examples/graph_lenet.cpp utils/Utils.cpp utils/GraphUtils.cpp utils/CommonGraphOptions.cpp -I. -Iinclude -std=c++14 -L. -larm_compute_graph -larm_compute -larm_compute_core -Wl,--allow-shlib-undefined -o graph_lenet
+ aarch64-linux-gnu-g++ examples/graph_lenet.cpp utils/Utils.cpp utils/GraphUtils.cpp utils/CommonGraphOptions.cpp -I. -Iinclude -std=c++14 -L. -larm_compute_graph -larm_compute -Wl,--allow-shlib-undefined -o graph_lenet
(notice the only difference with the 32 bit command is that we don't need the -mfpu option and the compiler's name is different)
-@note If compiling using static libraries, this order must be followed when linking: arm_compute_graph_static, arm_compute, arm_compute_core
+@note If compiling using static libraries, this order must be followed when linking: arm_compute_graph_static, arm_compute
To compile natively (i.e directly on an Arm device) for Arm® Neon™ for Linux 32bit:
- g++ examples/neon_cnn.cpp utils/Utils.cpp -I. -Iinclude -std=c++14 -mfpu=neon -larm_compute -larm_compute_core -o neon_cnn
+ g++ examples/neon_cnn.cpp utils/Utils.cpp -I. -Iinclude -std=c++14 -mfpu=neon -larm_compute -o neon_cnn
To compile natively (i.e directly on an Arm device) for Arm® Neon™ for Linux 64bit:
- g++ examples/neon_cnn.cpp utils/Utils.cpp -I. -Iinclude -std=c++14 -larm_compute -larm_compute_core -o neon_cnn
+ g++ examples/neon_cnn.cpp utils/Utils.cpp -I. -Iinclude -std=c++14 -larm_compute -o neon_cnn
(notice the only difference with the 32 bit command is that we don't need the -mfpu option)
To compile natively (i.e directly on an Arm device) for OpenCL for Linux 32bit or Linux 64bit:
- g++ examples/cl_sgemm.cpp utils/Utils.cpp -I. -Iinclude -std=c++14 -larm_compute -larm_compute_core -o cl_sgemm -DARM_COMPUTE_CL
+ g++ examples/cl_sgemm.cpp utils/Utils.cpp -I. -Iinclude -std=c++14 -larm_compute -o cl_sgemm -DARM_COMPUTE_CL
To compile natively the examples with the Graph API, such as graph_lenet.cpp, you need to link the examples against arm_compute_graph.so too.
i.e. to natively compile the "graph_lenet" example for Linux 32bit:
- g++ examples/graph_lenet.cpp utils/Utils.cpp utils/GraphUtils.cpp utils/CommonGraphOptions.cpp -I. -Iinclude -std=c++14 -mfpu=neon -L. -larm_compute_graph -larm_compute -larm_compute_core -Wl,--allow-shlib-undefined -o graph_lenet
+ g++ examples/graph_lenet.cpp utils/Utils.cpp utils/GraphUtils.cpp utils/CommonGraphOptions.cpp -I. -Iinclude -std=c++14 -mfpu=neon -L. -larm_compute_graph -larm_compute -Wl,--allow-shlib-undefined -o graph_lenet
i.e. to natively compile the "graph_lenet" example for Linux 64bit:
- g++ examples/graph_lenet.cpp utils/Utils.cpp utils/GraphUtils.cpp utils/CommonGraphOptions.cpp -I. -Iinclude -std=c++14 -L. -larm_compute_graph -larm_compute -larm_compute_core -Wl,--allow-shlib-undefined -o graph_lenet
+ g++ examples/graph_lenet.cpp utils/Utils.cpp utils/GraphUtils.cpp utils/CommonGraphOptions.cpp -I. -Iinclude -std=c++14 -L. -larm_compute_graph -larm_compute -Wl,--allow-shlib-undefined -o graph_lenet
(notice the only difference with the 32 bit command is that we don't need the -mfpu option)
-@note If compiling using static libraries, this order must be followed when linking: arm_compute_graph_static, arm_compute, arm_compute_core
+@note If compiling using static libraries, this order must be followed when linking: arm_compute_graph_static, arm_compute
@note These two commands assume libarm_compute.so is available in your library path, if not add the path to it using -L (e.g. -Llib/linux-armv8a-neon-cl-asserts/)
@note You might need to export the path to OpenCL library as well in your LD_LIBRARY_PATH if Compute Library was built with OpenCL enabled.
@@ -265,23 +265,23 @@ Once you've got your Android standalone toolchain built and added to your path y
To cross compile a Arm® Neon™ example:
#32 bit:
- arm-linux-androideabi-clang++ examples/neon_cnn.cpp utils/Utils.cpp -I. -Iinclude -std=c++14 -larm_compute-static -larm_compute_core-static -L. -o neon_cnn_arm -static-libstdc++ -pie
+ arm-linux-androideabi-clang++ examples/neon_cnn.cpp utils/Utils.cpp -I. -Iinclude -std=c++14 -larm_compute-static -L. -o neon_cnn_arm -static-libstdc++ -pie
#64 bit:
- aarch64-linux-android-clang++ examples/neon_cnn.cpp utils/Utils.cpp -I. -Iinclude -std=c++14 -larm_compute-static -larm_compute_core-static -L. -o neon_cnn_aarch64 -static-libstdc++ -pie
+ aarch64-linux-android-clang++ examples/neon_cnn.cpp utils/Utils.cpp -I. -Iinclude -std=c++14 -larm_compute-static -L. -o neon_cnn_aarch64 -static-libstdc++ -pie
To cross compile an OpenCL example:
#32 bit:
- arm-linux-androideabi-clang++ examples/cl_sgemm.cpp utils/Utils.cpp -I. -Iinclude -std=c++14 -larm_compute-static -larm_compute_core-static -L. -o cl_sgemm_arm -static-libstdc++ -pie -DARM_COMPUTE_CL
+ arm-linux-androideabi-clang++ examples/cl_sgemm.cpp utils/Utils.cpp -I. -Iinclude -std=c++14 -larm_compute-static -L. -o cl_sgemm_arm -static-libstdc++ -pie -DARM_COMPUTE_CL
#64 bit:
- aarch64-linux-android-clang++ examples/cl_sgemm.cpp utils/Utils.cpp -I. -Iinclude -std=c++14 -larm_compute-static -larm_compute_core-static -L. -o cl_sgemm_aarch64 -static-libstdc++ -pie -DARM_COMPUTE_CL
+ aarch64-linux-android-clang++ examples/cl_sgemm.cpp utils/Utils.cpp -I. -Iinclude -std=c++14 -larm_compute-static -L. -o cl_sgemm_aarch64 -static-libstdc++ -pie -DARM_COMPUTE_CL
To cross compile the examples with the Graph API, such as graph_lenet.cpp, you need to link the library arm_compute_graph also.
#32 bit:
- arm-linux-androideabi-clang++ examples/graph_lenet.cpp utils/Utils.cpp utils/GraphUtils.cpp utils/CommonGraphOptions.cpp -I. -Iinclude -std=c++14 -Wl,--whole-archive -larm_compute_graph-static -Wl,--no-whole-archive -larm_compute-static -larm_compute_core-static -L. -o graph_lenet_arm -static-libstdc++ -pie -DARM_COMPUTE_CL
+ arm-linux-androideabi-clang++ examples/graph_lenet.cpp utils/Utils.cpp utils/GraphUtils.cpp utils/CommonGraphOptions.cpp -I. -Iinclude -std=c++14 -Wl,--whole-archive -larm_compute_graph-static -Wl,--no-whole-archive -larm_compute-static -L. -o graph_lenet_arm -static-libstdc++ -pie -DARM_COMPUTE_CL
#64 bit:
- aarch64-linux-android-clang++ examples/graph_lenet.cpp utils/Utils.cpp utils/GraphUtils.cpp utils/CommonGraphOptions.cpp -I. -Iinclude -std=c++14 -Wl,--whole-archive -larm_compute_graph-static -Wl,--no-whole-archive -larm_compute-static -larm_compute_core-static -L. -o graph_lenet_aarch64 -static-libstdc++ -pie -DARM_COMPUTE_CL
+ aarch64-linux-android-clang++ examples/graph_lenet.cpp utils/Utils.cpp utils/GraphUtils.cpp utils/CommonGraphOptions.cpp -I. -Iinclude -std=c++14 -Wl,--whole-archive -larm_compute_graph-static -Wl,--no-whole-archive -larm_compute-static -L. -o graph_lenet_aarch64 -static-libstdc++ -pie -DARM_COMPUTE_CL
@note Due to some issues in older versions of the Arm® Mali™ OpenCL DDK (<= r13p0), we recommend to link arm_compute statically on Android.
@note When linked statically the arm_compute_graph library currently needs the --whole-archive linker flag in order to work properly
diff --git a/docs/user_guide/release_version_and_change_log.dox b/docs/user_guide/release_version_and_change_log.dox
index 11731e5a33..40ad09fd84 100644
--- a/docs/user_guide/release_version_and_change_log.dox
+++ b/docs/user_guide/release_version_and_change_log.dox
@@ -41,6 +41,18 @@ If there is more than one release in a month then an extra sequential number is
@section S2_2_changelog Changelog
+v24.01 Public major release
+ - Remove the legacy 'libarm_compute_core' library. This library is an artifact of Compute Library's legacy library architecture and no longer serves any purpose.
+ You should link only to the main `libarm_compute` library for core functionality.
+ - Expand GPUTarget list with Mali™ G720 and G620.
+ - Optimize CPU activation functions using LUT-based implementation:
+ - Sigmoid function for FP16.
+ - New features
+ - Add support for FP16 in all multi_isa builds.
+ - Performance optimizations:
+ - Optimize @ref NESoftmaxLayer
+ - Optimize @ref NEDepthToSpaceLayer.
+
v23.11 Public major release
- New features
- Add support for input data type U64/S64 in CLCast and NECast.
@@ -432,8 +444,8 @@ v21.02 Public major release
- @ref NEActivationLayer
- @ref NEArithmeticAddition
- @ref NEBatchNormalizationLayerKernel
- - @ref cpu::kernels::CpuLogits1DSoftmaxKernel
- - @ref cpu::kernels::CpuLogits1DMaxKernel
+ - cpu::kernels::CpuLogits1DSoftmaxKernel
+ - cpu::kernels::CpuLogits1DMaxKernel
- @ref cpu::kernels::CpuElementwiseUnaryKernel
- Remove padding from OpenCL kernels:
- CLDirectConvolutionLayerKernel
diff --git a/examples/SConscript b/examples/SConscript
index bfac9deb2b..16f31d93d4 100644
--- a/examples/SConscript
+++ b/examples/SConscript
@@ -38,15 +38,14 @@ utils = examples_env.Object("../utils/Utils.cpp")
if env['os'] in ['android', 'macos', 'bare_metal'] or env['standalone']:
Import('arm_compute_graph_a')
Import('arm_compute_a')
- Import('arm_compute_core_a')
- arm_compute_libs = [ arm_compute_a, arm_compute_core_a ]
+ arm_compute_libs = [ arm_compute_a ]
arm_compute_graph_libs = arm_compute_libs # The graph library needs to be linked separately with --whole-archive
arm_compute_dependency = arm_compute_a
graph_dependency = [arm_compute_graph_a]
else:
Import('arm_compute_graph_so')
Import('arm_compute_so')
- arm_compute_libs = ["arm_compute", "arm_compute_core"]
+ arm_compute_libs = ["arm_compute"]
arm_compute_graph_libs = [ "arm_compute_graph" ] + arm_compute_libs
arm_compute_dependency = arm_compute_so
graph_dependency = [arm_compute_graph_so]
diff --git a/examples/graph_ssd_mobilenet.cpp b/examples/graph_ssd_mobilenet.cpp
index 5162fe6890..6218d47dd6 100644
--- a/examples/graph_ssd_mobilenet.cpp
+++ b/examples/graph_ssd_mobilenet.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2022 Arm Limited.
+ * Copyright (c) 2018-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -21,6 +21,7 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
+#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
#include "arm_compute/graph.h"
#include "support/ToolchainSupport.h"
@@ -757,7 +758,8 @@ class GraphSSDMobilenetExample : public Example
std::move(conv_16_2_class_pre), std::move(conv_17_2_class_pre))
.set_name("ClassPrediction/concat");
- const QuantizationInfo logistic_out_qinfo = QuantizationInfo(0.00390625f, 0);
+ const QuantizationInfo logistic_out_qinfo = QuantizationInfo(
+ 0.00390625f, quantization::get_min_max_values_from_quantized_data_type(common_params.data_type).first);
class_pred << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC),
logistic_out_qinfo)
.set_name("ClassPrediction/logistic");
diff --git a/filelist.json b/filelist.json
index 5bca2419d6..7c530f3f33 100644
--- a/filelist.json
+++ b/filelist.json
@@ -14,6 +14,7 @@
"src/core/Error.cpp",
"src/core/GPUTarget.cpp",
"src/core/Helpers.cpp",
+ "src/core/helpers/LUTManager.cpp",
"src/core/IAccessWindow.cpp",
"src/core/IKernel.cpp",
"src/core/ITensor.cpp",
@@ -532,7 +533,8 @@
"src/gpu/cl/operators/ClMatMul.cpp",
"src/runtime/CL/functions/CLMatMul.cpp",
"src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultConfigValhall.cpp",
- "src/runtime/heuristics/matmul_native/ClMatMulNativeHelpers.cpp"
+ "src/runtime/heuristics/matmul_native/ClMatMulNativeHelpers.cpp",
+ "src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultVariantValhall.cpp"
]
}
},
@@ -982,12 +984,15 @@
"fp16": [
"src/cpu/kernels/fuse_batch_normalization/generic/fp16.cpp",
"src/core/NEON/kernels/batchnormalization/impl/NEON/fp16.cpp",
- "src/cpu/kernels/fuse_batch_normalization/nhwc/neon/fp16.cpp"
+ "src/cpu/kernels/fuse_batch_normalization/nhwc/neon/fp16.cpp",
+ "src/cpu/kernels/fuse_batch_normalization/nchw/neon/fp16.cpp"
+
],
"fp32": [
"src/cpu/kernels/fuse_batch_normalization/generic/fp32.cpp",
"src/core/NEON/kernels/batchnormalization/impl/NEON/fp32.cpp",
- "src/cpu/kernels/fuse_batch_normalization/nhwc/neon/fp32.cpp"
+ "src/cpu/kernels/fuse_batch_normalization/nhwc/neon/fp32.cpp",
+ "src/cpu/kernels/fuse_batch_normalization/nchw/neon/fp32.cpp"
]
},
"sve": {
@@ -1122,27 +1127,20 @@
"src/core/NEON/kernels/convolution/common/qasymm8.cpp",
"src/core/NEON/kernels/convolution/common/qsymm8.cpp",
"src/core/NEON/kernels/convolution/common/utils.cpp",
- "src/core/NEON/kernels/convolution/winograd/input_transforms_fp16.cpp",
"src/core/NEON/kernels/convolution/winograd/input_transforms_fp32.cpp",
- "src/core/NEON/kernels/convolution/winograd/output_transforms_fp16.cpp",
"src/core/NEON/kernels/convolution/winograd/output_transforms_fp32.cpp",
- "src/core/NEON/kernels/convolution/winograd/weight_transforms_fp16.cpp",
"src/core/NEON/kernels/convolution/winograd/weight_transforms_fp32.cpp",
- "src/core/NEON/kernels/convolution/winograd/winograd_fp16.cpp",
"src/core/NEON/kernels/convolution/winograd/winograd_fp32.cpp",
- "src/core/NEON/kernels/convolution/winograd/input_transforms/a64_fp16_6x6.cpp",
"src/core/NEON/kernels/convolution/winograd/input_transforms/a64_fp32_6x6.cpp",
"src/core/NEON/kernels/convolution/winograd/input_transforms/arm_fp32_1x8.cpp",
"src/core/NEON/kernels/convolution/winograd/input_transforms/arm_fp32_4x4.cpp",
"src/core/NEON/kernels/convolution/winograd/input_transforms/arm_fp32_6x6.cpp",
- "src/core/NEON/kernels/convolution/winograd/output_transforms/a64_fp16_4x4_3x3.cpp",
"src/core/NEON/kernels/convolution/winograd/output_transforms/arm_fp32_1x2_1x7.cpp",
"src/core/NEON/kernels/convolution/winograd/output_transforms/arm_fp32_1x4_1x5.cpp",
"src/core/NEON/kernels/convolution/winograd/output_transforms/arm_fp32_1x6_1x3.cpp",
"src/core/NEON/kernels/convolution/winograd/output_transforms/arm_fp32_2x2_3x3.cpp",
"src/core/NEON/kernels/convolution/winograd/output_transforms/arm_fp32_2x2_5x5.cpp",
"src/core/NEON/kernels/convolution/winograd/output_transforms/arm_fp32_4x4_3x3.cpp",
- "src/core/NEON/kernels/convolution/winograd/weight_transforms/a64_fp16_4x4_3x3.cpp",
"src/core/NEON/kernels/convolution/winograd/weight_transforms/arm_fp32_2x2_3x3.cpp",
"src/core/NEON/kernels/convolution/winograd/weight_transforms/arm_fp32_2x2_5x5.cpp",
"src/core/NEON/kernels/convolution/winograd/weight_transforms/arm_fp32_4x4_3x3.cpp",
@@ -1159,6 +1157,13 @@
],
"fp16": [
"src/cpu/kernels/directconv2d/nchw/fp16.cpp",
+ "src/core/NEON/kernels/convolution/winograd/winograd_fp16.cpp",
+ "src/core/NEON/kernels/convolution/winograd/input_transforms_fp16.cpp",
+ "src/core/NEON/kernels/convolution/winograd/output_transforms_fp16.cpp",
+ "src/core/NEON/kernels/convolution/winograd/weight_transforms_fp16.cpp",
+ "src/core/NEON/kernels/convolution/winograd/input_transforms/a64_fp16_6x6.cpp",
+ "src/core/NEON/kernels/convolution/winograd/output_transforms/a64_fp16_4x4_3x3.cpp",
+ "src/core/NEON/kernels/convolution/winograd/weight_transforms/a64_fp16_4x4_3x3.cpp",
"src/cpu/kernels/directconv2d/nhwc/neon/fp16.cpp"
]
},
@@ -1214,7 +1219,9 @@
"files": {
"common": [
"src/core/NEON/kernels/NEDepthToSpaceLayerKernel.cpp",
- "src/runtime/NEON/functions/NEDepthToSpaceLayer.cpp"
+ "src/runtime/NEON/functions/NEDepthToSpaceLayer.cpp",
+ "src/cpu/kernels/depth_to_space/nhwc/any/impl.cpp",
+ "src/cpu/kernels/depth_to_space/nchw/any/impl.cpp"
]
}
},
@@ -1241,7 +1248,6 @@
"src/core/NEON/kernels/arm_conv/depthwise/depthwise_u8s8u8q.cpp",
"src/core/NEON/kernels/arm_conv/depthwise/depthwise_s8q.cpp",
"src/core/NEON/kernels/arm_conv/depthwise/depthwise_fp32.cpp",
- "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp",
"src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst/generic.cpp",
"src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst/generic.cpp",
"src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp",
@@ -1252,18 +1258,6 @@
"src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst/generic.cpp",
"src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp",
"src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp",
- "src/core/NEON/kernels/arm_conv/depthwise/depthwise_fp16.cpp",
- "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp",
- "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp",
- "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp",
- "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp",
- "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp",
- "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp",
- "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp",
- "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp",
- "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_direct.cpp",
- "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_indirect.cpp",
- "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_generic_output9_mla_depthfirst/generic.cpp",
"src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp",
"src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp",
"src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp",
@@ -1300,7 +1294,22 @@
"src/core/NEON/kernels/arm_conv/depthwise/premultiply.cpp",
"src/cpu/kernels/depthwiseconv2d/generic/neon/impl.cpp"
],
- "fp16":["src/cpu/kernels/depthwiseconv2d/generic/neon/fp16.cpp"],
+ "fp16":[
+ "src/cpu/kernels/depthwiseconv2d/generic/neon/fp16.cpp",
+ "src/core/NEON/kernels/arm_conv/depthwise/depthwise_fp16.cpp",
+ "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp",
+ "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp",
+ "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp",
+ "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp",
+ "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp",
+ "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp",
+ "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp",
+ "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp",
+ "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp",
+ "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_direct.cpp",
+ "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_indirect.cpp",
+ "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_generic_output9_mla_depthfirst/generic.cpp"
+ ],
"fp32":["src/cpu/kernels/depthwiseconv2d/generic/neon/fp32.cpp"],
"qasymm8":["src/cpu/kernels/depthwiseconv2d/generic/neon/qasymm8.cpp"],
"qasymm8_signed":["src/cpu/kernels/depthwiseconv2d/generic/neon/qasymm8_signed.cpp"]
@@ -1820,6 +1829,11 @@
"qasymm8": ["src/cpu/kernels/lut/generic/neon/u8.cpp"],
"qasymm8_signed": ["src/cpu/kernels/lut/generic/neon/u8.cpp"]
},
+ "sve": {
+ "fp16": ["src/cpu/kernels/lut/generic/sve/u16.cpp"],
+ "qasymm16": ["src/cpu/kernels/lut/generic/sve/u16.cpp"],
+ "qasymm16_signed": ["src/cpu/kernels/lut/generic/sve/u16.cpp"]
+ },
"sve2": {
"qasymm8": ["src/cpu/kernels/lut/generic/sve2/u8.cpp"],
"qasymm8_signed": ["src/cpu/kernels/lut/generic/sve2/u8.cpp"]
@@ -1902,7 +1916,11 @@
"src/cpu/operators/CpuMul.cpp",
"src/cpu/kernels/CpuMulKernel.cpp",
"src/runtime/NEON/functions/NEPixelWiseMultiplication.cpp"
- ]
+ ],
+ "neon":{
+ "fp16":["src/cpu/kernels/mul/generic/neon/fp16.cpp"],
+ "fp32":["src/cpu/kernels/mul/generic/neon/fp32.cpp"]
+ }
}
},
"Normalize": {
@@ -1911,7 +1929,11 @@
"common": [
"src/core/NEON/kernels/NENormalizationLayerKernel.cpp",
"src/runtime/NEON/functions/NENormalizationLayer.cpp"
- ]
+ ],
+ "neon":{
+ "fp16":["src/cpu/kernels/norm_layer/generic/neon/fp16.cpp"],
+ "fp32":["src/cpu/kernels/norm_layer/generic/neon/fp32.cpp"]
+ }
}
},
"Pad": {
@@ -1943,16 +1965,11 @@
"neon": {
"common": [
"src/core/NEON/kernels/arm_conv/pooling/kernels/cpp_nhwc_1x1_stride_any_depthfirst/generic.cpp",
- "src/core/NEON/kernels/arm_conv/pooling/pooling_fp16.cpp",
"src/core/NEON/kernels/arm_conv/pooling/pooling_fp32.cpp",
"src/core/NEON/kernels/arm_conv/pooling/pooling_s8.cpp",
"src/core/NEON/kernels/arm_conv/pooling/pooling_s8q.cpp",
"src/core/NEON/kernels/arm_conv/pooling/pooling_u8.cpp",
"src/core/NEON/kernels/arm_conv/pooling/pooling_u8q.cpp",
- "src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst/generic.cpp",
- "src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_avg_generic_depthfirst/generic.cpp",
- "src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp",
- "src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_max_generic_depthfirst/generic.cpp",
"src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst/generic.cpp",
"src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_avg_generic_depthfirst/generic.cpp",
"src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp",
@@ -1969,7 +1986,14 @@
"src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8q_nhwc_max_generic_depthfirst/generic.cpp"
],
"nchw": [ "src/cpu/kernels/pool2d/neon/nchw/all.cpp" ],
- "fp16": [ "src/cpu/kernels/pool2d/neon/fp16.cpp" ],
+ "fp16": [
+ "src/cpu/kernels/pool2d/neon/fp16.cpp",
+ "src/core/NEON/kernels/arm_conv/pooling/pooling_fp16.cpp",
+ "src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst/generic.cpp",
+ "src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_avg_generic_depthfirst/generic.cpp",
+ "src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp",
+ "src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_max_generic_depthfirst/generic.cpp"
+ ],
"fp32": [ "src/cpu/kernels/pool2d/neon/fp32.cpp" ],
"qasymm8":[ "src/cpu/kernels/pool2d/neon/qasymm8.cpp" ],
"qasymm8_signed":["src/cpu/kernels/pool2d/neon/qasymm8_signed.cpp"]
@@ -2198,16 +2222,10 @@
"qasymm8_signed":["src/cpu/kernels/softmax/generic/neon/qasymm8_signed.cpp"]
},
"sve": {
- "common": [ "src/cpu/kernels/softmax/generic/sve/impl.cpp" ],
- "fp32": ["src/cpu/kernels/softmax/generic/sve/fp32.cpp"],
- "fp16": ["src/cpu/kernels/softmax/generic/sve/fp16.cpp"],
- "qasymm8": ["src/cpu/kernels/softmax/generic/sve/qasymm8.cpp" ],
- "qasymm8_signed": ["src/cpu/kernels/softmax/generic/sve/qasymm8_signed.cpp"]
+ "common": [ "src/cpu/kernels/softmax/generic/sve/impl.cpp" ]
},
"sve2":{
- "common" :["src/cpu/kernels/softmax/generic/sve2/impl.cpp"],
- "qasymm8":[ "src/cpu/kernels/softmax/generic/sve2/qasymm8.cpp"],
- "qasymm8_signed":["src/cpu/kernels/softmax/generic/sve2/qasymm8_signed.cpp"]
+ "common" :["src/cpu/kernels/softmax/generic/sve2/impl.cpp"]
}
}
},
diff --git a/scripts/arm_compute_library_nn_driver.go b/scripts/arm_compute_library_nn_driver.go
index dda77b55df..2aab2d3fe7 100644
--- a/scripts/arm_compute_library_nn_driver.go
+++ b/scripts/arm_compute_library_nn_driver.go
@@ -46,6 +46,7 @@ func globalFlags(ctx android.BaseContext) []string {
if theArch == "armv8-2a" {
cppflags = append(cppflags, "-march=armv8.2-a+fp16")
cppflags = append(cppflags, "-DARM_COMPUTE_ENABLE_FP16")
+ cppflags = append(cppflags, "-DENABLE_FP16_KERNELS")
}
}
}
@@ -74,9 +75,6 @@ func globalFlags(ctx android.BaseContext) []string {
if strings.ToUpper(x) == "ALL" || strings.ToUpper(x) == "QSYMM16" {
cppflags = append(cppflags, "-DENABLE_QSYMM16_KERNELS")
}
- if strings.ToUpper(x) == "ALL" || strings.ToUpper(x) == "FP16" {
- cppflags = append(cppflags, "-DENABLE_FP16_KERNELS")
- }
if strings.ToUpper(x) == "ALL" || strings.ToUpper(x) == "FP32" {
cppflags = append(cppflags, "-DENABLE_FP32_KERNELS")
}
diff --git a/scripts/format_code.py b/scripts/format_code.py
index f1ee7a731c..29dbea7f0d 100755
--- a/scripts/format_code.py
+++ b/scripts/format_code.py
@@ -262,6 +262,9 @@ def run(self):
self.shell.prepend_env("PATH","%s/../bin" % this_dir)
for f in self.files:
+ if not self.skip_copyright:
+ check_copyright(f)
+
skip_this_file = False
for e in exceptions:
if e in f:
@@ -272,8 +275,6 @@ def run(self):
continue
logger.info("Formatting %s" % f)
- if not self.skip_copyright:
- check_copyright(f)
check_license("LICENSE")
diff --git a/src/BUILD.bazel b/src/BUILD.bazel
index a22632e1f5..9d5ae63484 100644
--- a/src/BUILD.bazel
+++ b/src/BUILD.bazel
@@ -117,9 +117,7 @@ filegroup(
"cpu/kernels/elementwise_binary/generic/sve2/qasymm8_signed.cpp",
"cpu/kernels/elementwise_unary/generic/sve2/q8.cpp",
"cpu/kernels/lut/generic/sve2/u8.cpp",
- "cpu/kernels/softmax/generic/sve2/impl.cpp",
- "cpu/kernels/softmax/generic/sve2/qasymm8.cpp",
- "cpu/kernels/softmax/generic/sve2/qasymm8_signed.cpp"] +
+ "cpu/kernels/softmax/generic/sve2/impl.cpp"] +
glob(["**/*.h",
"**/*.hpp",
"**/*.inl"]),
@@ -337,16 +335,13 @@ filegroup(
"cpu/kernels/elementwise_unary/generic/sve/fp32.cpp",
"cpu/kernels/elementwise_unary/generic/sve/impl.cpp",
"cpu/kernels/elementwise_unary/generic/sve/integer.cpp",
+ "cpu/kernels/lut/generic/sve/u16.cpp",
"cpu/kernels/scale/sve/fp16.cpp",
"cpu/kernels/scale/sve/fp32.cpp",
"cpu/kernels/scale/sve/integer.cpp",
"cpu/kernels/scale/sve/qasymm8.cpp",
"cpu/kernels/scale/sve/qasymm8_signed.cpp",
- "cpu/kernels/softmax/generic/sve/fp16.cpp",
- "cpu/kernels/softmax/generic/sve/fp32.cpp",
- "cpu/kernels/softmax/generic/sve/impl.cpp",
- "cpu/kernels/softmax/generic/sve/qasymm8.cpp",
- "cpu/kernels/softmax/generic/sve/qasymm8_signed.cpp"] +
+ "cpu/kernels/softmax/generic/sve/impl.cpp"] +
glob(["**/*.h",
"**/*.hpp",
"**/*.inl"]),
@@ -643,6 +638,7 @@ filegroup(
"core/Utils.cpp",
"core/Validate.cpp",
"core/Version.cpp",
+ "core/helpers/LUTManager.cpp",
"core/helpers/SoftmaxHelpers.cpp",
"core/helpers/Utils.cpp",
"core/helpers/WindowHelpers.cpp",
@@ -739,6 +735,8 @@ filegroup(
"cpu/kernels/crop/generic/neon/fp16.cpp",
"cpu/kernels/crop/generic/neon/fp32.cpp",
"cpu/kernels/crop/generic/neon/integer.cpp",
+ "cpu/kernels/depth_to_space/nchw/any/impl.cpp",
+ "cpu/kernels/depth_to_space/nhwc/any/impl.cpp",
"cpu/kernels/depthwiseconv2d/generic/neon/fp16.cpp",
"cpu/kernels/depthwiseconv2d/generic/neon/fp32.cpp",
"cpu/kernels/depthwiseconv2d/generic/neon/impl.cpp",
@@ -766,6 +764,8 @@ filegroup(
"cpu/kernels/fuse_batch_normalization/generic/fp16.cpp",
"cpu/kernels/fuse_batch_normalization/generic/fp32.cpp",
"cpu/kernels/fuse_batch_normalization/nchw/all.cpp",
+ "cpu/kernels/fuse_batch_normalization/nchw/neon/fp16.cpp",
+ "cpu/kernels/fuse_batch_normalization/nchw/neon/fp32.cpp",
"cpu/kernels/fuse_batch_normalization/nhwc/neon/fp16.cpp",
"cpu/kernels/fuse_batch_normalization/nhwc/neon/fp32.cpp",
"cpu/kernels/gemm_matrix_add/generic/neon/fp16.cpp",
@@ -794,6 +794,10 @@ filegroup(
"cpu/kernels/meanstddevnorm/generic/neon/fp32.cpp",
"cpu/kernels/meanstddevnorm/generic/neon/impl.cpp",
"cpu/kernels/meanstddevnorm/generic/neon/qasymm8.cpp",
+ "cpu/kernels/mul/generic/neon/fp16.cpp",
+ "cpu/kernels/mul/generic/neon/fp32.cpp",
+ "cpu/kernels/norm_layer/generic/neon/fp16.cpp",
+ "cpu/kernels/norm_layer/generic/neon/fp32.cpp",
"cpu/kernels/pool2d/neon/fp16.cpp",
"cpu/kernels/pool2d/neon/fp32.cpp",
"cpu/kernels/pool2d/neon/nchw/all.cpp",
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 37599cdadd..be7a6ef188 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -312,16 +312,13 @@ target_sources(
cpu/kernels/elementwise_unary/generic/sve/fp32.cpp
cpu/kernels/elementwise_unary/generic/sve/impl.cpp
cpu/kernels/elementwise_unary/generic/sve/integer.cpp
+ cpu/kernels/lut/generic/sve/u16.cpp
cpu/kernels/scale/sve/fp16.cpp
cpu/kernels/scale/sve/fp32.cpp
cpu/kernels/scale/sve/integer.cpp
cpu/kernels/scale/sve/qasymm8.cpp
cpu/kernels/scale/sve/qasymm8_signed.cpp
- cpu/kernels/softmax/generic/sve/fp16.cpp
- cpu/kernels/softmax/generic/sve/fp32.cpp
cpu/kernels/softmax/generic/sve/impl.cpp
- cpu/kernels/softmax/generic/sve/qasymm8.cpp
- cpu/kernels/softmax/generic/sve/qasymm8_signed.cpp
)
target_sources(
@@ -339,8 +336,6 @@ target_sources(
cpu/kernels/elementwise_unary/generic/sve2/q8.cpp
cpu/kernels/lut/generic/sve2/u8.cpp
cpu/kernels/softmax/generic/sve2/impl.cpp
- cpu/kernels/softmax/generic/sve2/qasymm8.cpp
- cpu/kernels/softmax/generic/sve2/qasymm8_signed.cpp
)
target_sources(
@@ -634,6 +629,7 @@ target_sources(
core/Utils.cpp
core/Validate.cpp
core/Version.cpp
+ core/helpers/LUTManager.cpp
core/helpers/SoftmaxHelpers.cpp
core/helpers/Utils.cpp
core/helpers/WindowHelpers.cpp
@@ -730,6 +726,8 @@ target_sources(
cpu/kernels/crop/generic/neon/fp16.cpp
cpu/kernels/crop/generic/neon/fp32.cpp
cpu/kernels/crop/generic/neon/integer.cpp
+ cpu/kernels/depth_to_space/nchw/any/impl.cpp
+ cpu/kernels/depth_to_space/nhwc/any/impl.cpp
cpu/kernels/depthwiseconv2d/generic/neon/fp16.cpp
cpu/kernels/depthwiseconv2d/generic/neon/fp32.cpp
cpu/kernels/depthwiseconv2d/generic/neon/impl.cpp
@@ -757,6 +755,8 @@ target_sources(
cpu/kernels/fuse_batch_normalization/generic/fp16.cpp
cpu/kernels/fuse_batch_normalization/generic/fp32.cpp
cpu/kernels/fuse_batch_normalization/nchw/all.cpp
+ cpu/kernels/fuse_batch_normalization/nchw/neon/fp16.cpp
+ cpu/kernels/fuse_batch_normalization/nchw/neon/fp32.cpp
cpu/kernels/fuse_batch_normalization/nhwc/neon/fp16.cpp
cpu/kernels/fuse_batch_normalization/nhwc/neon/fp32.cpp
cpu/kernels/gemm_matrix_add/generic/neon/fp16.cpp
@@ -785,6 +785,10 @@ target_sources(
cpu/kernels/meanstddevnorm/generic/neon/fp32.cpp
cpu/kernels/meanstddevnorm/generic/neon/impl.cpp
cpu/kernels/meanstddevnorm/generic/neon/qasymm8.cpp
+ cpu/kernels/mul/generic/neon/fp16.cpp
+ cpu/kernels/mul/generic/neon/fp32.cpp
+ cpu/kernels/norm_layer/generic/neon/fp16.cpp
+ cpu/kernels/norm_layer/generic/neon/fp32.cpp
cpu/kernels/pool2d/neon/fp16.cpp
cpu/kernels/pool2d/neon/fp32.cpp
cpu/kernels/pool2d/neon/nchw/all.cpp
diff --git a/src/core/CL/CLMutableCommandBuffer.cpp b/src/core/CL/CLMutableCommandBuffer.cpp
index 05b351fc25..0e078d8416 100644
--- a/src/core/CL/CLMutableCommandBuffer.cpp
+++ b/src/core/CL/CLMutableCommandBuffer.cpp
@@ -26,6 +26,7 @@
#include "arm_compute/core/Error.h"
+#include "src/common/utils/Log.h"
#include "src/core/CL/CLUtils.h"
namespace arm_compute
@@ -48,7 +49,11 @@ CLMutableCommandBuffer::CLMutableCommandBuffer(cl_command_queue queue) : CLComma
CLMutableCommandBuffer::~CLMutableCommandBuffer()
{
const auto status = clReleaseCommandBufferKHR(_cb);
- handle_cl_error("clReleaseCommandBufferKHR", status);
+ if (status != CL_SUCCESS)
+ {
+ const std::string error_message = "clReleaseCommandBufferKHR - Error code: " + std::to_string(status);
+ ARM_COMPUTE_LOG_ERROR_ACL(error_message);
+ }
}
void CLMutableCommandBuffer::add_kernel(cl_kernel kernel,
diff --git a/src/core/CL/OpenCL.cpp b/src/core/CL/OpenCL.cpp
index 35421d025e..07baa5e7fb 100644
--- a/src/core/CL/OpenCL.cpp
+++ b/src/core/CL/OpenCL.cpp
@@ -132,6 +132,10 @@ bool CLSymbols::load(const std::vector &libraries_filenames, bool u
func_name##_ptr = reinterpret_cast(dlsym(handle, #func_name));
#endif /* __ANDROID__ */
+#define LOAD_EXTENSION_FUNCTION_PTR(func_name, platform_id) \
+ func_name##_ptr = \
+ reinterpret_cast(clGetExtensionFunctionAddressForPlatform(platform_id, #func_name));
+
LOAD_FUNCTION_PTR(clCreateContext, handle);
LOAD_FUNCTION_PTR(clCreateContextFromType, handle);
LOAD_FUNCTION_PTR(clCreateCommandQueue, handle);
@@ -181,8 +185,27 @@ bool CLSymbols::load(const std::vector &libraries_filenames, bool u
LOAD_FUNCTION_PTR(clWaitForEvents, handle);
LOAD_FUNCTION_PTR(clCreateImage, handle);
LOAD_FUNCTION_PTR(clSetKernelExecInfo, handle);
+ LOAD_FUNCTION_PTR(clGetExtensionFunctionAddressForPlatform, handle);
+
+ // Load Extensions
+
+ // Number of platforms is assumed to be 1. For this to be greater than 1,
+ // the system must have more than one OpenCL implementation provided by
+ // different vendors. This is not our use case. Besides, the library
+ // already assumes one implementation as it uses one handle to load core
+ // functions.
+ constexpr unsigned int num_platforms = 1U;
+ std::vector platform_ids(num_platforms);
+ clGetPlatformIDs(num_platforms, platform_ids.data(), nullptr);
// Command buffer and mutable dispatch command buffer extensions
+ /// TODO: (COMPMID-6742) Load Command Buffer extensions in a Portable way
+ /// using clGetExtensionFunctionAddressForPlatform().
+ /// The details can be found here:
+ /// https://registry.khronos.org/OpenCL/specs/3.0-unified/html/OpenCL_Ext.html#getting-opencl-api-extension-function-pointers
+ ///
+ /// @note: There are some problems reported while loading these extensions in the recommended way.
+ /// For details, please see COMPUTE-16545
LOAD_FUNCTION_PTR(clCreateCommandBufferKHR, handle);
LOAD_FUNCTION_PTR(clRetainCommandBufferKHR, handle);
LOAD_FUNCTION_PTR(clReleaseCommandBufferKHR, handle);
@@ -193,9 +216,10 @@ bool CLSymbols::load(const std::vector &libraries_filenames, bool u
LOAD_FUNCTION_PTR(clUpdateMutableCommandsKHR, handle);
// Third-party extensions
- LOAD_FUNCTION_PTR(clImportMemoryARM, handle);
+ LOAD_EXTENSION_FUNCTION_PTR(clImportMemoryARM, platform_ids[0]);
#undef LOAD_FUNCTION_PTR
+#undef LOAD_EXTENSION_FUNCTION_PTR
//Don't call dlclose(handle) or all the symbols will be unloaded !
@@ -1063,6 +1087,19 @@ clSetKernelExecInfo(cl_kernel kernel, cl_kernel_exec_info param_name, size_t par
}
}
+void *clGetExtensionFunctionAddressForPlatform(cl_platform_id platform, const char *funcname)
+{
+ arm_compute::CLSymbols::get().load_default();
+ const auto func = arm_compute::CLSymbols::get().clGetExtensionFunctionAddressForPlatform_ptr;
+
+ if (func != nullptr)
+ {
+ return func(platform, funcname);
+ }
+
+ return nullptr;
+}
+
cl_command_buffer_khr clCreateCommandBufferKHR(cl_uint num_queues,
const cl_command_queue *queues,
const cl_command_buffer_properties_khr *properties,
diff --git a/src/core/CL/cl_kernels/common/gemm_reshaped_only_rhs_mmul.cl b/src/core/CL/cl_kernels/common/gemm_reshaped_only_rhs_mmul.cl
index 8919023d4c..09b8956b68 100644
--- a/src/core/CL/cl_kernels/common/gemm_reshaped_only_rhs_mmul.cl
+++ b/src/core/CL/cl_kernels/common/gemm_reshaped_only_rhs_mmul.cl
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -117,9 +117,23 @@ __kernel void gemm_mm_reshaped_only_rhs_nt_mmul(
uint rhs_y = block_id;
// Compute LHS/RHS/DST matrix address
+#ifdef REINTERPRET_INPUT_AS_3D
+ lhs_offset_first_element_in_bytes += lhs_x * sizeof(DATA_TYPE) + (lhs_y + z * M) * lhs_stride_y;
+#else // REINTERPRET_INPUT_AS_3D
lhs_offset_first_element_in_bytes += lhs_x * sizeof(DATA_TYPE) + lhs_y * lhs_stride_y + z * lhs_stride_z;
+#endif // REINTERPRET_INPUT_AS_3D
+
+#ifdef BATCHED_RHS
rhs_offset_first_element_in_bytes += rhs_x * sizeof(DATA_TYPE) + rhs_y * rhs_stride_y + z * rhs_stride_z;
+#else // BATCHED_RHS
+ rhs_offset_first_element_in_bytes += rhs_x * sizeof(DATA_TYPE) + rhs_y * rhs_stride_y;
+#endif // BATCHED_RHS
+
+#ifdef REINTERPRET_OUTPUT_AS_3D
+ dst_offset_first_element_in_bytes += dst_x * sizeof(DATA_TYPE) + (dst_y + z * M) * dst_stride_y;
+#else // REINTERPRET_OUTPUT_AS_3D
dst_offset_first_element_in_bytes += dst_x * sizeof(DATA_TYPE) + dst_y * dst_stride_y + z * dst_stride_z;
+#endif // REINTERPRET_OUTPUT_AS_3D
// Note: If RHS derives from the weights of convolution 2d layer, RHS will always be 2D and rhs_stride_z will always be equal to 0 for
// not sliding the tensor
@@ -367,11 +381,25 @@ __kernel void gemm_mm_reshaped_only_rhs_nt_mmul_texture(
// Starting RHS coordinates
uint rhs_x = block_y * N0 * MMUL_N0 + block_x * N0;
+
+#ifdef BATCHED_RHS
uint rhs_y = block_id + z * rhs_h;
+#else // BATCHED_RHS
+ uint rhs_y = block_id;
+#endif // BATCHED_RHS
// Compute LHS/RHS/DST matrix address
+#ifdef REINTERPRET_INPUT_AS_3D
+ lhs_offset_first_element_in_bytes += lhs_x * sizeof(DATA_TYPE) + (lhs_y + z * M) * lhs_stride_y;
+#else // REINTERPRET_INPUT_AS_3D
lhs_offset_first_element_in_bytes += lhs_x * sizeof(DATA_TYPE) + lhs_y * lhs_stride_y + z * lhs_stride_z;
+#endif // REINTERPRET_INPUT_AS_3D
+
+#ifdef REINTERPRET_OUTPUT_AS_3D
+ dst_offset_first_element_in_bytes += dst_x * sizeof(DATA_TYPE) + (dst_y + z * M) * dst_stride_y;
+#else // REINTERPRET_OUTPUT_AS_3D
dst_offset_first_element_in_bytes += dst_x * sizeof(DATA_TYPE) + dst_y * dst_stride_y + z * dst_stride_z;
+#endif // REINTERPRET_OUTPUT_AS_3D
// Initialize the accumulators
// MMUL extension accumulate the result in F32 for both F32 and F16
@@ -525,4 +553,4 @@ __kernel void gemm_mm_reshaped_only_rhs_nt_mmul_texture(
#undef RHS_OFFSET_X
#undef RHS_STEP_X
}
-#endif // defined(GEMM_MM_RESHAPED_ONLY_RHS_MMUL_TEXTURE)
\ No newline at end of file
+#endif // defined(GEMM_MM_RESHAPED_ONLY_RHS_MMUL_TEXTURE)
diff --git a/src/core/CL/cl_kernels/common/generate_proposals.cl b/src/core/CL/cl_kernels/common/generate_proposals.cl
index 5b8502072a..bfe1922ac2 100644
--- a/src/core/CL/cl_kernels/common/generate_proposals.cl
+++ b/src/core/CL/cl_kernels/common/generate_proposals.cl
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -59,18 +59,16 @@ __kernel void generate_proposals_compute_all_anchors(
Vector anchors = CONVERT_TO_VECTOR_STRUCT_NO_STEP(anchors);
Vector rois = CONVERT_TO_VECTOR_STRUCT(rois);
- const size_t idx = get_global_id(0);
+ const unsigned int idx = get_global_id(0);
// Find the index of the anchor
- const size_t anchor_idx = idx % NUM_ANCHORS;
+ const unsigned int anchor_idx = idx % NUM_ANCHORS;
// Find which shift is this thread using
- const size_t shift_idx = idx / NUM_ANCHORS;
+ const unsigned int shift_idx = idx / NUM_ANCHORS;
// Compute the shift on the X and Y direction (the shift depends exclusively by the index thread id)
- const DATA_TYPE
- shift_x = (DATA_TYPE)(shift_idx % WIDTH) * STRIDE;
- const DATA_TYPE
- shift_y = (DATA_TYPE)(shift_idx / WIDTH) * STRIDE;
+ const float shift_x = (float)(shift_idx % WIDTH) * STRIDE;
+ const float shift_y = (float)(shift_idx / WIDTH) * STRIDE;
const VEC_DATA_TYPE(DATA_TYPE, NUM_ROI_FIELDS)
shift = (VEC_DATA_TYPE(DATA_TYPE, NUM_ROI_FIELDS))(shift_x, shift_y, shift_x, shift_y);
diff --git a/src/core/GPUTarget.cpp b/src/core/GPUTarget.cpp
index 2d1a13cb33..5904e1a06f 100644
--- a/src/core/GPUTarget.cpp
+++ b/src/core/GPUTarget.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2022 Arm Limited.
+ * Copyright (c) 2018-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -30,6 +30,23 @@
namespace
{
+
+arm_compute::GPUTarget get_fifth_gen_target(const std::string &version)
+{
+ if (version.find("G720") != std::string::npos)
+ {
+ return arm_compute::GPUTarget::G720;
+ }
+ else if (version.find("G620") != std::string::npos)
+ {
+ return arm_compute::GPUTarget::G620;
+ }
+ else
+ {
+ return arm_compute::GPUTarget::UNKNOWN;
+ }
+}
+
arm_compute::GPUTarget get_valhall_target(const std::string &version)
{
if (version.find("G77") != std::string::npos)
@@ -152,16 +169,18 @@ namespace arm_compute
const std::string &string_from_target(GPUTarget target)
{
static std::map gpu_target_map = {
- {GPUTarget::MIDGARD, "midgard"}, {GPUTarget::BIFROST, "bifrost"}, {GPUTarget::VALHALL, "valhall"},
- {GPUTarget::T600, "t600"}, {GPUTarget::T700, "t700"}, {GPUTarget::T800, "t800"},
- {GPUTarget::G71, "g71"}, {GPUTarget::G72, "g72"}, {GPUTarget::G51, "g51"},
- {GPUTarget::G51BIG, "g51big"}, {GPUTarget::G51LIT, "g51lit"}, {GPUTarget::G31, "g31"},
- {GPUTarget::G76, "g76"}, {GPUTarget::G52, "g52"}, {GPUTarget::G52LIT, "g52lit"},
- {GPUTarget::G77, "g77"}, {GPUTarget::G57, "g57"}, {GPUTarget::G78, "g78"},
- {GPUTarget::G68, "g68"}, {GPUTarget::G78AE, "g78ae"}, {GPUTarget::G710, "g710"},
- {GPUTarget::G610, "g610"}, {GPUTarget::G510, "g510"}, {GPUTarget::G310, "g310"},
- {GPUTarget::G715, "g715"}, {GPUTarget::G615, "g615"},
- };
+ {GPUTarget::MIDGARD, "midgard"}, {GPUTarget::BIFROST, "bifrost"}, {GPUTarget::VALHALL, "valhall"},
+ {GPUTarget::FIFTHGEN, "fifthgen"},
+
+ {GPUTarget::T600, "t600"}, {GPUTarget::T700, "t700"}, {GPUTarget::T800, "t800"},
+ {GPUTarget::G71, "g71"}, {GPUTarget::G72, "g72"}, {GPUTarget::G51, "g51"},
+ {GPUTarget::G51BIG, "g51big"}, {GPUTarget::G51LIT, "g51lit"}, {GPUTarget::G31, "g31"},
+ {GPUTarget::G76, "g76"}, {GPUTarget::G52, "g52"}, {GPUTarget::G52LIT, "g52lit"},
+ {GPUTarget::G77, "g77"}, {GPUTarget::G57, "g57"}, {GPUTarget::G78, "g78"},
+ {GPUTarget::G68, "g68"}, {GPUTarget::G78AE, "g78ae"}, {GPUTarget::G710, "g710"},
+ {GPUTarget::G610, "g610"}, {GPUTarget::G510, "g510"}, {GPUTarget::G310, "g310"},
+ {GPUTarget::G715, "g715"}, {GPUTarget::G615, "g615"}, {GPUTarget::G720, "g720"},
+ {GPUTarget::G620, "g620"}};
return gpu_target_map[target];
}
@@ -188,8 +207,13 @@ GPUTarget get_target_from_name(const std::string &device_name)
GPUTarget gpu_target;
if (target == 'G' || is_future_gpu)
{
- // Check for Valhall or Bifrost
- gpu_target = get_valhall_target(version);
+ // Check for Valhall, Bifrost or 5-th Gen
+ gpu_target = get_fifth_gen_target(version);
+ if (gpu_target == GPUTarget::UNKNOWN)
+ {
+ gpu_target = get_valhall_target(version);
+ }
+
if (gpu_target == GPUTarget::UNKNOWN)
{
gpu_target = get_bifrost_target(version);
diff --git a/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp b/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp
index deb89996a9..717fd11485 100644
--- a/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -151,128 +151,15 @@ Status validate_arguments(const ITensorInfo *input,
}
} //namespace
-template
-void NEBatchNormalizationLayerKernel::batch_normalization_nchw(const Window &window)
-{
- /** SIMD vector tag type. */
- using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t;
-
- const int window_step_x = 16 / sizeof(T);
- const auto window_start_x = static_cast(window.x().start());
- const auto window_end_x = static_cast(window.x().end());
-
- Window win_to_use = window;
- win_to_use.set(Window::DimX, Window::Dimension(0, 1, 1));
-
- Iterator input(_input, win_to_use);
- Iterator output(_output, win_to_use);
-
- F activation_functor(_act_info);
-
- // Hold information about the current feature map we are iterating.
- // Only compute denominator and constants once per feature map.
- int slice = -1;
-
- const auto input_mean = reinterpret_cast(_mean->ptr_to_element(Coordinates(0, 0)));
- const auto input_var = reinterpret_cast(_var->ptr_to_element(Coordinates(0, 0)));
- const auto input_gamma =
- (_gamma != nullptr) ? reinterpret_cast(_gamma->ptr_to_element(Coordinates(0, 0))) : nullptr;
- const auto input_beta =
- (_beta != nullptr) ? reinterpret_cast(_beta->ptr_to_element(Coordinates(0, 0))) : nullptr;
-
- T mean = static_cast(0);
- T var = static_cast(0);
- T gamma = static_cast(1);
- T beta = static_cast(0);
- T denominator = static_cast(0);
-
- auto mean_vec = wrapper::vdup_n(mean, ExactTagType{});
- auto var_vec = wrapper::vdup_n(var, ExactTagType{});
- auto gamma_vec = wrapper::vdup_n(gamma, ExactTagType{});
- auto beta_vec = wrapper::vdup_n(beta, ExactTagType{});
- auto denominator_vec = wrapper::vdup_n(denominator, ExactTagType{});
- const auto epsilon_vec = wrapper::vdup_n(static_cast(_epsilon), ExactTagType{});
- execute_window_loop(
- win_to_use,
- [&](const Coordinates &id)
- {
- const auto input_ptr = reinterpret_cast(input.ptr());
- const auto output_ptr = reinterpret_cast(output.ptr());
-
- if (slice != id.z())
- {
- mean = input_mean[id.z()];
- var = input_var[id.z()];
- mean_vec = wrapper::vdup_n(mean, ExactTagType{});
- var_vec = wrapper::vdup_n(var, ExactTagType{});
- if (input_gamma != nullptr)
- {
- gamma = input_gamma[id.z()];
- gamma_vec = wrapper::vdup_n(gamma, ExactTagType{});
- }
- if (input_beta != nullptr)
- {
- beta = input_beta[id.z()];
- beta_vec = wrapper::vdup_n(beta, ExactTagType{});
- }
-
- // Calculate denominator
- denominator_vec = wrapper::vinvsqrt(wrapper::vadd(var_vec, epsilon_vec));
- denominator = wrapper::vgetlane(denominator_vec, 0);
- slice = id.z();
- }
-
- // Perform core calculations using vector operations
- int x = window_start_x;
- for (; x <= (window_end_x - window_step_x); x += window_step_x)
- {
- // Calculate x bar
- const auto numerator = wrapper::vsub(wrapper::vloadq(input_ptr + x), mean_vec);
- const auto x_bar = wrapper::vmul(numerator, denominator_vec);
- auto res = wrapper::vmla(beta_vec, x_bar, gamma_vec);
-
- // Perform fused activation
- if (fused_activation)
- {
- activation_functor(res);
- }
-
- // Store results
- wrapper::vstore(output_ptr + x, res);
- }
-
- // Compute left-over elements
- for (; x < window_end_x; ++x)
- {
- const T numerator = input_ptr[x] - mean;
- const T x_bar = numerator * denominator;
- T res = beta + x_bar * gamma;
-
- // Perform fused activation
- if (fused_activation)
- {
- activation_functor(res);
- }
-
- // Store results
- *(output_ptr + x) = res;
- }
- },
- input, output);
-}
-
void NEBatchNormalizationLayerKernel::configure_non_fused()
{
switch (_input->info()->data_type())
{
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
case DataType::F16:
- _func = &NEBatchNormalizationLayerKernel::batch_normalization_nchw>;
+ _func = REGISTER_FP16_NEON(cpu::fp16_batch_normalization_nchw_non_fused);
break;
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
case DataType::F32:
- _func = &NEBatchNormalizationLayerKernel::batch_normalization_nchw>;
+ _func = REGISTER_FP32_NEON(cpu::fp32_batch_normalization_nchw_non_fused);
break;
default:
ARM_COMPUTE_ERROR("Element size not supported");
@@ -285,29 +172,26 @@ void NEBatchNormalizationLayerKernel::configure_fused()
// NCHW Fused Batched Normalization with activation functions : FP32
static std::map bn_fused_map_f32_nchw = {
{ActivationLayerInfo::ActivationFunction::RELU,
- &NEBatchNormalizationLayerKernel::batch_normalization_nchw>},
+ REGISTER_FP32_NEON(cpu::fp32_batch_normalization_nchw_non_fused_relu)},
{ActivationLayerInfo::ActivationFunction::BOUNDED_RELU,
- &NEBatchNormalizationLayerKernel::batch_normalization_nchw>},
+ REGISTER_FP32_NEON(cpu::fp32_batch_normalization_nchw_non_fused_brelu)},
{ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU,
- &NEBatchNormalizationLayerKernel::batch_normalization_nchw>}};
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+ REGISTER_FP32_NEON(cpu::fp32_batch_normalization_nchw_non_fused_lubrelu)}};
+
// NCHW Fused Batched Normalization with activation functions : FP16
static std::map bn_fused_map_f16_nchw = {
{ActivationLayerInfo::ActivationFunction::RELU,
- &NEBatchNormalizationLayerKernel::batch_normalization_nchw>},
+ REGISTER_FP16_NEON(cpu::fp16_batch_normalization_nchw_non_fused_relu)},
{ActivationLayerInfo::ActivationFunction::BOUNDED_RELU,
- &NEBatchNormalizationLayerKernel::batch_normalization_nchw>},
+ REGISTER_FP16_NEON(cpu::fp16_batch_normalization_nchw_non_fused_brelu)},
{ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU,
- &NEBatchNormalizationLayerKernel::batch_normalization_nchw>}};
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+ REGISTER_FP16_NEON(cpu::fp16_batch_normalization_nchw_non_fused_lubrelu)}};
switch (_input->info()->data_type())
{
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
case DataType::F16:
_func = bn_fused_map_f16_nchw[_act_info.activation()];
break;
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
case DataType::F32:
_func = bn_fused_map_f32_nchw[_act_info.activation()];
break;
@@ -409,7 +293,7 @@ void NEBatchNormalizationLayerKernel::run(const Window &window, const ThreadInfo
const bool is_nchw = _input->info()->data_layout() == DataLayout::NCHW;
if (is_nchw)
{
- (this->*_func)(window);
+ (*_func)(window, _input, _output, _mean, _var, _beta, _gamma, _epsilon, _act_info);
}
else
{
diff --git a/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.h b/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.h
index 2e8ff0dc9a..679ade0fae 100644
--- a/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.h
+++ b/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.h
@@ -21,8 +21,8 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#ifndef ARM_COMPUTE_NEBATCHNORMALIZATIONLAYERKERNEL_H
-#define ARM_COMPUTE_NEBATCHNORMALIZATIONLAYERKERNEL_H
+#ifndef ACL_SRC_CORE_NEON_KERNELS_NEBATCHNORMALIZATIONLAYERKERNEL_H
+#define ACL_SRC_CORE_NEON_KERNELS_NEBATCHNORMALIZATIONLAYERKERNEL_H
#include "arm_compute/function_info/ActivationLayerInfo.h"
@@ -110,31 +110,19 @@ class NEBatchNormalizationLayerKernel : public INEKernel
/** Configure execution function in case of fused activation **/
void configure_fused();
- /** Template function to run batch normalization on fp32
- *
- * @tparam T Specialization data type
- * @tparam fused_activation Boolean that flags if its a fused activation or not
- * @tparam F Activation function functor to run
- *
- * @param[in] window Region on which to execute the kernel. (Must be a valid region of the window returned by window()).
- */
- template
- void batch_normalization_nchw(const Window &window);
- /** Template function to run batch normalization on fp32 on tensors with NHWC format
- *
- * @tparam T Specialization data type
- * @tparam fused_activation Boolean that flags if its a fused activation or not
- * @tparam F Activation function functor to run
- *
- * @param[in] window Region on which to execute the kernel. (Must be a valid region of the window returned by window()).
- */
- template
- void batch_normalization_nhwc(const Window &window);
/** Common signature for all the batch normalization functions
*
* @param[in] window Region on which to execute the kernel.
*/
- using BatchNormFunctionPtr = void (NEBatchNormalizationLayerKernel::*)(const Window &window);
+ using BatchNormFunctionPtr = void (*)(const Window &window,
+ ITensor *input,
+ ITensor *output,
+ const ITensor *mean,
+ const ITensor *var,
+ const ITensor *beta,
+ const ITensor *gamma,
+ float epsilon,
+ ActivationLayerInfo act_info);
private:
BatchNormFunctionPtr _func;
@@ -148,4 +136,4 @@ class NEBatchNormalizationLayerKernel : public INEKernel
ActivationLayerInfo _act_info;
};
} // namespace arm_compute
-#endif /*ARM_COMPUTE_NEBATCHNORMALIZATIONLAYERKERNEL_H */
+#endif // ACL_SRC_CORE_NEON_KERNELS_NEBATCHNORMALIZATIONLAYERKERNEL_H
diff --git a/src/core/NEON/kernels/NEDepthToSpaceLayerKernel.cpp b/src/core/NEON/kernels/NEDepthToSpaceLayerKernel.cpp
index de0079ee60..e0eb5cf202 100644
--- a/src/core/NEON/kernels/NEDepthToSpaceLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEDepthToSpaceLayerKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2020, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -23,6 +23,7 @@
*/
#include "src/core/NEON/kernels/NEDepthToSpaceLayerKernel.h"
+#include "arm_compute/core/CoreTypes.h"
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/ITensor.h"
#include "arm_compute/core/Types.h"
@@ -31,13 +32,10 @@
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/WindowHelpers.h"
-#include "src/core/NEON/wrapper/wrapper.h"
+#include "src/cpu/kernels/depth_to_space/list.h"
-#include
#include
-using namespace arm_compute::misc::shape_calculator;
-
namespace arm_compute
{
namespace
@@ -70,15 +68,19 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, i
} // namespace
NEDepthToSpaceLayerKernel::NEDepthToSpaceLayerKernel()
- : _input(nullptr), _output(nullptr), _block_shape(), _data_layout(DataLayout::UNKNOWN)
+ : _input(nullptr),
+ _output(nullptr),
+ _block_shape(),
+ _data_layout(DataLayout::UNKNOWN),
+ _split_dimension(Window::DimY)
{
}
void NEDepthToSpaceLayerKernel::configure(const ITensor *input, ITensor *output, int32_t block_shape)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
- TensorShape output_shape =
- compute_depth_to_space_shape(input->info()->tensor_shape(), input->info()->data_layout(), block_shape);
+ TensorShape output_shape = misc::shape_calculator::compute_depth_to_space_shape(
+ input->info()->tensor_shape(), input->info()->data_layout(), block_shape);
// Output auto inizialitation if not yet initialized
auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape));
@@ -90,9 +92,31 @@ void NEDepthToSpaceLayerKernel::configure(const ITensor *input, ITensor *output,
_block_shape = block_shape;
_data_layout = input->info()->data_layout();
+ constexpr size_t dim_b = 3;
+ const auto dim_h = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
+ const auto dim_w = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH);
+ const auto dim_c = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::CHANNEL);
+
+ ARM_COMPUTE_ERROR_ON(get_data_layout_dimension_index(_data_layout, DataLayoutDimension::BATCHES) != dim_b);
+
// Configure kernel window
- Window win = calculate_max_window(*input->info(), Steps());
+ Steps steps;
+ steps.set(dim_h, block_shape);
+ steps.set(dim_w, block_shape);
+ steps.set(dim_c, output->info()->dimension(dim_c));
+
+ Window win = calculate_max_window(*output->info(), steps);
ICPPKernel::configure(win);
+
+ const auto num_batches = input->info()->tensor_shape().total_size_upper(dim_b);
+ if (num_batches > 1)
+ {
+ _split_dimension = dim_b;
+ }
+ else
+ {
+ _split_dimension = dim_h;
+ }
}
Status NEDepthToSpaceLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, int32_t block_shape)
@@ -102,68 +126,80 @@ Status NEDepthToSpaceLayerKernel::validate(const ITensorInfo *input, const ITens
return Status{};
}
+size_t NEDepthToSpaceLayerKernel::get_split_dimension() const
+{
+ return _split_dimension;
+}
+
void NEDepthToSpaceLayerKernel::run(const Window &window, const ThreadInfo &info)
{
ARM_COMPUTE_UNUSED(info);
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICPPKernel::window(), window);
- const int idx_channel = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::CHANNEL);
- const int depth_size = _input->info()->dimension(idx_channel);
- const int r = (depth_size / (_block_shape * _block_shape));
- const int element_size = _input->info()->element_size();
+ const auto *input_info = _input->info();
+ const auto *output_info = _output->info();
+
+ const auto element_size = input_info->element_size();
+ const auto &input_strides = input_info->strides_in_bytes();
+ const auto &output_strides = output_info->strides_in_bytes();
+
+ const auto &input_shape = input_info->tensor_shape();
- Window slice_out = window.first_slice_window_3D();
+ const uintptr_t k_input_strides[] = {input_strides[0], input_strides[1], input_strides[2], input_strides[3]};
+ const uintptr_t k_output_strides[] = {output_strides[0], output_strides[1], output_strides[2], output_strides[3]};
- // The slice_out slice does not move
- slice_out.set(Window::DimX, Window::Dimension(0, 0, 0));
- slice_out.set(Window::DimY, Window::Dimension(0, 0, 0));
- slice_out.set(Window::DimZ, Window::Dimension(0, 0, 0));
+ const uint8_t *k_input_ptr = _input->buffer();
+ uint8_t *k_output_ptr = //
+ _output->buffer() + //
+ window[3].start() * output_strides[3] + //
+ window[2].start() * output_strides[2] + //
+ window[1].start() * output_strides[1] + //
+ window[0].start() * output_strides[0];
- // Main loop for NCHW and NHWC
if (_data_layout == DataLayout::NCHW)
{
- Window slice_in = window.first_slice_window_2D();
- do
- {
- Iterator in(_input, slice_in);
- execute_window_loop(
- slice_in,
- [&](const Coordinates &id)
- {
- const int x = id.x();
- const int y = id.y();
-
- const int z = id.z() % r;
- const int out_x = x * _block_shape + (id.z() / r) % _block_shape;
- const int out_y = y * _block_shape + (id.z() / r) / _block_shape;
- Coordinates output_coords{out_x, out_y, z, id[3]};
- memcpy(_output->ptr_to_element(output_coords), in.ptr(), element_size);
- },
- in);
- } while (window.slide_window_slice_2D(slice_in));
+ ARM_COMPUTE_ERROR_ON_MSG(window[2].start() != 0 || window[2].end() != window[2].step(),
+ "The window cannot be splitted in channel dimension");
+
+ const uintptr_t k_input_shape[] = {
+ window.num_iterations(0), //
+ window.num_iterations(1), //
+ input_shape[2], // The window cannot be splitted in channel dimension.
+ window.num_iterations(3) //
+ };
+
+ k_input_ptr += window[3].start() * input_strides[3] + //
+ window[2].start() * _block_shape * _block_shape * input_strides[2] + //
+ (window[1].start() / _block_shape) * input_strides[1] + //
+ (window[0].start() / _block_shape) * input_strides[0];
+
+ cpu::depth_to_space_nchw_any( //
+ k_input_ptr, k_output_ptr, //
+ k_input_shape, k_input_strides, k_output_strides, //
+ element_size, _block_shape);
}
else
{
- Window slice_in = window.first_slice_window_3D();
- do
- {
- Iterator in(_input, slice_in);
- execute_window_loop(
- slice_in,
- [&](const Coordinates &id)
- {
- const int x = id.y();
- const int y = id.z();
-
- const int z = id.x() % r;
- const int out_x = x * _block_shape + (id.x() / r) % _block_shape;
- const int out_y = y * _block_shape + (id.x() / r) / _block_shape;
- Coordinates output_coords{z, out_x, out_y, id[3]};
- memcpy(_output->ptr_to_element(output_coords), in.ptr(), element_size);
- },
- in);
- } while (window.slide_window_slice_3D(slice_in));
+ ARM_COMPUTE_ERROR_ON_MSG(window[0].start() != 0 || window[0].end() != window[0].step(),
+ "The window cannot be splitted in channel dimension");
+
+ const uintptr_t k_input_shape[] = {
+ input_shape[0], // The window cannot be splitted in channel dimension.
+ window.num_iterations(1), //
+ window.num_iterations(2), //
+ window.num_iterations(3) //
+ };
+
+ k_input_ptr += window[3].start() * input_strides[3] + //
+ (window[2].start() / _block_shape) * input_strides[2] + //
+ (window[1].start() / _block_shape) * input_strides[1] + //
+ window[0].start() * _block_shape * _block_shape * input_strides[0];
+
+ cpu::depth_to_space_nhwc_any( //
+ k_input_ptr, k_output_ptr, //
+ k_input_shape, k_input_strides, k_output_strides, //
+ element_size, _block_shape);
}
}
} // namespace arm_compute
diff --git a/src/core/NEON/kernels/NEDepthToSpaceLayerKernel.h b/src/core/NEON/kernels/NEDepthToSpaceLayerKernel.h
index 7e18dd88b8..ca431ec5fe 100644
--- a/src/core/NEON/kernels/NEDepthToSpaceLayerKernel.h
+++ b/src/core/NEON/kernels/NEDepthToSpaceLayerKernel.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2020, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -21,8 +21,8 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#ifndef ARM_COMPUTE_NEDEPTHTOSPACELAYERKERNEL_H
-#define ARM_COMPUTE_NEDEPTHTOSPACELAYERKERNEL_H
+#ifndef ACL_SRC_CORE_NEON_KERNELS_NEDEPTHTOSPACELAYERKERNEL_H
+#define ACL_SRC_CORE_NEON_KERNELS_NEDEPTHTOSPACELAYERKERNEL_H
#include "src/core/NEON/INEKernel.h"
@@ -68,14 +68,18 @@ class NEDepthToSpaceLayerKernel : public INEKernel
*/
static Status validate(const ITensorInfo *input, const ITensorInfo *output, int32_t block_shape);
+ /** Get the dimension the scheduler should use to split. */
+ size_t get_split_dimension() const;
+
// Inherited methods overridden:
void run(const Window &window, const ThreadInfo &info) override;
private:
- const ITensor *_input; /**< Source tensor */
- ITensor *_output; /**< Destination tensor */
- int32_t _block_shape; /**< Block shape */
- DataLayout _data_layout; /**< Data layout of the operation */
+ const ITensor *_input; /**< Source tensor */
+ ITensor *_output; /**< Destination tensor */
+ int32_t _block_shape; /**< Block shape */
+ DataLayout _data_layout; /**< Data layout of the operation */
+ size_t _split_dimension; /**< The dimension the scheduler should use to split the workload. */
};
} // namespace arm_compute
-#endif /* ARM_COMPUTE_NEDEPTHTOSPACELAYERKERNEL_H */
+#endif // ACL_SRC_CORE_NEON_KERNELS_NEDEPTHTOSPACELAYERKERNEL_H
diff --git a/src/core/NEON/kernels/NENormalizationLayerKernel.cpp b/src/core/NEON/kernels/NENormalizationLayerKernel.cpp
index 2c61bda147..8399c6c49d 100644
--- a/src/core/NEON/kernels/NENormalizationLayerKernel.cpp
+++ b/src/core/NEON/kernels/NENormalizationLayerKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -30,6 +30,7 @@
#include "arm_compute/core/Validate.h"
#include "arm_compute/core/Window.h"
+#include "src/core/common/Registrars.h"
#include "src/core/CPP/Validate.h"
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/NormalizationHelpers.h"
@@ -37,6 +38,8 @@
#include "src/core/NEON/NEFixedPoint.h"
#include "src/core/NEON/NEMath.h"
#include "src/core/NEON/wrapper/wrapper.h"
+#include "src/cpu/kernels/norm_layer/generic/neon/impl.h"
+#include "src/cpu/kernels/norm_layer/generic/neon/list.h"
namespace arm_compute
{
@@ -91,7 +94,6 @@ void NENormalizationLayerKernel::configure(const ITensor *input,
_input_squared = input_squared;
_output = output;
_norm_info = norm_info;
-
switch (_input->info()->data_type())
{
case DataType::F32:
@@ -102,33 +104,33 @@ void NENormalizationLayerKernel::configure(const ITensor *input,
{
if (norm_info.type() == NormType::IN_MAP_2D)
{
- _func = &NENormalizationLayerKernel::normalize_float;
+ _func = REGISTER_FP32_NEON(cpu::neon_normalize_float32_4_0_2D);
}
else
{
- _func = &NENormalizationLayerKernel::normalize_float;
+ _func = REGISTER_FP32_NEON(cpu::neon_normalize_float32_4_0);
}
break;
}
case 1:
if (norm_info.type() == NormType::IN_MAP_2D)
{
- _func = &NENormalizationLayerKernel::normalize_float;
+ _func = REGISTER_FP32_NEON(cpu::neon_normalize_float32_4_1_2D);
}
else
{
- _func = &NENormalizationLayerKernel::normalize_float;
+ _func = REGISTER_FP32_NEON(cpu::neon_normalize_float32_4_1);
}
break;
case 2:
- _func = &NENormalizationLayerKernel::normalize_float;
+ _func = REGISTER_FP32_NEON(cpu::neon_normalize_float32_4_2);
break;
default:
break;
}
break;
}
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+#ifdef ARM_COMPUTE_ENABLE_FP16
case DataType::F16:
{
switch (norm_idx)
@@ -137,33 +139,33 @@ void NENormalizationLayerKernel::configure(const ITensor *input,
{
if (norm_info.type() == NormType::IN_MAP_2D)
{
- _func = &NENormalizationLayerKernel::normalize_float;
+ _func = REGISTER_FP16_NEON(cpu::neon_normalize_float16_8_0_2D);
}
else
{
- _func = &NENormalizationLayerKernel::normalize_float;
+ _func = REGISTER_FP16_NEON(cpu::neon_normalize_float16_8_0);
}
break;
}
case 1:
if (norm_info.type() == NormType::IN_MAP_2D)
{
- _func = &NENormalizationLayerKernel::normalize_float;
+ _func = REGISTER_FP16_NEON(cpu::neon_normalize_float16_8_1_2D);
}
else
{
- _func = &NENormalizationLayerKernel::normalize_float;
+ _func = REGISTER_FP16_NEON(cpu::neon_normalize_float16_8_1);
}
break;
case 2:
- _func = &NENormalizationLayerKernel::normalize_float;
+ _func = REGISTER_FP16_NEON(cpu::neon_normalize_float16_8_2);
break;
default:
break;
}
break;
}
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+#endif /* ARM_COMPUTE_ENABLE_FP16 */
default:
ARM_COMPUTE_ERROR("NOT SUPPORTED!");
}
@@ -173,124 +175,6 @@ void NENormalizationLayerKernel::configure(const ITensor *input,
INEKernel::configure(win);
}
-template
-void NENormalizationLayerKernel::normalize_float(const Window &window)
-{
- /** SIMD vector tag type. */
- using ExactTagType = typename wrapper::traits::neon_vector::tag_type;
-
- Window win(window);
- win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
- const auto window_start_x = static_cast(window.x().start());
- const auto window_end_x = static_cast(window.x().end());
- const int window_step_x = S;
-
- Iterator input(_input, win);
- Iterator input_squared(_input_squared, win);
- Iterator output(_output, win);
-
- const int dim_y = _input->info()->data_layout() == DataLayout::NCHW ? 1 : 2;
- const int radius = _norm_info.norm_size() / 2;
- const int input_squared_stride_x = _input_squared->info()->strides_in_bytes()[0];
- const int input_squared_stride_slice = _input_squared->info()->strides_in_bytes()[dim];
- const int input_squared_stride_row = _input_squared->info()->strides_in_bytes()[dim_y];
-
- const int max_right = _input->info()->dimension(dim) - 1;
- const int max_bottom = _input->info()->dimension(dim_y) - 1;
-
- const auto coeff_vec = wrapper::vdup_n(static_cast(_norm_info.scale_coeff()), ExactTagType{});
- const auto beta_vec = wrapper::vdup_n(static_cast(_norm_info.beta()), ExactTagType{});
- const auto kappa_vec = wrapper::vdup_n(static_cast(_norm_info.kappa()), ExactTagType{});
-
- auto sequential_normalization = [&](const int x, const Coordinates &id, const int current_row, const int first_row,
- const int last_row, const T *input_ptr, const uint8_t *input_squared_start_ptr,
- T *output_ptr)
- {
- const int current_slice = dim == 0 ? x : id[dim];
- const int first_slice = std::max(current_slice - radius, 0);
- const int last_slice = std::min(current_slice + radius, max_right);
-
- const uint8_t *const input_squared_x_ptr = input_squared_start_ptr + x * input_squared_stride_x;
- // Accumulate 2D In-Map values
- auto accu = static_cast(0.f);
- for (int j = first_row; j <= last_row; ++j)
- {
- // Compute row displacement
- const uint8_t *const input_squared_ptr = input_squared_x_ptr + (j - current_row) * input_squared_stride_row;
- for (int i = first_slice; i <= last_slice; ++i)
- {
- accu +=
- *reinterpret_cast(input_squared_ptr + (i - current_slice) * input_squared_stride_slice);
- }
- }
-
- // Normalize
- const auto normalized = std::pow(
- accu * static_cast(_norm_info.scale_coeff()) + static_cast(_norm_info.kappa()), _norm_info.beta());
- const auto normalized_pixel = (*(input_ptr + x)) / normalized;
- *(output_ptr + x) = normalized_pixel;
- };
-
- execute_window_loop(
- win,
- [&](const Coordinates &id)
- {
- const auto input_ptr = reinterpret_cast(input.ptr());
- auto output_ptr = reinterpret_cast(output.ptr());
-
- // Get range to normalize
- const int current_row = do_2D_norm ? id[dim_y] : 0;
- const int first_row = do_2D_norm ? std::max(current_row - radius, 0) : 0;
- const int last_row = do_2D_norm ? std::min(current_row + radius, max_bottom) : 0;
-
- int x = window_start_x;
- // Compute serially starting elements for the case x dimension is width
- for (; x < radius && x < window_end_x && dim == 0; ++x)
- {
- sequential_normalization(x, id, current_row, first_row, last_row, input_ptr, input_squared.ptr(),
- output_ptr);
- }
-
- // Compute vectorized
- for (; x <= window_end_x - window_step_x - radius; x += window_step_x)
- {
- const int current_slice = dim == 0 ? x : id[dim];
- const int first_slice = std::max(current_slice - radius, 0);
- const int last_slice = std::min(current_slice + radius, max_right);
-
- const uint8_t *const input_squared_x_ptr = input_squared.ptr() + x * input_squared_stride_x;
- // Accumulate 2D In-Map values
- auto accu = wrapper::vdup_n(static_cast(0.f), ExactTagType{});
- for (int j = first_row; j <= last_row; ++j)
- {
- // Compute row displacement
- const uint8_t *const input_squared_ptr =
- input_squared_x_ptr + (j - current_row) * input_squared_stride_row;
- for (int i = first_slice; i <= last_slice; ++i)
- {
- accu = wrapper::vadd(
- accu, wrapper::vloadq(reinterpret_cast(
- input_squared_ptr + (i - current_slice) * input_squared_stride_slice)));
- }
- }
-
- // Normalize
- const auto normalized = wrapper::vpow(wrapper::vmla(kappa_vec, coeff_vec, accu), beta_vec);
- const auto normalized_pixel = wrapper::vmul(wrapper::vloadq(input_ptr + x), wrapper::vinv(normalized));
- wrapper::vstore(reinterpret_cast(output_ptr + x), normalized_pixel);
- }
-
- // Compute left-over elements
- for (; x < window_end_x; ++x)
- {
- sequential_normalization(x, id, current_row, first_row, last_row, input_ptr, input_squared.ptr(),
- output_ptr);
- }
- },
- input, input_squared, output);
-}
-
Status NENormalizationLayerKernel::validate(const ITensorInfo *input,
const ITensorInfo *input_squared,
const ITensorInfo *output,
@@ -309,6 +193,6 @@ void NENormalizationLayerKernel::run(const Window &window, const ThreadInfo &inf
ARM_COMPUTE_ERROR_ON(_func == nullptr);
// Run function
- (this->*_func)(window);
+ (*_func)(window, _input, _input_squared, _output, _norm_info);
}
} // namespace arm_compute
diff --git a/src/core/NEON/kernels/NENormalizationLayerKernel.h b/src/core/NEON/kernels/NENormalizationLayerKernel.h
index 2d8d9f3d60..5ba4c3edca 100644
--- a/src/core/NEON/kernels/NENormalizationLayerKernel.h
+++ b/src/core/NEON/kernels/NENormalizationLayerKernel.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2020, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -21,8 +21,8 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#ifndef ARM_COMPUTE_NENORMALIZATIONLAYERKERNEL_H
-#define ARM_COMPUTE_NENORMALIZATIONLAYERKERNEL_H
+#ifndef ACL_SRC_CORE_NEON_KERNELS_NENORMALIZATIONLAYERKERNEL_H
+#define ACL_SRC_CORE_NEON_KERNELS_NENORMALIZATIONLAYERKERNEL_H
#include "src/core/NEON/INEKernel.h"
@@ -82,24 +82,12 @@ class NENormalizationLayerKernel : public INEKernel
void run(const Window &window, const ThreadInfo &info) override;
private:
- /** Function to perform normalization depending on the given template
- * dimension. The second template parameter specifies whether the
- * normalization has to be 1D or 2D.
- *
- * @note Only supported normalizations are:
- * - 1D over X or Z
- * - 2D over X and Y
- *
- * @param[in] window Region on which to execute the kernel.
- */
- template
- void normalize_float(const Window &window);
-
/** Common signature for all the specialised normalization functions
*
* @param[in] window Region on which to execute the kernel.
*/
- using NormalizationFunction = void (NENormalizationLayerKernel::*)(const Window &window);
+ using NormalizationFunction = void (*)(
+ const Window &window, const ITensor *in, const ITensor *in_squared, ITensor *out, NormalizationLayerInfo ninfo);
private:
NormalizationFunction _func;
@@ -109,4 +97,4 @@ class NENormalizationLayerKernel : public INEKernel
NormalizationLayerInfo _norm_info;
};
} // namespace arm_compute
-#endif /*ARM_COMPUTE_NENORMALIZATIONLAYERKERNEL_H */
+#endif // ACL_SRC_CORE_NEON_KERNELS_NENORMALIZATIONLAYERKERNEL_H
diff --git a/src/core/NEON/kernels/NEReorderKernel.cpp b/src/core/NEON/kernels/NEReorderKernel.cpp
index 6c2c987eb7..f5bea3e163 100644
--- a/src/core/NEON/kernels/NEReorderKernel.cpp
+++ b/src/core/NEON/kernels/NEReorderKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2023 Arm Limited.
+ * Copyright (c) 2023-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -54,17 +54,41 @@ void NEReorderKernel::run(const Window &window, const ThreadInfo &info)
{
case WeightFormat::OHWIo4:
{
- arm_gemm::Transform<4, 1, true, arm_gemm::VLType::None>(
- reinterpret_cast(_output->buffer()) + jump_rows,
- reinterpret_cast(_input->buffer()), stride, k_start, k_end, 0, _xmax);
+ switch (_output->info()->data_type())
+ {
+ case DataType::F32:
+ arm_gemm::Transform<4, 1, true, arm_gemm::VLType::None>(
+ reinterpret_cast(_output->buffer()) + jump_rows,
+ reinterpret_cast(_input->buffer()), stride, k_start, k_end, 0, _xmax);
+ break;
+ case DataType::BFLOAT16:
+ arm_gemm::Transform<4, 4, true, arm_gemm::VLType::None>(
+ reinterpret_cast(_output->buffer()) + jump_rows,
+ reinterpret_cast(_input->buffer()), stride, k_start, k_end, 0, _xmax);
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Unsupported data type!");
+ }
break;
}
#if defined(ARM_COMPUTE_ENABLE_SVE)
case WeightFormat::OHWIo8:
{
- arm_gemm::Transform<1, 1, true, arm_gemm::VLType::SVE>(
- reinterpret_cast(_output->buffer()) + jump_rows,
- reinterpret_cast(_input->buffer()), stride, k_start, k_end, 0, _xmax);
+ switch (_output->info()->data_type())
+ {
+ case DataType::F32:
+ arm_gemm::Transform<1, 1, true, arm_gemm::VLType::SVE>(
+ reinterpret_cast(_output->buffer()) + jump_rows,
+ reinterpret_cast(_input->buffer()), stride, k_start, k_end, 0, _xmax);
+ break;
+ case DataType::BFLOAT16:
+ arm_gemm::Transform<2, 4, true, arm_gemm::VLType::SVE>(
+ reinterpret_cast(_output->buffer()) + jump_rows,
+ reinterpret_cast(_input->buffer()), stride, k_start, k_end, 0, _xmax);
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Unsupported data type!");
+ }
break;
}
#endif /* ARM_COMPUTE_ENABLE_SVE */
@@ -175,7 +199,8 @@ Status NEReorderKernel::validate(const ITensorInfo *input,
ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN);
if (output->tensor_shape().total_size() != 0)
{
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() != DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON(output->data_type() != DataType::F32 && output->data_type() != DataType::BFLOAT16);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
// Only input WeightFormat OHWI supported
ARM_COMPUTE_RETURN_ERROR_ON(input_wf != arm_compute::WeightFormat::OHWI);
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_bf16.cpp b/src/core/NEON/kernels/arm_gemm/gemm_bf16.cpp
index 72b0fac96a..5c08e6137d 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_bf16.cpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_bf16.cpp
@@ -206,30 +206,6 @@ GemmImplementation::with_estimate(
[](const GemmArgs &args) { return GemmInterleavedFixedFormat::estimate_cycles(args); },
[](const GemmArgs &args) { return new GemmInterleavedFixedFormat(args); }
),
-GemmImplementation::with_estimate(
- GemmMethod::GEMM_INTERLEAVED,
- "a64_ffinterleaved_bf16fp32_mmla_8x12",
- KernelWeightFormat::VL256_BL64,
- [](const GemmArgs &args) { return args._ci->has_bf16(); },
- [](const GemmArgs &args) { return GemmInterleavedFixedFormat::estimate_cycles(args); },
- [](const GemmArgs &args) { return new GemmInterleavedFixedFormat(args); }
-),
-GemmImplementation::with_estimate(
- GemmMethod::GEMM_INTERLEAVED,
- "a64_ffhybrid_bf16fp32_mmla_6x16",
- KernelWeightFormat::VL256_BL64,
- [](const GemmArgs &args) { return args._ci->has_bf16(); },
- [](const GemmArgs &args) { return GemmHybridIndirectFixedFormat::estimate_cycles(args); },
- [](const GemmArgs &args) { return new GemmHybridIndirectFixedFormat(args); }
-),
-GemmImplementation::with_estimate(
- GemmMethod::GEMM_INTERLEAVED,
- "a64_ffinterleaved_bf16fp32_dot_8x12",
- KernelWeightFormat::VL128_BL32,
- [](const GemmArgs &args) { return args._ci->has_bf16(); },
- [](const GemmArgs &args) { return GemmInterleavedFixedFormat::estimate_cycles(args); },
- [](const GemmArgs &args) { return new GemmInterleavedFixedFormat(args); }
-),
#endif // ARM_COMPUTE_ENABLE_FIXED_FORMAT_KERNELS
GemmImplementation::with_estimate(
GemmMethod::GEMM_INTERLEAVED,
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_4_2x4_fp32bf16.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_4_2x4_fp32bf16.hpp
new file mode 100644
index 0000000000..98200c50c5
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_4_2x4_fp32bf16.hpp
@@ -0,0 +1,346 @@
+/*
+ * Copyright (c) 2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+#if defined(__aarch64__)
+
+namespace {
+
+void a64_transpose_interleave_4_2x4_fp32bf16(bfloat16 *out, const float *in, size_t width, size_t in_stride, size_t height)
+{
+ float *pad_row = reinterpret_cast(alloca(width * sizeof(float)));
+
+ if (height % 4) {
+ memset(pad_row, 0, width * sizeof(float));
+ }
+
+ size_t out_stride = 4 * roundup(height, 4) * sizeof(bfloat16);
+
+ __asm__ __volatile__(
+ "cmp %x[height], #0x8\n"
+ "blt 8f\n"
+ "1:" // Main row loop: Head
+ "mov x9, %x[in]\n"
+ "mov x28, %x[width]\n"
+ "mov x27, %x[out]\n"
+ "sub %x[height], %x[height], #0x8\n"
+ "add x26, x9, %x[in_stride]\n"
+ "add x25, x26, %x[in_stride]\n"
+ "add x24, x25, %x[in_stride]\n"
+ "cmp x28, #0x8\n"
+ "add x23, x24, %x[in_stride]\n"
+ "add x22, x23, %x[in_stride]\n"
+ "add x21, x22, %x[in_stride]\n"
+ "add x20, x21, %x[in_stride]\n"
+ "add %x[in], x20, %x[in_stride]\n"
+ "blt 3f\n"
+ "2:" // Main row loop: Unroll column loop
+ "ldr q19, [x9], #0x10\n"
+ "ldr q18, [x26], #0x10\n"
+ "sub x28, x28, #0x8\n"
+ "ldr q17, [x25], #0x10\n"
+ "ldr q16, [x24], #0x10\n"
+ "cmp x28, #0x8\n"
+ "ldr q1, [x23], #0x10\n"
+ "ldr q0, [x22], #0x10\n"
+ "ldr q31, [x21], #0x10\n"
+ "ldr q24, [x20], #0x10\n"
+ "ldr q23, [x9], #0x10\n"
+ "ldr q22, [x26], #0x10\n"
+ "zip1 v30.4s, v19.4s, v17.4s\n"
+ "zip1 v29.4s, v18.4s, v16.4s\n"
+ "ldr q21, [x25], #0x10\n"
+ "ldr q20, [x24], #0x10\n"
+ "zip2 v28.4s, v19.4s, v17.4s\n"
+ "zip2 v27.4s, v18.4s, v16.4s\n"
+ "ldr q19, [x23], #0x10\n"
+ "ldr q18, [x22], #0x10\n"
+ "zip1 v26.4s, v1.4s, v31.4s\n"
+ "zip1 v25.4s, v0.4s, v24.4s\n"
+ "ldr q17, [x21], #0x10\n"
+ "ldr q16, [x20], #0x10\n"
+ "zip2 v8.4s, v1.4s, v31.4s\n"
+ "zip2 v24.4s, v0.4s, v24.4s\n"
+ "zip1 v7.4s, v23.4s, v21.4s\n"
+ "zip1 v6.4s, v22.4s, v20.4s\n"
+ "zip2 v5.4s, v23.4s, v21.4s\n"
+ "zip2 v4.4s, v22.4s, v20.4s\n"
+ "zip1 v3.4s, v19.4s, v17.4s\n"
+ "zip1 v2.4s, v18.4s, v16.4s\n"
+ "zip2 v1.4s, v19.4s, v17.4s\n"
+ "zip2 v0.4s, v18.4s, v16.4s\n"
+ "zip1 v23.4s, v30.4s, v29.4s\n"
+ "zip1 v22.4s, v28.4s, v27.4s\n"
+ "zip1 v21.4s, v26.4s, v25.4s\n"
+ "zip1 v20.4s, v8.4s, v24.4s\n"
+ "zip1 v19.4s, v7.4s, v6.4s\n"
+ "zip1 v18.4s, v5.4s, v4.4s\n"
+ "zip1 v17.4s, v3.4s, v2.4s\n"
+ "zip1 v16.4s, v1.4s, v0.4s\n"
+ ".inst 0x0ea16aff // bfcvtn v31.4h, v23.4s\n"
+ "zip2 v30.4s, v30.4s, v29.4s\n"
+ ".inst 0x0ea16add // bfcvtn v29.4h, v22.4s\n"
+ "zip2 v28.4s, v28.4s, v27.4s\n"
+ ".inst 0x0ea16abb // bfcvtn v27.4h, v21.4s\n"
+ "zip2 v26.4s, v26.4s, v25.4s\n"
+ ".inst 0x0ea16a99 // bfcvtn v25.4h, v20.4s\n"
+ "zip2 v24.4s, v8.4s, v24.4s\n"
+ ".inst 0x0ea16a77 // bfcvtn v23.4h, v19.4s\n"
+ "zip2 v22.4s, v7.4s, v6.4s\n"
+ ".inst 0x0ea16a55 // bfcvtn v21.4h, v18.4s\n"
+ "zip2 v20.4s, v5.4s, v4.4s\n"
+ ".inst 0x0ea16a33 // bfcvtn v19.4h, v17.4s\n"
+ "zip2 v18.4s, v3.4s, v2.4s\n"
+ ".inst 0x0ea16a11 // bfcvtn v17.4h, v16.4s\n"
+ "zip2 v16.4s, v1.4s, v0.4s\n"
+ ".inst 0x4ea16bdf // bfcvtn2 v31.8h, v30.4s\n"
+ ".inst 0x4ea16b9d // bfcvtn2 v29.8h, v28.4s\n"
+ ".inst 0x4ea16b5b // bfcvtn2 v27.8h, v26.4s\n"
+ ".inst 0x4ea16b19 // bfcvtn2 v25.8h, v24.4s\n"
+ ".inst 0x4ea16ad7 // bfcvtn2 v23.8h, v22.4s\n"
+ ".inst 0x4ea16a95 // bfcvtn2 v21.8h, v20.4s\n"
+ "str q31, [x27, #0x0]\n"
+ "str q29, [x27, #0x10]\n"
+ ".inst 0x4ea16a53 // bfcvtn2 v19.8h, v18.4s\n"
+ ".inst 0x4ea16a11 // bfcvtn2 v17.8h, v16.4s\n"
+ "str q27, [x27, #0x20]\n"
+ "str q25, [x27, #0x30]\n"
+ "add x27, x27, %x[out_stride]\n"
+ "str q23, [x27, #0x0]\n"
+ "str q21, [x27, #0x10]\n"
+ "str q19, [x27, #0x20]\n"
+ "str q17, [x27, #0x30]\n"
+ "add x27, x27, %x[out_stride]\n"
+ "bge 2b\n"
+ "3:" // Main row loop: Unroll column loop skip
+ "cmp x28, #0x4\n"
+ "blt 5f\n"
+ "4:" // Main row loop: Column loop
+ "ldr q25, [x9], #0x10\n"
+ "ldr q24, [x26], #0x10\n"
+ "sub x28, x28, #0x4\n"
+ "ldr q21, [x25], #0x10\n"
+ "ldr q20, [x24], #0x10\n"
+ "cmp x28, #0x4\n"
+ "ldr q23, [x23], #0x10\n"
+ "ldr q19, [x22], #0x10\n"
+ "ldr q18, [x21], #0x10\n"
+ "ldr q17, [x20], #0x10\n"
+ "zip1 v22.4s, v25.4s, v21.4s\n"
+ "zip1 v16.4s, v24.4s, v20.4s\n"
+ "zip2 v21.4s, v25.4s, v21.4s\n"
+ "zip2 v20.4s, v24.4s, v20.4s\n"
+ "zip1 v27.4s, v23.4s, v18.4s\n"
+ "zip1 v26.4s, v19.4s, v17.4s\n"
+ "zip2 v25.4s, v23.4s, v18.4s\n"
+ "zip2 v24.4s, v19.4s, v17.4s\n"
+ "zip1 v19.4s, v22.4s, v16.4s\n"
+ "zip1 v18.4s, v21.4s, v20.4s\n"
+ "zip1 v17.4s, v27.4s, v26.4s\n"
+ "zip2 v23.4s, v22.4s, v16.4s\n"
+ "zip1 v16.4s, v25.4s, v24.4s\n"
+ "zip2 v22.4s, v21.4s, v20.4s\n"
+ ".inst 0x0ea16a75 // bfcvtn v21.4h, v19.4s\n"
+ ".inst 0x0ea16a54 // bfcvtn v20.4h, v18.4s\n"
+ ".inst 0x0ea16a33 // bfcvtn v19.4h, v17.4s\n"
+ "zip2 v18.4s, v27.4s, v26.4s\n"
+ ".inst 0x0ea16a11 // bfcvtn v17.4h, v16.4s\n"
+ "zip2 v16.4s, v25.4s, v24.4s\n"
+ ".inst 0x4ea16af5 // bfcvtn2 v21.8h, v23.4s\n"
+ ".inst 0x4ea16ad4 // bfcvtn2 v20.8h, v22.4s\n"
+ ".inst 0x4ea16a53 // bfcvtn2 v19.8h, v18.4s\n"
+ ".inst 0x4ea16a11 // bfcvtn2 v17.8h, v16.4s\n"
+ "str q21, [x27, #0x0]\n"
+ "str q20, [x27, #0x10]\n"
+ "str q19, [x27, #0x20]\n"
+ "str q17, [x27, #0x30]\n"
+ "add x27, x27, %x[out_stride]\n"
+ "bge 4b\n"
+ "5:" // Main row loop: Column loop skip
+ "cbz x28, 7f\n"
+ "movi v16.16b, #0x0\n"
+ "str q16, [x27, #0x0]\n"
+ "str q16, [x27, #0x10]\n"
+ "str q16, [x27, #0x20]\n"
+ "str q16, [x27, #0x30]\n"
+ "6:" // Main row loop: width 1 loop: loop
+ "ldr s23, [x9], #0x4\n"
+ "ldr s22, [x26], #0x4\n"
+ "sub x28, x28, #0x1\n"
+ "ldr s19, [x25], #0x4\n"
+ "ldr s17, [x24], #0x4\n"
+ "cmp x28, #0x1\n"
+ "ldr s21, [x23], #0x4\n"
+ "ldr s20, [x22], #0x4\n"
+ "ldr s18, [x21], #0x4\n"
+ "ldr s16, [x20], #0x4\n"
+ "zip1 v19.4s, v23.4s, v19.4s\n"
+ "zip1 v17.4s, v22.4s, v17.4s\n"
+ "zip1 v18.4s, v21.4s, v18.4s\n"
+ "zip1 v16.4s, v20.4s, v16.4s\n"
+ "zip1 v17.4s, v19.4s, v17.4s\n"
+ "zip1 v16.4s, v18.4s, v16.4s\n"
+ ".inst 0x0ea16a31 // bfcvtn v17.4h, v17.4s\n"
+ ".inst 0x0ea16a10 // bfcvtn v16.4h, v16.4s\n"
+ "str d17, [x27, #0x0]\n"
+ "str d16, [x27, #0x20]\n"
+ "add x27, x27, #0x8\n"
+ "bge 6b\n"
+ "7:" // Main row loop: odd col skip
+ "cmp %x[height], #0x8\n"
+ "add %x[out], %x[out], #0x40\n"
+ "bge 1b\n"
+ "cbz %x[height], 16f\n"
+ "8:" // Main loop skip
+ "9:" // Tail row loop: Head
+ "mov x9, %x[in]\n"
+ "mov x20, %x[width]\n"
+ "cmp %x[height], #0x3\n"
+ "mov x27, %x[out]\n"
+ "add x26, x9, %x[in_stride]\n"
+ "add x25, x26, %x[in_stride]\n"
+ "add x24, x25, %x[in_stride]\n"
+ "csel x25, x25, %x[pad_row], GE\n"
+ "add %x[in], x24, %x[in_stride]\n"
+ "csel x24, x24, %x[pad_row], GT\n"
+ "cmp %x[height], #0x1\n"
+ "sub %x[height], %x[height], #0x4\n"
+ "csel x26, x26, %x[pad_row], GT\n"
+ "cmp x20, #0x8\n"
+ "blt 11f\n"
+ "10:" // Tail row loop: Unroll column loop
+ "ldr q25, [x9], #0x10\n"
+ "ldr q24, [x26], #0x10\n"
+ "sub x20, x20, #0x8\n"
+ "ldr q21, [x25], #0x10\n"
+ "ldr q20, [x24], #0x10\n"
+ "cmp x20, #0x8\n"
+ "ldr q23, [x9], #0x10\n"
+ "ldr q19, [x26], #0x10\n"
+ "ldr q18, [x25], #0x10\n"
+ "ldr q17, [x24], #0x10\n"
+ "zip1 v22.4s, v25.4s, v21.4s\n"
+ "zip1 v16.4s, v24.4s, v20.4s\n"
+ "zip2 v21.4s, v25.4s, v21.4s\n"
+ "zip2 v20.4s, v24.4s, v20.4s\n"
+ "zip1 v27.4s, v23.4s, v18.4s\n"
+ "zip1 v26.4s, v19.4s, v17.4s\n"
+ "zip2 v25.4s, v23.4s, v18.4s\n"
+ "zip2 v24.4s, v19.4s, v17.4s\n"
+ "zip1 v19.4s, v22.4s, v16.4s\n"
+ "zip1 v18.4s, v21.4s, v20.4s\n"
+ "zip1 v17.4s, v27.4s, v26.4s\n"
+ "zip2 v23.4s, v22.4s, v16.4s\n"
+ "zip1 v16.4s, v25.4s, v24.4s\n"
+ "zip2 v22.4s, v21.4s, v20.4s\n"
+ ".inst 0x0ea16a75 // bfcvtn v21.4h, v19.4s\n"
+ ".inst 0x0ea16a54 // bfcvtn v20.4h, v18.4s\n"
+ ".inst 0x0ea16a33 // bfcvtn v19.4h, v17.4s\n"
+ "zip2 v18.4s, v27.4s, v26.4s\n"
+ ".inst 0x0ea16a11 // bfcvtn v17.4h, v16.4s\n"
+ "zip2 v16.4s, v25.4s, v24.4s\n"
+ ".inst 0x4ea16af5 // bfcvtn2 v21.8h, v23.4s\n"
+ ".inst 0x4ea16ad4 // bfcvtn2 v20.8h, v22.4s\n"
+ ".inst 0x4ea16a53 // bfcvtn2 v19.8h, v18.4s\n"
+ ".inst 0x4ea16a11 // bfcvtn2 v17.8h, v16.4s\n"
+ "str q21, [x27, #0x0]\n"
+ "str q20, [x27, #0x10]\n"
+ "add x27, x27, %x[out_stride]\n"
+ "str q19, [x27, #0x0]\n"
+ "str q17, [x27, #0x10]\n"
+ "add x27, x27, %x[out_stride]\n"
+ "bge 10b\n"
+ "11:" // Tail row loop: Unroll column loop skip
+ "cmp x20, #0x4\n"
+ "blt 13f\n"
+ "12:" // Tail row loop: Column loop
+ "ldr q21, [x9], #0x10\n"
+ "ldr q20, [x26], #0x10\n"
+ "sub x20, x20, #0x4\n"
+ "ldr q19, [x25], #0x10\n"
+ "ldr q17, [x24], #0x10\n"
+ "cmp x20, #0x4\n"
+ "zip1 v18.4s, v21.4s, v19.4s\n"
+ "zip1 v16.4s, v20.4s, v17.4s\n"
+ "zip2 v21.4s, v21.4s, v19.4s\n"
+ "zip2 v20.4s, v20.4s, v17.4s\n"
+ "zip1 v17.4s, v18.4s, v16.4s\n"
+ "zip2 v19.4s, v18.4s, v16.4s\n"
+ "zip1 v16.4s, v21.4s, v20.4s\n"
+ ".inst 0x0ea16a32 // bfcvtn v18.4h, v17.4s\n"
+ "zip2 v17.4s, v21.4s, v20.4s\n"
+ ".inst 0x0ea16a10 // bfcvtn v16.4h, v16.4s\n"
+ ".inst 0x4ea16a72 // bfcvtn2 v18.8h, v19.4s\n"
+ ".inst 0x4ea16a30 // bfcvtn2 v16.8h, v17.4s\n"
+ "str q18, [x27, #0x0]\n"
+ "str q16, [x27, #0x10]\n"
+ "add x27, x27, %x[out_stride]\n"
+ "bge 12b\n"
+ "13:" // Tail row loop: Column loop skip
+ "cbz x20, 15f\n"
+ "movi v16.16b, #0x0\n"
+ "str q16, [x27, #0x0]\n"
+ "str q16, [x27, #0x10]\n"
+ "14:" // Tail row loop: width 1 loop: loop
+ "ldr s19, [x9], #0x4\n"
+ "ldr s18, [x26], #0x4\n"
+ "sub x20, x20, #0x1\n"
+ "ldr s17, [x25], #0x4\n"
+ "ldr s16, [x24], #0x4\n"
+ "cmp x20, #0x1\n"
+ "zip1 v17.4s, v19.4s, v17.4s\n"
+ "zip1 v16.4s, v18.4s, v16.4s\n"
+ "zip1 v16.4s, v17.4s, v16.4s\n"
+ ".inst 0x0ea16a10 // bfcvtn v16.4h, v16.4s\n"
+ "str d16, [x27, #0x0]\n"
+ "add x27, x27, #0x8\n"
+ "bge 14b\n"
+ "15:" // Tail row loop: odd col skip
+ "cmp %x[height], #0x1\n"
+ "add %x[out], %x[out], #0x20\n"
+ "bge 9b\n"
+ "16:" // Done
+ : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
+ : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ );
+}
+
+} // anonymous namespace
+template<>
+void Transform<4, 4, true, VLType::None>(
+ bfloat16 *out, const float *in, int stride, int x0, int xmax, int k0, int kmax)
+{
+ a64_transpose_interleave_4_2x4_fp32bf16(
+ out,
+ in + k0 * stride + x0,
+ (xmax-x0),
+ stride * sizeof(float),
+ (kmax-k0)
+ );
+}
+
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/list-sve.hpp b/src/core/NEON/kernels/arm_gemm/transforms/list-sve.hpp
index c066c01bab..1e6c3d35f4 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/list-sve.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/list-sve.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023,2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -42,6 +42,7 @@
#include "sve_transpose_interleave_12VL_2x4_fp32bf16.hpp"
#include "sve_transpose_interleave_1VL_1x4.hpp"
#include "sve_transpose_interleave_1VL.hpp"
+#include "sve_transpose_interleave_2VL_2x4_fp32bf16.hpp"
#include "sve_transpose_interleave_3VL_1x4.hpp"
#include "sve_transpose_interleave_3VL_2x2.hpp"
#include "sve_transpose_interleave_3VL.hpp"
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/list.hpp b/src/core/NEON/kernels/arm_gemm/transforms/list.hpp
index adbaa6cf2f..1ce319efee 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/list.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/list.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2020 Arm Limited.
+ * Copyright (c) 2020,2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -44,6 +44,7 @@
#include "a64_transpose_interleave_32_2x2.hpp"
#include "a64_transpose_interleave_4_1x16.hpp"
#include "a64_transpose_interleave_4_1x4.hpp"
+#include "a64_transpose_interleave_4_2x4_fp32bf16.hpp"
#include "a64_transpose_interleave_48.hpp"
#include "a64_transpose_interleave_64.hpp"
#include "a64_transpose_interleave_96.hpp"
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_2VL_2x4_fp32bf16.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_2VL_2x4_fp32bf16.hpp
new file mode 100644
index 0000000000..f66fcdc994
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_2VL_2x4_fp32bf16.hpp
@@ -0,0 +1,149 @@
+/*
+ * Copyright (c) 2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+
+namespace {
+
+void sve_transpose_interleave_2VL_2x4_fp32bf16(bfloat16 *out, const float *in, size_t width, size_t in_stride, size_t height)
+{
+ float *pad_row = reinterpret_cast(alloca(width * sizeof(float)));
+
+ if (height % 4) {
+ memset(pad_row, 0, width * sizeof(float));
+ }
+
+ size_t out_stride = 2 * roundup(height, 4) * get_vector_length();
+
+ __asm__ __volatile__(
+ "ptrue p1.b\n"
+ "1:" // Main row loop: Head
+ "mov x26, %x[in]\n"
+ "mov x25, %x[width]\n"
+ "cnth x24\n"
+ "cmp %x[height], #0x3\n"
+ "mov x23, %x[out]\n"
+ "add x22, x26, %x[in_stride]\n"
+ "add x21, x22, %x[in_stride]\n"
+ "add x20, x21, %x[in_stride]\n"
+ "add %x[in], x20, %x[in_stride]\n"
+ "csel x20, x20, %x[pad_row], GT\n"
+ "csel x21, x21, %x[pad_row], GE\n"
+ "cmp %x[height], #0x1\n"
+ "csel x22, x22, %x[pad_row], GT\n"
+ "cmp x25, x24\n"
+ "sub %x[height], %x[height], #0x4\n"
+ "blt 3f\n"
+ "2:" // Main row loop: Unroll column loop
+ "ld1w { z18.s }, p1/Z, [x26]\n"
+ "ld1w { z17.s }, p1/Z, [x21]\n"
+ "sub x25, x25, x24\n"
+ "ld1w { z21.s }, p1/Z, [x26, #1, MUL VL]\n"
+ "ld1w { z16.s }, p1/Z, [x21, #1, MUL VL]\n"
+ "cmp x25, x24\n"
+ "addvl x26, x26, #2\n"
+ "ld1w { z26.s }, p1/Z, [x22]\n"
+ "ld1w { z20.s }, p1/Z, [x20]\n"
+ "addvl x21, x21, #2\n"
+ "zip1 z19.s, z18.s, z17.s\n"
+ "zip2 z18.s, z18.s, z17.s\n"
+ "ld1w { z25.s }, p1/Z, [x22, #1, MUL VL]\n"
+ "ld1w { z24.s }, p1/Z, [x20, #1, MUL VL]\n"
+ "addvl x22, x22, #2\n"
+ "zip1 z17.s, z21.s, z16.s\n"
+ "zip2 z16.s, z21.s, z16.s\n"
+ "addvl x20, x20, #2\n"
+ ".inst 0x658aa677 // bfcvt z23.h, p1/M, z19.s\n"
+ "zip1 z22.s, z26.s, z20.s\n"
+ ".inst 0x658aa655 // bfcvt z21.h, p1/M, z18.s\n"
+ "zip2 z20.s, z26.s, z20.s\n"
+ ".inst 0x658aa633 // bfcvt z19.h, p1/M, z17.s\n"
+ "zip1 z18.s, z25.s, z24.s\n"
+ ".inst 0x658aa611 // bfcvt z17.h, p1/M, z16.s\n"
+ "zip2 z16.s, z25.s, z24.s\n"
+ ".inst 0x648aa6d7 // bfcvtnt z23.h, p1/M, z22.s\n"
+ ".inst 0x648aa695 // bfcvtnt z21.h, p1/M, z20.s\n"
+ ".inst 0x648aa653 // bfcvtnt z19.h, p1/M, z18.s\n"
+ ".inst 0x648aa611 // bfcvtnt z17.h, p1/M, z16.s\n"
+ "st1h { z23.h }, p1, [x23]\n"
+ "st1h { z21.h }, p1, [x23, #1, MUL VL]\n"
+ "add x23, x23, %x[out_stride]\n"
+ "st1h { z19.h }, p1, [x23]\n"
+ "st1h { z17.h }, p1, [x23, #1, MUL VL]\n"
+ "add x23, x23, %x[out_stride]\n"
+ "bge 2b\n"
+ "3:" // Main row loop: Unroll column loop skip
+ "cbz x25, 5f\n"
+ "4:" // Main row loop: Column loop
+ "whilelt p0.s, XZR, x25\n"
+ "decd x25, ALL, MUL #2\n"
+ "ld1w { z19.s }, p0/Z, [x26]\n"
+ "addvl x26, x26, #1\n"
+ "ld1w { z16.s }, p0/Z, [x21]\n"
+ "addvl x21, x21, #1\n"
+ "ld1w { z20.s }, p0/Z, [x22]\n"
+ "addvl x22, x22, #1\n"
+ "ld1w { z18.s }, p0/Z, [x20]\n"
+ "addvl x20, x20, #1\n"
+ "cmp x25, #0x0\n"
+ "zip1 z17.s, z19.s, z16.s\n"
+ "zip2 z16.s, z19.s, z16.s\n"
+ "zip1 z19.s, z20.s, z18.s\n"
+ "zip2 z18.s, z20.s, z18.s\n"
+ ".inst 0x658aa631 // bfcvt z17.h, p1/M, z17.s\n"
+ ".inst 0x658aa610 // bfcvt z16.h, p1/M, z16.s\n"
+ ".inst 0x648aa671 // bfcvtnt z17.h, p1/M, z19.s\n"
+ ".inst 0x648aa650 // bfcvtnt z16.h, p1/M, z18.s\n"
+ "st1h { z17.h }, p1, [x23]\n"
+ "st1h { z16.h }, p1, [x23, #1, MUL VL]\n"
+ "add x23, x23, %x[out_stride]\n"
+ "bgt 4b\n"
+ "5:" // Main row loop: Column loop skip
+ "cmp %x[height], #0x1\n"
+ "addvl %x[out], %x[out], #2\n"
+ "bge 1b\n"
+ : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
+ : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width)
+ : "cc", "memory", "p0", "p1", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26"
+ );
+}
+
+} // anonymous namespace
+template<>
+void Transform<2, 4, true, VLType::SVE>(
+ bfloat16 *out, const float *in, int stride, int x0, int xmax, int k0, int kmax)
+{
+ sve_transpose_interleave_2VL_2x4_fp32bf16(
+ out,
+ in + k0 * stride + x0,
+ (xmax-x0),
+ stride * sizeof(float),
+ (kmax-k0)
+ );
+}
+
+
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/batchnormalization/impl/list.h b/src/core/NEON/kernels/batchnormalization/impl/list.h
index cbf540bd71..c619788125 100644
--- a/src/core/NEON/kernels/batchnormalization/impl/list.h
+++ b/src/core/NEON/kernels/batchnormalization/impl/list.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2020 Arm Limited.
+ * Copyright (c) 2020, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -21,8 +21,8 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#ifndef SRC_CORE_NEON_KERNELS_BATCH_NORMALIZATION_LIST_H
-#define SRC_CORE_NEON_KERNELS_BATCH_NORMALIZATION_LIST_H
+#ifndef ACL_SRC_CORE_NEON_KERNELS_BATCHNORMALIZATION_IMPL_LIST_H
+#define ACL_SRC_CORE_NEON_KERNELS_BATCHNORMALIZATION_IMPL_LIST_H
namespace arm_compute
{
@@ -37,8 +37,23 @@ DECLARE_BATCH_NORMALIZATION_KERNEL(fp16_sve_batch_normalization);
DECLARE_BATCH_NORMALIZATION_KERNEL(fp32_neon_batch_normalization);
DECLARE_BATCH_NORMALIZATION_KERNEL(fp32_sve_batch_normalization);
-#undef DECLARE_ACTIVATION_KERNEL
+#define DECLARE_BATCH_NORMALIZATION_NCHW_KERNEL(func_name) \
+ void func_name(const Window &window, ITensor *input, ITensor *output, const ITensor *mean, const ITensor *var, \
+ const ITensor *beta, const ITensor *gamma, float epsilon, ActivationLayerInfo act_info)
+
+DECLARE_BATCH_NORMALIZATION_NCHW_KERNEL(fp16_batch_normalization_nchw_non_fused);
+DECLARE_BATCH_NORMALIZATION_NCHW_KERNEL(fp32_batch_normalization_nchw_non_fused);
+DECLARE_BATCH_NORMALIZATION_NCHW_KERNEL(fp16_batch_normalization_nchw_non_fused_relu);
+DECLARE_BATCH_NORMALIZATION_NCHW_KERNEL(fp16_batch_normalization_nchw_non_fused_brelu);
+DECLARE_BATCH_NORMALIZATION_NCHW_KERNEL(fp16_batch_normalization_nchw_non_fused_lubrelu);
+DECLARE_BATCH_NORMALIZATION_NCHW_KERNEL(fp32_batch_normalization_nchw_non_fused_relu);
+DECLARE_BATCH_NORMALIZATION_NCHW_KERNEL(fp32_batch_normalization_nchw_non_fused_brelu);
+DECLARE_BATCH_NORMALIZATION_NCHW_KERNEL(fp32_batch_normalization_nchw_non_fused_lubrelu);
+
+#undef DECLARE_BATCH_NORMALIZATION_KERNEL
+#undef DECLARE_BATCH_NORMALIZATION_NCHW_KERNEL
+
} // namespace cpu
} // namespace arm_compute
-#endif /* SRC_CORE_NEON_KERNELS_BATCH_NORMALIZATION_LIST_H */
+#endif // ACL_SRC_CORE_NEON_KERNELS_BATCHNORMALIZATION_IMPL_LIST_H
diff --git a/src/core/NEON/wrapper/intrinsics/max.h b/src/core/NEON/wrapper/intrinsics/max.h
index cec437d171..32d38a856c 100644
--- a/src/core/NEON/wrapper/intrinsics/max.h
+++ b/src/core/NEON/wrapper/intrinsics/max.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2020, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -21,8 +21,8 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#ifndef ARM_COMPUTE_WRAPPER_MAX_H
-#define ARM_COMPUTE_WRAPPER_MAX_H
+#ifndef ACL_SRC_CORE_NEON_WRAPPER_INTRINSICS_MAX_H
+#define ACL_SRC_CORE_NEON_WRAPPER_INTRINSICS_MAX_H
#include
@@ -59,6 +59,39 @@ VMAX_IMPL(float16_t, float16x8_t, vmaxq, f16)
#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
#undef VMAX_IMPL
+
+#if defined(__aarch64__)
+// VMAXV: Across vector max
+#define VMAXV_IMPL(stype, vtype, prefix, postfix) \
+ inline stype vmaxv(const vtype &a) \
+ { \
+ return prefix##_##postfix(a); \
+ }
+
+VMAXV_IMPL(uint8_t, uint8x8_t, vmaxv, u8)
+VMAXV_IMPL(int8_t, int8x8_t, vmaxv, s8)
+VMAXV_IMPL(uint16_t, uint16x4_t, vmaxv, u16)
+VMAXV_IMPL(int16_t, int16x4_t, vmaxv, s16)
+VMAXV_IMPL(uint32_t, uint32x2_t, vmaxv, u32)
+VMAXV_IMPL(int32_t, int32x2_t, vmaxv, s32)
+VMAXV_IMPL(float, float32x2_t, vmaxv, f32)
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+VMAXV_IMPL(float16_t, float16x4_t, vmaxv, f16)
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+
+VMAXV_IMPL(uint8_t, uint8x16_t, vmaxvq, u8)
+VMAXV_IMPL(int8_t, int8x16_t, vmaxvq, s8)
+VMAXV_IMPL(uint16_t, uint16x8_t, vmaxvq, u16)
+VMAXV_IMPL(int16_t, int16x8_t, vmaxvq, s16)
+VMAXV_IMPL(uint32_t, uint32x4_t, vmaxvq, u32)
+VMAXV_IMPL(int32_t, int32x4_t, vmaxvq, s32)
+VMAXV_IMPL(float, float32x4_t, vmaxvq, f32)
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+VMAXV_IMPL(float16_t, float16x8_t, vmaxvq, f16)
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+
+#undef VMAXV_IMPL
+#endif // defined(__aarch64__)
} // namespace wrapper
} // namespace arm_compute
-#endif /* ARM_COMPUTE_WRAPPER_MAX_H */
+#endif // ACL_SRC_CORE_NEON_WRAPPER_INTRINSICS_MAX_H
diff --git a/src/core/Utils.cpp b/src/core/Utils.cpp
index 90a7ac32c0..532d08de92 100644
--- a/src/core/Utils.cpp
+++ b/src/core/Utils.cpp
@@ -450,8 +450,9 @@ std::pair get_quantized_activation_min_max(const ActivationLay
const int b_int = is_qasymm8_signed ? quantize_qasymm8_signed(b, oq_info) : quantize_qasymm8(b, oq_info);
const auto type_max_value = std::get<1>(get_min_max(data_type)).get();
- const int32_t min_activation =
- act_info.activation() != ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU ? oq_info.offset : b_int;
+ const int32_t min_activation = act_info.activation() != ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU
+ ? std::min(oq_info.offset, type_max_value)
+ : b_int;
const int32_t max_activation =
act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU ? type_max_value : a_int;
diff --git a/src/core/common/Registrars.h b/src/core/common/Registrars.h
index 686304b8d7..50b3fc1284 100644
--- a/src/core/common/Registrars.h
+++ b/src/core/common/Registrars.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2020-2022 Arm Limited.
+ * Copyright (c) 2020-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -21,8 +21,8 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#ifndef SRC_CORE_COMMON_REGISTRARS_H
-#define SRC_CORE_COMMON_REGISTRARS_H
+#ifndef ACL_SRC_CORE_COMMON_REGISTRARS_H
+#define ACL_SRC_CORE_COMMON_REGISTRARS_H
#if defined(ENABLE_FP16_KERNELS)
@@ -38,11 +38,11 @@
#define REGISTER_FP16_SVE2(func_name) nullptr
#endif /* defined(ARM_COMPUTE_ENABLE_SVE2) */
-#if defined(ARM_COMPUTE_ENABLE_NEON) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+#if defined(ARM_COMPUTE_ENABLE_NEON)
#define REGISTER_FP16_NEON(func_name) &(func_name)
#else /* !defined(ARM_COMPUTE_ENABLE_NEON) */
#define REGISTER_FP16_NEON(func_name) nullptr
-#endif /* defined(ARM_COMPUTE_ENABLE_NEON) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) */
+#endif /* defined(ARM_COMPUTE_ENABLE_NEON) */
#else /* !defined(ENABLE_FP16_KERNELS) */
#define REGISTER_FP16_NEON(func_name) nullptr
@@ -179,4 +179,4 @@
#define REGISTER_BF16_NEON(func_name) nullptr
#endif /* defined(ARM_COMPUTE_ENABLE_BF16)*/
-#endif /* SRC_CORE_COMMON_REGISTRARS_H */
+#endif // ACL_SRC_CORE_COMMON_REGISTRARS_H
diff --git a/src/core/helpers/LUTManager.cpp b/src/core/helpers/LUTManager.cpp
new file mode 100644
index 0000000000..06e35eed8c
--- /dev/null
+++ b/src/core/helpers/LUTManager.cpp
@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/helpers/LUTManager.h"
+
+namespace arm_compute
+{
+#ifdef __aarch64__
+namespace
+{
+
+void init_lut_fp16(ActivationLayerInfo::LookupTable65536 *lut)
+{
+ union Element
+ {
+ uint16_t i = 0;
+ float16_t fp;
+ } item;
+ // Fill lut by iterating over all 16 bit values using the union.
+ while (true)
+ {
+ (*lut)[item.i] = 1.f / (1.f + std::exp(-item.fp));
+ if (item.i == 65535)
+ break;
+ item.i++;
+ }
+}
+} // namespace
+
+std::shared_ptr LUTManager::get_lut_table(LUTInfo info)
+{
+ const auto itr = map_fp16.find(info);
+ auto s_ptr = (itr != map_fp16.end()) ? itr->second.lock() : nullptr; // nullptr if invalid or not found.
+ if (s_ptr != nullptr)
+ {
+ // Found and valid
+ return s_ptr; // Return weak ptr as shared ptr
+ }
+ else
+ {
+ // Not found, or pointer not valid
+ // We do not use make_shared to prevent the weak_ptr keeping the control block alive
+ std::shared_ptr ptr(new ActivationLayerInfo::LookupTable65536);
+ init_lut_fp16(ptr.get());
+ map_fp16[info] = ptr;
+ return ptr;
+ }
+}
+#endif // __aarch64__
+
+// Static function to get LutManager instance
+LUTManager &LUTManager::get_instance()
+{
+ static auto inst_ = std::make_unique(); // The one, single instance.
+ return *inst_;
+}
+
+} // namespace arm_compute
diff --git a/src/core/helpers/LUTManager.h b/src/core/helpers/LUTManager.h
new file mode 100644
index 0000000000..4e13ead7e3
--- /dev/null
+++ b/src/core/helpers/LUTManager.h
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef ACL_SRC_CORE_HELPERS_LUTMANAGER_H
+#define ACL_SRC_CORE_HELPERS_LUTMANAGER_H
+
+#include "arm_compute/core/CoreTypes.h"
+#include "arm_compute/core/QuantizationInfo.h"
+#include "arm_compute/function_info/ActivationLayerInfo.h"
+
+#include