diff --git a/Android.bp b/Android.bp index 7681205770..f7d4d257d6 100644 --- a/Android.bp +++ b/Android.bp @@ -392,6 +392,7 @@ cc_library_static { "src/core/Utils.cpp", "src/core/Validate.cpp", "src/core/Version.cpp", + "src/core/helpers/LUTManager.cpp", "src/core/helpers/SoftmaxHelpers.cpp", "src/core/helpers/Utils.cpp", "src/core/helpers/WindowHelpers.cpp", @@ -488,6 +489,8 @@ cc_library_static { "src/cpu/kernels/crop/generic/neon/fp16.cpp", "src/cpu/kernels/crop/generic/neon/fp32.cpp", "src/cpu/kernels/crop/generic/neon/integer.cpp", + "src/cpu/kernels/depth_to_space/nchw/any/impl.cpp", + "src/cpu/kernels/depth_to_space/nhwc/any/impl.cpp", "src/cpu/kernels/depthwiseconv2d/generic/neon/fp16.cpp", "src/cpu/kernels/depthwiseconv2d/generic/neon/fp32.cpp", "src/cpu/kernels/depthwiseconv2d/generic/neon/impl.cpp", @@ -515,6 +518,8 @@ cc_library_static { "src/cpu/kernels/fuse_batch_normalization/generic/fp16.cpp", "src/cpu/kernels/fuse_batch_normalization/generic/fp32.cpp", "src/cpu/kernels/fuse_batch_normalization/nchw/all.cpp", + "src/cpu/kernels/fuse_batch_normalization/nchw/neon/fp16.cpp", + "src/cpu/kernels/fuse_batch_normalization/nchw/neon/fp32.cpp", "src/cpu/kernels/fuse_batch_normalization/nhwc/neon/fp16.cpp", "src/cpu/kernels/fuse_batch_normalization/nhwc/neon/fp32.cpp", "src/cpu/kernels/gemm_matrix_add/generic/neon/fp16.cpp", @@ -543,6 +548,10 @@ cc_library_static { "src/cpu/kernels/meanstddevnorm/generic/neon/fp32.cpp", "src/cpu/kernels/meanstddevnorm/generic/neon/impl.cpp", "src/cpu/kernels/meanstddevnorm/generic/neon/qasymm8.cpp", + "src/cpu/kernels/mul/generic/neon/fp16.cpp", + "src/cpu/kernels/mul/generic/neon/fp32.cpp", + "src/cpu/kernels/norm_layer/generic/neon/fp16.cpp", + "src/cpu/kernels/norm_layer/generic/neon/fp32.cpp", "src/cpu/kernels/pool2d/neon/fp16.cpp", "src/cpu/kernels/pool2d/neon/fp32.cpp", "src/cpu/kernels/pool2d/neon/nchw/all.cpp", @@ -1033,6 +1042,7 @@ cc_library_static { "src/runtime/heuristics/dwc_native/ClDWCNativeHeuristicsHelpers.cpp", "src/runtime/heuristics/indirect_conv/ClIndirectConvDefaultConfigValhall.cpp", "src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultConfigValhall.cpp", + "src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultVariantValhall.cpp", "src/runtime/heuristics/matmul_native/ClMatMulNativeHelpers.cpp", "utils/CommonGraphOptions.cpp", "utils/GraphUtils.cpp", diff --git a/CMakeLists.txt b/CMakeLists.txt index 9dd3e2cef7..35b6ca2b7f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,4 +1,4 @@ -# Copyright (c) 2023 Arm Limited. +# Copyright (c) 2023-2024 Arm Limited. # # SPDX-License-Identifier: MIT # @@ -28,7 +28,7 @@ cmake_minimum_required(VERSION 3.13 FATAL_ERROR) list(APPEND CMAKE_MESSAGE_CONTEXT ArmCompute) project( ArmCompute - VERSION 33.0.0 + VERSION 34.0.0 DESCRIPTION "The Arm Compute Library is a collection of low-level machine learning functions optimized for Arm® Cortex®-A CPU and Arm® Mali™ GPU architectures" LANGUAGES C CXX ASM) diff --git a/LICENSE b/LICENSE index 0d2cb83aaa..781685ab31 100644 --- a/LICENSE +++ b/LICENSE @@ -1,6 +1,6 @@ MIT License -Copyright (c) 2017-2023 Arm Limited +Copyright (c) 2017-2024 Arm Limited Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/README.md b/README.md index 9b06dbeabf..71a6518594 100644 --- a/README.md +++ b/README.md @@ -1,9 +1,8 @@ -> **⚠ Important** -> From release 22.05: 'master' branch has been replaced with 'main' following our inclusive language update, more information [here](https://arm-software.github.io/ComputeLibrary/latest/contribution_guidelines.xhtml#S5_0_inc_lang). - -> **⚠ Important** -> From release 22.08: armv7a with Android build will no longer be tested or maintained. +> **⚠ Deprecation Notice** +> 24.01 announcement: NCHW data format specific optimizations will gradually be removed from the code base in +> future releases. The implication of this is that the user is expected to translate NCHW models into NHWC in +> order to benefit from the optimizations. > **⚠ Important** > From release 23.02: The 23.02 release introduces a change to the default tensor extend padding behavior. @@ -16,7 +15,7 @@

-# Compute Library ![](https://img.shields.io/badge/latest_release-23.11-green) +# Compute Library ![](https://img.shields.io/badge/latest_release-24.01-green) The Compute Library is a collection of low-level machine learning functions optimized for Arm® Cortex®-A, Arm® Neoverse® and Arm® Mali™ GPUs architectures.
@@ -44,7 +43,7 @@ Key Features:
## Documentation -[![Documentation](https://img.shields.io/badge/documentation-23.11-green)](https://arm-software.github.io/ComputeLibrary/latest) +[![Documentation](https://img.shields.io/badge/documentation-24.01-green)](https://arm-software.github.io/ComputeLibrary/latest) > Note: The documentation includes the reference API, changelogs, build guide, contribution guide, errata, etc. @@ -57,24 +56,24 @@ All the binaries can be downloaded from [here](https://github.com/ARM-software/C | Platform | Operating System | Release archive (Download) | | -------------- | ---------------- | -------------------------- | -| Raspberry Pi 4 | Linux® 32bit | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v23.11/arm_compute-v23.11-bin-linux-armv7a-neon.tar.gz) | -| Raspberry Pi 4 | Linux® 64bit | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v23.11/arm_compute-v23.11-bin-linux-arm64-v8a-neon.tar.gz) | -| Odroid N2 | Linux® 64bit | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v23.11/arm_compute-v23.11-bin-linux-arm64-v8a-neon.tar.gz) [![](https://img.shields.io/badge/build-opencl-blue)](https://github.com/ARM-software/ComputeLibrary/releases/download/v23.11/arm_compute-v23.11-bin-linux-arm64-v8a-cl.tar.gz) [![](https://img.shields.io/badge/build-neon+cl-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/download/v23.11/arm_compute-v23.11-bin-linux-arm64-v8a-neon-cl.tar.gz) | -| HiKey960 | Linux® 64bit | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v23.11/arm_compute-v23.11-bin-linux-arm64-v8a-neon.tar.gz) [![](https://img.shields.io/badge/build-opencl-blue)](https://github.com/ARM-software/ComputeLibrary/releases/download/v23.11/arm_compute-v23.11-bin-linux-arm64-v8a-cl.tar.gz) [![](https://img.shields.io/badge/build-neon+cl-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/download/v23.11/arm_compute-v23.11-bin-linux-arm64-v8a-neon-cl.tar.gz) | +| Raspberry Pi 4 | Linux® 32bit | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.01/arm_compute-v24.01-bin-linux-armv7a-neon.tar.gz) | +| Raspberry Pi 4 | Linux® 64bit | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.01/arm_compute-v24.01-bin-linux-arm64-v8a-neon.tar.gz) | +| Odroid N2 | Linux® 64bit | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.01/arm_compute-v24.01-bin-linux-arm64-v8a-neon.tar.gz) [![](https://img.shields.io/badge/build-opencl-blue)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.01/arm_compute-v24.01-bin-linux-arm64-v8a-cl.tar.gz) [![](https://img.shields.io/badge/build-neon+cl-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.01/arm_compute-v24.01-bin-linux-arm64-v8a-neon-cl.tar.gz) | +| HiKey960 | Linux® 64bit | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.01/arm_compute-v24.01-bin-linux-arm64-v8a-neon.tar.gz) [![](https://img.shields.io/badge/build-opencl-blue)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.01/arm_compute-v24.01-bin-linux-arm64-v8a-cl.tar.gz) [![](https://img.shields.io/badge/build-neon+cl-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.01/arm_compute-v24.01-bin-linux-arm64-v8a-neon-cl.tar.gz) |
| Architecture | Operating System | Release archive (Download) | | ------------ | ---------------- | -------------------------- | -| armv7 | Linux® | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v23.11/arm_compute-v23.11-bin-linux-armv7a-neon.tar.gz) [![](https://img.shields.io/badge/build-opencl-blue)](https://github.com/ARM-software/ComputeLibrary/releases/download/v23.11/arm_compute-v23.11-bin-linux-armv7a-cl.tar.gz) [![](https://img.shields.io/badge/build-neon+cl-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/download/v23.11/arm_compute-v23.11-bin-linux-armv7a-neon-cl.tar.gz) | -| arm64-v8a | Android™ | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v23.11/arm_compute-v23.11-bin-android-arm64-v8a-neon.tar.gz) [![](https://img.shields.io/badge/build-opencl-blue)](https://github.com/ARM-software/ComputeLibrary/releases/download/v23.11/arm_compute-v23.11-bin-android-arm64-v8a-cl.tar.gz) [![](https://img.shields.io/badge/build-neon+cl-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/download/v23.11/arm_compute-v23.11-bin-android-arm64-v8a-neon-cl.tar.gz) | -| arm64-v8a | Linux® | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v23.11/arm_compute-v23.11-bin-linux-arm64-v8a-neon.tar.gz) [![](https://img.shields.io/badge/build-opencl-blue)](https://github.com/ARM-software/ComputeLibrary/releases/download/v23.11/arm_compute-v23.11-bin-linux-arm64-v8a-cl.tar.gz) [![](https://img.shields.io/badge/build-neon+cl-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/download/v23.11/arm_compute-v23.11-bin-linux-arm64-v8a-neon-cl.tar.gz) | -| arm64-v8.2-a | Android™ | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v23.11/arm_compute-v23.11-bin-android-arm64-v8.2-a-neon.tar.gz) [![](https://img.shields.io/badge/build-opencl-blue)](https://github.com/ARM-software/ComputeLibrary/releases/download/v23.11/arm_compute-v23.11-bin-android-arm64-v8.2-a-cl.tar.gz) [![](https://img.shields.io/badge/build-neon+cl-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/download/v23.11/arm_compute-v23.11-bin-android-arm64-v8.2-a-neon-cl.tar.gz) | -| arm64-v8.2-a | Linux® | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v23.11/arm_compute-v23.11-bin-linux-arm64-v8.2-a-neon.tar.gz) [![](https://img.shields.io/badge/build-opencl-blue)](https://github.com/ARM-software/ComputeLibrary/releases/download/v23.11/arm_compute-v23.11-bin-linux-arm64-v8.2-a-cl.tar.gz) [![](https://img.shields.io/badge/build-neon+cl-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/download/v23.11/arm_compute-v23.11-bin-linux-arm64-v8.2-a-neon-cl.tar.gz) | +| armv7 | Linux® | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.01/arm_compute-v24.01-bin-linux-armv7a-neon.tar.gz) [![](https://img.shields.io/badge/build-opencl-blue)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.01/arm_compute-v24.01-bin-linux-armv7a-cl.tar.gz) [![](https://img.shields.io/badge/build-neon+cl-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.01/arm_compute-v24.01-bin-linux-armv7a-neon-cl.tar.gz) | +| arm64-v8a | Android™ | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.01/arm_compute-v24.01-bin-android-arm64-v8a-neon.tar.gz) [![](https://img.shields.io/badge/build-opencl-blue)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.01/arm_compute-v24.01-bin-android-arm64-v8a-cl.tar.gz) [![](https://img.shields.io/badge/build-neon+cl-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.01/arm_compute-v24.01-bin-android-arm64-v8a-neon-cl.tar.gz) | +| arm64-v8a | Linux® | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.01/arm_compute-v24.01-bin-linux-arm64-v8a-neon.tar.gz) [![](https://img.shields.io/badge/build-opencl-blue)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.01/arm_compute-v24.01-bin-linux-arm64-v8a-cl.tar.gz) [![](https://img.shields.io/badge/build-neon+cl-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.01/arm_compute-v24.01-bin-linux-arm64-v8a-neon-cl.tar.gz) | +| arm64-v8.2-a | Android™ | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.01/arm_compute-v24.01-bin-android-arm64-v8.2-a-neon.tar.gz) [![](https://img.shields.io/badge/build-opencl-blue)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.01/arm_compute-v24.01-bin-android-arm64-v8.2-a-cl.tar.gz) [![](https://img.shields.io/badge/build-neon+cl-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.01/arm_compute-v24.01-bin-android-arm64-v8.2-a-neon-cl.tar.gz) | +| arm64-v8.2-a | Linux® | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.01/arm_compute-v24.01-bin-linux-arm64-v8.2-a-neon.tar.gz) [![](https://img.shields.io/badge/build-opencl-blue)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.01/arm_compute-v24.01-bin-linux-arm64-v8.2-a-cl.tar.gz) [![](https://img.shields.io/badge/build-neon+cl-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.01/arm_compute-v24.01-bin-linux-arm64-v8.2-a-neon-cl.tar.gz) |
-Please refer to the following link for more pre-built binaries: [![](https://img.shields.io/badge/v23.11-bins-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/tag/v23.11) +Please refer to the following link for more pre-built binaries: [![](https://img.shields.io/badge/v24.01-bins-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/tag/v24.01) Pre-build binaries are generated with the following security / good coding practices related flags: > -Wall, -Wextra, -Wformat=2, -Winit-self, -Wstrict-overflow=2, -Wswitch-default, -Woverloaded-virtual, -Wformat-security, -Wctor-dtor-privacy, -Wsign-promo, -Weffc++, -pedantic, -fstack-protector-strong diff --git a/SConscript b/SConscript index 099ff706ad..96b3bdc689 100644 --- a/SConscript +++ b/SConscript @@ -1,7 +1,7 @@ #!/usr/bin/python # -*- coding: utf-8 -*- -# Copyright (c) 2016-2023 Arm Limited. +# Copyright (c) 2016-2024 Arm Limited. # # SPDX-License-Identifier: MIT # @@ -31,15 +31,8 @@ import zlib import json import codecs -from SCons.Warnings import warn, DeprecatedWarning - -warn(DeprecatedWarning, - "DEPRECATION NOTICE: Legacy libarm_compute_core has been deprecated and is scheduled for removal in 24.02 release." - " Link your application only to libarm_compute for core library functionality" - ) - -VERSION = "v23.11" -LIBRARY_VERSION_MAJOR = 33 +VERSION = "v24.01" +LIBRARY_VERSION_MAJOR = 34 LIBRARY_VERSION_MINOR = 0 LIBRARY_VERSION_PATCH = 0 SONAME_VERSION = str(LIBRARY_VERSION_MAJOR) + "." + str(LIBRARY_VERSION_MINOR) + "." + str(LIBRARY_VERSION_PATCH) @@ -89,31 +82,42 @@ def build_obj_list(arch_info, sources, static=False): # A list of static objects # A list of shared objects -def build_lib_objects(): +def build_multiisa_lib_objects(): lib_static_objs = [] # static objects lib_shared_objs = [] # shared objects + # note that ARM_COMPUTE_ENABLE_FP16 is enabled in update_data_type_layout_flags() to make + # sure the environment is progated to the validation suite arm_compute_env.Append(CPPDEFINES = ['ENABLE_NEON', 'ARM_COMPUTE_ENABLE_NEON', - 'ENABLE_SVE', 'ARM_COMPUTE_ENABLE_SVE', - 'ARM_COMPUTE_ENABLE_FP16', 'ARM_COMPUTE_ENABLE_BF16', + 'ENABLE_SVE', 'ARM_COMPUTE_ENABLE_SVE','ARM_COMPUTE_ENABLE_BF16', 'ARM_COMPUTE_ENABLE_I8MM', 'ARM_COMPUTE_ENABLE_SVEF32MM']) # Build all the common files for the base architecture if env['arch'] == 'armv8a': - lib_static_objs += build_obj_list(filedefs["armv8-a"], lib_files, static=True) - lib_shared_objs += build_obj_list(filedefs["armv8-a"], lib_files, static=False) + lib_static_objs += build_obj_list(filedefs["armv8-a"], misa_lib_files, static=True) + lib_shared_objs += build_obj_list(filedefs["armv8-a"], misa_lib_files, static=False) else: - lib_static_objs += build_obj_list(filedefs["armv8.2-a"], lib_files, static=True) - lib_shared_objs += build_obj_list(filedefs["armv8.2-a"], lib_files, static=False) + lib_static_objs += build_obj_list(filedefs["armv8.2-a"], misa_lib_files, static=True) + lib_shared_objs += build_obj_list(filedefs["armv8.2-a"], misa_lib_files, static=False) + + # Build the FP16 specific files + lib_static_objs += build_obj_list(filedefs["armv8.2-a"], misa_lib_files_neon_fp16, static=True) + lib_shared_objs += build_obj_list(filedefs["armv8.2-a"], misa_lib_files_neon_fp16, static=False) # Build the SVE specific files - lib_static_objs += build_obj_list(filedefs["armv8.2-a-sve"], lib_files_sve, static=True) - lib_shared_objs += build_obj_list(filedefs["armv8.2-a-sve"], lib_files_sve, static=False) + lib_static_objs += build_obj_list(filedefs["armv8.2-a-sve"], misa_lib_files_sve, static=True) + lib_shared_objs += build_obj_list(filedefs["armv8.2-a-sve"], misa_lib_files_sve, static=False) + lib_static_objs += build_obj_list(filedefs["armv8.2-a-sve"], misa_lib_files_sve_fp16, static=True) + lib_shared_objs += build_obj_list(filedefs["armv8.2-a-sve"], misa_lib_files_sve_fp16, static=False) + # Build the SVE2 specific files arm_compute_env.Append(CPPDEFINES = ['ARM_COMPUTE_ENABLE_SVE2']) - lib_static_objs += build_obj_list(filedefs["armv8.6-a-sve2"], lib_files_sve2, static=True) - lib_shared_objs += build_obj_list(filedefs["armv8.6-a-sve2"], lib_files_sve2, static=False) + lib_static_objs += build_obj_list(filedefs["armv8.6-a-sve2"], misa_lib_files_sve2, static=True) + lib_shared_objs += build_obj_list(filedefs["armv8.6-a-sve2"], misa_lib_files_sve2, static=False) + lib_static_objs += build_obj_list(filedefs["armv8.6-a-sve2"], misa_lib_files_sve2_fp16, static=True) + lib_shared_objs += build_obj_list(filedefs["armv8.6-a-sve2"], misa_lib_files_sve2_fp16, static=False) + return lib_static_objs, lib_shared_objs @@ -291,29 +295,29 @@ def get_attrs_list(env, data_types, data_layouts): return attrs -def get_operator_backend_files(filelist, operators, backend='', techs=[], attrs=[]): +def get_operator_backend_files(filelist, operators, backend='', techs=[], attrs=[], include_common=True): files = { "common" : [] } - # Early return if filelist is empty if backend not in filelist: return files - # Iterate over operators and create the file lists to compiler for operator in operators: if operator in filelist[backend]['operators']: - files['common'] += filelist[backend]['operators'][operator]["files"]["common"] + if include_common : + files['common'] += filelist[backend]['operators'][operator]["files"]["common"] for tech in techs: if tech in filelist[backend]['operators'][operator]["files"]: # Add tech as a key to dictionary if not there if tech not in files: files[tech] = [] - # Add tech files to the tech file list tech_files = filelist[backend]['operators'][operator]["files"][tech] - files[tech] += tech_files.get('common', []) + if include_common: + files[tech] += tech_files.get('common', []) for attr in attrs: files[tech] += tech_files.get(attr, []) + # Remove duplicates if they exist return {k: list(set(v)) for k,v in files.items()} @@ -615,6 +619,17 @@ if env['opencl']: lib_files_sve = [] lib_files_sve2 = [] +# the variables below are used for the multi_isa builds +# please note that the variables names without the _fp16 suffix +# do not hold any fp16 files. + +misa_lib_files = lib_files +misa_lib_files_sve = [] +misa_lib_files_sve2 = [] +misa_lib_files_neon_fp16 = [] +misa_lib_files_sve_fp16 = [] +misa_lib_files_sve2_fp16 = [] + if env['neon']: # build winograd/depthwise sources for either v7a / v8a arm_compute_env.Append(CPPPATH = ["src/core/NEON/kernels/arm_gemm", @@ -627,8 +642,6 @@ if env['neon']: "arm_compute/core/NEON/kernels/assembly/", "src/cpu/kernels/assembly/"]) - lib_files += filelist['cpu']['common'] - # Setup SIMD file list to include simd = ['neon'] if env['multi_isa']: @@ -643,7 +656,6 @@ if env['neon']: else: attrs = get_attrs_list(env, env['data_type_support'], env['data_layout_support']) - if env['fixed_format_kernels']: attrs.append("fixed_format_kernels") @@ -651,19 +663,46 @@ if env['neon']: cpu_operators = custom_operators if use_custom_ops else filelist['cpu']['operators'].keys() cpu_ops_to_build = resolve_operator_dependencies(filelist, cpu_operators, 'cpu') - cpu_files = get_operator_backend_files(filelist, cpu_ops_to_build, 'cpu', simd, attrs) + if env['multi_isa']: + misa_lib_files += filelist['cpu']['common'] - # Shared among ALL CPU files - lib_files += cpu_files.get('common', []) + # For multi_isa builds we need to build fp16 files for armv8.2-a+fp16 so we filter them out of cpu_files removing the attribute fp16 + attrs.remove('fp16') + cpu_files = get_operator_backend_files(filelist, cpu_ops_to_build, 'cpu', simd, attrs) - # Arm® Neon™ specific files - lib_files += cpu_files.get('neon', []) + # Shared among ALL CPU files + misa_lib_files += cpu_files.get('common', []) - # SVE files only - lib_files_sve = cpu_files.get('sve', []) + # Arm® Neon™ specific files + misa_lib_files += cpu_files.get('neon', []) - # SVE2 files only - lib_files_sve2 = cpu_files.get('sve2', []) + # Get all the fp16 files + fp16_cpu_files = get_operator_backend_files(filelist, cpu_ops_to_build, 'cpu', simd, ['fp16'],False) + + misa_lib_files_neon_fp16 = fp16_cpu_files.get('neon',[]) + misa_lib_files_sve_fp16 = fp16_cpu_files.get('sve',[]) + misa_lib_files_sve2_fp16 = fp16_cpu_files.get('sve2',[]) + + # SVE files only minus FP16 + misa_lib_files_sve = cpu_files.get('sve', []) + + # SVE2 files only minus FP16 + misa_lib_files_sve2 = cpu_files.get('sve2', []) + else: + lib_files += filelist['cpu']['common'] + + # Non multi_isa build + cpu_files = get_operator_backend_files(filelist, cpu_ops_to_build, 'cpu', simd, attrs) + + # Shared among ALL CPU files + lib_files += cpu_files.get('common', []) + + # Arm® Neon™ specific files + lib_files += cpu_files.get('neon', []) + + lib_files_sve = cpu_files.get('sve', []) + + lib_files_sve2 = cpu_files.get('sve2', []) graph_files += Glob('src/graph/backends/NEON/*.cpp') @@ -681,7 +720,7 @@ Export('bootcode_o') if (env['multi_isa']): - lib_static_objs, lib_shared_objs = build_lib_objects() + lib_static_objs, lib_shared_objs = build_multiisa_lib_objects() # STATIC library build. @@ -708,18 +747,6 @@ if env['os'] != 'bare_metal' and not env['standalone']: Export('arm_compute_so') -# Generate dummy core lib for backwards compatibility -if env['os'] == 'macos': - # macos static library archiver fails if given an empty list of files - arm_compute_core_a = build_library('arm_compute_core-static', arm_compute_env, lib_files, static=True) -else: - arm_compute_core_a = build_library('arm_compute_core-static', arm_compute_env, [], static=True) - -Export('arm_compute_core_a') - -if env['os'] != 'bare_metal' and not env['standalone']: - arm_compute_core_a_so = build_library('arm_compute_core', arm_compute_env, [], static=False) - Export('arm_compute_core_a_so') arm_compute_graph_env = arm_compute_env.Clone() diff --git a/SConstruct b/SConstruct index 68c518a4a0..cf8fb52bd6 100644 --- a/SConstruct +++ b/SConstruct @@ -62,8 +62,14 @@ def read_build_config_json(build_config): def update_data_type_layout_flags(env, data_types, data_layouts): # Manage data-types - if any(i in data_types for i in ['all', 'fp16']): - env.Append(CXXFLAGS = ['-DENABLE_FP16_KERNELS']) + if env['multi_isa']: + if any(i in data_types for i in ['all', 'fp16']): + env.Append(CXXFLAGS = ['-DENABLE_FP16_KERNELS', '-DARM_COMPUTE_ENABLE_FP16']) + else: + if not 'v8a' in env['arch'] and not 'v7a' in env['arch'] and not 'armv8r64' in env['arch']: + if any(i in data_types for i in ['all', 'fp16']): + env.Append(CXXFLAGS = ['-DENABLE_FP16_KERNELS','-DARM_COMPUTE_ENABLE_FP16']) + if any(i in data_types for i in ['all', 'fp32']): env.Append(CXXFLAGS = ['-DENABLE_FP32_KERNELS']) if any(i in data_types for i in ['all', 'qasymm8']): @@ -112,7 +118,7 @@ vars.AddVariables( BoolVariable("exceptions", "Enable/disable C++ exception support", True), BoolVariable("high_priority", "Generate a library containing only the high priority operators", False), PathVariable("linker_script", "Use an external linker script", "", PathVariable.PathAccept), - PathVariable("external_tests_dir", """Add examples, benchmarks and tests to the tests suite from an external path. In order to use this option, the external tests directory must have the following structure: + PathVariable("external_tests_dir", """Add examples, benchmarks and tests to the tests suite from an external path. In order to use this option, the external tests directory must have the following structure: EXTERNAL_TESTS_DIR: └── tests ├── benchmark @@ -240,7 +246,6 @@ env.Append(CXXFLAGS = ['-DARCH_ARM', if not 'windows' in env['os']: env.Append(CXXFLAGS = ['-Wall','-std=c++14', '-pedantic' ]) -env.Append(CPPDEFINES = ['_GLIBCXX_USE_NANOSLEEP']) cpp_tool = {'linux': 'g++', 'android' : 'clang++', 'tizen': 'g++', 'macos':'clang++', @@ -312,8 +317,7 @@ if env['multi_isa']: Exit(1) if 'v8a' in env['arch']: - print("INFO: multi_isa armv8-a architecture build doesn't enable __ARM_FEATURE_FP16_VECTOR_ARITHMETIC. Use armv8.2-a or beyond to enable FP16 vector arithmetic support") - env.Append(CXXFLAGS = ['-march=armv8-a']) # note: this will disable fp16 extension __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + env.Append(CXXFLAGS = ['-march=armv8-a']) else: if 'v8.6-a' in env['arch']: if "disable_mmla_fp" not in env['custom_options']: @@ -536,7 +540,7 @@ if env['standalone']: if not 'windows' in env['os']: env.Append(CXXFLAGS = ['-fPIC']) env.Append(LINKFLAGS = ['-static-libgcc','-static-libstdc++']) - + if env['Werror']: env.Append(CXXFLAGS = ['-Werror']) @@ -597,7 +601,7 @@ if env['debug']: else: env.Append(CXXFLAGS = ['-Z7','-MTd','-fms-compatibility','-fdelayed-template-parsing']) env.Append(LINKFLAGS = ['-DEBUG']) - + env.Append(CPPDEFINES = ['ARM_COMPUTE_DEBUG_ENABLED']) else: if not 'windows' in env['os']: diff --git a/arm_compute/core/CL/OpenCL.h b/arm_compute/core/CL/OpenCL.h index a5c4e39df2..8b5bf97099 100644 --- a/arm_compute/core/CL/OpenCL.h +++ b/arm_compute/core/CL/OpenCL.h @@ -21,8 +21,8 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#ifndef ARM_COMPUTE_OPENCL_H -#define ARM_COMPUTE_OPENCL_H +#ifndef ACL_ARM_COMPUTE_CORE_CL_OPENCL_H +#define ACL_ARM_COMPUTE_CORE_CL_OPENCL_H #include #include @@ -139,6 +139,7 @@ class CLSymbols final DECLARE_FUNCTION_PTR(clWaitForEvents); DECLARE_FUNCTION_PTR(clCreateImage); DECLARE_FUNCTION_PTR(clSetKernelExecInfo); + DECLARE_FUNCTION_PTR(clGetExtensionFunctionAddressForPlatform); // Command buffer and mutable dispatch command buffer extensions DECLARE_FUNCTION_PTR(clCreateCommandBufferKHR); @@ -159,4 +160,4 @@ class CLSymbols final std::pair _loaded; }; } // namespace arm_compute -#endif /* ARM_COMPUTE_OPENCL_H */ +#endif // ACL_ARM_COMPUTE_CORE_CL_OPENCL_H diff --git a/arm_compute/core/GPUTarget.h b/arm_compute/core/GPUTarget.h index affa79a89e..b107a52d9f 100644 --- a/arm_compute/core/GPUTarget.h +++ b/arm_compute/core/GPUTarget.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2022 Arm Limited. + * Copyright (c) 2018-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -21,8 +21,8 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#ifndef ARM_COMPUTE_GPUTARGET_H -#define ARM_COMPUTE_GPUTARGET_H +#ifndef ACL_ARM_COMPUTE_CORE_GPUTARGET_H +#define ACL_ARM_COMPUTE_CORE_GPUTARGET_H #include "support/Traits.h" @@ -39,6 +39,7 @@ enum class GPUTarget MIDGARD = 0x100, BIFROST = 0x200, VALHALL = 0x300, + FIFTHGEN = 0X400, T600 = 0x110, T700 = 0x120, T800 = 0x130, @@ -62,6 +63,8 @@ enum class GPUTarget G310 = 0x343, G715 = 0x350, G615 = 0x351, + G720 = 0x410, + G620 = 0X411 }; /** Enable bitwise operations on GPUTarget enumerations */ @@ -114,4 +117,4 @@ inline bool gpu_target_is_in(GPUTarget target_to_check, GPUTarget target) return target_to_check == target; } } // namespace arm_compute -#endif /* ARM_COMPUTE_GPUTARGET_H */ +#endif // ACL_ARM_COMPUTE_CORE_GPUTARGET_H diff --git a/arm_compute/core/utils/misc/ShapeCalculator.h b/arm_compute/core/utils/misc/ShapeCalculator.h index 86dcfdc3d0..e97d81390e 100644 --- a/arm_compute/core/utils/misc/ShapeCalculator.h +++ b/arm_compute/core/utils/misc/ShapeCalculator.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2023 Arm Limited. + * Copyright (c) 2017-2024 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -60,7 +60,14 @@ inline TensorShape calculate_reduce_mean_shape(ITensorInfo *input, const Coordin { // We have to sort the reduction axis vectors in order for remove_dimension // to work properly + +// Suppress warning produced by a compiler bug in GCC +// https://gcc.gnu.org/bugzilla/show_bug.cgi?id=104165 +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Warray-bounds" std::sort(axis_local.begin(), axis_local.begin() + reduction_ops); +#pragma GCC diagnostic pop + for (int i = 0; i < reduction_ops; ++i) { out_shape.remove_dimension(axis_local[i] - i, false); diff --git a/arm_compute/function_info/ActivationLayerInfo.h b/arm_compute/function_info/ActivationLayerInfo.h index 195b67cf99..9390d0c54f 100644 --- a/arm_compute/function_info/ActivationLayerInfo.h +++ b/arm_compute/function_info/ActivationLayerInfo.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2023 Arm Limited. + * Copyright (c) 2016-2024 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -21,13 +21,19 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#ifndef ACL_ARM_COMPUTE_FUNCTION_INFO_ACTIVATIONLAYERINFO -#define ACL_ARM_COMPUTE_FUNCTION_INFO_ACTIVATIONLAYERINFO +#ifndef ACL_ARM_COMPUTE_FUNCTION_INFO_ACTIVATIONLAYERINFO_H +#define ACL_ARM_COMPUTE_FUNCTION_INFO_ACTIVATIONLAYERINFO_H #include "arm_compute/core/CoreTypes.h" +#include "arm_compute/core/Error.h" #include "arm_compute/core/QuantizationInfo.h" #include +#include + +#ifdef __aarch64__ +#include +#endif // __arch64__ namespace arm_compute { @@ -58,7 +64,10 @@ class ActivationLayerInfo typedef arm_compute::ActivationFunction ActivationFunction; /** Lookup table */ - using LookupTable256 = std::array; +#ifdef __aarch64__ + using LookupTable256 = std::array; + using LookupTable65536 = std::array; +#endif // __aarch64__ ActivationLayerInfo() = default; /** Default Constructor @@ -101,6 +110,16 @@ class ActivationLayerInfo { _lut = std::move(lut); } + + const LookupTable65536 &lut_fp16() const + { + ARM_COMPUTE_ERROR_ON(_lut_fp16 == nullptr); + return *_lut_fp16; + } + void setLookupTable65536(std::shared_ptr lut) + { + _lut_fp16 = lut; + } #endif // __aarch64__ private: ActivationFunction _act = {ActivationLayerInfo::ActivationFunction::IDENTITY}; @@ -109,8 +128,9 @@ class ActivationLayerInfo bool _enabled = {false}; #ifdef __aarch64__ - LookupTable256 _lut = {}; + LookupTable256 _lut = {}; + std::shared_ptr _lut_fp16{nullptr}; #endif // __aarch64__ }; } // namespace arm_compute -#endif /* ACL_ARM_COMPUTE_FUNCTION_INFO_ACTIVATIONLAYERINFO */ +#endif // ACL_ARM_COMPUTE_FUNCTION_INFO_ACTIVATIONLAYERINFO_H diff --git a/arm_compute/runtime/NEON/functions/NEDepthToSpaceLayer.h b/arm_compute/runtime/NEON/functions/NEDepthToSpaceLayer.h index c7df29a704..d27369670e 100644 --- a/arm_compute/runtime/NEON/functions/NEDepthToSpaceLayer.h +++ b/arm_compute/runtime/NEON/functions/NEDepthToSpaceLayer.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021 Arm Limited. + * Copyright (c) 2019-2021, 2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -21,25 +21,27 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#ifndef ARM_COMPUTE_NEDEPTHTOSPACELAYER_H -#define ARM_COMPUTE_NEDEPTHTOSPACELAYER_H +#ifndef ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NEDEPTHTOSPACELAYER_H +#define ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NEDEPTHTOSPACELAYER_H #include "arm_compute/core/Types.h" #include "arm_compute/runtime/IFunction.h" -#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h" + +#include namespace arm_compute { // Forward declarations class ITensor; class ITensorInfo; +class NEDepthToSpaceLayerKernel; /** Basic function to run @ref NEDepthToSpaceLayerKernel. */ -class NEDepthToSpaceLayer : public INESimpleFunctionNoBorder +class NEDepthToSpaceLayer : public IFunction { public: /** Constructor */ - NEDepthToSpaceLayer() = default; + NEDepthToSpaceLayer(); /** Prevent instances of this class from being copied (As this class contains pointers) */ NEDepthToSpaceLayer(const NEDepthToSpaceLayer &) = delete; /** Prevent instances of this class from being copied (As this class contains pointers) */ @@ -49,7 +51,7 @@ class NEDepthToSpaceLayer : public INESimpleFunctionNoBorder /** Prevent instances of this class from being moved (As this class contains non movable objects) */ NEDepthToSpaceLayer &operator=(NEDepthToSpaceLayer &&) = delete; /** Default destructor */ - ~NEDepthToSpaceLayer() = default; + ~NEDepthToSpaceLayer(); /** Set the input and output tensors. * * Valid data layouts: @@ -75,6 +77,11 @@ class NEDepthToSpaceLayer : public INESimpleFunctionNoBorder * @return a status */ static Status validate(const ITensorInfo *input, const ITensorInfo *output, int32_t block_shape); + + void run() override; + +private: + std::unique_ptr _kernel; }; } // namespace arm_compute -#endif /* ARM_COMPUTE_NEDEPTHTOSPACELAYER_H */ +#endif // ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NEDEPTHTOSPACELAYER_H diff --git a/arm_compute/runtime/Scheduler.h b/arm_compute/runtime/Scheduler.h index bd29cbb31f..7c83f86caa 100644 --- a/arm_compute/runtime/Scheduler.h +++ b/arm_compute/runtime/Scheduler.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2019 Arm Limited. + * Copyright (c) 2017-2019, 2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -21,8 +21,8 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#ifndef ARM_COMPUTE_SCHEDULER_H -#define ARM_COMPUTE_SCHEDULER_H +#ifndef ACL_ARM_COMPUTE_RUNTIME_SCHEDULER_H +#define ACL_ARM_COMPUTE_RUNTIME_SCHEDULER_H #include "arm_compute/runtime/IScheduler.h" @@ -81,4 +81,4 @@ class Scheduler Scheduler(); }; } // namespace arm_compute -#endif /* ARM_COMPUTE_SCHEDULER_H */ +#endif // ACL_ARM_COMPUTE_RUNTIME_SCHEDULER_H diff --git a/cmake/Options.cmake b/cmake/Options.cmake index bc51cbbc0d..e5c8cb8efe 100644 --- a/cmake/Options.cmake +++ b/cmake/Options.cmake @@ -116,4 +116,4 @@ endif() if(ARM_COMPUTE_ENABLE_FIXED_FORMAT_KERNELS) add_definitions(-DARM_COMPUTE_ENABLE_FIXED_FORMAT_KERNELS) endif() -add_definitions(-D_GLIBCXX_USE_NANOSLEEP) \ No newline at end of file +add_definitions(-D_GLIBCXX_USE_NANOSLEEP) diff --git a/docs/Doxyfile b/docs/Doxyfile index 0b2f32ad1a..0d8654944d 100644 --- a/docs/Doxyfile +++ b/docs/Doxyfile @@ -38,7 +38,7 @@ PROJECT_NAME = "Compute Library" # could be handy for archiving the generated documentation or if some version # control system is used. -PROJECT_NUMBER = 23.11 +PROJECT_NUMBER = 24.01 # Using the PROJECT_BRIEF tag one can provide an optional one line description # for a project that appears at the top of each page and should give viewer a diff --git a/docs/user_guide/how_to_build_and_run_examples.dox b/docs/user_guide/how_to_build_and_run_examples.dox index 4da26d31bc..775cb6abbe 100644 --- a/docs/user_guide/how_to_build_and_run_examples.dox +++ b/docs/user_guide/how_to_build_and_run_examples.dox @@ -76,21 +76,21 @@ The examples get automatically built by scons as part of the build process of th To cross compile a Arm® Neon™ example for Linux 32bit: - arm-linux-gnueabihf-g++ examples/neon_cnn.cpp utils/Utils.cpp -I. -Iinclude -std=c++14 -mfpu=neon -L. -larm_compute -larm_compute_core -o neon_cnn + arm-linux-gnueabihf-g++ examples/neon_cnn.cpp utils/Utils.cpp -I. -Iinclude -std=c++14 -mfpu=neon -L. -larm_compute -o neon_cnn To cross compile a Arm® Neon™ example for Linux 64bit: - aarch64-linux-gnu-g++ examples/neon_cnn.cpp utils/Utils.cpp -I. -Iinclude -std=c++14 -L. -larm_compute -larm_compute_core -o neon_cnn + aarch64-linux-gnu-g++ examples/neon_cnn.cpp utils/Utils.cpp -I. -Iinclude -std=c++14 -L. -larm_compute -o neon_cnn (notice the only difference with the 32 bit command is that we don't need the -mfpu option and the compiler's name is different) To cross compile an OpenCL example for Linux 32bit: - arm-linux-gnueabihf-g++ examples/cl_sgemm.cpp utils/Utils.cpp -I. -Iinclude -std=c++14 -mfpu=neon -L. -larm_compute -larm_compute_core -o cl_sgemm -DARM_COMPUTE_CL + arm-linux-gnueabihf-g++ examples/cl_sgemm.cpp utils/Utils.cpp -I. -Iinclude -std=c++14 -mfpu=neon -L. -larm_compute -o cl_sgemm -DARM_COMPUTE_CL To cross compile an OpenCL example for Linux 64bit: - aarch64-linux-gnu-g++ examples/cl_sgemm.cpp utils/Utils.cpp -I. -Iinclude -std=c++14 -L. -larm_compute -larm_compute_core -o cl_sgemm -DARM_COMPUTE_CL + aarch64-linux-gnu-g++ examples/cl_sgemm.cpp utils/Utils.cpp -I. -Iinclude -std=c++14 -L. -larm_compute -o cl_sgemm -DARM_COMPUTE_CL (notice the only difference with the 32 bit command is that we don't need the -mfpu option and the compiler's name is different) @@ -98,43 +98,43 @@ To cross compile the examples with the Graph API, such as graph_lenet.cpp, you n i.e. to cross compile the "graph_lenet" example for Linux 32bit: - arm-linux-gnueabihf-g++ examples/graph_lenet.cpp utils/Utils.cpp utils/GraphUtils.cpp utils/CommonGraphOptions.cpp -I. -Iinclude -std=c++14 -mfpu=neon -L. -larm_compute_graph -larm_compute -larm_compute_core -Wl,--allow-shlib-undefined -o graph_lenet + arm-linux-gnueabihf-g++ examples/graph_lenet.cpp utils/Utils.cpp utils/GraphUtils.cpp utils/CommonGraphOptions.cpp -I. -Iinclude -std=c++14 -mfpu=neon -L. -larm_compute_graph -larm_compute -Wl,--allow-shlib-undefined -o graph_lenet i.e. to cross compile the "graph_lenet" example for Linux 64bit: - aarch64-linux-gnu-g++ examples/graph_lenet.cpp utils/Utils.cpp utils/GraphUtils.cpp utils/CommonGraphOptions.cpp -I. -Iinclude -std=c++14 -L. -larm_compute_graph -larm_compute -larm_compute_core -Wl,--allow-shlib-undefined -o graph_lenet + aarch64-linux-gnu-g++ examples/graph_lenet.cpp utils/Utils.cpp utils/GraphUtils.cpp utils/CommonGraphOptions.cpp -I. -Iinclude -std=c++14 -L. -larm_compute_graph -larm_compute -Wl,--allow-shlib-undefined -o graph_lenet (notice the only difference with the 32 bit command is that we don't need the -mfpu option and the compiler's name is different) -@note If compiling using static libraries, this order must be followed when linking: arm_compute_graph_static, arm_compute, arm_compute_core +@note If compiling using static libraries, this order must be followed when linking: arm_compute_graph_static, arm_compute To compile natively (i.e directly on an Arm device) for Arm® Neon™ for Linux 32bit: - g++ examples/neon_cnn.cpp utils/Utils.cpp -I. -Iinclude -std=c++14 -mfpu=neon -larm_compute -larm_compute_core -o neon_cnn + g++ examples/neon_cnn.cpp utils/Utils.cpp -I. -Iinclude -std=c++14 -mfpu=neon -larm_compute -o neon_cnn To compile natively (i.e directly on an Arm device) for Arm® Neon™ for Linux 64bit: - g++ examples/neon_cnn.cpp utils/Utils.cpp -I. -Iinclude -std=c++14 -larm_compute -larm_compute_core -o neon_cnn + g++ examples/neon_cnn.cpp utils/Utils.cpp -I. -Iinclude -std=c++14 -larm_compute -o neon_cnn (notice the only difference with the 32 bit command is that we don't need the -mfpu option) To compile natively (i.e directly on an Arm device) for OpenCL for Linux 32bit or Linux 64bit: - g++ examples/cl_sgemm.cpp utils/Utils.cpp -I. -Iinclude -std=c++14 -larm_compute -larm_compute_core -o cl_sgemm -DARM_COMPUTE_CL + g++ examples/cl_sgemm.cpp utils/Utils.cpp -I. -Iinclude -std=c++14 -larm_compute -o cl_sgemm -DARM_COMPUTE_CL To compile natively the examples with the Graph API, such as graph_lenet.cpp, you need to link the examples against arm_compute_graph.so too. i.e. to natively compile the "graph_lenet" example for Linux 32bit: - g++ examples/graph_lenet.cpp utils/Utils.cpp utils/GraphUtils.cpp utils/CommonGraphOptions.cpp -I. -Iinclude -std=c++14 -mfpu=neon -L. -larm_compute_graph -larm_compute -larm_compute_core -Wl,--allow-shlib-undefined -o graph_lenet + g++ examples/graph_lenet.cpp utils/Utils.cpp utils/GraphUtils.cpp utils/CommonGraphOptions.cpp -I. -Iinclude -std=c++14 -mfpu=neon -L. -larm_compute_graph -larm_compute -Wl,--allow-shlib-undefined -o graph_lenet i.e. to natively compile the "graph_lenet" example for Linux 64bit: - g++ examples/graph_lenet.cpp utils/Utils.cpp utils/GraphUtils.cpp utils/CommonGraphOptions.cpp -I. -Iinclude -std=c++14 -L. -larm_compute_graph -larm_compute -larm_compute_core -Wl,--allow-shlib-undefined -o graph_lenet + g++ examples/graph_lenet.cpp utils/Utils.cpp utils/GraphUtils.cpp utils/CommonGraphOptions.cpp -I. -Iinclude -std=c++14 -L. -larm_compute_graph -larm_compute -Wl,--allow-shlib-undefined -o graph_lenet (notice the only difference with the 32 bit command is that we don't need the -mfpu option) -@note If compiling using static libraries, this order must be followed when linking: arm_compute_graph_static, arm_compute, arm_compute_core +@note If compiling using static libraries, this order must be followed when linking: arm_compute_graph_static, arm_compute @note These two commands assume libarm_compute.so is available in your library path, if not add the path to it using -L (e.g. -Llib/linux-armv8a-neon-cl-asserts/) @note You might need to export the path to OpenCL library as well in your LD_LIBRARY_PATH if Compute Library was built with OpenCL enabled. @@ -265,23 +265,23 @@ Once you've got your Android standalone toolchain built and added to your path y To cross compile a Arm® Neon™ example: #32 bit: - arm-linux-androideabi-clang++ examples/neon_cnn.cpp utils/Utils.cpp -I. -Iinclude -std=c++14 -larm_compute-static -larm_compute_core-static -L. -o neon_cnn_arm -static-libstdc++ -pie + arm-linux-androideabi-clang++ examples/neon_cnn.cpp utils/Utils.cpp -I. -Iinclude -std=c++14 -larm_compute-static -L. -o neon_cnn_arm -static-libstdc++ -pie #64 bit: - aarch64-linux-android-clang++ examples/neon_cnn.cpp utils/Utils.cpp -I. -Iinclude -std=c++14 -larm_compute-static -larm_compute_core-static -L. -o neon_cnn_aarch64 -static-libstdc++ -pie + aarch64-linux-android-clang++ examples/neon_cnn.cpp utils/Utils.cpp -I. -Iinclude -std=c++14 -larm_compute-static -L. -o neon_cnn_aarch64 -static-libstdc++ -pie To cross compile an OpenCL example: #32 bit: - arm-linux-androideabi-clang++ examples/cl_sgemm.cpp utils/Utils.cpp -I. -Iinclude -std=c++14 -larm_compute-static -larm_compute_core-static -L. -o cl_sgemm_arm -static-libstdc++ -pie -DARM_COMPUTE_CL + arm-linux-androideabi-clang++ examples/cl_sgemm.cpp utils/Utils.cpp -I. -Iinclude -std=c++14 -larm_compute-static -L. -o cl_sgemm_arm -static-libstdc++ -pie -DARM_COMPUTE_CL #64 bit: - aarch64-linux-android-clang++ examples/cl_sgemm.cpp utils/Utils.cpp -I. -Iinclude -std=c++14 -larm_compute-static -larm_compute_core-static -L. -o cl_sgemm_aarch64 -static-libstdc++ -pie -DARM_COMPUTE_CL + aarch64-linux-android-clang++ examples/cl_sgemm.cpp utils/Utils.cpp -I. -Iinclude -std=c++14 -larm_compute-static -L. -o cl_sgemm_aarch64 -static-libstdc++ -pie -DARM_COMPUTE_CL To cross compile the examples with the Graph API, such as graph_lenet.cpp, you need to link the library arm_compute_graph also. #32 bit: - arm-linux-androideabi-clang++ examples/graph_lenet.cpp utils/Utils.cpp utils/GraphUtils.cpp utils/CommonGraphOptions.cpp -I. -Iinclude -std=c++14 -Wl,--whole-archive -larm_compute_graph-static -Wl,--no-whole-archive -larm_compute-static -larm_compute_core-static -L. -o graph_lenet_arm -static-libstdc++ -pie -DARM_COMPUTE_CL + arm-linux-androideabi-clang++ examples/graph_lenet.cpp utils/Utils.cpp utils/GraphUtils.cpp utils/CommonGraphOptions.cpp -I. -Iinclude -std=c++14 -Wl,--whole-archive -larm_compute_graph-static -Wl,--no-whole-archive -larm_compute-static -L. -o graph_lenet_arm -static-libstdc++ -pie -DARM_COMPUTE_CL #64 bit: - aarch64-linux-android-clang++ examples/graph_lenet.cpp utils/Utils.cpp utils/GraphUtils.cpp utils/CommonGraphOptions.cpp -I. -Iinclude -std=c++14 -Wl,--whole-archive -larm_compute_graph-static -Wl,--no-whole-archive -larm_compute-static -larm_compute_core-static -L. -o graph_lenet_aarch64 -static-libstdc++ -pie -DARM_COMPUTE_CL + aarch64-linux-android-clang++ examples/graph_lenet.cpp utils/Utils.cpp utils/GraphUtils.cpp utils/CommonGraphOptions.cpp -I. -Iinclude -std=c++14 -Wl,--whole-archive -larm_compute_graph-static -Wl,--no-whole-archive -larm_compute-static -L. -o graph_lenet_aarch64 -static-libstdc++ -pie -DARM_COMPUTE_CL @note Due to some issues in older versions of the Arm® Mali™ OpenCL DDK (<= r13p0), we recommend to link arm_compute statically on Android. @note When linked statically the arm_compute_graph library currently needs the --whole-archive linker flag in order to work properly diff --git a/docs/user_guide/release_version_and_change_log.dox b/docs/user_guide/release_version_and_change_log.dox index 11731e5a33..40ad09fd84 100644 --- a/docs/user_guide/release_version_and_change_log.dox +++ b/docs/user_guide/release_version_and_change_log.dox @@ -41,6 +41,18 @@ If there is more than one release in a month then an extra sequential number is @section S2_2_changelog Changelog +v24.01 Public major release + - Remove the legacy 'libarm_compute_core' library. This library is an artifact of Compute Library's legacy library architecture and no longer serves any purpose. + You should link only to the main `libarm_compute` library for core functionality. + - Expand GPUTarget list with Mali™ G720 and G620. + - Optimize CPU activation functions using LUT-based implementation: + - Sigmoid function for FP16. + - New features + - Add support for FP16 in all multi_isa builds. + - Performance optimizations: + - Optimize @ref NESoftmaxLayer + - Optimize @ref NEDepthToSpaceLayer. + v23.11 Public major release - New features - Add support for input data type U64/S64 in CLCast and NECast. @@ -432,8 +444,8 @@ v21.02 Public major release - @ref NEActivationLayer - @ref NEArithmeticAddition - @ref NEBatchNormalizationLayerKernel - - @ref cpu::kernels::CpuLogits1DSoftmaxKernel - - @ref cpu::kernels::CpuLogits1DMaxKernel + - cpu::kernels::CpuLogits1DSoftmaxKernel + - cpu::kernels::CpuLogits1DMaxKernel - @ref cpu::kernels::CpuElementwiseUnaryKernel - Remove padding from OpenCL kernels: - CLDirectConvolutionLayerKernel diff --git a/examples/SConscript b/examples/SConscript index bfac9deb2b..16f31d93d4 100644 --- a/examples/SConscript +++ b/examples/SConscript @@ -38,15 +38,14 @@ utils = examples_env.Object("../utils/Utils.cpp") if env['os'] in ['android', 'macos', 'bare_metal'] or env['standalone']: Import('arm_compute_graph_a') Import('arm_compute_a') - Import('arm_compute_core_a') - arm_compute_libs = [ arm_compute_a, arm_compute_core_a ] + arm_compute_libs = [ arm_compute_a ] arm_compute_graph_libs = arm_compute_libs # The graph library needs to be linked separately with --whole-archive arm_compute_dependency = arm_compute_a graph_dependency = [arm_compute_graph_a] else: Import('arm_compute_graph_so') Import('arm_compute_so') - arm_compute_libs = ["arm_compute", "arm_compute_core"] + arm_compute_libs = ["arm_compute"] arm_compute_graph_libs = [ "arm_compute_graph" ] + arm_compute_libs arm_compute_dependency = arm_compute_so graph_dependency = [arm_compute_graph_so] diff --git a/examples/graph_ssd_mobilenet.cpp b/examples/graph_ssd_mobilenet.cpp index 5162fe6890..6218d47dd6 100644 --- a/examples/graph_ssd_mobilenet.cpp +++ b/examples/graph_ssd_mobilenet.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2022 Arm Limited. + * Copyright (c) 2018-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -21,6 +21,7 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ +#include "arm_compute/core/utils/quantization/AsymmHelpers.h" #include "arm_compute/graph.h" #include "support/ToolchainSupport.h" @@ -757,7 +758,8 @@ class GraphSSDMobilenetExample : public Example std::move(conv_16_2_class_pre), std::move(conv_17_2_class_pre)) .set_name("ClassPrediction/concat"); - const QuantizationInfo logistic_out_qinfo = QuantizationInfo(0.00390625f, 0); + const QuantizationInfo logistic_out_qinfo = QuantizationInfo( + 0.00390625f, quantization::get_min_max_values_from_quantized_data_type(common_params.data_type).first); class_pred << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC), logistic_out_qinfo) .set_name("ClassPrediction/logistic"); diff --git a/filelist.json b/filelist.json index 5bca2419d6..7c530f3f33 100644 --- a/filelist.json +++ b/filelist.json @@ -14,6 +14,7 @@ "src/core/Error.cpp", "src/core/GPUTarget.cpp", "src/core/Helpers.cpp", + "src/core/helpers/LUTManager.cpp", "src/core/IAccessWindow.cpp", "src/core/IKernel.cpp", "src/core/ITensor.cpp", @@ -532,7 +533,8 @@ "src/gpu/cl/operators/ClMatMul.cpp", "src/runtime/CL/functions/CLMatMul.cpp", "src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultConfigValhall.cpp", - "src/runtime/heuristics/matmul_native/ClMatMulNativeHelpers.cpp" + "src/runtime/heuristics/matmul_native/ClMatMulNativeHelpers.cpp", + "src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultVariantValhall.cpp" ] } }, @@ -982,12 +984,15 @@ "fp16": [ "src/cpu/kernels/fuse_batch_normalization/generic/fp16.cpp", "src/core/NEON/kernels/batchnormalization/impl/NEON/fp16.cpp", - "src/cpu/kernels/fuse_batch_normalization/nhwc/neon/fp16.cpp" + "src/cpu/kernels/fuse_batch_normalization/nhwc/neon/fp16.cpp", + "src/cpu/kernels/fuse_batch_normalization/nchw/neon/fp16.cpp" + ], "fp32": [ "src/cpu/kernels/fuse_batch_normalization/generic/fp32.cpp", "src/core/NEON/kernels/batchnormalization/impl/NEON/fp32.cpp", - "src/cpu/kernels/fuse_batch_normalization/nhwc/neon/fp32.cpp" + "src/cpu/kernels/fuse_batch_normalization/nhwc/neon/fp32.cpp", + "src/cpu/kernels/fuse_batch_normalization/nchw/neon/fp32.cpp" ] }, "sve": { @@ -1122,27 +1127,20 @@ "src/core/NEON/kernels/convolution/common/qasymm8.cpp", "src/core/NEON/kernels/convolution/common/qsymm8.cpp", "src/core/NEON/kernels/convolution/common/utils.cpp", - "src/core/NEON/kernels/convolution/winograd/input_transforms_fp16.cpp", "src/core/NEON/kernels/convolution/winograd/input_transforms_fp32.cpp", - "src/core/NEON/kernels/convolution/winograd/output_transforms_fp16.cpp", "src/core/NEON/kernels/convolution/winograd/output_transforms_fp32.cpp", - "src/core/NEON/kernels/convolution/winograd/weight_transforms_fp16.cpp", "src/core/NEON/kernels/convolution/winograd/weight_transforms_fp32.cpp", - "src/core/NEON/kernels/convolution/winograd/winograd_fp16.cpp", "src/core/NEON/kernels/convolution/winograd/winograd_fp32.cpp", - "src/core/NEON/kernels/convolution/winograd/input_transforms/a64_fp16_6x6.cpp", "src/core/NEON/kernels/convolution/winograd/input_transforms/a64_fp32_6x6.cpp", "src/core/NEON/kernels/convolution/winograd/input_transforms/arm_fp32_1x8.cpp", "src/core/NEON/kernels/convolution/winograd/input_transforms/arm_fp32_4x4.cpp", "src/core/NEON/kernels/convolution/winograd/input_transforms/arm_fp32_6x6.cpp", - "src/core/NEON/kernels/convolution/winograd/output_transforms/a64_fp16_4x4_3x3.cpp", "src/core/NEON/kernels/convolution/winograd/output_transforms/arm_fp32_1x2_1x7.cpp", "src/core/NEON/kernels/convolution/winograd/output_transforms/arm_fp32_1x4_1x5.cpp", "src/core/NEON/kernels/convolution/winograd/output_transforms/arm_fp32_1x6_1x3.cpp", "src/core/NEON/kernels/convolution/winograd/output_transforms/arm_fp32_2x2_3x3.cpp", "src/core/NEON/kernels/convolution/winograd/output_transforms/arm_fp32_2x2_5x5.cpp", "src/core/NEON/kernels/convolution/winograd/output_transforms/arm_fp32_4x4_3x3.cpp", - "src/core/NEON/kernels/convolution/winograd/weight_transforms/a64_fp16_4x4_3x3.cpp", "src/core/NEON/kernels/convolution/winograd/weight_transforms/arm_fp32_2x2_3x3.cpp", "src/core/NEON/kernels/convolution/winograd/weight_transforms/arm_fp32_2x2_5x5.cpp", "src/core/NEON/kernels/convolution/winograd/weight_transforms/arm_fp32_4x4_3x3.cpp", @@ -1159,6 +1157,13 @@ ], "fp16": [ "src/cpu/kernels/directconv2d/nchw/fp16.cpp", + "src/core/NEON/kernels/convolution/winograd/winograd_fp16.cpp", + "src/core/NEON/kernels/convolution/winograd/input_transforms_fp16.cpp", + "src/core/NEON/kernels/convolution/winograd/output_transforms_fp16.cpp", + "src/core/NEON/kernels/convolution/winograd/weight_transforms_fp16.cpp", + "src/core/NEON/kernels/convolution/winograd/input_transforms/a64_fp16_6x6.cpp", + "src/core/NEON/kernels/convolution/winograd/output_transforms/a64_fp16_4x4_3x3.cpp", + "src/core/NEON/kernels/convolution/winograd/weight_transforms/a64_fp16_4x4_3x3.cpp", "src/cpu/kernels/directconv2d/nhwc/neon/fp16.cpp" ] }, @@ -1214,7 +1219,9 @@ "files": { "common": [ "src/core/NEON/kernels/NEDepthToSpaceLayerKernel.cpp", - "src/runtime/NEON/functions/NEDepthToSpaceLayer.cpp" + "src/runtime/NEON/functions/NEDepthToSpaceLayer.cpp", + "src/cpu/kernels/depth_to_space/nhwc/any/impl.cpp", + "src/cpu/kernels/depth_to_space/nchw/any/impl.cpp" ] } }, @@ -1241,7 +1248,6 @@ "src/core/NEON/kernels/arm_conv/depthwise/depthwise_u8s8u8q.cpp", "src/core/NEON/kernels/arm_conv/depthwise/depthwise_s8q.cpp", "src/core/NEON/kernels/arm_conv/depthwise/depthwise_fp32.cpp", - "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp", "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst/generic.cpp", "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst/generic.cpp", "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp", @@ -1252,18 +1258,6 @@ "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst/generic.cpp", "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp", "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp", - "src/core/NEON/kernels/arm_conv/depthwise/depthwise_fp16.cpp", - "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp", - "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp", - "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp", - "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp", - "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp", - "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp", - "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp", - "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp", - "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_direct.cpp", - "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_indirect.cpp", - "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_generic_output9_mla_depthfirst/generic.cpp", "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp", "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp", "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp", @@ -1300,7 +1294,22 @@ "src/core/NEON/kernels/arm_conv/depthwise/premultiply.cpp", "src/cpu/kernels/depthwiseconv2d/generic/neon/impl.cpp" ], - "fp16":["src/cpu/kernels/depthwiseconv2d/generic/neon/fp16.cpp"], + "fp16":[ + "src/cpu/kernels/depthwiseconv2d/generic/neon/fp16.cpp", + "src/core/NEON/kernels/arm_conv/depthwise/depthwise_fp16.cpp", + "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp", + "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp", + "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp", + "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp", + "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp", + "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp", + "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp", + "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp", + "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp", + "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_direct.cpp", + "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_indirect.cpp", + "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_generic_output9_mla_depthfirst/generic.cpp" + ], "fp32":["src/cpu/kernels/depthwiseconv2d/generic/neon/fp32.cpp"], "qasymm8":["src/cpu/kernels/depthwiseconv2d/generic/neon/qasymm8.cpp"], "qasymm8_signed":["src/cpu/kernels/depthwiseconv2d/generic/neon/qasymm8_signed.cpp"] @@ -1820,6 +1829,11 @@ "qasymm8": ["src/cpu/kernels/lut/generic/neon/u8.cpp"], "qasymm8_signed": ["src/cpu/kernels/lut/generic/neon/u8.cpp"] }, + "sve": { + "fp16": ["src/cpu/kernels/lut/generic/sve/u16.cpp"], + "qasymm16": ["src/cpu/kernels/lut/generic/sve/u16.cpp"], + "qasymm16_signed": ["src/cpu/kernels/lut/generic/sve/u16.cpp"] + }, "sve2": { "qasymm8": ["src/cpu/kernels/lut/generic/sve2/u8.cpp"], "qasymm8_signed": ["src/cpu/kernels/lut/generic/sve2/u8.cpp"] @@ -1902,7 +1916,11 @@ "src/cpu/operators/CpuMul.cpp", "src/cpu/kernels/CpuMulKernel.cpp", "src/runtime/NEON/functions/NEPixelWiseMultiplication.cpp" - ] + ], + "neon":{ + "fp16":["src/cpu/kernels/mul/generic/neon/fp16.cpp"], + "fp32":["src/cpu/kernels/mul/generic/neon/fp32.cpp"] + } } }, "Normalize": { @@ -1911,7 +1929,11 @@ "common": [ "src/core/NEON/kernels/NENormalizationLayerKernel.cpp", "src/runtime/NEON/functions/NENormalizationLayer.cpp" - ] + ], + "neon":{ + "fp16":["src/cpu/kernels/norm_layer/generic/neon/fp16.cpp"], + "fp32":["src/cpu/kernels/norm_layer/generic/neon/fp32.cpp"] + } } }, "Pad": { @@ -1943,16 +1965,11 @@ "neon": { "common": [ "src/core/NEON/kernels/arm_conv/pooling/kernels/cpp_nhwc_1x1_stride_any_depthfirst/generic.cpp", - "src/core/NEON/kernels/arm_conv/pooling/pooling_fp16.cpp", "src/core/NEON/kernels/arm_conv/pooling/pooling_fp32.cpp", "src/core/NEON/kernels/arm_conv/pooling/pooling_s8.cpp", "src/core/NEON/kernels/arm_conv/pooling/pooling_s8q.cpp", "src/core/NEON/kernels/arm_conv/pooling/pooling_u8.cpp", "src/core/NEON/kernels/arm_conv/pooling/pooling_u8q.cpp", - "src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst/generic.cpp", - "src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_avg_generic_depthfirst/generic.cpp", - "src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp", - "src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_max_generic_depthfirst/generic.cpp", "src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst/generic.cpp", "src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_avg_generic_depthfirst/generic.cpp", "src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp", @@ -1969,7 +1986,14 @@ "src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8q_nhwc_max_generic_depthfirst/generic.cpp" ], "nchw": [ "src/cpu/kernels/pool2d/neon/nchw/all.cpp" ], - "fp16": [ "src/cpu/kernels/pool2d/neon/fp16.cpp" ], + "fp16": [ + "src/cpu/kernels/pool2d/neon/fp16.cpp", + "src/core/NEON/kernels/arm_conv/pooling/pooling_fp16.cpp", + "src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst/generic.cpp", + "src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_avg_generic_depthfirst/generic.cpp", + "src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp", + "src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_max_generic_depthfirst/generic.cpp" + ], "fp32": [ "src/cpu/kernels/pool2d/neon/fp32.cpp" ], "qasymm8":[ "src/cpu/kernels/pool2d/neon/qasymm8.cpp" ], "qasymm8_signed":["src/cpu/kernels/pool2d/neon/qasymm8_signed.cpp"] @@ -2198,16 +2222,10 @@ "qasymm8_signed":["src/cpu/kernels/softmax/generic/neon/qasymm8_signed.cpp"] }, "sve": { - "common": [ "src/cpu/kernels/softmax/generic/sve/impl.cpp" ], - "fp32": ["src/cpu/kernels/softmax/generic/sve/fp32.cpp"], - "fp16": ["src/cpu/kernels/softmax/generic/sve/fp16.cpp"], - "qasymm8": ["src/cpu/kernels/softmax/generic/sve/qasymm8.cpp" ], - "qasymm8_signed": ["src/cpu/kernels/softmax/generic/sve/qasymm8_signed.cpp"] + "common": [ "src/cpu/kernels/softmax/generic/sve/impl.cpp" ] }, "sve2":{ - "common" :["src/cpu/kernels/softmax/generic/sve2/impl.cpp"], - "qasymm8":[ "src/cpu/kernels/softmax/generic/sve2/qasymm8.cpp"], - "qasymm8_signed":["src/cpu/kernels/softmax/generic/sve2/qasymm8_signed.cpp"] + "common" :["src/cpu/kernels/softmax/generic/sve2/impl.cpp"] } } }, diff --git a/scripts/arm_compute_library_nn_driver.go b/scripts/arm_compute_library_nn_driver.go index dda77b55df..2aab2d3fe7 100644 --- a/scripts/arm_compute_library_nn_driver.go +++ b/scripts/arm_compute_library_nn_driver.go @@ -46,6 +46,7 @@ func globalFlags(ctx android.BaseContext) []string { if theArch == "armv8-2a" { cppflags = append(cppflags, "-march=armv8.2-a+fp16") cppflags = append(cppflags, "-DARM_COMPUTE_ENABLE_FP16") + cppflags = append(cppflags, "-DENABLE_FP16_KERNELS") } } } @@ -74,9 +75,6 @@ func globalFlags(ctx android.BaseContext) []string { if strings.ToUpper(x) == "ALL" || strings.ToUpper(x) == "QSYMM16" { cppflags = append(cppflags, "-DENABLE_QSYMM16_KERNELS") } - if strings.ToUpper(x) == "ALL" || strings.ToUpper(x) == "FP16" { - cppflags = append(cppflags, "-DENABLE_FP16_KERNELS") - } if strings.ToUpper(x) == "ALL" || strings.ToUpper(x) == "FP32" { cppflags = append(cppflags, "-DENABLE_FP32_KERNELS") } diff --git a/scripts/format_code.py b/scripts/format_code.py index f1ee7a731c..29dbea7f0d 100755 --- a/scripts/format_code.py +++ b/scripts/format_code.py @@ -262,6 +262,9 @@ def run(self): self.shell.prepend_env("PATH","%s/../bin" % this_dir) for f in self.files: + if not self.skip_copyright: + check_copyright(f) + skip_this_file = False for e in exceptions: if e in f: @@ -272,8 +275,6 @@ def run(self): continue logger.info("Formatting %s" % f) - if not self.skip_copyright: - check_copyright(f) check_license("LICENSE") diff --git a/src/BUILD.bazel b/src/BUILD.bazel index a22632e1f5..9d5ae63484 100644 --- a/src/BUILD.bazel +++ b/src/BUILD.bazel @@ -117,9 +117,7 @@ filegroup( "cpu/kernels/elementwise_binary/generic/sve2/qasymm8_signed.cpp", "cpu/kernels/elementwise_unary/generic/sve2/q8.cpp", "cpu/kernels/lut/generic/sve2/u8.cpp", - "cpu/kernels/softmax/generic/sve2/impl.cpp", - "cpu/kernels/softmax/generic/sve2/qasymm8.cpp", - "cpu/kernels/softmax/generic/sve2/qasymm8_signed.cpp"] + + "cpu/kernels/softmax/generic/sve2/impl.cpp"] + glob(["**/*.h", "**/*.hpp", "**/*.inl"]), @@ -337,16 +335,13 @@ filegroup( "cpu/kernels/elementwise_unary/generic/sve/fp32.cpp", "cpu/kernels/elementwise_unary/generic/sve/impl.cpp", "cpu/kernels/elementwise_unary/generic/sve/integer.cpp", + "cpu/kernels/lut/generic/sve/u16.cpp", "cpu/kernels/scale/sve/fp16.cpp", "cpu/kernels/scale/sve/fp32.cpp", "cpu/kernels/scale/sve/integer.cpp", "cpu/kernels/scale/sve/qasymm8.cpp", "cpu/kernels/scale/sve/qasymm8_signed.cpp", - "cpu/kernels/softmax/generic/sve/fp16.cpp", - "cpu/kernels/softmax/generic/sve/fp32.cpp", - "cpu/kernels/softmax/generic/sve/impl.cpp", - "cpu/kernels/softmax/generic/sve/qasymm8.cpp", - "cpu/kernels/softmax/generic/sve/qasymm8_signed.cpp"] + + "cpu/kernels/softmax/generic/sve/impl.cpp"] + glob(["**/*.h", "**/*.hpp", "**/*.inl"]), @@ -643,6 +638,7 @@ filegroup( "core/Utils.cpp", "core/Validate.cpp", "core/Version.cpp", + "core/helpers/LUTManager.cpp", "core/helpers/SoftmaxHelpers.cpp", "core/helpers/Utils.cpp", "core/helpers/WindowHelpers.cpp", @@ -739,6 +735,8 @@ filegroup( "cpu/kernels/crop/generic/neon/fp16.cpp", "cpu/kernels/crop/generic/neon/fp32.cpp", "cpu/kernels/crop/generic/neon/integer.cpp", + "cpu/kernels/depth_to_space/nchw/any/impl.cpp", + "cpu/kernels/depth_to_space/nhwc/any/impl.cpp", "cpu/kernels/depthwiseconv2d/generic/neon/fp16.cpp", "cpu/kernels/depthwiseconv2d/generic/neon/fp32.cpp", "cpu/kernels/depthwiseconv2d/generic/neon/impl.cpp", @@ -766,6 +764,8 @@ filegroup( "cpu/kernels/fuse_batch_normalization/generic/fp16.cpp", "cpu/kernels/fuse_batch_normalization/generic/fp32.cpp", "cpu/kernels/fuse_batch_normalization/nchw/all.cpp", + "cpu/kernels/fuse_batch_normalization/nchw/neon/fp16.cpp", + "cpu/kernels/fuse_batch_normalization/nchw/neon/fp32.cpp", "cpu/kernels/fuse_batch_normalization/nhwc/neon/fp16.cpp", "cpu/kernels/fuse_batch_normalization/nhwc/neon/fp32.cpp", "cpu/kernels/gemm_matrix_add/generic/neon/fp16.cpp", @@ -794,6 +794,10 @@ filegroup( "cpu/kernels/meanstddevnorm/generic/neon/fp32.cpp", "cpu/kernels/meanstddevnorm/generic/neon/impl.cpp", "cpu/kernels/meanstddevnorm/generic/neon/qasymm8.cpp", + "cpu/kernels/mul/generic/neon/fp16.cpp", + "cpu/kernels/mul/generic/neon/fp32.cpp", + "cpu/kernels/norm_layer/generic/neon/fp16.cpp", + "cpu/kernels/norm_layer/generic/neon/fp32.cpp", "cpu/kernels/pool2d/neon/fp16.cpp", "cpu/kernels/pool2d/neon/fp32.cpp", "cpu/kernels/pool2d/neon/nchw/all.cpp", diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 37599cdadd..be7a6ef188 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -312,16 +312,13 @@ target_sources( cpu/kernels/elementwise_unary/generic/sve/fp32.cpp cpu/kernels/elementwise_unary/generic/sve/impl.cpp cpu/kernels/elementwise_unary/generic/sve/integer.cpp + cpu/kernels/lut/generic/sve/u16.cpp cpu/kernels/scale/sve/fp16.cpp cpu/kernels/scale/sve/fp32.cpp cpu/kernels/scale/sve/integer.cpp cpu/kernels/scale/sve/qasymm8.cpp cpu/kernels/scale/sve/qasymm8_signed.cpp - cpu/kernels/softmax/generic/sve/fp16.cpp - cpu/kernels/softmax/generic/sve/fp32.cpp cpu/kernels/softmax/generic/sve/impl.cpp - cpu/kernels/softmax/generic/sve/qasymm8.cpp - cpu/kernels/softmax/generic/sve/qasymm8_signed.cpp ) target_sources( @@ -339,8 +336,6 @@ target_sources( cpu/kernels/elementwise_unary/generic/sve2/q8.cpp cpu/kernels/lut/generic/sve2/u8.cpp cpu/kernels/softmax/generic/sve2/impl.cpp - cpu/kernels/softmax/generic/sve2/qasymm8.cpp - cpu/kernels/softmax/generic/sve2/qasymm8_signed.cpp ) target_sources( @@ -634,6 +629,7 @@ target_sources( core/Utils.cpp core/Validate.cpp core/Version.cpp + core/helpers/LUTManager.cpp core/helpers/SoftmaxHelpers.cpp core/helpers/Utils.cpp core/helpers/WindowHelpers.cpp @@ -730,6 +726,8 @@ target_sources( cpu/kernels/crop/generic/neon/fp16.cpp cpu/kernels/crop/generic/neon/fp32.cpp cpu/kernels/crop/generic/neon/integer.cpp + cpu/kernels/depth_to_space/nchw/any/impl.cpp + cpu/kernels/depth_to_space/nhwc/any/impl.cpp cpu/kernels/depthwiseconv2d/generic/neon/fp16.cpp cpu/kernels/depthwiseconv2d/generic/neon/fp32.cpp cpu/kernels/depthwiseconv2d/generic/neon/impl.cpp @@ -757,6 +755,8 @@ target_sources( cpu/kernels/fuse_batch_normalization/generic/fp16.cpp cpu/kernels/fuse_batch_normalization/generic/fp32.cpp cpu/kernels/fuse_batch_normalization/nchw/all.cpp + cpu/kernels/fuse_batch_normalization/nchw/neon/fp16.cpp + cpu/kernels/fuse_batch_normalization/nchw/neon/fp32.cpp cpu/kernels/fuse_batch_normalization/nhwc/neon/fp16.cpp cpu/kernels/fuse_batch_normalization/nhwc/neon/fp32.cpp cpu/kernels/gemm_matrix_add/generic/neon/fp16.cpp @@ -785,6 +785,10 @@ target_sources( cpu/kernels/meanstddevnorm/generic/neon/fp32.cpp cpu/kernels/meanstddevnorm/generic/neon/impl.cpp cpu/kernels/meanstddevnorm/generic/neon/qasymm8.cpp + cpu/kernels/mul/generic/neon/fp16.cpp + cpu/kernels/mul/generic/neon/fp32.cpp + cpu/kernels/norm_layer/generic/neon/fp16.cpp + cpu/kernels/norm_layer/generic/neon/fp32.cpp cpu/kernels/pool2d/neon/fp16.cpp cpu/kernels/pool2d/neon/fp32.cpp cpu/kernels/pool2d/neon/nchw/all.cpp diff --git a/src/core/CL/CLMutableCommandBuffer.cpp b/src/core/CL/CLMutableCommandBuffer.cpp index 05b351fc25..0e078d8416 100644 --- a/src/core/CL/CLMutableCommandBuffer.cpp +++ b/src/core/CL/CLMutableCommandBuffer.cpp @@ -26,6 +26,7 @@ #include "arm_compute/core/Error.h" +#include "src/common/utils/Log.h" #include "src/core/CL/CLUtils.h" namespace arm_compute @@ -48,7 +49,11 @@ CLMutableCommandBuffer::CLMutableCommandBuffer(cl_command_queue queue) : CLComma CLMutableCommandBuffer::~CLMutableCommandBuffer() { const auto status = clReleaseCommandBufferKHR(_cb); - handle_cl_error("clReleaseCommandBufferKHR", status); + if (status != CL_SUCCESS) + { + const std::string error_message = "clReleaseCommandBufferKHR - Error code: " + std::to_string(status); + ARM_COMPUTE_LOG_ERROR_ACL(error_message); + } } void CLMutableCommandBuffer::add_kernel(cl_kernel kernel, diff --git a/src/core/CL/OpenCL.cpp b/src/core/CL/OpenCL.cpp index 35421d025e..07baa5e7fb 100644 --- a/src/core/CL/OpenCL.cpp +++ b/src/core/CL/OpenCL.cpp @@ -132,6 +132,10 @@ bool CLSymbols::load(const std::vector &libraries_filenames, bool u func_name##_ptr = reinterpret_cast(dlsym(handle, #func_name)); #endif /* __ANDROID__ */ +#define LOAD_EXTENSION_FUNCTION_PTR(func_name, platform_id) \ + func_name##_ptr = \ + reinterpret_cast(clGetExtensionFunctionAddressForPlatform(platform_id, #func_name)); + LOAD_FUNCTION_PTR(clCreateContext, handle); LOAD_FUNCTION_PTR(clCreateContextFromType, handle); LOAD_FUNCTION_PTR(clCreateCommandQueue, handle); @@ -181,8 +185,27 @@ bool CLSymbols::load(const std::vector &libraries_filenames, bool u LOAD_FUNCTION_PTR(clWaitForEvents, handle); LOAD_FUNCTION_PTR(clCreateImage, handle); LOAD_FUNCTION_PTR(clSetKernelExecInfo, handle); + LOAD_FUNCTION_PTR(clGetExtensionFunctionAddressForPlatform, handle); + + // Load Extensions + + // Number of platforms is assumed to be 1. For this to be greater than 1, + // the system must have more than one OpenCL implementation provided by + // different vendors. This is not our use case. Besides, the library + // already assumes one implementation as it uses one handle to load core + // functions. + constexpr unsigned int num_platforms = 1U; + std::vector platform_ids(num_platforms); + clGetPlatformIDs(num_platforms, platform_ids.data(), nullptr); // Command buffer and mutable dispatch command buffer extensions + /// TODO: (COMPMID-6742) Load Command Buffer extensions in a Portable way + /// using clGetExtensionFunctionAddressForPlatform(). + /// The details can be found here: + /// https://registry.khronos.org/OpenCL/specs/3.0-unified/html/OpenCL_Ext.html#getting-opencl-api-extension-function-pointers + /// + /// @note: There are some problems reported while loading these extensions in the recommended way. + /// For details, please see COMPUTE-16545 LOAD_FUNCTION_PTR(clCreateCommandBufferKHR, handle); LOAD_FUNCTION_PTR(clRetainCommandBufferKHR, handle); LOAD_FUNCTION_PTR(clReleaseCommandBufferKHR, handle); @@ -193,9 +216,10 @@ bool CLSymbols::load(const std::vector &libraries_filenames, bool u LOAD_FUNCTION_PTR(clUpdateMutableCommandsKHR, handle); // Third-party extensions - LOAD_FUNCTION_PTR(clImportMemoryARM, handle); + LOAD_EXTENSION_FUNCTION_PTR(clImportMemoryARM, platform_ids[0]); #undef LOAD_FUNCTION_PTR +#undef LOAD_EXTENSION_FUNCTION_PTR //Don't call dlclose(handle) or all the symbols will be unloaded ! @@ -1063,6 +1087,19 @@ clSetKernelExecInfo(cl_kernel kernel, cl_kernel_exec_info param_name, size_t par } } +void *clGetExtensionFunctionAddressForPlatform(cl_platform_id platform, const char *funcname) +{ + arm_compute::CLSymbols::get().load_default(); + const auto func = arm_compute::CLSymbols::get().clGetExtensionFunctionAddressForPlatform_ptr; + + if (func != nullptr) + { + return func(platform, funcname); + } + + return nullptr; +} + cl_command_buffer_khr clCreateCommandBufferKHR(cl_uint num_queues, const cl_command_queue *queues, const cl_command_buffer_properties_khr *properties, diff --git a/src/core/CL/cl_kernels/common/gemm_reshaped_only_rhs_mmul.cl b/src/core/CL/cl_kernels/common/gemm_reshaped_only_rhs_mmul.cl index 8919023d4c..09b8956b68 100644 --- a/src/core/CL/cl_kernels/common/gemm_reshaped_only_rhs_mmul.cl +++ b/src/core/CL/cl_kernels/common/gemm_reshaped_only_rhs_mmul.cl @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022 Arm Limited. + * Copyright (c) 2022-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -117,9 +117,23 @@ __kernel void gemm_mm_reshaped_only_rhs_nt_mmul( uint rhs_y = block_id; // Compute LHS/RHS/DST matrix address +#ifdef REINTERPRET_INPUT_AS_3D + lhs_offset_first_element_in_bytes += lhs_x * sizeof(DATA_TYPE) + (lhs_y + z * M) * lhs_stride_y; +#else // REINTERPRET_INPUT_AS_3D lhs_offset_first_element_in_bytes += lhs_x * sizeof(DATA_TYPE) + lhs_y * lhs_stride_y + z * lhs_stride_z; +#endif // REINTERPRET_INPUT_AS_3D + +#ifdef BATCHED_RHS rhs_offset_first_element_in_bytes += rhs_x * sizeof(DATA_TYPE) + rhs_y * rhs_stride_y + z * rhs_stride_z; +#else // BATCHED_RHS + rhs_offset_first_element_in_bytes += rhs_x * sizeof(DATA_TYPE) + rhs_y * rhs_stride_y; +#endif // BATCHED_RHS + +#ifdef REINTERPRET_OUTPUT_AS_3D + dst_offset_first_element_in_bytes += dst_x * sizeof(DATA_TYPE) + (dst_y + z * M) * dst_stride_y; +#else // REINTERPRET_OUTPUT_AS_3D dst_offset_first_element_in_bytes += dst_x * sizeof(DATA_TYPE) + dst_y * dst_stride_y + z * dst_stride_z; +#endif // REINTERPRET_OUTPUT_AS_3D // Note: If RHS derives from the weights of convolution 2d layer, RHS will always be 2D and rhs_stride_z will always be equal to 0 for // not sliding the tensor @@ -367,11 +381,25 @@ __kernel void gemm_mm_reshaped_only_rhs_nt_mmul_texture( // Starting RHS coordinates uint rhs_x = block_y * N0 * MMUL_N0 + block_x * N0; + +#ifdef BATCHED_RHS uint rhs_y = block_id + z * rhs_h; +#else // BATCHED_RHS + uint rhs_y = block_id; +#endif // BATCHED_RHS // Compute LHS/RHS/DST matrix address +#ifdef REINTERPRET_INPUT_AS_3D + lhs_offset_first_element_in_bytes += lhs_x * sizeof(DATA_TYPE) + (lhs_y + z * M) * lhs_stride_y; +#else // REINTERPRET_INPUT_AS_3D lhs_offset_first_element_in_bytes += lhs_x * sizeof(DATA_TYPE) + lhs_y * lhs_stride_y + z * lhs_stride_z; +#endif // REINTERPRET_INPUT_AS_3D + +#ifdef REINTERPRET_OUTPUT_AS_3D + dst_offset_first_element_in_bytes += dst_x * sizeof(DATA_TYPE) + (dst_y + z * M) * dst_stride_y; +#else // REINTERPRET_OUTPUT_AS_3D dst_offset_first_element_in_bytes += dst_x * sizeof(DATA_TYPE) + dst_y * dst_stride_y + z * dst_stride_z; +#endif // REINTERPRET_OUTPUT_AS_3D // Initialize the accumulators // MMUL extension accumulate the result in F32 for both F32 and F16 @@ -525,4 +553,4 @@ __kernel void gemm_mm_reshaped_only_rhs_nt_mmul_texture( #undef RHS_OFFSET_X #undef RHS_STEP_X } -#endif // defined(GEMM_MM_RESHAPED_ONLY_RHS_MMUL_TEXTURE) \ No newline at end of file +#endif // defined(GEMM_MM_RESHAPED_ONLY_RHS_MMUL_TEXTURE) diff --git a/src/core/CL/cl_kernels/common/generate_proposals.cl b/src/core/CL/cl_kernels/common/generate_proposals.cl index 5b8502072a..bfe1922ac2 100644 --- a/src/core/CL/cl_kernels/common/generate_proposals.cl +++ b/src/core/CL/cl_kernels/common/generate_proposals.cl @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021 Arm Limited. + * Copyright (c) 2019-2021, 2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -59,18 +59,16 @@ __kernel void generate_proposals_compute_all_anchors( Vector anchors = CONVERT_TO_VECTOR_STRUCT_NO_STEP(anchors); Vector rois = CONVERT_TO_VECTOR_STRUCT(rois); - const size_t idx = get_global_id(0); + const unsigned int idx = get_global_id(0); // Find the index of the anchor - const size_t anchor_idx = idx % NUM_ANCHORS; + const unsigned int anchor_idx = idx % NUM_ANCHORS; // Find which shift is this thread using - const size_t shift_idx = idx / NUM_ANCHORS; + const unsigned int shift_idx = idx / NUM_ANCHORS; // Compute the shift on the X and Y direction (the shift depends exclusively by the index thread id) - const DATA_TYPE - shift_x = (DATA_TYPE)(shift_idx % WIDTH) * STRIDE; - const DATA_TYPE - shift_y = (DATA_TYPE)(shift_idx / WIDTH) * STRIDE; + const float shift_x = (float)(shift_idx % WIDTH) * STRIDE; + const float shift_y = (float)(shift_idx / WIDTH) * STRIDE; const VEC_DATA_TYPE(DATA_TYPE, NUM_ROI_FIELDS) shift = (VEC_DATA_TYPE(DATA_TYPE, NUM_ROI_FIELDS))(shift_x, shift_y, shift_x, shift_y); diff --git a/src/core/GPUTarget.cpp b/src/core/GPUTarget.cpp index 2d1a13cb33..5904e1a06f 100644 --- a/src/core/GPUTarget.cpp +++ b/src/core/GPUTarget.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2022 Arm Limited. + * Copyright (c) 2018-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -30,6 +30,23 @@ namespace { + +arm_compute::GPUTarget get_fifth_gen_target(const std::string &version) +{ + if (version.find("G720") != std::string::npos) + { + return arm_compute::GPUTarget::G720; + } + else if (version.find("G620") != std::string::npos) + { + return arm_compute::GPUTarget::G620; + } + else + { + return arm_compute::GPUTarget::UNKNOWN; + } +} + arm_compute::GPUTarget get_valhall_target(const std::string &version) { if (version.find("G77") != std::string::npos) @@ -152,16 +169,18 @@ namespace arm_compute const std::string &string_from_target(GPUTarget target) { static std::map gpu_target_map = { - {GPUTarget::MIDGARD, "midgard"}, {GPUTarget::BIFROST, "bifrost"}, {GPUTarget::VALHALL, "valhall"}, - {GPUTarget::T600, "t600"}, {GPUTarget::T700, "t700"}, {GPUTarget::T800, "t800"}, - {GPUTarget::G71, "g71"}, {GPUTarget::G72, "g72"}, {GPUTarget::G51, "g51"}, - {GPUTarget::G51BIG, "g51big"}, {GPUTarget::G51LIT, "g51lit"}, {GPUTarget::G31, "g31"}, - {GPUTarget::G76, "g76"}, {GPUTarget::G52, "g52"}, {GPUTarget::G52LIT, "g52lit"}, - {GPUTarget::G77, "g77"}, {GPUTarget::G57, "g57"}, {GPUTarget::G78, "g78"}, - {GPUTarget::G68, "g68"}, {GPUTarget::G78AE, "g78ae"}, {GPUTarget::G710, "g710"}, - {GPUTarget::G610, "g610"}, {GPUTarget::G510, "g510"}, {GPUTarget::G310, "g310"}, - {GPUTarget::G715, "g715"}, {GPUTarget::G615, "g615"}, - }; + {GPUTarget::MIDGARD, "midgard"}, {GPUTarget::BIFROST, "bifrost"}, {GPUTarget::VALHALL, "valhall"}, + {GPUTarget::FIFTHGEN, "fifthgen"}, + + {GPUTarget::T600, "t600"}, {GPUTarget::T700, "t700"}, {GPUTarget::T800, "t800"}, + {GPUTarget::G71, "g71"}, {GPUTarget::G72, "g72"}, {GPUTarget::G51, "g51"}, + {GPUTarget::G51BIG, "g51big"}, {GPUTarget::G51LIT, "g51lit"}, {GPUTarget::G31, "g31"}, + {GPUTarget::G76, "g76"}, {GPUTarget::G52, "g52"}, {GPUTarget::G52LIT, "g52lit"}, + {GPUTarget::G77, "g77"}, {GPUTarget::G57, "g57"}, {GPUTarget::G78, "g78"}, + {GPUTarget::G68, "g68"}, {GPUTarget::G78AE, "g78ae"}, {GPUTarget::G710, "g710"}, + {GPUTarget::G610, "g610"}, {GPUTarget::G510, "g510"}, {GPUTarget::G310, "g310"}, + {GPUTarget::G715, "g715"}, {GPUTarget::G615, "g615"}, {GPUTarget::G720, "g720"}, + {GPUTarget::G620, "g620"}}; return gpu_target_map[target]; } @@ -188,8 +207,13 @@ GPUTarget get_target_from_name(const std::string &device_name) GPUTarget gpu_target; if (target == 'G' || is_future_gpu) { - // Check for Valhall or Bifrost - gpu_target = get_valhall_target(version); + // Check for Valhall, Bifrost or 5-th Gen + gpu_target = get_fifth_gen_target(version); + if (gpu_target == GPUTarget::UNKNOWN) + { + gpu_target = get_valhall_target(version); + } + if (gpu_target == GPUTarget::UNKNOWN) { gpu_target = get_bifrost_target(version); diff --git a/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp b/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp index deb89996a9..717fd11485 100644 --- a/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp +++ b/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2021 Arm Limited. + * Copyright (c) 2017-2021, 2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -151,128 +151,15 @@ Status validate_arguments(const ITensorInfo *input, } } //namespace -template -void NEBatchNormalizationLayerKernel::batch_normalization_nchw(const Window &window) -{ - /** SIMD vector tag type. */ - using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t; - - const int window_step_x = 16 / sizeof(T); - const auto window_start_x = static_cast(window.x().start()); - const auto window_end_x = static_cast(window.x().end()); - - Window win_to_use = window; - win_to_use.set(Window::DimX, Window::Dimension(0, 1, 1)); - - Iterator input(_input, win_to_use); - Iterator output(_output, win_to_use); - - F activation_functor(_act_info); - - // Hold information about the current feature map we are iterating. - // Only compute denominator and constants once per feature map. - int slice = -1; - - const auto input_mean = reinterpret_cast(_mean->ptr_to_element(Coordinates(0, 0))); - const auto input_var = reinterpret_cast(_var->ptr_to_element(Coordinates(0, 0))); - const auto input_gamma = - (_gamma != nullptr) ? reinterpret_cast(_gamma->ptr_to_element(Coordinates(0, 0))) : nullptr; - const auto input_beta = - (_beta != nullptr) ? reinterpret_cast(_beta->ptr_to_element(Coordinates(0, 0))) : nullptr; - - T mean = static_cast(0); - T var = static_cast(0); - T gamma = static_cast(1); - T beta = static_cast(0); - T denominator = static_cast(0); - - auto mean_vec = wrapper::vdup_n(mean, ExactTagType{}); - auto var_vec = wrapper::vdup_n(var, ExactTagType{}); - auto gamma_vec = wrapper::vdup_n(gamma, ExactTagType{}); - auto beta_vec = wrapper::vdup_n(beta, ExactTagType{}); - auto denominator_vec = wrapper::vdup_n(denominator, ExactTagType{}); - const auto epsilon_vec = wrapper::vdup_n(static_cast(_epsilon), ExactTagType{}); - execute_window_loop( - win_to_use, - [&](const Coordinates &id) - { - const auto input_ptr = reinterpret_cast(input.ptr()); - const auto output_ptr = reinterpret_cast(output.ptr()); - - if (slice != id.z()) - { - mean = input_mean[id.z()]; - var = input_var[id.z()]; - mean_vec = wrapper::vdup_n(mean, ExactTagType{}); - var_vec = wrapper::vdup_n(var, ExactTagType{}); - if (input_gamma != nullptr) - { - gamma = input_gamma[id.z()]; - gamma_vec = wrapper::vdup_n(gamma, ExactTagType{}); - } - if (input_beta != nullptr) - { - beta = input_beta[id.z()]; - beta_vec = wrapper::vdup_n(beta, ExactTagType{}); - } - - // Calculate denominator - denominator_vec = wrapper::vinvsqrt(wrapper::vadd(var_vec, epsilon_vec)); - denominator = wrapper::vgetlane(denominator_vec, 0); - slice = id.z(); - } - - // Perform core calculations using vector operations - int x = window_start_x; - for (; x <= (window_end_x - window_step_x); x += window_step_x) - { - // Calculate x bar - const auto numerator = wrapper::vsub(wrapper::vloadq(input_ptr + x), mean_vec); - const auto x_bar = wrapper::vmul(numerator, denominator_vec); - auto res = wrapper::vmla(beta_vec, x_bar, gamma_vec); - - // Perform fused activation - if (fused_activation) - { - activation_functor(res); - } - - // Store results - wrapper::vstore(output_ptr + x, res); - } - - // Compute left-over elements - for (; x < window_end_x; ++x) - { - const T numerator = input_ptr[x] - mean; - const T x_bar = numerator * denominator; - T res = beta + x_bar * gamma; - - // Perform fused activation - if (fused_activation) - { - activation_functor(res); - } - - // Store results - *(output_ptr + x) = res; - } - }, - input, output); -} - void NEBatchNormalizationLayerKernel::configure_non_fused() { switch (_input->info()->data_type()) { -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC case DataType::F16: - _func = &NEBatchNormalizationLayerKernel::batch_normalization_nchw>; + _func = REGISTER_FP16_NEON(cpu::fp16_batch_normalization_nchw_non_fused); break; -#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC case DataType::F32: - _func = &NEBatchNormalizationLayerKernel::batch_normalization_nchw>; + _func = REGISTER_FP32_NEON(cpu::fp32_batch_normalization_nchw_non_fused); break; default: ARM_COMPUTE_ERROR("Element size not supported"); @@ -285,29 +172,26 @@ void NEBatchNormalizationLayerKernel::configure_fused() // NCHW Fused Batched Normalization with activation functions : FP32 static std::map bn_fused_map_f32_nchw = { {ActivationLayerInfo::ActivationFunction::RELU, - &NEBatchNormalizationLayerKernel::batch_normalization_nchw>}, + REGISTER_FP32_NEON(cpu::fp32_batch_normalization_nchw_non_fused_relu)}, {ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, - &NEBatchNormalizationLayerKernel::batch_normalization_nchw>}, + REGISTER_FP32_NEON(cpu::fp32_batch_normalization_nchw_non_fused_brelu)}, {ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, - &NEBatchNormalizationLayerKernel::batch_normalization_nchw>}}; -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + REGISTER_FP32_NEON(cpu::fp32_batch_normalization_nchw_non_fused_lubrelu)}}; + // NCHW Fused Batched Normalization with activation functions : FP16 static std::map bn_fused_map_f16_nchw = { {ActivationLayerInfo::ActivationFunction::RELU, - &NEBatchNormalizationLayerKernel::batch_normalization_nchw>}, + REGISTER_FP16_NEON(cpu::fp16_batch_normalization_nchw_non_fused_relu)}, {ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, - &NEBatchNormalizationLayerKernel::batch_normalization_nchw>}, + REGISTER_FP16_NEON(cpu::fp16_batch_normalization_nchw_non_fused_brelu)}, {ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, - &NEBatchNormalizationLayerKernel::batch_normalization_nchw>}}; -#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + REGISTER_FP16_NEON(cpu::fp16_batch_normalization_nchw_non_fused_lubrelu)}}; switch (_input->info()->data_type()) { -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC case DataType::F16: _func = bn_fused_map_f16_nchw[_act_info.activation()]; break; -#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC case DataType::F32: _func = bn_fused_map_f32_nchw[_act_info.activation()]; break; @@ -409,7 +293,7 @@ void NEBatchNormalizationLayerKernel::run(const Window &window, const ThreadInfo const bool is_nchw = _input->info()->data_layout() == DataLayout::NCHW; if (is_nchw) { - (this->*_func)(window); + (*_func)(window, _input, _output, _mean, _var, _beta, _gamma, _epsilon, _act_info); } else { diff --git a/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.h b/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.h index 2e8ff0dc9a..679ade0fae 100644 --- a/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.h +++ b/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.h @@ -21,8 +21,8 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#ifndef ARM_COMPUTE_NEBATCHNORMALIZATIONLAYERKERNEL_H -#define ARM_COMPUTE_NEBATCHNORMALIZATIONLAYERKERNEL_H +#ifndef ACL_SRC_CORE_NEON_KERNELS_NEBATCHNORMALIZATIONLAYERKERNEL_H +#define ACL_SRC_CORE_NEON_KERNELS_NEBATCHNORMALIZATIONLAYERKERNEL_H #include "arm_compute/function_info/ActivationLayerInfo.h" @@ -110,31 +110,19 @@ class NEBatchNormalizationLayerKernel : public INEKernel /** Configure execution function in case of fused activation **/ void configure_fused(); - /** Template function to run batch normalization on fp32 - * - * @tparam T Specialization data type - * @tparam fused_activation Boolean that flags if its a fused activation or not - * @tparam F Activation function functor to run - * - * @param[in] window Region on which to execute the kernel. (Must be a valid region of the window returned by window()). - */ - template - void batch_normalization_nchw(const Window &window); - /** Template function to run batch normalization on fp32 on tensors with NHWC format - * - * @tparam T Specialization data type - * @tparam fused_activation Boolean that flags if its a fused activation or not - * @tparam F Activation function functor to run - * - * @param[in] window Region on which to execute the kernel. (Must be a valid region of the window returned by window()). - */ - template - void batch_normalization_nhwc(const Window &window); /** Common signature for all the batch normalization functions * * @param[in] window Region on which to execute the kernel. */ - using BatchNormFunctionPtr = void (NEBatchNormalizationLayerKernel::*)(const Window &window); + using BatchNormFunctionPtr = void (*)(const Window &window, + ITensor *input, + ITensor *output, + const ITensor *mean, + const ITensor *var, + const ITensor *beta, + const ITensor *gamma, + float epsilon, + ActivationLayerInfo act_info); private: BatchNormFunctionPtr _func; @@ -148,4 +136,4 @@ class NEBatchNormalizationLayerKernel : public INEKernel ActivationLayerInfo _act_info; }; } // namespace arm_compute -#endif /*ARM_COMPUTE_NEBATCHNORMALIZATIONLAYERKERNEL_H */ +#endif // ACL_SRC_CORE_NEON_KERNELS_NEBATCHNORMALIZATIONLAYERKERNEL_H diff --git a/src/core/NEON/kernels/NEDepthToSpaceLayerKernel.cpp b/src/core/NEON/kernels/NEDepthToSpaceLayerKernel.cpp index de0079ee60..e0eb5cf202 100644 --- a/src/core/NEON/kernels/NEDepthToSpaceLayerKernel.cpp +++ b/src/core/NEON/kernels/NEDepthToSpaceLayerKernel.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020 Arm Limited. + * Copyright (c) 2019-2020, 2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -23,6 +23,7 @@ */ #include "src/core/NEON/kernels/NEDepthToSpaceLayerKernel.h" +#include "arm_compute/core/CoreTypes.h" #include "arm_compute/core/Helpers.h" #include "arm_compute/core/ITensor.h" #include "arm_compute/core/Types.h" @@ -31,13 +32,10 @@ #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" -#include "src/core/NEON/wrapper/wrapper.h" +#include "src/cpu/kernels/depth_to_space/list.h" -#include #include -using namespace arm_compute::misc::shape_calculator; - namespace arm_compute { namespace @@ -70,15 +68,19 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, i } // namespace NEDepthToSpaceLayerKernel::NEDepthToSpaceLayerKernel() - : _input(nullptr), _output(nullptr), _block_shape(), _data_layout(DataLayout::UNKNOWN) + : _input(nullptr), + _output(nullptr), + _block_shape(), + _data_layout(DataLayout::UNKNOWN), + _split_dimension(Window::DimY) { } void NEDepthToSpaceLayerKernel::configure(const ITensor *input, ITensor *output, int32_t block_shape) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); - TensorShape output_shape = - compute_depth_to_space_shape(input->info()->tensor_shape(), input->info()->data_layout(), block_shape); + TensorShape output_shape = misc::shape_calculator::compute_depth_to_space_shape( + input->info()->tensor_shape(), input->info()->data_layout(), block_shape); // Output auto inizialitation if not yet initialized auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape)); @@ -90,9 +92,31 @@ void NEDepthToSpaceLayerKernel::configure(const ITensor *input, ITensor *output, _block_shape = block_shape; _data_layout = input->info()->data_layout(); + constexpr size_t dim_b = 3; + const auto dim_h = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT); + const auto dim_w = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH); + const auto dim_c = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::CHANNEL); + + ARM_COMPUTE_ERROR_ON(get_data_layout_dimension_index(_data_layout, DataLayoutDimension::BATCHES) != dim_b); + // Configure kernel window - Window win = calculate_max_window(*input->info(), Steps()); + Steps steps; + steps.set(dim_h, block_shape); + steps.set(dim_w, block_shape); + steps.set(dim_c, output->info()->dimension(dim_c)); + + Window win = calculate_max_window(*output->info(), steps); ICPPKernel::configure(win); + + const auto num_batches = input->info()->tensor_shape().total_size_upper(dim_b); + if (num_batches > 1) + { + _split_dimension = dim_b; + } + else + { + _split_dimension = dim_h; + } } Status NEDepthToSpaceLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, int32_t block_shape) @@ -102,68 +126,80 @@ Status NEDepthToSpaceLayerKernel::validate(const ITensorInfo *input, const ITens return Status{}; } +size_t NEDepthToSpaceLayerKernel::get_split_dimension() const +{ + return _split_dimension; +} + void NEDepthToSpaceLayerKernel::run(const Window &window, const ThreadInfo &info) { ARM_COMPUTE_UNUSED(info); ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICPPKernel::window(), window); - const int idx_channel = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::CHANNEL); - const int depth_size = _input->info()->dimension(idx_channel); - const int r = (depth_size / (_block_shape * _block_shape)); - const int element_size = _input->info()->element_size(); + const auto *input_info = _input->info(); + const auto *output_info = _output->info(); + + const auto element_size = input_info->element_size(); + const auto &input_strides = input_info->strides_in_bytes(); + const auto &output_strides = output_info->strides_in_bytes(); + + const auto &input_shape = input_info->tensor_shape(); - Window slice_out = window.first_slice_window_3D(); + const uintptr_t k_input_strides[] = {input_strides[0], input_strides[1], input_strides[2], input_strides[3]}; + const uintptr_t k_output_strides[] = {output_strides[0], output_strides[1], output_strides[2], output_strides[3]}; - // The slice_out slice does not move - slice_out.set(Window::DimX, Window::Dimension(0, 0, 0)); - slice_out.set(Window::DimY, Window::Dimension(0, 0, 0)); - slice_out.set(Window::DimZ, Window::Dimension(0, 0, 0)); + const uint8_t *k_input_ptr = _input->buffer(); + uint8_t *k_output_ptr = // + _output->buffer() + // + window[3].start() * output_strides[3] + // + window[2].start() * output_strides[2] + // + window[1].start() * output_strides[1] + // + window[0].start() * output_strides[0]; - // Main loop for NCHW and NHWC if (_data_layout == DataLayout::NCHW) { - Window slice_in = window.first_slice_window_2D(); - do - { - Iterator in(_input, slice_in); - execute_window_loop( - slice_in, - [&](const Coordinates &id) - { - const int x = id.x(); - const int y = id.y(); - - const int z = id.z() % r; - const int out_x = x * _block_shape + (id.z() / r) % _block_shape; - const int out_y = y * _block_shape + (id.z() / r) / _block_shape; - Coordinates output_coords{out_x, out_y, z, id[3]}; - memcpy(_output->ptr_to_element(output_coords), in.ptr(), element_size); - }, - in); - } while (window.slide_window_slice_2D(slice_in)); + ARM_COMPUTE_ERROR_ON_MSG(window[2].start() != 0 || window[2].end() != window[2].step(), + "The window cannot be splitted in channel dimension"); + + const uintptr_t k_input_shape[] = { + window.num_iterations(0), // + window.num_iterations(1), // + input_shape[2], // The window cannot be splitted in channel dimension. + window.num_iterations(3) // + }; + + k_input_ptr += window[3].start() * input_strides[3] + // + window[2].start() * _block_shape * _block_shape * input_strides[2] + // + (window[1].start() / _block_shape) * input_strides[1] + // + (window[0].start() / _block_shape) * input_strides[0]; + + cpu::depth_to_space_nchw_any( // + k_input_ptr, k_output_ptr, // + k_input_shape, k_input_strides, k_output_strides, // + element_size, _block_shape); } else { - Window slice_in = window.first_slice_window_3D(); - do - { - Iterator in(_input, slice_in); - execute_window_loop( - slice_in, - [&](const Coordinates &id) - { - const int x = id.y(); - const int y = id.z(); - - const int z = id.x() % r; - const int out_x = x * _block_shape + (id.x() / r) % _block_shape; - const int out_y = y * _block_shape + (id.x() / r) / _block_shape; - Coordinates output_coords{z, out_x, out_y, id[3]}; - memcpy(_output->ptr_to_element(output_coords), in.ptr(), element_size); - }, - in); - } while (window.slide_window_slice_3D(slice_in)); + ARM_COMPUTE_ERROR_ON_MSG(window[0].start() != 0 || window[0].end() != window[0].step(), + "The window cannot be splitted in channel dimension"); + + const uintptr_t k_input_shape[] = { + input_shape[0], // The window cannot be splitted in channel dimension. + window.num_iterations(1), // + window.num_iterations(2), // + window.num_iterations(3) // + }; + + k_input_ptr += window[3].start() * input_strides[3] + // + (window[2].start() / _block_shape) * input_strides[2] + // + (window[1].start() / _block_shape) * input_strides[1] + // + window[0].start() * _block_shape * _block_shape * input_strides[0]; + + cpu::depth_to_space_nhwc_any( // + k_input_ptr, k_output_ptr, // + k_input_shape, k_input_strides, k_output_strides, // + element_size, _block_shape); } } } // namespace arm_compute diff --git a/src/core/NEON/kernels/NEDepthToSpaceLayerKernel.h b/src/core/NEON/kernels/NEDepthToSpaceLayerKernel.h index 7e18dd88b8..ca431ec5fe 100644 --- a/src/core/NEON/kernels/NEDepthToSpaceLayerKernel.h +++ b/src/core/NEON/kernels/NEDepthToSpaceLayerKernel.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020 Arm Limited. + * Copyright (c) 2019-2020, 2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -21,8 +21,8 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#ifndef ARM_COMPUTE_NEDEPTHTOSPACELAYERKERNEL_H -#define ARM_COMPUTE_NEDEPTHTOSPACELAYERKERNEL_H +#ifndef ACL_SRC_CORE_NEON_KERNELS_NEDEPTHTOSPACELAYERKERNEL_H +#define ACL_SRC_CORE_NEON_KERNELS_NEDEPTHTOSPACELAYERKERNEL_H #include "src/core/NEON/INEKernel.h" @@ -68,14 +68,18 @@ class NEDepthToSpaceLayerKernel : public INEKernel */ static Status validate(const ITensorInfo *input, const ITensorInfo *output, int32_t block_shape); + /** Get the dimension the scheduler should use to split. */ + size_t get_split_dimension() const; + // Inherited methods overridden: void run(const Window &window, const ThreadInfo &info) override; private: - const ITensor *_input; /**< Source tensor */ - ITensor *_output; /**< Destination tensor */ - int32_t _block_shape; /**< Block shape */ - DataLayout _data_layout; /**< Data layout of the operation */ + const ITensor *_input; /**< Source tensor */ + ITensor *_output; /**< Destination tensor */ + int32_t _block_shape; /**< Block shape */ + DataLayout _data_layout; /**< Data layout of the operation */ + size_t _split_dimension; /**< The dimension the scheduler should use to split the workload. */ }; } // namespace arm_compute -#endif /* ARM_COMPUTE_NEDEPTHTOSPACELAYERKERNEL_H */ +#endif // ACL_SRC_CORE_NEON_KERNELS_NEDEPTHTOSPACELAYERKERNEL_H diff --git a/src/core/NEON/kernels/NENormalizationLayerKernel.cpp b/src/core/NEON/kernels/NENormalizationLayerKernel.cpp index 2c61bda147..8399c6c49d 100644 --- a/src/core/NEON/kernels/NENormalizationLayerKernel.cpp +++ b/src/core/NEON/kernels/NENormalizationLayerKernel.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2021 Arm Limited. + * Copyright (c) 2017-2021, 2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -30,6 +30,7 @@ #include "arm_compute/core/Validate.h" #include "arm_compute/core/Window.h" +#include "src/core/common/Registrars.h" #include "src/core/CPP/Validate.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/NormalizationHelpers.h" @@ -37,6 +38,8 @@ #include "src/core/NEON/NEFixedPoint.h" #include "src/core/NEON/NEMath.h" #include "src/core/NEON/wrapper/wrapper.h" +#include "src/cpu/kernels/norm_layer/generic/neon/impl.h" +#include "src/cpu/kernels/norm_layer/generic/neon/list.h" namespace arm_compute { @@ -91,7 +94,6 @@ void NENormalizationLayerKernel::configure(const ITensor *input, _input_squared = input_squared; _output = output; _norm_info = norm_info; - switch (_input->info()->data_type()) { case DataType::F32: @@ -102,33 +104,33 @@ void NENormalizationLayerKernel::configure(const ITensor *input, { if (norm_info.type() == NormType::IN_MAP_2D) { - _func = &NENormalizationLayerKernel::normalize_float; + _func = REGISTER_FP32_NEON(cpu::neon_normalize_float32_4_0_2D); } else { - _func = &NENormalizationLayerKernel::normalize_float; + _func = REGISTER_FP32_NEON(cpu::neon_normalize_float32_4_0); } break; } case 1: if (norm_info.type() == NormType::IN_MAP_2D) { - _func = &NENormalizationLayerKernel::normalize_float; + _func = REGISTER_FP32_NEON(cpu::neon_normalize_float32_4_1_2D); } else { - _func = &NENormalizationLayerKernel::normalize_float; + _func = REGISTER_FP32_NEON(cpu::neon_normalize_float32_4_1); } break; case 2: - _func = &NENormalizationLayerKernel::normalize_float; + _func = REGISTER_FP32_NEON(cpu::neon_normalize_float32_4_2); break; default: break; } break; } -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +#ifdef ARM_COMPUTE_ENABLE_FP16 case DataType::F16: { switch (norm_idx) @@ -137,33 +139,33 @@ void NENormalizationLayerKernel::configure(const ITensor *input, { if (norm_info.type() == NormType::IN_MAP_2D) { - _func = &NENormalizationLayerKernel::normalize_float; + _func = REGISTER_FP16_NEON(cpu::neon_normalize_float16_8_0_2D); } else { - _func = &NENormalizationLayerKernel::normalize_float; + _func = REGISTER_FP16_NEON(cpu::neon_normalize_float16_8_0); } break; } case 1: if (norm_info.type() == NormType::IN_MAP_2D) { - _func = &NENormalizationLayerKernel::normalize_float; + _func = REGISTER_FP16_NEON(cpu::neon_normalize_float16_8_1_2D); } else { - _func = &NENormalizationLayerKernel::normalize_float; + _func = REGISTER_FP16_NEON(cpu::neon_normalize_float16_8_1); } break; case 2: - _func = &NENormalizationLayerKernel::normalize_float; + _func = REGISTER_FP16_NEON(cpu::neon_normalize_float16_8_2); break; default: break; } break; } -#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ +#endif /* ARM_COMPUTE_ENABLE_FP16 */ default: ARM_COMPUTE_ERROR("NOT SUPPORTED!"); } @@ -173,124 +175,6 @@ void NENormalizationLayerKernel::configure(const ITensor *input, INEKernel::configure(win); } -template -void NENormalizationLayerKernel::normalize_float(const Window &window) -{ - /** SIMD vector tag type. */ - using ExactTagType = typename wrapper::traits::neon_vector::tag_type; - - Window win(window); - win.set(Window::DimX, Window::Dimension(0, 1, 1)); - - const auto window_start_x = static_cast(window.x().start()); - const auto window_end_x = static_cast(window.x().end()); - const int window_step_x = S; - - Iterator input(_input, win); - Iterator input_squared(_input_squared, win); - Iterator output(_output, win); - - const int dim_y = _input->info()->data_layout() == DataLayout::NCHW ? 1 : 2; - const int radius = _norm_info.norm_size() / 2; - const int input_squared_stride_x = _input_squared->info()->strides_in_bytes()[0]; - const int input_squared_stride_slice = _input_squared->info()->strides_in_bytes()[dim]; - const int input_squared_stride_row = _input_squared->info()->strides_in_bytes()[dim_y]; - - const int max_right = _input->info()->dimension(dim) - 1; - const int max_bottom = _input->info()->dimension(dim_y) - 1; - - const auto coeff_vec = wrapper::vdup_n(static_cast(_norm_info.scale_coeff()), ExactTagType{}); - const auto beta_vec = wrapper::vdup_n(static_cast(_norm_info.beta()), ExactTagType{}); - const auto kappa_vec = wrapper::vdup_n(static_cast(_norm_info.kappa()), ExactTagType{}); - - auto sequential_normalization = [&](const int x, const Coordinates &id, const int current_row, const int first_row, - const int last_row, const T *input_ptr, const uint8_t *input_squared_start_ptr, - T *output_ptr) - { - const int current_slice = dim == 0 ? x : id[dim]; - const int first_slice = std::max(current_slice - radius, 0); - const int last_slice = std::min(current_slice + radius, max_right); - - const uint8_t *const input_squared_x_ptr = input_squared_start_ptr + x * input_squared_stride_x; - // Accumulate 2D In-Map values - auto accu = static_cast(0.f); - for (int j = first_row; j <= last_row; ++j) - { - // Compute row displacement - const uint8_t *const input_squared_ptr = input_squared_x_ptr + (j - current_row) * input_squared_stride_row; - for (int i = first_slice; i <= last_slice; ++i) - { - accu += - *reinterpret_cast(input_squared_ptr + (i - current_slice) * input_squared_stride_slice); - } - } - - // Normalize - const auto normalized = std::pow( - accu * static_cast(_norm_info.scale_coeff()) + static_cast(_norm_info.kappa()), _norm_info.beta()); - const auto normalized_pixel = (*(input_ptr + x)) / normalized; - *(output_ptr + x) = normalized_pixel; - }; - - execute_window_loop( - win, - [&](const Coordinates &id) - { - const auto input_ptr = reinterpret_cast(input.ptr()); - auto output_ptr = reinterpret_cast(output.ptr()); - - // Get range to normalize - const int current_row = do_2D_norm ? id[dim_y] : 0; - const int first_row = do_2D_norm ? std::max(current_row - radius, 0) : 0; - const int last_row = do_2D_norm ? std::min(current_row + radius, max_bottom) : 0; - - int x = window_start_x; - // Compute serially starting elements for the case x dimension is width - for (; x < radius && x < window_end_x && dim == 0; ++x) - { - sequential_normalization(x, id, current_row, first_row, last_row, input_ptr, input_squared.ptr(), - output_ptr); - } - - // Compute vectorized - for (; x <= window_end_x - window_step_x - radius; x += window_step_x) - { - const int current_slice = dim == 0 ? x : id[dim]; - const int first_slice = std::max(current_slice - radius, 0); - const int last_slice = std::min(current_slice + radius, max_right); - - const uint8_t *const input_squared_x_ptr = input_squared.ptr() + x * input_squared_stride_x; - // Accumulate 2D In-Map values - auto accu = wrapper::vdup_n(static_cast(0.f), ExactTagType{}); - for (int j = first_row; j <= last_row; ++j) - { - // Compute row displacement - const uint8_t *const input_squared_ptr = - input_squared_x_ptr + (j - current_row) * input_squared_stride_row; - for (int i = first_slice; i <= last_slice; ++i) - { - accu = wrapper::vadd( - accu, wrapper::vloadq(reinterpret_cast( - input_squared_ptr + (i - current_slice) * input_squared_stride_slice))); - } - } - - // Normalize - const auto normalized = wrapper::vpow(wrapper::vmla(kappa_vec, coeff_vec, accu), beta_vec); - const auto normalized_pixel = wrapper::vmul(wrapper::vloadq(input_ptr + x), wrapper::vinv(normalized)); - wrapper::vstore(reinterpret_cast(output_ptr + x), normalized_pixel); - } - - // Compute left-over elements - for (; x < window_end_x; ++x) - { - sequential_normalization(x, id, current_row, first_row, last_row, input_ptr, input_squared.ptr(), - output_ptr); - } - }, - input, input_squared, output); -} - Status NENormalizationLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *input_squared, const ITensorInfo *output, @@ -309,6 +193,6 @@ void NENormalizationLayerKernel::run(const Window &window, const ThreadInfo &inf ARM_COMPUTE_ERROR_ON(_func == nullptr); // Run function - (this->*_func)(window); + (*_func)(window, _input, _input_squared, _output, _norm_info); } } // namespace arm_compute diff --git a/src/core/NEON/kernels/NENormalizationLayerKernel.h b/src/core/NEON/kernels/NENormalizationLayerKernel.h index 2d8d9f3d60..5ba4c3edca 100644 --- a/src/core/NEON/kernels/NENormalizationLayerKernel.h +++ b/src/core/NEON/kernels/NENormalizationLayerKernel.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2020 Arm Limited. + * Copyright (c) 2017-2020, 2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -21,8 +21,8 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#ifndef ARM_COMPUTE_NENORMALIZATIONLAYERKERNEL_H -#define ARM_COMPUTE_NENORMALIZATIONLAYERKERNEL_H +#ifndef ACL_SRC_CORE_NEON_KERNELS_NENORMALIZATIONLAYERKERNEL_H +#define ACL_SRC_CORE_NEON_KERNELS_NENORMALIZATIONLAYERKERNEL_H #include "src/core/NEON/INEKernel.h" @@ -82,24 +82,12 @@ class NENormalizationLayerKernel : public INEKernel void run(const Window &window, const ThreadInfo &info) override; private: - /** Function to perform normalization depending on the given template - * dimension. The second template parameter specifies whether the - * normalization has to be 1D or 2D. - * - * @note Only supported normalizations are: - * - 1D over X or Z - * - 2D over X and Y - * - * @param[in] window Region on which to execute the kernel. - */ - template - void normalize_float(const Window &window); - /** Common signature for all the specialised normalization functions * * @param[in] window Region on which to execute the kernel. */ - using NormalizationFunction = void (NENormalizationLayerKernel::*)(const Window &window); + using NormalizationFunction = void (*)( + const Window &window, const ITensor *in, const ITensor *in_squared, ITensor *out, NormalizationLayerInfo ninfo); private: NormalizationFunction _func; @@ -109,4 +97,4 @@ class NENormalizationLayerKernel : public INEKernel NormalizationLayerInfo _norm_info; }; } // namespace arm_compute -#endif /*ARM_COMPUTE_NENORMALIZATIONLAYERKERNEL_H */ +#endif // ACL_SRC_CORE_NEON_KERNELS_NENORMALIZATIONLAYERKERNEL_H diff --git a/src/core/NEON/kernels/NEReorderKernel.cpp b/src/core/NEON/kernels/NEReorderKernel.cpp index 6c2c987eb7..f5bea3e163 100644 --- a/src/core/NEON/kernels/NEReorderKernel.cpp +++ b/src/core/NEON/kernels/NEReorderKernel.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023 Arm Limited. + * Copyright (c) 2023-2024 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -54,17 +54,41 @@ void NEReorderKernel::run(const Window &window, const ThreadInfo &info) { case WeightFormat::OHWIo4: { - arm_gemm::Transform<4, 1, true, arm_gemm::VLType::None>( - reinterpret_cast(_output->buffer()) + jump_rows, - reinterpret_cast(_input->buffer()), stride, k_start, k_end, 0, _xmax); + switch (_output->info()->data_type()) + { + case DataType::F32: + arm_gemm::Transform<4, 1, true, arm_gemm::VLType::None>( + reinterpret_cast(_output->buffer()) + jump_rows, + reinterpret_cast(_input->buffer()), stride, k_start, k_end, 0, _xmax); + break; + case DataType::BFLOAT16: + arm_gemm::Transform<4, 4, true, arm_gemm::VLType::None>( + reinterpret_cast(_output->buffer()) + jump_rows, + reinterpret_cast(_input->buffer()), stride, k_start, k_end, 0, _xmax); + break; + default: + ARM_COMPUTE_ERROR("Unsupported data type!"); + } break; } #if defined(ARM_COMPUTE_ENABLE_SVE) case WeightFormat::OHWIo8: { - arm_gemm::Transform<1, 1, true, arm_gemm::VLType::SVE>( - reinterpret_cast(_output->buffer()) + jump_rows, - reinterpret_cast(_input->buffer()), stride, k_start, k_end, 0, _xmax); + switch (_output->info()->data_type()) + { + case DataType::F32: + arm_gemm::Transform<1, 1, true, arm_gemm::VLType::SVE>( + reinterpret_cast(_output->buffer()) + jump_rows, + reinterpret_cast(_input->buffer()), stride, k_start, k_end, 0, _xmax); + break; + case DataType::BFLOAT16: + arm_gemm::Transform<2, 4, true, arm_gemm::VLType::SVE>( + reinterpret_cast(_output->buffer()) + jump_rows, + reinterpret_cast(_input->buffer()), stride, k_start, k_end, 0, _xmax); + break; + default: + ARM_COMPUTE_ERROR("Unsupported data type!"); + } break; } #endif /* ARM_COMPUTE_ENABLE_SVE */ @@ -175,7 +199,8 @@ Status NEReorderKernel::validate(const ITensorInfo *input, ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN); if (output->tensor_shape().total_size() != 0) { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() != DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON(output->data_type() != DataType::F32 && output->data_type() != DataType::BFLOAT16); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output); // Only input WeightFormat OHWI supported ARM_COMPUTE_RETURN_ERROR_ON(input_wf != arm_compute::WeightFormat::OHWI); diff --git a/src/core/NEON/kernels/arm_gemm/gemm_bf16.cpp b/src/core/NEON/kernels/arm_gemm/gemm_bf16.cpp index 72b0fac96a..5c08e6137d 100644 --- a/src/core/NEON/kernels/arm_gemm/gemm_bf16.cpp +++ b/src/core/NEON/kernels/arm_gemm/gemm_bf16.cpp @@ -206,30 +206,6 @@ GemmImplementation::with_estimate( [](const GemmArgs &args) { return GemmInterleavedFixedFormat::estimate_cycles(args); }, [](const GemmArgs &args) { return new GemmInterleavedFixedFormat(args); } ), -GemmImplementation::with_estimate( - GemmMethod::GEMM_INTERLEAVED, - "a64_ffinterleaved_bf16fp32_mmla_8x12", - KernelWeightFormat::VL256_BL64, - [](const GemmArgs &args) { return args._ci->has_bf16(); }, - [](const GemmArgs &args) { return GemmInterleavedFixedFormat::estimate_cycles(args); }, - [](const GemmArgs &args) { return new GemmInterleavedFixedFormat(args); } -), -GemmImplementation::with_estimate( - GemmMethod::GEMM_INTERLEAVED, - "a64_ffhybrid_bf16fp32_mmla_6x16", - KernelWeightFormat::VL256_BL64, - [](const GemmArgs &args) { return args._ci->has_bf16(); }, - [](const GemmArgs &args) { return GemmHybridIndirectFixedFormat::estimate_cycles(args); }, - [](const GemmArgs &args) { return new GemmHybridIndirectFixedFormat(args); } -), -GemmImplementation::with_estimate( - GemmMethod::GEMM_INTERLEAVED, - "a64_ffinterleaved_bf16fp32_dot_8x12", - KernelWeightFormat::VL128_BL32, - [](const GemmArgs &args) { return args._ci->has_bf16(); }, - [](const GemmArgs &args) { return GemmInterleavedFixedFormat::estimate_cycles(args); }, - [](const GemmArgs &args) { return new GemmInterleavedFixedFormat(args); } -), #endif // ARM_COMPUTE_ENABLE_FIXED_FORMAT_KERNELS GemmImplementation::with_estimate( GemmMethod::GEMM_INTERLEAVED, diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_4_2x4_fp32bf16.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_4_2x4_fp32bf16.hpp new file mode 100644 index 0000000000..98200c50c5 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_4_2x4_fp32bf16.hpp @@ -0,0 +1,346 @@ +/* + * Copyright (c) 2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#pragma once + +#if defined(__aarch64__) + +namespace { + +void a64_transpose_interleave_4_2x4_fp32bf16(bfloat16 *out, const float *in, size_t width, size_t in_stride, size_t height) +{ + float *pad_row = reinterpret_cast(alloca(width * sizeof(float))); + + if (height % 4) { + memset(pad_row, 0, width * sizeof(float)); + } + + size_t out_stride = 4 * roundup(height, 4) * sizeof(bfloat16); + + __asm__ __volatile__( + "cmp %x[height], #0x8\n" + "blt 8f\n" + "1:" // Main row loop: Head + "mov x9, %x[in]\n" + "mov x28, %x[width]\n" + "mov x27, %x[out]\n" + "sub %x[height], %x[height], #0x8\n" + "add x26, x9, %x[in_stride]\n" + "add x25, x26, %x[in_stride]\n" + "add x24, x25, %x[in_stride]\n" + "cmp x28, #0x8\n" + "add x23, x24, %x[in_stride]\n" + "add x22, x23, %x[in_stride]\n" + "add x21, x22, %x[in_stride]\n" + "add x20, x21, %x[in_stride]\n" + "add %x[in], x20, %x[in_stride]\n" + "blt 3f\n" + "2:" // Main row loop: Unroll column loop + "ldr q19, [x9], #0x10\n" + "ldr q18, [x26], #0x10\n" + "sub x28, x28, #0x8\n" + "ldr q17, [x25], #0x10\n" + "ldr q16, [x24], #0x10\n" + "cmp x28, #0x8\n" + "ldr q1, [x23], #0x10\n" + "ldr q0, [x22], #0x10\n" + "ldr q31, [x21], #0x10\n" + "ldr q24, [x20], #0x10\n" + "ldr q23, [x9], #0x10\n" + "ldr q22, [x26], #0x10\n" + "zip1 v30.4s, v19.4s, v17.4s\n" + "zip1 v29.4s, v18.4s, v16.4s\n" + "ldr q21, [x25], #0x10\n" + "ldr q20, [x24], #0x10\n" + "zip2 v28.4s, v19.4s, v17.4s\n" + "zip2 v27.4s, v18.4s, v16.4s\n" + "ldr q19, [x23], #0x10\n" + "ldr q18, [x22], #0x10\n" + "zip1 v26.4s, v1.4s, v31.4s\n" + "zip1 v25.4s, v0.4s, v24.4s\n" + "ldr q17, [x21], #0x10\n" + "ldr q16, [x20], #0x10\n" + "zip2 v8.4s, v1.4s, v31.4s\n" + "zip2 v24.4s, v0.4s, v24.4s\n" + "zip1 v7.4s, v23.4s, v21.4s\n" + "zip1 v6.4s, v22.4s, v20.4s\n" + "zip2 v5.4s, v23.4s, v21.4s\n" + "zip2 v4.4s, v22.4s, v20.4s\n" + "zip1 v3.4s, v19.4s, v17.4s\n" + "zip1 v2.4s, v18.4s, v16.4s\n" + "zip2 v1.4s, v19.4s, v17.4s\n" + "zip2 v0.4s, v18.4s, v16.4s\n" + "zip1 v23.4s, v30.4s, v29.4s\n" + "zip1 v22.4s, v28.4s, v27.4s\n" + "zip1 v21.4s, v26.4s, v25.4s\n" + "zip1 v20.4s, v8.4s, v24.4s\n" + "zip1 v19.4s, v7.4s, v6.4s\n" + "zip1 v18.4s, v5.4s, v4.4s\n" + "zip1 v17.4s, v3.4s, v2.4s\n" + "zip1 v16.4s, v1.4s, v0.4s\n" + ".inst 0x0ea16aff // bfcvtn v31.4h, v23.4s\n" + "zip2 v30.4s, v30.4s, v29.4s\n" + ".inst 0x0ea16add // bfcvtn v29.4h, v22.4s\n" + "zip2 v28.4s, v28.4s, v27.4s\n" + ".inst 0x0ea16abb // bfcvtn v27.4h, v21.4s\n" + "zip2 v26.4s, v26.4s, v25.4s\n" + ".inst 0x0ea16a99 // bfcvtn v25.4h, v20.4s\n" + "zip2 v24.4s, v8.4s, v24.4s\n" + ".inst 0x0ea16a77 // bfcvtn v23.4h, v19.4s\n" + "zip2 v22.4s, v7.4s, v6.4s\n" + ".inst 0x0ea16a55 // bfcvtn v21.4h, v18.4s\n" + "zip2 v20.4s, v5.4s, v4.4s\n" + ".inst 0x0ea16a33 // bfcvtn v19.4h, v17.4s\n" + "zip2 v18.4s, v3.4s, v2.4s\n" + ".inst 0x0ea16a11 // bfcvtn v17.4h, v16.4s\n" + "zip2 v16.4s, v1.4s, v0.4s\n" + ".inst 0x4ea16bdf // bfcvtn2 v31.8h, v30.4s\n" + ".inst 0x4ea16b9d // bfcvtn2 v29.8h, v28.4s\n" + ".inst 0x4ea16b5b // bfcvtn2 v27.8h, v26.4s\n" + ".inst 0x4ea16b19 // bfcvtn2 v25.8h, v24.4s\n" + ".inst 0x4ea16ad7 // bfcvtn2 v23.8h, v22.4s\n" + ".inst 0x4ea16a95 // bfcvtn2 v21.8h, v20.4s\n" + "str q31, [x27, #0x0]\n" + "str q29, [x27, #0x10]\n" + ".inst 0x4ea16a53 // bfcvtn2 v19.8h, v18.4s\n" + ".inst 0x4ea16a11 // bfcvtn2 v17.8h, v16.4s\n" + "str q27, [x27, #0x20]\n" + "str q25, [x27, #0x30]\n" + "add x27, x27, %x[out_stride]\n" + "str q23, [x27, #0x0]\n" + "str q21, [x27, #0x10]\n" + "str q19, [x27, #0x20]\n" + "str q17, [x27, #0x30]\n" + "add x27, x27, %x[out_stride]\n" + "bge 2b\n" + "3:" // Main row loop: Unroll column loop skip + "cmp x28, #0x4\n" + "blt 5f\n" + "4:" // Main row loop: Column loop + "ldr q25, [x9], #0x10\n" + "ldr q24, [x26], #0x10\n" + "sub x28, x28, #0x4\n" + "ldr q21, [x25], #0x10\n" + "ldr q20, [x24], #0x10\n" + "cmp x28, #0x4\n" + "ldr q23, [x23], #0x10\n" + "ldr q19, [x22], #0x10\n" + "ldr q18, [x21], #0x10\n" + "ldr q17, [x20], #0x10\n" + "zip1 v22.4s, v25.4s, v21.4s\n" + "zip1 v16.4s, v24.4s, v20.4s\n" + "zip2 v21.4s, v25.4s, v21.4s\n" + "zip2 v20.4s, v24.4s, v20.4s\n" + "zip1 v27.4s, v23.4s, v18.4s\n" + "zip1 v26.4s, v19.4s, v17.4s\n" + "zip2 v25.4s, v23.4s, v18.4s\n" + "zip2 v24.4s, v19.4s, v17.4s\n" + "zip1 v19.4s, v22.4s, v16.4s\n" + "zip1 v18.4s, v21.4s, v20.4s\n" + "zip1 v17.4s, v27.4s, v26.4s\n" + "zip2 v23.4s, v22.4s, v16.4s\n" + "zip1 v16.4s, v25.4s, v24.4s\n" + "zip2 v22.4s, v21.4s, v20.4s\n" + ".inst 0x0ea16a75 // bfcvtn v21.4h, v19.4s\n" + ".inst 0x0ea16a54 // bfcvtn v20.4h, v18.4s\n" + ".inst 0x0ea16a33 // bfcvtn v19.4h, v17.4s\n" + "zip2 v18.4s, v27.4s, v26.4s\n" + ".inst 0x0ea16a11 // bfcvtn v17.4h, v16.4s\n" + "zip2 v16.4s, v25.4s, v24.4s\n" + ".inst 0x4ea16af5 // bfcvtn2 v21.8h, v23.4s\n" + ".inst 0x4ea16ad4 // bfcvtn2 v20.8h, v22.4s\n" + ".inst 0x4ea16a53 // bfcvtn2 v19.8h, v18.4s\n" + ".inst 0x4ea16a11 // bfcvtn2 v17.8h, v16.4s\n" + "str q21, [x27, #0x0]\n" + "str q20, [x27, #0x10]\n" + "str q19, [x27, #0x20]\n" + "str q17, [x27, #0x30]\n" + "add x27, x27, %x[out_stride]\n" + "bge 4b\n" + "5:" // Main row loop: Column loop skip + "cbz x28, 7f\n" + "movi v16.16b, #0x0\n" + "str q16, [x27, #0x0]\n" + "str q16, [x27, #0x10]\n" + "str q16, [x27, #0x20]\n" + "str q16, [x27, #0x30]\n" + "6:" // Main row loop: width 1 loop: loop + "ldr s23, [x9], #0x4\n" + "ldr s22, [x26], #0x4\n" + "sub x28, x28, #0x1\n" + "ldr s19, [x25], #0x4\n" + "ldr s17, [x24], #0x4\n" + "cmp x28, #0x1\n" + "ldr s21, [x23], #0x4\n" + "ldr s20, [x22], #0x4\n" + "ldr s18, [x21], #0x4\n" + "ldr s16, [x20], #0x4\n" + "zip1 v19.4s, v23.4s, v19.4s\n" + "zip1 v17.4s, v22.4s, v17.4s\n" + "zip1 v18.4s, v21.4s, v18.4s\n" + "zip1 v16.4s, v20.4s, v16.4s\n" + "zip1 v17.4s, v19.4s, v17.4s\n" + "zip1 v16.4s, v18.4s, v16.4s\n" + ".inst 0x0ea16a31 // bfcvtn v17.4h, v17.4s\n" + ".inst 0x0ea16a10 // bfcvtn v16.4h, v16.4s\n" + "str d17, [x27, #0x0]\n" + "str d16, [x27, #0x20]\n" + "add x27, x27, #0x8\n" + "bge 6b\n" + "7:" // Main row loop: odd col skip + "cmp %x[height], #0x8\n" + "add %x[out], %x[out], #0x40\n" + "bge 1b\n" + "cbz %x[height], 16f\n" + "8:" // Main loop skip + "9:" // Tail row loop: Head + "mov x9, %x[in]\n" + "mov x20, %x[width]\n" + "cmp %x[height], #0x3\n" + "mov x27, %x[out]\n" + "add x26, x9, %x[in_stride]\n" + "add x25, x26, %x[in_stride]\n" + "add x24, x25, %x[in_stride]\n" + "csel x25, x25, %x[pad_row], GE\n" + "add %x[in], x24, %x[in_stride]\n" + "csel x24, x24, %x[pad_row], GT\n" + "cmp %x[height], #0x1\n" + "sub %x[height], %x[height], #0x4\n" + "csel x26, x26, %x[pad_row], GT\n" + "cmp x20, #0x8\n" + "blt 11f\n" + "10:" // Tail row loop: Unroll column loop + "ldr q25, [x9], #0x10\n" + "ldr q24, [x26], #0x10\n" + "sub x20, x20, #0x8\n" + "ldr q21, [x25], #0x10\n" + "ldr q20, [x24], #0x10\n" + "cmp x20, #0x8\n" + "ldr q23, [x9], #0x10\n" + "ldr q19, [x26], #0x10\n" + "ldr q18, [x25], #0x10\n" + "ldr q17, [x24], #0x10\n" + "zip1 v22.4s, v25.4s, v21.4s\n" + "zip1 v16.4s, v24.4s, v20.4s\n" + "zip2 v21.4s, v25.4s, v21.4s\n" + "zip2 v20.4s, v24.4s, v20.4s\n" + "zip1 v27.4s, v23.4s, v18.4s\n" + "zip1 v26.4s, v19.4s, v17.4s\n" + "zip2 v25.4s, v23.4s, v18.4s\n" + "zip2 v24.4s, v19.4s, v17.4s\n" + "zip1 v19.4s, v22.4s, v16.4s\n" + "zip1 v18.4s, v21.4s, v20.4s\n" + "zip1 v17.4s, v27.4s, v26.4s\n" + "zip2 v23.4s, v22.4s, v16.4s\n" + "zip1 v16.4s, v25.4s, v24.4s\n" + "zip2 v22.4s, v21.4s, v20.4s\n" + ".inst 0x0ea16a75 // bfcvtn v21.4h, v19.4s\n" + ".inst 0x0ea16a54 // bfcvtn v20.4h, v18.4s\n" + ".inst 0x0ea16a33 // bfcvtn v19.4h, v17.4s\n" + "zip2 v18.4s, v27.4s, v26.4s\n" + ".inst 0x0ea16a11 // bfcvtn v17.4h, v16.4s\n" + "zip2 v16.4s, v25.4s, v24.4s\n" + ".inst 0x4ea16af5 // bfcvtn2 v21.8h, v23.4s\n" + ".inst 0x4ea16ad4 // bfcvtn2 v20.8h, v22.4s\n" + ".inst 0x4ea16a53 // bfcvtn2 v19.8h, v18.4s\n" + ".inst 0x4ea16a11 // bfcvtn2 v17.8h, v16.4s\n" + "str q21, [x27, #0x0]\n" + "str q20, [x27, #0x10]\n" + "add x27, x27, %x[out_stride]\n" + "str q19, [x27, #0x0]\n" + "str q17, [x27, #0x10]\n" + "add x27, x27, %x[out_stride]\n" + "bge 10b\n" + "11:" // Tail row loop: Unroll column loop skip + "cmp x20, #0x4\n" + "blt 13f\n" + "12:" // Tail row loop: Column loop + "ldr q21, [x9], #0x10\n" + "ldr q20, [x26], #0x10\n" + "sub x20, x20, #0x4\n" + "ldr q19, [x25], #0x10\n" + "ldr q17, [x24], #0x10\n" + "cmp x20, #0x4\n" + "zip1 v18.4s, v21.4s, v19.4s\n" + "zip1 v16.4s, v20.4s, v17.4s\n" + "zip2 v21.4s, v21.4s, v19.4s\n" + "zip2 v20.4s, v20.4s, v17.4s\n" + "zip1 v17.4s, v18.4s, v16.4s\n" + "zip2 v19.4s, v18.4s, v16.4s\n" + "zip1 v16.4s, v21.4s, v20.4s\n" + ".inst 0x0ea16a32 // bfcvtn v18.4h, v17.4s\n" + "zip2 v17.4s, v21.4s, v20.4s\n" + ".inst 0x0ea16a10 // bfcvtn v16.4h, v16.4s\n" + ".inst 0x4ea16a72 // bfcvtn2 v18.8h, v19.4s\n" + ".inst 0x4ea16a30 // bfcvtn2 v16.8h, v17.4s\n" + "str q18, [x27, #0x0]\n" + "str q16, [x27, #0x10]\n" + "add x27, x27, %x[out_stride]\n" + "bge 12b\n" + "13:" // Tail row loop: Column loop skip + "cbz x20, 15f\n" + "movi v16.16b, #0x0\n" + "str q16, [x27, #0x0]\n" + "str q16, [x27, #0x10]\n" + "14:" // Tail row loop: width 1 loop: loop + "ldr s19, [x9], #0x4\n" + "ldr s18, [x26], #0x4\n" + "sub x20, x20, #0x1\n" + "ldr s17, [x25], #0x4\n" + "ldr s16, [x24], #0x4\n" + "cmp x20, #0x1\n" + "zip1 v17.4s, v19.4s, v17.4s\n" + "zip1 v16.4s, v18.4s, v16.4s\n" + "zip1 v16.4s, v17.4s, v16.4s\n" + ".inst 0x0ea16a10 // bfcvtn v16.4h, v16.4s\n" + "str d16, [x27, #0x0]\n" + "add x27, x27, #0x8\n" + "bge 14b\n" + "15:" // Tail row loop: odd col skip + "cmp %x[height], #0x1\n" + "add %x[out], %x[out], #0x20\n" + "bge 9b\n" + "16:" // Done + : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out) + : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" + ); +} + +} // anonymous namespace +template<> +void Transform<4, 4, true, VLType::None>( + bfloat16 *out, const float *in, int stride, int x0, int xmax, int k0, int kmax) +{ + a64_transpose_interleave_4_2x4_fp32bf16( + out, + in + k0 * stride + x0, + (xmax-x0), + stride * sizeof(float), + (kmax-k0) + ); +} + + +#endif // defined(__aarch64__) diff --git a/src/core/NEON/kernels/arm_gemm/transforms/list-sve.hpp b/src/core/NEON/kernels/arm_gemm/transforms/list-sve.hpp index c066c01bab..1e6c3d35f4 100644 --- a/src/core/NEON/kernels/arm_gemm/transforms/list-sve.hpp +++ b/src/core/NEON/kernels/arm_gemm/transforms/list-sve.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2022 Arm Limited. + * Copyright (c) 2021-2023,2024 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -42,6 +42,7 @@ #include "sve_transpose_interleave_12VL_2x4_fp32bf16.hpp" #include "sve_transpose_interleave_1VL_1x4.hpp" #include "sve_transpose_interleave_1VL.hpp" +#include "sve_transpose_interleave_2VL_2x4_fp32bf16.hpp" #include "sve_transpose_interleave_3VL_1x4.hpp" #include "sve_transpose_interleave_3VL_2x2.hpp" #include "sve_transpose_interleave_3VL.hpp" diff --git a/src/core/NEON/kernels/arm_gemm/transforms/list.hpp b/src/core/NEON/kernels/arm_gemm/transforms/list.hpp index adbaa6cf2f..1ce319efee 100644 --- a/src/core/NEON/kernels/arm_gemm/transforms/list.hpp +++ b/src/core/NEON/kernels/arm_gemm/transforms/list.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020 Arm Limited. + * Copyright (c) 2020,2024 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -44,6 +44,7 @@ #include "a64_transpose_interleave_32_2x2.hpp" #include "a64_transpose_interleave_4_1x16.hpp" #include "a64_transpose_interleave_4_1x4.hpp" +#include "a64_transpose_interleave_4_2x4_fp32bf16.hpp" #include "a64_transpose_interleave_48.hpp" #include "a64_transpose_interleave_64.hpp" #include "a64_transpose_interleave_96.hpp" diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_2VL_2x4_fp32bf16.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_2VL_2x4_fp32bf16.hpp new file mode 100644 index 0000000000..f66fcdc994 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_2VL_2x4_fp32bf16.hpp @@ -0,0 +1,149 @@ +/* + * Copyright (c) 2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#pragma once + +#if defined(ARM_COMPUTE_ENABLE_SVE) + +namespace { + +void sve_transpose_interleave_2VL_2x4_fp32bf16(bfloat16 *out, const float *in, size_t width, size_t in_stride, size_t height) +{ + float *pad_row = reinterpret_cast(alloca(width * sizeof(float))); + + if (height % 4) { + memset(pad_row, 0, width * sizeof(float)); + } + + size_t out_stride = 2 * roundup(height, 4) * get_vector_length(); + + __asm__ __volatile__( + "ptrue p1.b\n" + "1:" // Main row loop: Head + "mov x26, %x[in]\n" + "mov x25, %x[width]\n" + "cnth x24\n" + "cmp %x[height], #0x3\n" + "mov x23, %x[out]\n" + "add x22, x26, %x[in_stride]\n" + "add x21, x22, %x[in_stride]\n" + "add x20, x21, %x[in_stride]\n" + "add %x[in], x20, %x[in_stride]\n" + "csel x20, x20, %x[pad_row], GT\n" + "csel x21, x21, %x[pad_row], GE\n" + "cmp %x[height], #0x1\n" + "csel x22, x22, %x[pad_row], GT\n" + "cmp x25, x24\n" + "sub %x[height], %x[height], #0x4\n" + "blt 3f\n" + "2:" // Main row loop: Unroll column loop + "ld1w { z18.s }, p1/Z, [x26]\n" + "ld1w { z17.s }, p1/Z, [x21]\n" + "sub x25, x25, x24\n" + "ld1w { z21.s }, p1/Z, [x26, #1, MUL VL]\n" + "ld1w { z16.s }, p1/Z, [x21, #1, MUL VL]\n" + "cmp x25, x24\n" + "addvl x26, x26, #2\n" + "ld1w { z26.s }, p1/Z, [x22]\n" + "ld1w { z20.s }, p1/Z, [x20]\n" + "addvl x21, x21, #2\n" + "zip1 z19.s, z18.s, z17.s\n" + "zip2 z18.s, z18.s, z17.s\n" + "ld1w { z25.s }, p1/Z, [x22, #1, MUL VL]\n" + "ld1w { z24.s }, p1/Z, [x20, #1, MUL VL]\n" + "addvl x22, x22, #2\n" + "zip1 z17.s, z21.s, z16.s\n" + "zip2 z16.s, z21.s, z16.s\n" + "addvl x20, x20, #2\n" + ".inst 0x658aa677 // bfcvt z23.h, p1/M, z19.s\n" + "zip1 z22.s, z26.s, z20.s\n" + ".inst 0x658aa655 // bfcvt z21.h, p1/M, z18.s\n" + "zip2 z20.s, z26.s, z20.s\n" + ".inst 0x658aa633 // bfcvt z19.h, p1/M, z17.s\n" + "zip1 z18.s, z25.s, z24.s\n" + ".inst 0x658aa611 // bfcvt z17.h, p1/M, z16.s\n" + "zip2 z16.s, z25.s, z24.s\n" + ".inst 0x648aa6d7 // bfcvtnt z23.h, p1/M, z22.s\n" + ".inst 0x648aa695 // bfcvtnt z21.h, p1/M, z20.s\n" + ".inst 0x648aa653 // bfcvtnt z19.h, p1/M, z18.s\n" + ".inst 0x648aa611 // bfcvtnt z17.h, p1/M, z16.s\n" + "st1h { z23.h }, p1, [x23]\n" + "st1h { z21.h }, p1, [x23, #1, MUL VL]\n" + "add x23, x23, %x[out_stride]\n" + "st1h { z19.h }, p1, [x23]\n" + "st1h { z17.h }, p1, [x23, #1, MUL VL]\n" + "add x23, x23, %x[out_stride]\n" + "bge 2b\n" + "3:" // Main row loop: Unroll column loop skip + "cbz x25, 5f\n" + "4:" // Main row loop: Column loop + "whilelt p0.s, XZR, x25\n" + "decd x25, ALL, MUL #2\n" + "ld1w { z19.s }, p0/Z, [x26]\n" + "addvl x26, x26, #1\n" + "ld1w { z16.s }, p0/Z, [x21]\n" + "addvl x21, x21, #1\n" + "ld1w { z20.s }, p0/Z, [x22]\n" + "addvl x22, x22, #1\n" + "ld1w { z18.s }, p0/Z, [x20]\n" + "addvl x20, x20, #1\n" + "cmp x25, #0x0\n" + "zip1 z17.s, z19.s, z16.s\n" + "zip2 z16.s, z19.s, z16.s\n" + "zip1 z19.s, z20.s, z18.s\n" + "zip2 z18.s, z20.s, z18.s\n" + ".inst 0x658aa631 // bfcvt z17.h, p1/M, z17.s\n" + ".inst 0x658aa610 // bfcvt z16.h, p1/M, z16.s\n" + ".inst 0x648aa671 // bfcvtnt z17.h, p1/M, z19.s\n" + ".inst 0x648aa650 // bfcvtnt z16.h, p1/M, z18.s\n" + "st1h { z17.h }, p1, [x23]\n" + "st1h { z16.h }, p1, [x23, #1, MUL VL]\n" + "add x23, x23, %x[out_stride]\n" + "bgt 4b\n" + "5:" // Main row loop: Column loop skip + "cmp %x[height], #0x1\n" + "addvl %x[out], %x[out], #2\n" + "bge 1b\n" + : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out) + : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width) + : "cc", "memory", "p0", "p1", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26" + ); +} + +} // anonymous namespace +template<> +void Transform<2, 4, true, VLType::SVE>( + bfloat16 *out, const float *in, int stride, int x0, int xmax, int k0, int kmax) +{ + sve_transpose_interleave_2VL_2x4_fp32bf16( + out, + in + k0 * stride + x0, + (xmax-x0), + stride * sizeof(float), + (kmax-k0) + ); +} + + +#endif // defined(ARM_COMPUTE_ENABLE_SVE) diff --git a/src/core/NEON/kernels/batchnormalization/impl/list.h b/src/core/NEON/kernels/batchnormalization/impl/list.h index cbf540bd71..c619788125 100644 --- a/src/core/NEON/kernels/batchnormalization/impl/list.h +++ b/src/core/NEON/kernels/batchnormalization/impl/list.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020 Arm Limited. + * Copyright (c) 2020, 2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -21,8 +21,8 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#ifndef SRC_CORE_NEON_KERNELS_BATCH_NORMALIZATION_LIST_H -#define SRC_CORE_NEON_KERNELS_BATCH_NORMALIZATION_LIST_H +#ifndef ACL_SRC_CORE_NEON_KERNELS_BATCHNORMALIZATION_IMPL_LIST_H +#define ACL_SRC_CORE_NEON_KERNELS_BATCHNORMALIZATION_IMPL_LIST_H namespace arm_compute { @@ -37,8 +37,23 @@ DECLARE_BATCH_NORMALIZATION_KERNEL(fp16_sve_batch_normalization); DECLARE_BATCH_NORMALIZATION_KERNEL(fp32_neon_batch_normalization); DECLARE_BATCH_NORMALIZATION_KERNEL(fp32_sve_batch_normalization); -#undef DECLARE_ACTIVATION_KERNEL +#define DECLARE_BATCH_NORMALIZATION_NCHW_KERNEL(func_name) \ + void func_name(const Window &window, ITensor *input, ITensor *output, const ITensor *mean, const ITensor *var, \ + const ITensor *beta, const ITensor *gamma, float epsilon, ActivationLayerInfo act_info) + +DECLARE_BATCH_NORMALIZATION_NCHW_KERNEL(fp16_batch_normalization_nchw_non_fused); +DECLARE_BATCH_NORMALIZATION_NCHW_KERNEL(fp32_batch_normalization_nchw_non_fused); +DECLARE_BATCH_NORMALIZATION_NCHW_KERNEL(fp16_batch_normalization_nchw_non_fused_relu); +DECLARE_BATCH_NORMALIZATION_NCHW_KERNEL(fp16_batch_normalization_nchw_non_fused_brelu); +DECLARE_BATCH_NORMALIZATION_NCHW_KERNEL(fp16_batch_normalization_nchw_non_fused_lubrelu); +DECLARE_BATCH_NORMALIZATION_NCHW_KERNEL(fp32_batch_normalization_nchw_non_fused_relu); +DECLARE_BATCH_NORMALIZATION_NCHW_KERNEL(fp32_batch_normalization_nchw_non_fused_brelu); +DECLARE_BATCH_NORMALIZATION_NCHW_KERNEL(fp32_batch_normalization_nchw_non_fused_lubrelu); + +#undef DECLARE_BATCH_NORMALIZATION_KERNEL +#undef DECLARE_BATCH_NORMALIZATION_NCHW_KERNEL + } // namespace cpu } // namespace arm_compute -#endif /* SRC_CORE_NEON_KERNELS_BATCH_NORMALIZATION_LIST_H */ +#endif // ACL_SRC_CORE_NEON_KERNELS_BATCHNORMALIZATION_IMPL_LIST_H diff --git a/src/core/NEON/wrapper/intrinsics/max.h b/src/core/NEON/wrapper/intrinsics/max.h index cec437d171..32d38a856c 100644 --- a/src/core/NEON/wrapper/intrinsics/max.h +++ b/src/core/NEON/wrapper/intrinsics/max.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2020 Arm Limited. + * Copyright (c) 2018-2020, 2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -21,8 +21,8 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#ifndef ARM_COMPUTE_WRAPPER_MAX_H -#define ARM_COMPUTE_WRAPPER_MAX_H +#ifndef ACL_SRC_CORE_NEON_WRAPPER_INTRINSICS_MAX_H +#define ACL_SRC_CORE_NEON_WRAPPER_INTRINSICS_MAX_H #include @@ -59,6 +59,39 @@ VMAX_IMPL(float16_t, float16x8_t, vmaxq, f16) #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC #undef VMAX_IMPL + +#if defined(__aarch64__) +// VMAXV: Across vector max +#define VMAXV_IMPL(stype, vtype, prefix, postfix) \ + inline stype vmaxv(const vtype &a) \ + { \ + return prefix##_##postfix(a); \ + } + +VMAXV_IMPL(uint8_t, uint8x8_t, vmaxv, u8) +VMAXV_IMPL(int8_t, int8x8_t, vmaxv, s8) +VMAXV_IMPL(uint16_t, uint16x4_t, vmaxv, u16) +VMAXV_IMPL(int16_t, int16x4_t, vmaxv, s16) +VMAXV_IMPL(uint32_t, uint32x2_t, vmaxv, u32) +VMAXV_IMPL(int32_t, int32x2_t, vmaxv, s32) +VMAXV_IMPL(float, float32x2_t, vmaxv, f32) +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +VMAXV_IMPL(float16_t, float16x4_t, vmaxv, f16) +#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + +VMAXV_IMPL(uint8_t, uint8x16_t, vmaxvq, u8) +VMAXV_IMPL(int8_t, int8x16_t, vmaxvq, s8) +VMAXV_IMPL(uint16_t, uint16x8_t, vmaxvq, u16) +VMAXV_IMPL(int16_t, int16x8_t, vmaxvq, s16) +VMAXV_IMPL(uint32_t, uint32x4_t, vmaxvq, u32) +VMAXV_IMPL(int32_t, int32x4_t, vmaxvq, s32) +VMAXV_IMPL(float, float32x4_t, vmaxvq, f32) +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +VMAXV_IMPL(float16_t, float16x8_t, vmaxvq, f16) +#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + +#undef VMAXV_IMPL +#endif // defined(__aarch64__) } // namespace wrapper } // namespace arm_compute -#endif /* ARM_COMPUTE_WRAPPER_MAX_H */ +#endif // ACL_SRC_CORE_NEON_WRAPPER_INTRINSICS_MAX_H diff --git a/src/core/Utils.cpp b/src/core/Utils.cpp index 90a7ac32c0..532d08de92 100644 --- a/src/core/Utils.cpp +++ b/src/core/Utils.cpp @@ -450,8 +450,9 @@ std::pair get_quantized_activation_min_max(const ActivationLay const int b_int = is_qasymm8_signed ? quantize_qasymm8_signed(b, oq_info) : quantize_qasymm8(b, oq_info); const auto type_max_value = std::get<1>(get_min_max(data_type)).get(); - const int32_t min_activation = - act_info.activation() != ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU ? oq_info.offset : b_int; + const int32_t min_activation = act_info.activation() != ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU + ? std::min(oq_info.offset, type_max_value) + : b_int; const int32_t max_activation = act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU ? type_max_value : a_int; diff --git a/src/core/common/Registrars.h b/src/core/common/Registrars.h index 686304b8d7..50b3fc1284 100644 --- a/src/core/common/Registrars.h +++ b/src/core/common/Registrars.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2022 Arm Limited. + * Copyright (c) 2020-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -21,8 +21,8 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#ifndef SRC_CORE_COMMON_REGISTRARS_H -#define SRC_CORE_COMMON_REGISTRARS_H +#ifndef ACL_SRC_CORE_COMMON_REGISTRARS_H +#define ACL_SRC_CORE_COMMON_REGISTRARS_H #if defined(ENABLE_FP16_KERNELS) @@ -38,11 +38,11 @@ #define REGISTER_FP16_SVE2(func_name) nullptr #endif /* defined(ARM_COMPUTE_ENABLE_SVE2) */ -#if defined(ARM_COMPUTE_ENABLE_NEON) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) +#if defined(ARM_COMPUTE_ENABLE_NEON) #define REGISTER_FP16_NEON(func_name) &(func_name) #else /* !defined(ARM_COMPUTE_ENABLE_NEON) */ #define REGISTER_FP16_NEON(func_name) nullptr -#endif /* defined(ARM_COMPUTE_ENABLE_NEON) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) */ +#endif /* defined(ARM_COMPUTE_ENABLE_NEON) */ #else /* !defined(ENABLE_FP16_KERNELS) */ #define REGISTER_FP16_NEON(func_name) nullptr @@ -179,4 +179,4 @@ #define REGISTER_BF16_NEON(func_name) nullptr #endif /* defined(ARM_COMPUTE_ENABLE_BF16)*/ -#endif /* SRC_CORE_COMMON_REGISTRARS_H */ +#endif // ACL_SRC_CORE_COMMON_REGISTRARS_H diff --git a/src/core/helpers/LUTManager.cpp b/src/core/helpers/LUTManager.cpp new file mode 100644 index 0000000000..06e35eed8c --- /dev/null +++ b/src/core/helpers/LUTManager.cpp @@ -0,0 +1,79 @@ +/* + * Copyright (c) 2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "src/core/helpers/LUTManager.h" + +namespace arm_compute +{ +#ifdef __aarch64__ +namespace +{ + +void init_lut_fp16(ActivationLayerInfo::LookupTable65536 *lut) +{ + union Element + { + uint16_t i = 0; + float16_t fp; + } item; + // Fill lut by iterating over all 16 bit values using the union. + while (true) + { + (*lut)[item.i] = 1.f / (1.f + std::exp(-item.fp)); + if (item.i == 65535) + break; + item.i++; + } +} +} // namespace + +std::shared_ptr LUTManager::get_lut_table(LUTInfo info) +{ + const auto itr = map_fp16.find(info); + auto s_ptr = (itr != map_fp16.end()) ? itr->second.lock() : nullptr; // nullptr if invalid or not found. + if (s_ptr != nullptr) + { + // Found and valid + return s_ptr; // Return weak ptr as shared ptr + } + else + { + // Not found, or pointer not valid + // We do not use make_shared to prevent the weak_ptr keeping the control block alive + std::shared_ptr ptr(new ActivationLayerInfo::LookupTable65536); + init_lut_fp16(ptr.get()); + map_fp16[info] = ptr; + return ptr; + } +} +#endif // __aarch64__ + +// Static function to get LutManager instance +LUTManager &LUTManager::get_instance() +{ + static auto inst_ = std::make_unique(); // The one, single instance. + return *inst_; +} + +} // namespace arm_compute diff --git a/src/core/helpers/LUTManager.h b/src/core/helpers/LUTManager.h new file mode 100644 index 0000000000..4e13ead7e3 --- /dev/null +++ b/src/core/helpers/LUTManager.h @@ -0,0 +1,73 @@ +/* + * Copyright (c) 2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef ACL_SRC_CORE_HELPERS_LUTMANAGER_H +#define ACL_SRC_CORE_HELPERS_LUTMANAGER_H + +#include "arm_compute/core/CoreTypes.h" +#include "arm_compute/core/QuantizationInfo.h" +#include "arm_compute/function_info/ActivationLayerInfo.h" + +#include +#include + +namespace arm_compute +{ + +struct LUTInfo +{ + ActivationLayerInfo::ActivationFunction act; + DataType dt; + QuantizationInfo qinfo; + // Operators enable use of map with Lutinfo as key + friend bool operator<(const LUTInfo &l, const LUTInfo &r) + { + return (l.act < r.act) || ((l.act == r.act) && (l.dt < r.dt)) || + ((l.act == r.act) && (l.dt == r.dt) && (l.qinfo.scale() < r.qinfo.scale())) || + ((l.act == r.act) && (l.dt == r.dt) && (l.qinfo.scale() == r.qinfo.scale()) && + (l.qinfo.offset() < l.qinfo.offset())); + } + bool operator==(const LUTInfo &l) + { + return this->act == l.act && this->dt == l.dt && this->qinfo == l.qinfo; + } +}; + +/* Class to handle getting look up table */ +class LUTManager +{ +public: + LUTManager() = default; + + static LUTManager &get_instance(); +#ifdef __aarch64__ + std::shared_ptr get_lut_table(LUTInfo info); + +private: + std::map> map_fp16{}; +#endif // __aarch64__ +}; + +} // namespace arm_compute +#endif // ACL_SRC_CORE_HELPERS_LUTMANAGER_H diff --git a/src/cpu/kernels/CpuActivationKernel.cpp b/src/cpu/kernels/CpuActivationKernel.cpp index 50bf672d3c..7cfa39b286 100644 --- a/src/cpu/kernels/CpuActivationKernel.cpp +++ b/src/cpu/kernels/CpuActivationKernel.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2023 Arm Limited. + * Copyright (c) 2017-2024 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -49,7 +49,8 @@ static const std::vector available_kernel [](const ActivationDataTypeISASelectorData &data) { return (data.dt == DataType::QASYMM8 || data.dt == DataType::QASYMM8_SIGNED) && - data.cpumodel == CPUModel::A510 && data.isa.sve2; + data.cpumodel == CPUModel::A510 && data.isa.sve2 && + data.f != ActivationLayerInfo::ActivationFunction::RELU; }, REGISTER_QASYMM8_SVE2(arm_compute::cpu::sve2_q8_activation_lut)}, #endif // ARM_COMPUTE_ENABLE_SVE @@ -57,7 +58,10 @@ static const std::vector available_kernel {// Neon LUT implementantion takes precedence "neon_q8_activation_lut", [](const ActivationDataTypeISASelectorData &data) - { return data.dt == DataType::QASYMM8 || data.dt == DataType::QASYMM8_SIGNED; }, + { + return (data.dt == DataType::QASYMM8 || data.dt == DataType::QASYMM8_SIGNED) && + data.f != ActivationLayerInfo::ActivationFunction::RELU; + }, REGISTER_Q8_NEON(arm_compute::cpu::neon_q8_activation_lut)}, #endif // __aarch64__ {"sve2_qu8_activation", @@ -79,6 +83,13 @@ static const std::vector available_kernel data.f != ActivationLayerInfo::ActivationFunction::GELU; }, REGISTER_QSYMM16_SVE2(arm_compute::cpu::sve2_qsymm16_activation)}, + {"sve_fp16_activation_lut", + [](const ActivationDataTypeISASelectorData &data) + { + return data.dt == DataType::F16 && data.isa.fp16 && data.isa.sve && + data.f == ActivationLayerInfo::ActivationFunction::LOGISTIC; + }, + REGISTER_FP16_SVE(arm_compute::cpu::sve_fp16_activation_lut)}, {"sve_fp16_activation", [](const ActivationDataTypeISASelectorData &data) { @@ -214,9 +225,6 @@ void init_lut(ActivationLayerInfo::ActivationFunction act_func, case ActivationLayerInfo::ActivationFunction::LINEAR: tmp_f = a * tmp_f + b; break; - case ActivationLayerInfo::ActivationFunction::RELU: - tmp_f = std::max<>(0.f, tmp_f); - break; case ActivationLayerInfo::ActivationFunction::BOUNDED_RELU: tmp_f = std::min<>(a, std::max(0.f, tmp_f)); break; @@ -278,7 +286,11 @@ void CpuActivationKernel::configure(const ITensorInfo *src, ITensorInfo *dst, Ac _name = std::string("CpuActivationKernel").append("/").append(uk->name); #ifdef __aarch64__ - if (src->data_type() == DataType::QASYMM8 || src->data_type() == DataType::QASYMM8_SIGNED) + // Initialise lut_manager + LUTManager &lut_manager = LUTManager::get_instance(); + + if ((src->data_type() == DataType::QASYMM8 || src->data_type() == DataType::QASYMM8_SIGNED) && + activation_info.activation() != ActivationFunction::RELU) { ActivationLayerInfo::LookupTable256 tmp_lut; init_lut(activation_info.activation(), src->data_type(), src->quantization_info().uniform(), @@ -286,6 +298,13 @@ void CpuActivationKernel::configure(const ITensorInfo *src, ITensorInfo *dst, Ac activation_info.a(), activation_info.b()); activation_info.setLookupTable256(tmp_lut); } + + if (src->data_type() == DataType::F16 && + activation_info.activation() == ActivationLayerInfo::ActivationFunction::LOGISTIC) + { + const LUTInfo info = {activation_info.activation(), src->data_type(), src->quantization_info()}; + activation_info.setLookupTable65536((lut_manager.get_lut_table(info))); + } #endif // __aarch64__ _act_info = activation_info; diff --git a/src/cpu/kernels/CpuActivationKernel.h b/src/cpu/kernels/CpuActivationKernel.h index 4bad9fb3e8..c1487499d6 100644 --- a/src/cpu/kernels/CpuActivationKernel.h +++ b/src/cpu/kernels/CpuActivationKernel.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2023 Arm Limited. + * Copyright (c) 2017-2024 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -21,12 +21,13 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#ifndef ARM_COMPUTE_CPU_ACTIVATION_KERNEL_H -#define ARM_COMPUTE_CPU_ACTIVATION_KERNEL_H +#ifndef ACL_SRC_CPU_KERNELS_CPUACTIVATIONKERNEL_H +#define ACL_SRC_CPU_KERNELS_CPUACTIVATIONKERNEL_H #include "arm_compute/function_info/ActivationLayerInfo.h" #include "src/core/common/Macros.h" +#include "src/core/helpers/LUTManager.h" #include "src/cpu/ICpuKernel.h" namespace arm_compute @@ -103,4 +104,4 @@ class CpuActivationKernel : public ICpuKernel } // namespace kernels } // namespace cpu } // namespace arm_compute -#endif /* ARM_COMPUTE_CPU_ACTIVATION_KERNEL_H */ +#endif // ACL_SRC_CPU_KERNELS_CPUACTIVATIONKERNEL_H diff --git a/src/cpu/kernels/CpuKernelSelectionTypes.h b/src/cpu/kernels/CpuKernelSelectionTypes.h index b7daa4d583..45ebeec394 100644 --- a/src/cpu/kernels/CpuKernelSelectionTypes.h +++ b/src/cpu/kernels/CpuKernelSelectionTypes.h @@ -21,8 +21,8 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#ifndef ARM_COMPUTE_CPU_KERNEL_SELECTION_TYPES_H -#define ARM_COMPUTE_CPU_KERNEL_SELECTION_TYPES_H +#ifndef ACL_SRC_CPU_KERNELS_CPUKERNELSELECTIONTYPES_H +#define ACL_SRC_CPU_KERNELS_CPUKERNELSELECTIONTYPES_H #include "arm_compute/core/Types.h" @@ -99,6 +99,13 @@ struct ScaleKernelDataTypeISASelectorData InterpolationPolicy interpolation_policy; }; +struct SoftmaxKernelDataTypeISASelectorData +{ + DataType dt; + cpuinfo::CpuIsaInfo isa; + bool is_log; +}; + // Selector pointer types using DataTypeISASelectorPtr = std::add_pointer::type; using DataTypeDataLayoutSelectorPtr = std::add_pointer::type; @@ -113,9 +120,10 @@ using CpuAddKernelDataTypeISASelectorDataPtr = std::add_pointer::type; using ScaleKernelDataTypeISASelectorDataPtr = std::add_pointer::type; - +using SoftmaxKernelDataTypeISASelectorDataPtr = + std::add_pointer::type; } // namespace kernels } // namespace cpu } // namespace arm_compute -#endif // ARM_COMPUTE_CPU_KERNEL_SELECTION_TYPES_H +#endif // ACL_SRC_CPU_KERNELS_CPUKERNELSELECTIONTYPES_H diff --git a/src/cpu/kernels/CpuMulKernel.cpp b/src/cpu/kernels/CpuMulKernel.cpp index ba086e3ac6..8001482154 100644 --- a/src/cpu/kernels/CpuMulKernel.cpp +++ b/src/cpu/kernels/CpuMulKernel.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2022 Arm Limited. + * Copyright (c) 2016-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -26,12 +26,14 @@ #include "arm_compute/core/ITensor.h" #include "arm_compute/core/TensorInfo.h" +#include "src/core/common/Registrars.h" #include "src/core/CPP/Validate.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" #include "src/core/NEON/NEAsymm.h" #include "src/core/NEON/NESymm.h" #include "src/core/NEON/wrapper/wrapper.h" +#include "src/cpu/kernels/mul/generic/neon/list.h" #include @@ -1170,108 +1172,6 @@ void mul_S32_S32_S32(const ITensor *src1, const ITensor *src2, ITensor *out, con } } -void mul_F32_F32_F32(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, float scale) -{ - // Create input windows - Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape()); - Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape()); - - // Clear X Dimension on execution window as we handle manually - Window win = window; - win.set(Window::DimX, Window::Dimension(0, 1, 1)); - - constexpr int window_step_x = 16 / sizeof(float); - const auto window_start_x = static_cast(window.x().start()); - const auto window_end_x = static_cast(window.x().end()); - const bool is_broadcast_across_x = src1->info()->tensor_shape().x() != src2->info()->tensor_shape().x(); - - using ExactTagType = typename wrapper::traits::neon_vector::tag_type; - - if (is_broadcast_across_x) - { - const bool is_broadcast_input_2 = input2_win.x().step() == 0; - Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win; - Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win; - const ITensor *broadcast_tensor = is_broadcast_input_2 ? src2 : src1; - const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src2 : src1; - - // Clear X Dimension on execution window as we handle manually - non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1)); - - Iterator broadcast_input(broadcast_tensor, broadcast_win); - Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win); - Iterator dst(out, win); - - execute_window_loop( - win, - [&](const Coordinates &) - { - const auto non_broadcast_input_ptr = reinterpret_cast(non_broadcast_input.ptr()); - const auto output_ptr = reinterpret_cast(dst.ptr()); - - const float broadcast_value = *reinterpret_cast(broadcast_input.ptr()); - const auto broadcast_value_vec = wrapper::vdup_n(broadcast_value, ExactTagType{}); - const auto scale_vec = wrapper::vdup_n(scale, ExactTagType{}); - - // Compute window_step_x elements per iteration - int x = window_start_x; - for (; x <= (window_end_x - window_step_x); x += window_step_x) - { - const auto non_broadcast_v = wrapper::vloadq(non_broadcast_input_ptr + x); - auto res = wrapper::vmul(wrapper::vmul(broadcast_value_vec, non_broadcast_v), scale_vec); - wrapper::vstore(output_ptr + x, res); - } - - // Compute left-over elements - for (; x < window_end_x; ++x) - { - const auto non_broadcast_v = *(non_broadcast_input_ptr + x); - *(output_ptr + x) = broadcast_value * non_broadcast_v * scale; - } - }, - broadcast_input, non_broadcast_input, dst); - } - else - { - // Clear X Dimension on execution window as we handle manually - input1_win.set(Window::DimX, Window::Dimension(0, 1, 1)); - input2_win.set(Window::DimX, Window::Dimension(0, 1, 1)); - - Iterator input1(src1, input1_win); - Iterator input2(src2, input2_win); - Iterator dst(out, win); - - execute_window_loop( - win, - [&](const Coordinates &) - { - const auto input1_ptr = reinterpret_cast(input1.ptr()); - const auto input2_ptr = reinterpret_cast(input2.ptr()); - const auto output_ptr = reinterpret_cast(dst.ptr()); - - // Compute window_step_x elements per iteration - int x = window_start_x; - for (; x <= (window_end_x - window_step_x); x += window_step_x) - { - const auto ta1 = wrapper::vloadq(input1_ptr + x); - const auto ta2 = wrapper::vloadq(input2_ptr + x); - const auto scale_vec = wrapper::vdup_n(scale, ExactTagType{}); - const auto res = wrapper::vmul(wrapper::vmul(ta1, ta2), scale_vec); - wrapper::vstore(output_ptr + x, res); - } - - // Compute left-over elements - for (; x < window_end_x; ++x) - { - const auto ta1 = *(input1_ptr + x); - const auto ta2 = *(input2_ptr + x); - *(output_ptr + x) = ta1 * ta2 * scale; - } - }, - input1, input2, dst); - } -} - void c_mul_F32_F32_F32_n(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window) { // Create input windows @@ -1409,115 +1309,6 @@ void c_mul_F32_F32_F32_n(const ITensor *src1, const ITensor *src2, ITensor *out, } } -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC -void mul_F16_F16_F16(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, float scale) -{ - // Create input windows - Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape()); - Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape()); - - // Clear X Dimension on execution window as we handle manually - Window win = window; - win.set(Window::DimX, Window::Dimension(0, 1, 1)); - constexpr int window_step_x = 16; - const auto window_start_x = static_cast(window.x().start()); - const auto window_end_x = static_cast(window.x().end()); - const bool is_broadcast_across_x = src1->info()->tensor_shape().x() != src2->info()->tensor_shape().x(); - if (is_broadcast_across_x) - { - const bool is_broadcast_input_2 = input2_win.x().step() == 0; - Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win; - Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win; - const ITensor *broadcast_tensor = is_broadcast_input_2 ? src2 : src1; - const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src2 : src1; - // Clear X Dimension on execution window as we handle manually - non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1)); - Iterator broadcast_input(broadcast_tensor, broadcast_win); - Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win); - Iterator dst(out, win); - execute_window_loop( - win, - [&](const Coordinates &) - { - const auto non_broadcast_input_ptr = reinterpret_cast(non_broadcast_input.ptr()); - const auto output_ptr = reinterpret_cast(dst.ptr()); - const auto broadcast_value = *reinterpret_cast(broadcast_input.ptr()); - const float16x8x2_t broadcast_value_vec = {{ - vdupq_n_f16(broadcast_value), - vdupq_n_f16(broadcast_value), - }}; - const auto scale_vec = vdupq_n_f16(scale); - // Compute window_step_x elements per iteration - int x = window_start_x; - for (; x <= (window_end_x - window_step_x); x += window_step_x) - { - const float16x8x2_t non_broadcast_v = {{ - vld1q_f16(non_broadcast_input_ptr + x), - vld1q_f16(non_broadcast_input_ptr + x + 8), - }}; - const float16x8x2_t result = {{ - vmulq_f16(vmulq_f16(broadcast_value_vec.val[0], non_broadcast_v.val[0]), scale_vec), - vmulq_f16(vmulq_f16(broadcast_value_vec.val[1], non_broadcast_v.val[1]), scale_vec), - }}; - vst1q_f16(output_ptr + x, result.val[0]); - vst1q_f16(output_ptr + x + 8, result.val[1]); - } - // Compute left-over elements - for (; x < window_end_x; ++x) - { - const auto non_broadcast_v = *(non_broadcast_input_ptr + x); - *(output_ptr + x) = broadcast_value * non_broadcast_v * scale; - } - }, - broadcast_input, non_broadcast_input, dst); - } - else - { - input1_win.set(Window::DimX, Window::Dimension(0, 1, 1)); - input2_win.set(Window::DimX, Window::Dimension(0, 1, 1)); - Iterator input1(src1, input1_win); - Iterator input2(src2, input2_win); - Iterator dst(out, win); - execute_window_loop( - win, - [&](const Coordinates &) - { - const auto input1_ptr = reinterpret_cast(input1.ptr()); - const auto input2_ptr = reinterpret_cast(input2.ptr()); - const auto output_ptr = reinterpret_cast(dst.ptr()); - // Compute window_step_x elements per iteration - int x = window_start_x; - for (; x <= (window_end_x - window_step_x); x += window_step_x) - { - const float16x8x2_t ta1 = {{ - vld1q_f16(input1_ptr + x), - vld1q_f16(input1_ptr + x + 8), - }}; - const float16x8x2_t ta2 = {{ - vld1q_f16(input2_ptr + x), - vld1q_f16(input2_ptr + x + 8), - }}; - const float16x8_t scale_vec = vdupq_n_f16(scale); - const float16x8x2_t result = {{ - vmulq_f16(vmulq_f16(ta1.val[0], ta2.val[0]), scale_vec), - vmulq_f16(vmulq_f16(ta1.val[1], ta2.val[1]), scale_vec), - }}; - vst1q_f16(output_ptr + x, result.val[0]); - vst1q_f16(output_ptr + x + 8, result.val[1]); - } - // Compute left-over elements - for (; x < window_end_x; ++x) - { - const auto ta1 = *(input1_ptr + x); - const auto ta2 = *(input2_ptr + x); - *(output_ptr + x) = ta1 * ta2 * scale; - } - }, - input1, input2, dst); - } -} -#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ - template void mul_U8_U8_S16(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, int n) { @@ -1857,13 +1648,11 @@ void CpuMulKernel::configure(ITensorInfo *src1, } } break; -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC case DataType::F16: - _func_float = &mul_F16_F16_F16; + _func_float = REGISTER_FP16_NEON(cpu::mul_F16_F16_F16); break; -#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ case DataType::F32: - _func_float = &mul_F32_F32_F32; + _func_float = REGISTER_FP32_NEON(cpu::mul_F32_F32_F32); break; default: ARM_COMPUTE_ERROR("You called with the wrong img formats"); diff --git a/src/cpu/kernels/CpuPool2dKernel.cpp b/src/cpu/kernels/CpuPool2dKernel.cpp index 9308d860d1..2c9627bdee 100644 --- a/src/cpu/kernels/CpuPool2dKernel.cpp +++ b/src/cpu/kernels/CpuPool2dKernel.cpp @@ -271,11 +271,9 @@ std::pair validate_and_configure_window(ITensorInfo * break; } break; -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC case DataType::F16: num_elems_processed_per_iteration = 1; break; -#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ case DataType::F32: num_elems_processed_per_iteration = 1; break; diff --git a/src/cpu/kernels/CpuSoftmaxKernel.cpp b/src/cpu/kernels/CpuSoftmaxKernel.cpp index ce144351f8..486f55e2c1 100644 --- a/src/cpu/kernels/CpuSoftmaxKernel.cpp +++ b/src/cpu/kernels/CpuSoftmaxKernel.cpp @@ -34,9 +34,12 @@ #include "src/core/common/Registrars.h" #include "src/core/CPP/Validate.h" #include "src/core/helpers/AutoConfiguration.h" +#include "src/core/helpers/Utils.h" #include "src/core/helpers/WindowHelpers.h" #include "src/cpu/kernels/softmax/list.h" +#include + namespace arm_compute { namespace cpu @@ -45,136 +48,40 @@ namespace kernels { namespace { -/* Softmax Logits 1D Max - identifying the max value of 1D Logits */ -static const std::vector available_kernels_max_logits = { - {"sve_fp32_logits_1d_max", - [](const DataTypeISASelectorData &data) { return (data.dt == DataType::F32) && data.isa.sve; }, - REGISTER_FP32_SVE(sve_fp32_logits)}, - {"sve_fp16_logits_1d_max", - [](const DataTypeISASelectorData &data) { return (data.dt == DataType::F16) && data.isa.sve && data.isa.fp16; }, - REGISTER_FP16_SVE(sve_fp16_logits)}, - {"sve_qu8_logits_1d_max", - [](const DataTypeISASelectorData &data) { return (data.dt == DataType::QASYMM8) && data.isa.sve; }, - REGISTER_QASYMM8_SVE(sve_qasymm8_logits)}, - {"sve_qs8_logits_1d_max", - [](const DataTypeISASelectorData &data) { return (data.dt == DataType::QASYMM8_SIGNED) && data.isa.sve; }, - REGISTER_QASYMM8_SIGNED_SVE(sve_qasymm8_signed_logits)}, - {"neon_fp32_logits_1d_max", [](const DataTypeISASelectorData &data) { return (data.dt == DataType::F32); }, - REGISTER_FP32_NEON(neon_fp32_logits)}, - {"neon_fp16_logits_1d_max", - [](const DataTypeISASelectorData &data) { return (data.dt == DataType::F16) && data.isa.fp16; }, - REGISTER_FP16_NEON(neon_fp16_logits)}, - {"neon_qu8_logits_1d_max", [](const DataTypeISASelectorData &data) { return (data.dt == DataType::QASYMM8); }, - REGISTER_QASYMM8_NEON(neon_qasymm8_logits)}, - {"neon_qs8_logits_1d_max", - [](const DataTypeISASelectorData &data) { return (data.dt == DataType::QASYMM8_SIGNED); }, - REGISTER_QASYMM8_SIGNED_NEON(neon_qasymm8_singed_logits)}, +/* Softmax */ +static const std::vector available_kernels = { + {"neon_fp32_softmax", + [](const SoftmaxKernelDataTypeISASelectorData &data) { return (!data.is_log && data.dt == DataType::F32); }, + REGISTER_FP32_NEON(neon_fp32_softmax)}, + {"neon_fp16_softmax", + [](const SoftmaxKernelDataTypeISASelectorData &data) + { return (!data.is_log && data.dt == DataType::F16) && data.isa.fp16; }, + REGISTER_FP16_NEON(neon_fp16_softmax)}, + {"neon_qu8_softmax", + [](const SoftmaxKernelDataTypeISASelectorData &data) { return (!data.is_log && data.dt == DataType::QASYMM8); }, + REGISTER_QASYMM8_NEON(arm_compute::cpu::neon_qasymm8_softmax)}, + {"neon_qs8_softmax", + [](const SoftmaxKernelDataTypeISASelectorData &data) + { return (!data.is_log && data.dt == DataType::QASYMM8_SIGNED); }, + REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::neon_qasymm8_signed_softmax)}, + {"neon_fp32_log_softmax", + [](const SoftmaxKernelDataTypeISASelectorData &data) { return (data.is_log && data.dt == DataType::F32); }, + REGISTER_FP32_NEON(neon_fp32_softmax)}, + {"neon_fp16_log_softmax", + [](const SoftmaxKernelDataTypeISASelectorData &data) + { return (data.is_log && data.dt == DataType::F16) && data.isa.fp16; }, + REGISTER_FP16_NEON(neon_fp16_softmax)}, + {"neon_qu8_log_softmax", + [](const SoftmaxKernelDataTypeISASelectorData &data) { return (data.is_log && data.dt == DataType::QASYMM8); }, + REGISTER_QASYMM8_NEON(arm_compute::cpu::neon_qasymm8_softmax)}, + {"neon_qs8_log_softmax", + [](const SoftmaxKernelDataTypeISASelectorData &data) + { return (data.is_log && data.dt == DataType::QASYMM8_SIGNED); }, + REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::neon_qasymm8_signed_softmax)}, }; -Status validate_arguments_logits_1d_max(const ITensorInfo &input, const ITensorInfo &output) -{ - ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(&input); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, - DataType::F16, DataType::F32); - - // Validate in case of configured output - if (output.total_size() != 0) - { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&input, &output); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(&input, &output); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output.tensor_shape(), - TensorShape(input.tensor_shape()).set(0, 1)); - } - - return Status{}; -} -} //namespace -const std::vector &CpuLogits1DMaxKernel::get_available_kernels() -{ - return available_kernels_max_logits; -} - -void CpuLogits1DMaxKernel::configure(const ITensorInfo *src, ITensorInfo *dst) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_logits_1d_max(*src, *dst)); - - // Softmax across the x dimension - const TensorShape output_shape = TensorShape(src->tensor_shape()).set(0, 1); - // Output auto initialization if not yet initialized - auto_init_if_empty(*dst, output_shape, 1, src->data_type(), src->quantization_info()); - - const auto *uk = get_implementation(DataTypeISASelectorData{src->data_type(), CPUInfo::get().get_isa()}); - ARM_COMPUTE_ERROR_ON(uk == nullptr || uk->ukernel == nullptr); - - _run_method = uk->ukernel; - _name = std::string("CpuLogits1DMaxKernel").append("/").append(uk->name); - - Window win = calculate_max_window(*src, Steps()); - ICpuKernel::configure(win); -} - -Status CpuLogits1DMaxKernel::validate(const ITensorInfo *src, const ITensorInfo *dst) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_logits_1d_max(*src, *dst)); - - return Status{}; -} - -void CpuLogits1DMaxKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) -{ - ARM_COMPUTE_UNUSED(info); - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window); - ARM_COMPUTE_ERROR_ON(_run_method == nullptr); - - const auto src = tensors.get_const_tensor(TensorType::ACL_SRC); - auto dst = tensors.get_tensor(TensorType::ACL_DST); - - _run_method(src, dst, window); -} - -const char *CpuLogits1DMaxKernel::name() const -{ - return _name.c_str(); -} - -/* Softmax Logits 1D - computation for QASYMM8 with pre-computed max. */ -template -static const std::vector::SoftmaxLogits1DKernel> available_kernels_logits = { - {"sve2_qu8_softmax_logits_1d", - [](const DataTypeISASelectorData &data) { return (data.dt == DataType::QASYMM8) && data.isa.sve2; }, - REGISTER_QASYMM8_SVE2(sve2_qasymm8_softmax)}, - {"sve2_qs8_softmax_logits_1d", - [](const DataTypeISASelectorData &data) { return (data.dt == DataType::QASYMM8_SIGNED) && data.isa.sve2; }, - REGISTER_QASYMM8_SIGNED_SVE2(sve2_qasymm8_signed_softmax)}, - {"sve_fp32_softmax_logits_1d", - [](const DataTypeISASelectorData &data) { return (data.dt == DataType::F32) && data.isa.sve; }, - REGISTER_FP32_SVE(sve_fp32_softmax)}, - {"sve_fp16_softmax_logits_1d", - [](const DataTypeISASelectorData &data) { return (data.dt == DataType::F16) && data.isa.sve && data.isa.fp16; }, - REGISTER_FP16_SVE(sve_fp16_softmax)}, - - {"neon_fp32_softmax_logits_1d", [](const DataTypeISASelectorData &data) { return (data.dt == DataType::F32); }, - REGISTER_FP32_NEON(neon_fp32_softmax)}, - {"neon_fp16_softmax_logits_1d", - [](const DataTypeISASelectorData &data) { return (data.dt == DataType::F16) && data.isa.fp16; }, - REGISTER_FP16_NEON(neon_fp16_softmax)}, - {"neon_qu8_softmax_logits_1d", [](const DataTypeISASelectorData &data) { return (data.dt == DataType::QASYMM8); }, - REGISTER_QASYMM8_NEON(arm_compute::cpu::neon_qasymm8_softmax)}, - {"neon_qs8_softmax_logits_1d", - [](const DataTypeISASelectorData &data) { return (data.dt == DataType::QASYMM8_SIGNED); }, - REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::neon_qasymm8_signed_softmax)}, -}; -namespace -{ -Status validate_arguments_logits_softmax(const ITensorInfo &src, - const ITensorInfo &max, - const ITensorInfo &dst, - const float beta, - const ITensorInfo &tmp, - bool is_log) +Status validate_arguments_softmax( + const ITensorInfo &src, const ITensorInfo &dst, float beta, const ITensorInfo &tmp, bool is_log) { ARM_COMPUTE_UNUSED(beta); // Check input @@ -184,11 +91,6 @@ Status validate_arguments_logits_softmax(const ITensorInfo &src, const bool is_quantized_asymmetric = is_data_type_quantized_asymmetric(src.data_type()); - // Check max - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src, &max); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(TensorShape(src.tensor_shape()).set(0, 1), max.tensor_shape()); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(&src, &max); - // Check output if configured if (dst.total_size() != 0) { @@ -203,8 +105,11 @@ Status validate_arguments_logits_softmax(const ITensorInfo &src, // Check tmp if configured if (tmp.total_size() != 0) { - const DataType tmp_data_type = is_quantized_asymmetric ? DataType::F32 : src.data_type(); - ARM_COMPUTE_RETURN_ERROR_ON(tmp.data_type() != tmp_data_type); + // We have temporary storage only if src data type is quantized. + // Therefore, tmp data type must be F32 + ARM_COMPUTE_RETURN_ERROR_ON(tmp.data_type() != DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON(!is_quantized_asymmetric); + // We could potentially reduce tmp memory if we could predict or make an assumption // on the maximum number of threads that will run in parallel. ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&src, &tmp); @@ -214,91 +119,97 @@ Status validate_arguments_logits_softmax(const ITensorInfo &src, } } // namespace -template -const std::vector::SoftmaxLogits1DKernel> & -CpuLogits1DSoftmaxKernel::get_available_kernels() +const std::vector &CpuSoftmaxKernel::get_available_kernels() { - return available_kernels_logits; + return available_kernels; } -template -void CpuLogits1DSoftmaxKernel::configure( - const ITensorInfo *src, const ITensorInfo *max, ITensorInfo *dst, const float beta, ITensorInfo *tmp) +void CpuSoftmaxKernel::configure(const ITensorInfo *src, ITensorInfo *dst, float beta, bool is_log, ITensorInfo *tmp) { - ARM_COMPUTE_ERROR_ON_NULLPTR(src, max, dst, tmp); - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_logits_softmax(*src, *max, *dst, beta, *tmp, IS_LOG)); + ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst, tmp); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_softmax(*src, *dst, beta, *tmp, is_log)); // Configure kernel window const bool is_quantized_asymmetric = is_data_type_quantized_asymmetric(src->data_type()); // Output auto initialization if not yet initialized const QuantizationInfo output_quantization = - is_quantized_asymmetric ? arm_compute::get_softmax_output_quantization_info(src->data_type(), IS_LOG) + is_quantized_asymmetric ? arm_compute::get_softmax_output_quantization_info(src->data_type(), is_log) : dst->quantization_info(); auto_init_if_empty(*dst, TensorInfo(*src).set_quantization_info(output_quantization).reset_padding()); - // Tmp auto initialization if not yet initialized - const DataType tmp_data_type = is_quantized_asymmetric ? DataType::F32 : src->data_type(); - auto_init_if_empty(*tmp, TensorInfo(*src).set_data_type(tmp_data_type).reset_padding()); + // Tmp auto initialization if not yet initialized and src is quantized + if (is_quantized_asymmetric) + { + const DataType tmp_data_type = is_quantized_asymmetric ? DataType::F32 : src->data_type(); + auto_init_if_empty(*tmp, TensorInfo(*src).set_data_type(tmp_data_type).reset_padding()); + } - const auto *uk = CpuLogits1DSoftmaxKernel::get_implementation( - DataTypeISASelectorData{src->data_type(), CPUInfo::get().get_isa()}); + const auto *uk = CpuSoftmaxKernel::get_implementation( + SoftmaxKernelDataTypeISASelectorData{src->data_type(), CPUInfo::get().get_isa(), is_log}); ARM_COMPUTE_ERROR_ON(uk == nullptr || uk->ukernel == nullptr); - std::string kernel_name = - IS_LOG ? std::string("CpuLogits1DLogSoftmaxKernel") : std::string("CpuLogits1DSoftmaxKernel"); + std::string kernel_name = is_log ? std::string("CpuLogSoftmaxKernel") : std::string("CpuSoftmaxKernel"); _beta = beta; _run_method = uk->ukernel; _name = kernel_name.append("/").append(uk->name); - // Configure kernel window - Window win = calculate_max_window(*max, Steps()); + Window win = calculate_max_window(*dst, Steps()); + + /// TODO: Check dimensions > 0 for holes only. For this, we need + /// a utility function checking if there are holes after some dimension. + if (!has_holes(*dst, dst->num_dimensions() - 1)) + { + win = win.collapse(win, Window::DimY); + } - ICpuKernel>::configure(win); + win.set(Window::DimX, Window::Dimension(0, 1, 1)); // First dimension is the reduction axis + + ICpuKernel::configure(win); } -template -Status CpuLogits1DSoftmaxKernel::validate( - const ITensorInfo *src, const ITensorInfo *max, const ITensorInfo *dst, const float beta, const ITensorInfo *tmp) +Status CpuSoftmaxKernel::validate( + const ITensorInfo *src, const ITensorInfo *dst, float beta, bool is_log, const ITensorInfo *tmp) { - ARM_COMPUTE_ERROR_ON_NULLPTR(src, max, dst, tmp); - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_logits_softmax(*src, *max, *dst, beta, *tmp, IS_LOG)); + ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst, tmp); + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_softmax(*src, *dst, beta, *tmp, is_log)); return Status{}; } -template -void CpuLogits1DSoftmaxKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) +void CpuSoftmaxKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) { - ARM_COMPUTE_UNUSED(info); ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel>::window(), window); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window); ARM_COMPUTE_ERROR_ON(_run_method == nullptr); const auto src = tensors.get_const_tensor(TensorType::ACL_SRC_0); - auto max = tensors.get_tensor(TensorType::ACL_SRC_1); auto dst = tensors.get_tensor(TensorType::ACL_DST_0); - auto tmp = tensors.get_tensor(TensorType::ACL_DST_1); - const unsigned int num_elems_processed_per_iteration = src->info()->valid_region().shape.x(); - const unsigned int tmp_size_for_thread = tmp->info()->element_size() * num_elems_processed_per_iteration; + if (is_data_type_quantized_asymmetric(src->info()->data_type())) + { + auto tmp = tensors.get_tensor(TensorType::ACL_DST_1); + + const unsigned int num_elems_processed_per_iteration = src->info()->valid_region().shape.x(); + const unsigned int tmp_size_for_thread = tmp->info()->element_size() * num_elems_processed_per_iteration; - ARM_COMPUTE_ERROR_ON(tmp->info()->total_size() < (info.num_threads * tmp_size_for_thread)); + ARM_COMPUTE_ERROR_ON(tmp->info()->total_size() < (info.num_threads * tmp_size_for_thread)); - void *tmp_for_thread = tmp->buffer() + (info.thread_id * tmp_size_for_thread); - _run_method(src, max, tmp_for_thread, dst, _beta, IS_LOG, window); + void *tmp_for_thread = tmp->buffer() + (info.thread_id * tmp_size_for_thread); + _run_method(src, tmp_for_thread, dst, _beta, window); + } + else + { + _run_method(src, nullptr, dst, _beta, window); + } } -template -const char *CpuLogits1DSoftmaxKernel::name() const +const char *CpuSoftmaxKernel::name() const { return _name.c_str(); } -template class CpuLogits1DSoftmaxKernel; -template class CpuLogits1DSoftmaxKernel; - } // namespace kernels } // namespace cpu } // namespace arm_compute diff --git a/src/cpu/kernels/CpuSoftmaxKernel.h b/src/cpu/kernels/CpuSoftmaxKernel.h index 5d288179fd..3db1f3d0ef 100644 --- a/src/cpu/kernels/CpuSoftmaxKernel.h +++ b/src/cpu/kernels/CpuSoftmaxKernel.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2022 Arm Limited. + * Copyright (c) 2017-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -21,8 +21,8 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#ifndef ARM_COMPUTE_CPU_SOFTMAX_KERNEL_H -#define ARM_COMPUTE_CPU_SOFTMAX_KERNEL_H +#ifndef ACL_SRC_CPU_KERNELS_CPUSOFTMAXKERNEL_H +#define ACL_SRC_CPU_KERNELS_CPUSOFTMAXKERNEL_H #include "src/core/common/Macros.h" #include "src/cpu/ICpuKernel.h" @@ -33,102 +33,55 @@ namespace cpu { namespace kernels { -/** Interface for the identifying the max value of 1D Logits */ -class CpuLogits1DMaxKernel : public ICpuKernel +/** Interface for softmax computation */ +class CpuSoftmaxKernel : public ICpuKernel { private: - using SoftmaxLogits1DMaxKernelPtr = std::add_pointer::type; + using SoftmaxKernelPtr = + std::add_pointer::type; public: - CpuLogits1DMaxKernel() = default; - ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuLogits1DMaxKernel); - /** Set the input and output tensors. - * - * @param[in] src Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32. - * @param[out] dst Destination tensor info. Data types supported: same as @p input - */ - void configure(const ITensorInfo *src, ITensorInfo *dst); - /** Static function to check if given info will lead to a valid configuration - * - * Similar to CpuLogits1DMaxKernel::configure() - * - * @return a status - */ - static Status validate(const ITensorInfo *src, const ITensorInfo *dst); - - // Inherited methods overridden: - void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; - const char *name() const override; - - struct SoftmaxLogits1DMaxKernel - { - const char *name; - const DataTypeISASelectorPtr is_selected; - SoftmaxLogits1DMaxKernelPtr ukernel; - }; - - static const std::vector &get_available_kernels(); - -private: - SoftmaxLogits1DMaxKernelPtr _run_method{nullptr}; - std::string _name{}; -}; - -/** Interface for softmax computation for QASYMM8 with pre-computed max. */ -template -class CpuLogits1DSoftmaxKernel : public ICpuKernel> -{ -private: - using SoftmaxLogits1DKernelPtr = std::add_pointer::type; - -public: - CpuLogits1DSoftmaxKernel() = default; - ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuLogits1DSoftmaxKernel); + CpuSoftmaxKernel() = default; + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuSoftmaxKernel); /** Set the input and output tensors. * - * @param[in] src Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32. - * @param[in] max Max values tensor info. Same shape as input with dimension 0 set to 1. - * Data types supported: same as @p input. - * @param[out] dst Destination tensor info. Data types supported: same as @p input. - * @param[in] beta A scaling factor for the exponent. + * @param[in] src Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32. + * @param[out] dst Destination tensor info. Data types supported: same as @p input. + * @param[in] beta A scaling factor for the exponent. + * @param[in] is_log True if the operation is log-softmax * * @param tmp Auxiliary tensor info. Must be type F32 and same shape as the input. */ - void - configure(const ITensorInfo *src, const ITensorInfo *max, ITensorInfo *dst, const float beta, ITensorInfo *tmp); + void configure(const ITensorInfo *src, ITensorInfo *dst, float beta, bool is_log, ITensorInfo *tmp); /** Static function to check if given info will lead to a valid configuration * - * Similar to CpuLogits1DSoftmaxKernel::configure() + * Similar to CpuSoftmaxKernel::configure() * * @return a status */ - static Status validate(const ITensorInfo *src, - const ITensorInfo *max, - const ITensorInfo *dst, - const float beta, - const ITensorInfo *tmp); + static Status + validate(const ITensorInfo *src, const ITensorInfo *dst, float beta, bool is_log, const ITensorInfo *tmp); // Inherited methods overridden: void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; const char *name() const override; - struct SoftmaxLogits1DKernel + struct SoftmaxKernel { - const char *name; - const DataTypeISASelectorPtr is_selected; - SoftmaxLogits1DKernelPtr ukernel; + const char *name; + const SoftmaxKernelDataTypeISASelectorDataPtr is_selected; + SoftmaxKernelPtr ukernel; }; - static const std::vector &get_available_kernels(); + static const std::vector &get_available_kernels(); private: - float _beta{1.0f}; - SoftmaxLogits1DKernelPtr _run_method{nullptr}; - std::string _name{}; + float _beta{1.0f}; + SoftmaxKernelPtr _run_method{nullptr}; + std::string _name{}; }; } // namespace kernels } // namespace cpu } // namespace arm_compute -#endif /* ARM_COMPUTE_CPU_SOFTMAX_KERNEL_H */ +#endif // ACL_SRC_CPU_KERNELS_CPUSOFTMAXKERNEL_H diff --git a/src/cpu/kernels/activation/generic/neon/lut.cpp b/src/cpu/kernels/activation/generic/neon/lut.cpp index f289c80d4b..ddd186f9cb 100644 --- a/src/cpu/kernels/activation/generic/neon/lut.cpp +++ b/src/cpu/kernels/activation/generic/neon/lut.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2023 Arm Limited. + * Copyright (c) 2022-2024 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -34,8 +34,9 @@ namespace cpu #ifdef __aarch64__ void neon_q8_activation_lut(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window) { - ARM_COMPUTE_ERROR_ON(src->info()->data_type() != DataType::QASYMM8 && - src->info()->data_type() != DataType::QASYMM8_SIGNED); + ARM_COMPUTE_ERROR_ON( // LUT does not provide any performance benefit for ReLU as it's a single max() operation + (src->info()->data_type() != DataType::QASYMM8 && src->info()->data_type() != DataType::QASYMM8_SIGNED) || + act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU); const auto window_end_x = window.x().end(); Window win_collapsed = window.collapse_if_possible(window, Window::DimZ); win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1)); diff --git a/src/cpu/kernels/activation/generic/sve/fp16.cpp b/src/cpu/kernels/activation/generic/sve/fp16.cpp index 97399e01e0..19d9126556 100644 --- a/src/cpu/kernels/activation/generic/sve/fp16.cpp +++ b/src/cpu/kernels/activation/generic/sve/fp16.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2023 Arm Limited. + * Copyright (c) 2020-2024 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -30,6 +30,7 @@ #include "arm_compute/function_info/ActivationLayerInfo.h" #include "src/core/NEON/SVEMath.h" +#include "src/cpu/kernels/lut/list.h" #include #include @@ -141,6 +142,32 @@ void sve_fp16_activation(const ITensor *src, ITensor *dst, const ActivationLayer }, input, output); } + +void sve_fp16_activation_lut(const ITensor *src, + ITensor *dst, + const ActivationLayerInfo &act_info, + const Window &window) +{ + ARM_COMPUTE_ERROR_ON(src->info()->data_type() != DataType::F16); + const auto window_start_x = window.x().start(); + const auto window_end_x = window.x().end(); + const auto size = window_end_x - window_start_x; + Window win_collapsed = window.collapse_if_possible(window, Window::DimZ); + win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator input(src, win_collapsed); + Iterator output(dst, win_collapsed); + execute_window_loop( + win_collapsed, + [&](const Coordinates &) + { + const auto input_ptr = reinterpret_cast(input.ptr()); + auto output_ptr = reinterpret_cast(output.ptr()); + lut_u16_sve(reinterpret_cast(act_info.lut_fp16().data()), 1U /* num_strings (UNUSED) */, + size, input_ptr + window_start_x, output_ptr + window_start_x); + }, + input, output); +} } // namespace cpu } // namespace arm_compute #endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */ diff --git a/src/cpu/kernels/activation/generic/sve2/lut.cpp b/src/cpu/kernels/activation/generic/sve2/lut.cpp index 2ed667debf..5db8595a75 100644 --- a/src/cpu/kernels/activation/generic/sve2/lut.cpp +++ b/src/cpu/kernels/activation/generic/sve2/lut.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2023 Arm Limited. + * Copyright (c) 2022-2024 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -34,8 +34,9 @@ namespace cpu #ifdef __aarch64__ void sve2_q8_activation_lut(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window) { - ARM_COMPUTE_ERROR_ON(src->info()->data_type() != DataType::QASYMM8 && - src->info()->data_type() != DataType::QASYMM8_SIGNED); + ARM_COMPUTE_ERROR_ON( // LUT does not provide any performance benefit for ReLU as it's a single max() operation + (src->info()->data_type() != DataType::QASYMM8 && src->info()->data_type() != DataType::QASYMM8_SIGNED) || + act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU); const auto window_end_x = window.x().end(); Window win_collapsed = window.collapse_if_possible(window, Window::DimZ); win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1)); diff --git a/src/cpu/kernels/activation/list.h b/src/cpu/kernels/activation/list.h index 6550ddfeca..8c24adc3fe 100644 --- a/src/cpu/kernels/activation/list.h +++ b/src/cpu/kernels/activation/list.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2023 Arm Limited. + * Copyright (c) 2020-2024 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -21,8 +21,8 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#ifndef SRC_CORE_NEON_KERNELS_ACTIVATION_LIST_H -#define SRC_CORE_NEON_KERNELS_ACTIVATION_LIST_H +#ifndef ACL_SRC_CPU_KERNELS_ACTIVATION_LIST_H +#define ACL_SRC_CPU_KERNELS_ACTIVATION_LIST_H namespace arm_compute { @@ -42,6 +42,7 @@ DECLARE_ACTIVATION_KERNEL(sve2_qasymm8_signed_activation); DECLARE_ACTIVATION_KERNEL(neon_qsymm16_activation); DECLARE_ACTIVATION_KERNEL(sve2_qsymm16_activation); DECLARE_ACTIVATION_KERNEL(sve_fp16_activation); +DECLARE_ACTIVATION_KERNEL(sve_fp16_activation_lut); DECLARE_ACTIVATION_KERNEL(sve_fp32_activation); DECLARE_ACTIVATION_KERNEL(neon_fp16_activation); DECLARE_ACTIVATION_KERNEL(neon_fp32_activation); @@ -50,4 +51,4 @@ DECLARE_ACTIVATION_KERNEL(neon_fp32_activation); } // namespace cpu } // namespace arm_compute -#endif /* SRC_CORE_NEON_KERNELS_ACTIVATION_LIST_H */ +#endif // ACL_SRC_CPU_KERNELS_ACTIVATION_LIST_H diff --git a/src/cpu/kernels/softmax/generic/sve/fp32.cpp b/src/cpu/kernels/depth_to_space/list.h similarity index 63% rename from src/cpu/kernels/softmax/generic/sve/fp32.cpp rename to src/cpu/kernels/depth_to_space/list.h index d692cc2477..9d0cd1e740 100644 --- a/src/cpu/kernels/softmax/generic/sve/fp32.cpp +++ b/src/cpu/kernels/depth_to_space/list.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2022 Arm Limited. + * Copyright (c) 2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -22,28 +22,26 @@ * SOFTWARE. */ -#include "arm_compute/core/Helpers.h" +#ifndef ACL_SRC_CPU_KERNELS_DEPTH_TO_SPACE_LIST_H +#define ACL_SRC_CPU_KERNELS_DEPTH_TO_SPACE_LIST_H -#include "src/cpu/kernels/softmax/generic/sve/impl.h" +#include namespace arm_compute { namespace cpu { -void sve_fp32_softmax(const ITensor *in, - const ITensor *max, - void *const tmp, - ITensor *out, - const float beta, - bool is_log, - const Window &window) -{ - return sve_softmax_logits_1d_float(in, max, tmp, out, beta, is_log, window); -} -void sve_fp32_logits(const ITensor *in, ITensor *out, const Window &window) -{ - return sve_logits_1d_max(in, out, window); -} +#define DECLARE_DEPTHTOSPACE_KERNEL(func_name) \ + void func_name(const uint8_t *src, uint8_t *dst, const uintptr_t src_shape[4], const uintptr_t src_strides[4], \ + const uintptr_t dst_strides[4], uintptr_t element_size, uintptr_t block_size) + +DECLARE_DEPTHTOSPACE_KERNEL(depth_to_space_nhwc_any); +DECLARE_DEPTHTOSPACE_KERNEL(depth_to_space_nchw_any); + +#undef DECLARE_DEPTHTOSPACE_KERNEL + } // namespace cpu } // namespace arm_compute + +#endif // ACL_SRC_CPU_KERNELS_DEPTH_TO_SPACE_LIST_H diff --git a/src/cpu/kernels/depth_to_space/nchw/any/impl.cpp b/src/cpu/kernels/depth_to_space/nchw/any/impl.cpp new file mode 100644 index 0000000000..0277690112 --- /dev/null +++ b/src/cpu/kernels/depth_to_space/nchw/any/impl.cpp @@ -0,0 +1,123 @@ +/* + * Copyright (c) 2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/core/Error.h" + +#include +#include + +namespace arm_compute +{ +namespace cpu +{ + +void depth_to_space_nchw_any( // + const uint8_t *src, + uint8_t *dst, + const uintptr_t src_shape[4], + const uintptr_t src_strides[4], + const uintptr_t dst_strides[4], + uintptr_t element_size, + uintptr_t block_size) +{ + ARM_COMPUTE_ERROR_ON(src_strides[0] != element_size); + ARM_COMPUTE_ERROR_ON(dst_strides[0] != element_size); + + const auto dst_channels = src_shape[2] / (block_size * block_size); + const auto src_block_col_stride = dst_channels * src_strides[2]; + const auto src_block_row_stride = block_size * dst_channels * src_strides[2]; + + auto *src_batch_ptr = src; + auto *dst_batch_ptr = dst; + + for (uintptr_t batch = 0; batch < src_shape[3]; ++batch) + { + auto *src_channel_ptr = src_batch_ptr; + auto *dst_channel_ptr = dst_batch_ptr; + + for (uintptr_t channel = 0; channel < dst_channels; ++channel) + { + auto *src_height_block_ptr = src_channel_ptr; + auto *dst_row_ptr = dst_channel_ptr; + + for (uintptr_t height_block = 0; height_block < src_shape[1]; ++height_block) + { + auto *src_block_row_ptr = src_height_block_ptr; + + for (uintptr_t block_row = 0; block_row < block_size; ++block_row) + { + auto *src_width_block_ptr = src_block_row_ptr; + auto *dst_col_ptr = dst_row_ptr; + + for (uintptr_t width_block = 0; width_block < src_shape[0]; ++width_block) + { + auto *src_block_col_ptr = src_width_block_ptr; + + for (uintptr_t block_col = 0; block_col < block_size; ++block_col) + { + // The source pointer is accumulated as: + // + // src_block_col_ptr = + // src + + // batch * dst_strides[3] + + // (channel + (block_row * block_size + block_col) * dst_channels) * src_strides[2] + + // height_block * src_strides[1] + + // width_block * element_size; + // + // The destination pointer is accumuated as: + // + // dst_col_ptr = + // dst + + // batch * dst_strides[3] + + // channel * dst_strides[2] + + // (height_block * block_size + block_row) * dst_strides[1] + + // (width_block * block_size + block_col) * element_size + + std::memcpy(dst_col_ptr, src_block_col_ptr, element_size); + + src_block_col_ptr += src_block_col_stride; + dst_col_ptr += element_size; + } + + src_width_block_ptr += element_size; + } + + src_block_row_ptr += src_block_row_stride; + dst_row_ptr += dst_strides[1]; + } + + src_height_block_ptr += src_strides[1]; + } + + src_channel_ptr += src_strides[2]; + dst_channel_ptr += dst_strides[2]; + } + + src_batch_ptr += src_strides[3]; + dst_batch_ptr += dst_strides[3]; + } +} + +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/depth_to_space/nhwc/any/impl.cpp b/src/cpu/kernels/depth_to_space/nhwc/any/impl.cpp new file mode 100644 index 0000000000..b1c84599dc --- /dev/null +++ b/src/cpu/kernels/depth_to_space/nhwc/any/impl.cpp @@ -0,0 +1,105 @@ +/* + * Copyright (c) 2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/core/Error.h" + +#include +#include + +namespace arm_compute +{ +namespace cpu +{ + +void depth_to_space_nhwc_any( // + const uint8_t *src, + uint8_t *dst, + const uintptr_t src_shape[4], + const uintptr_t src_strides[4], + const uintptr_t dst_strides[4], + uintptr_t element_size, + uintptr_t block_size) +{ + ARM_COMPUTE_ERROR_ON(src_strides[0] != element_size); + ARM_COMPUTE_ERROR_ON(dst_strides[0] != element_size); + + const auto src_block_row_stride = (src_shape[0] / block_size) * element_size; + const auto dst_width_block_stride = block_size * dst_strides[1]; + + auto *src_batch_ptr = src; + auto *dst_batch_ptr = dst; + + for (uintptr_t batch = 0; batch < src_shape[3]; ++batch) + { + auto *src_height_block_ptr = src_batch_ptr; + auto *dst_row_ptr = dst_batch_ptr; + + for (uintptr_t height_block = 0; height_block < src_shape[2]; ++height_block) + { + auto *src_block_row_ptr = src_height_block_ptr; + + for (uintptr_t block_row = 0; block_row < block_size; ++block_row) + { + auto *src_width_block_ptr = src_block_row_ptr; + auto *dst_width_block_ptr = dst_row_ptr; + + for (uintptr_t width_block = 0; width_block < src_shape[1]; ++width_block) + { + // The source pointer is accumulated as: + // + // src_width_block_ptr = + // src + + // batch * src_strides[3] + + // height_block * src_strides[2] + + // width_block * src_strides[1] + + // block_row * (src_shape[0] / block_size) * element_size; + // + // The destination pointer is accumulated as: + // + // dst_width_block_ptr = + // dst + + // batch * dst_strides[3] + + // (height_block * block_size + block_row) * dst_strides[2] + + // width_block * block_size * dst_strides[1]; + + std::memcpy(dst_width_block_ptr, src_width_block_ptr, src_block_row_stride); + + src_width_block_ptr += src_strides[1]; + dst_width_block_ptr += dst_width_block_stride; + } + + src_block_row_ptr += src_block_row_stride; + dst_row_ptr += dst_strides[2]; + } + + src_height_block_ptr += src_strides[2]; + } + + src_batch_ptr += src_strides[3]; + dst_batch_ptr += dst_strides[3]; + } +} + +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/fuse_batch_normalization/generic/impl.h b/src/cpu/kernels/fuse_batch_normalization/generic/impl.h index d807148e37..0c90abccb1 100644 --- a/src/cpu/kernels/fuse_batch_normalization/generic/impl.h +++ b/src/cpu/kernels/fuse_batch_normalization/generic/impl.h @@ -32,6 +32,124 @@ namespace arm_compute { namespace cpu { +template +void batch_normalization_nchw(const Window &window, + ITensor *in, + ITensor *out, + const ITensor *in_mean, + const ITensor *in_var, + const ITensor *in_beta, + const ITensor *in_gamma, + float epsilon, + ActivationLayerInfo act_info) +{ + /** SIMD vector tag type. */ + using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t; + + const int window_step_x = 16 / sizeof(T); + const auto window_start_x = static_cast(window.x().start()); + const auto window_end_x = static_cast(window.x().end()); + + Window win_to_use = window; + win_to_use.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator input(in, win_to_use); + Iterator output(out, win_to_use); + + F activation_functor(act_info); + + // Hold information about the current feature map we are iterating. + // Only compute denominator and constants once per feature map. + int slice = -1; + + const auto input_mean = reinterpret_cast(in_mean->ptr_to_element(Coordinates(0, 0))); + const auto input_var = reinterpret_cast(in_var->ptr_to_element(Coordinates(0, 0))); + const auto input_gamma = + (in_gamma != nullptr) ? reinterpret_cast(in_gamma->ptr_to_element(Coordinates(0, 0))) : nullptr; + const auto input_beta = + (in_beta != nullptr) ? reinterpret_cast(in_beta->ptr_to_element(Coordinates(0, 0))) : nullptr; + + T mean = static_cast(0); + T var = static_cast(0); + T gamma = static_cast(1); + T beta = static_cast(0); + T denominator = static_cast(0); + + auto mean_vec = wrapper::vdup_n(mean, ExactTagType{}); + auto var_vec = wrapper::vdup_n(var, ExactTagType{}); + auto gamma_vec = wrapper::vdup_n(gamma, ExactTagType{}); + auto beta_vec = wrapper::vdup_n(beta, ExactTagType{}); + auto denominator_vec = wrapper::vdup_n(denominator, ExactTagType{}); + const auto epsilon_vec = wrapper::vdup_n(static_cast(epsilon), ExactTagType{}); + execute_window_loop( + win_to_use, + [&](const Coordinates &id) + { + const auto input_ptr = reinterpret_cast(input.ptr()); + const auto output_ptr = reinterpret_cast(output.ptr()); + + if (slice != id.z()) + { + mean = input_mean[id.z()]; + var = input_var[id.z()]; + mean_vec = wrapper::vdup_n(mean, ExactTagType{}); + var_vec = wrapper::vdup_n(var, ExactTagType{}); + if (input_gamma != nullptr) + { + gamma = input_gamma[id.z()]; + gamma_vec = wrapper::vdup_n(gamma, ExactTagType{}); + } + if (input_beta != nullptr) + { + beta = input_beta[id.z()]; + beta_vec = wrapper::vdup_n(beta, ExactTagType{}); + } + + // Calculate denominator + denominator_vec = wrapper::vinvsqrt(wrapper::vadd(var_vec, epsilon_vec)); + denominator = wrapper::vgetlane(denominator_vec, 0); + slice = id.z(); + } + + // Perform core calculations using vector operations + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + // Calculate x bar + const auto numerator = wrapper::vsub(wrapper::vloadq(input_ptr + x), mean_vec); + const auto x_bar = wrapper::vmul(numerator, denominator_vec); + auto res = wrapper::vmla(beta_vec, x_bar, gamma_vec); + + // Perform fused activation + if (fused_activation) + { + activation_functor(res); + } + + // Store results + wrapper::vstore(output_ptr + x, res); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + const T numerator = input_ptr[x] - mean; + const T x_bar = numerator * denominator; + T res = beta + x_bar * gamma; + + // Perform fused activation + if (fused_activation) + { + activation_functor(res); + } + + // Store results + *(output_ptr + x) = res; + } + }, + input, output); +} + template void fused_batch_normalization_conv(const ITensor *conv_weights, const ITensor *conv_bias, diff --git a/src/cpu/kernels/fuse_batch_normalization/nchw/neon/fp16.cpp b/src/cpu/kernels/fuse_batch_normalization/nchw/neon/fp16.cpp new file mode 100644 index 0000000000..ae4c7e5736 --- /dev/null +++ b/src/cpu/kernels/fuse_batch_normalization/nchw/neon/fp16.cpp @@ -0,0 +1,96 @@ +/* + * Copyright (c) 2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) + +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/TensorInfo.h" + +#include "src/core/CPP/Validate.h" +#include "src/core/NEON/kernels/detail/NEActivationFunctionDetail.h" +#include "src/core/NEON/wrapper/wrapper.h" +#include "src/cpu/CpuTypes.h" +#include "src/cpu/kernels/fuse_batch_normalization/generic/impl.h" + +namespace arm_compute +{ +namespace cpu +{ +void fp16_batch_normalization_nchw_non_fused(const Window &window, + ITensor *input, + ITensor *output, + const ITensor *mean, + const ITensor *var, + const ITensor *beta, + const ITensor *gamma, + float epsilon, + ActivationLayerInfo act_info) +{ + batch_normalization_nchw>(window, input, output, mean, var, beta, + gamma, epsilon, act_info); +} + +void fp16_batch_normalization_nchw_non_fused_relu(const Window &window, + ITensor *input, + ITensor *output, + const ITensor *mean, + const ITensor *var, + const ITensor *beta, + const ITensor *gamma, + float epsilon, + ActivationLayerInfo act_info) +{ + batch_normalization_nchw>(window, input, output, mean, var, beta, gamma, + epsilon, act_info); +} + +void fp16_batch_normalization_nchw_non_fused_brelu(const Window &window, + ITensor *input, + ITensor *output, + const ITensor *mean, + const ITensor *var, + const ITensor *beta, + const ITensor *gamma, + float epsilon, + ActivationLayerInfo act_info) +{ + batch_normalization_nchw>(window, input, output, mean, var, beta, + gamma, epsilon, act_info); +} + +void fp16_batch_normalization_nchw_non_fused_lubrelu(const Window &window, + ITensor *input, + ITensor *output, + const ITensor *mean, + const ITensor *var, + const ITensor *beta, + const ITensor *gamma, + float epsilon, + ActivationLayerInfo act_info) +{ + batch_normalization_nchw>(window, input, output, mean, var, beta, + gamma, epsilon, act_info); +} +} // namespace cpu +} // namespace arm_compute +#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */ diff --git a/src/cpu/kernels/fuse_batch_normalization/nchw/neon/fp32.cpp b/src/cpu/kernels/fuse_batch_normalization/nchw/neon/fp32.cpp new file mode 100644 index 0000000000..ae2db1ac66 --- /dev/null +++ b/src/cpu/kernels/fuse_batch_normalization/nchw/neon/fp32.cpp @@ -0,0 +1,93 @@ +/* + * Copyright (c) 2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/TensorInfo.h" + +#include "src/core/CPP/Validate.h" +#include "src/core/NEON/kernels/detail/NEActivationFunctionDetail.h" +#include "src/core/NEON/wrapper/wrapper.h" +#include "src/cpu/CpuTypes.h" +#include "src/cpu/kernels/fuse_batch_normalization/generic/impl.h" + +namespace arm_compute +{ +namespace cpu +{ +void fp32_batch_normalization_nchw_non_fused(const Window &window, + ITensor *input, + ITensor *output, + const ITensor *mean, + const ITensor *var, + const ITensor *beta, + const ITensor *gamma, + float epsilon, + ActivationLayerInfo act_info) +{ + batch_normalization_nchw>(window, input, output, mean, var, beta, gamma, + epsilon, act_info); +} + +void fp32_batch_normalization_nchw_non_fused_relu(const Window &window, + ITensor *input, + ITensor *output, + const ITensor *mean, + const ITensor *var, + const ITensor *beta, + const ITensor *gamma, + float epsilon, + ActivationLayerInfo act_info) +{ + batch_normalization_nchw>(window, input, output, mean, var, beta, gamma, + epsilon, act_info); +} + +void fp32_batch_normalization_nchw_non_fused_brelu(const Window &window, + ITensor *input, + ITensor *output, + const ITensor *mean, + const ITensor *var, + const ITensor *beta, + const ITensor *gamma, + float epsilon, + ActivationLayerInfo act_info) +{ + batch_normalization_nchw>(window, input, output, mean, var, beta, gamma, + epsilon, act_info); +} + +void fp32_batch_normalization_nchw_non_fused_lubrelu(const Window &window, + ITensor *input, + ITensor *output, + const ITensor *mean, + const ITensor *var, + const ITensor *beta, + const ITensor *gamma, + float epsilon, + ActivationLayerInfo act_info) +{ + batch_normalization_nchw>(window, input, output, mean, var, beta, gamma, + epsilon, act_info); +} +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/internal/CpuDepthwiseConv2dAssemblyWrapperKernel.cpp b/src/cpu/kernels/internal/CpuDepthwiseConv2dAssemblyWrapperKernel.cpp index 32d9ca4eac..296fe88791 100644 --- a/src/cpu/kernels/internal/CpuDepthwiseConv2dAssemblyWrapperKernel.cpp +++ b/src/cpu/kernels/internal/CpuDepthwiseConv2dAssemblyWrapperKernel.cpp @@ -239,12 +239,12 @@ void CpuDepthwiseConv2dAssemblyWrapperKernel::configure(const ITensorInfo *src, create_arm_dwc_quant(src, weights, dst, info, cpu_info, _kernel_asm, _multipliers, _right_shifts, _left_shifts, asm_kernel_name); break; -#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) +#if defined(ENABLE_FP16_KERNELS) case DataType::F16: create_arm_dwc(src, weights, dst, info, cpu_info, _kernel_asm, asm_kernel_name); break; -#endif // defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) +#endif // defined(ENABLE_FP16_KERNELS) case DataType::F32: create_arm_dwc(src, weights, dst, info, cpu_info, _kernel_asm, asm_kernel_name); break; diff --git a/src/cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.cpp b/src/cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.cpp index a161c800fd..9ba2451482 100644 --- a/src/cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.cpp +++ b/src/cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2022 Arm Limited. + * Copyright (c) 2021-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -79,11 +79,11 @@ void CpuPool2dAssemblyWrapperKernel::configure(const ITensorInfo *src, create_arm_pooling(src, dst, info, cpu_info); } break; -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +#if defined(ENABLE_FP16_KERNELS) case DataType::F16: create_arm_pooling(src, dst, info, cpu_info); break; -#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ +#endif // defined(ENABLE_FP16_KERNELS) case DataType::F32: create_arm_pooling(src, dst, info, cpu_info); break; diff --git a/src/cpu/kernels/lut/generic/sve/u16.cpp b/src/cpu/kernels/lut/generic/sve/u16.cpp new file mode 100644 index 0000000000..75b8dcaae2 --- /dev/null +++ b/src/cpu/kernels/lut/generic/sve/u16.cpp @@ -0,0 +1,103 @@ +/* + * Copyright (c) 2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/Error.h" + +#include "src/cpu/kernels/lut/list.h" + +#ifdef __aarch64__ +#ifdef ARM_COMPUTE_ENABLE_SVE + +#include + +namespace arm_compute +{ +namespace cpu +{ +void lut_u16_sve(const uint16_t *table, size_t num_strings, size_t size, const uint16_t *input, uint16_t *output) +{ + int64_t cnth = svcnth(); + int64_t tail = size & (4 * cnth - 1); + int64_t count = size - tail; + int64_t pos = 0; + ARM_COMPUTE_UNUSED(num_strings); + __asm __volatile("cbz %[count], 2f\n" + "mov z31.s, #0\n" + "cnth x7, ALL, MUL #4\n" + "cntb x8, ALL, MUL #4\n" + "ptrue p0.b\n" + "1:" + "ld1h z0.h, p0/z, [%[input]]\n" + "ld1h z1.h, p0/z, [%[input], #1, MUL VL]\n" + "ld1h z2.h, p0/z, [%[input], #2, MUL VL]\n" + "ld1h z3.h, p0/z, [%[input], #3, MUL VL]\n" + "add %[input], %[input], x8\n" + + "zip1 z8.h, z0.h, z31.h\n" + "ld1h z8.s, p0/z, [%[table], z8.s, UXTW #1]\n" + "zip2 z0.h, z0.h, z31.h\n" + "ld1h z0.s, p0/z, [%[table], z0.s, UXTW #1]\n" + "uzp1 z0.h, z8.h, z0.h\n" + "st1h z0.h, p0, [%[output]]\n" + + "zip1 z10.h, z1.h, z31.h\n" + "ld1h z10.s, p0/z, [%[table], z10.s, UXTW #1]\n" + "zip2 z1.h, z1.h, z31.h\n" + "ld1h z1.s, p0/z, [%[table], z1.s, UXTW #1]\n" + "uzp1 z1.h, z10.h, z1.h\n" + "st1h z1.h, p0, [%[output], #1, MUL VL]\n" + + "zip1 z12.h, z2.h, z31.h\n" + "ld1h z12.s, p0/z, [%[table], z12.s, UXTW #1]\n" + "zip2 z2.h, z2.h, z31.h\n" + "ld1h z2.s, p0/z, [%[table], z2.s, UXTW #1]\n" + "uzp1 z2.h, z12.h, z2.h\n" + "st1h z2.h, p0, [%[output], #2, MUL VL]\n" + + "zip1 z14.h, z3.h, z31.h\n" + "ld1h z14.s, p0/z, [%[table], z14.s, UXTW #1]\n" + "zip2 z3.h, z3.h, z31.h\n" + "ld1h z3.s, p0/z, [%[table], z3.s, UXTW #1]\n" + "uzp1 z3.h, z14.h, z3.h\n" + "st1h z3.h, p0, [%[output], #3, MUL VL]\n" + + "add %[pos], %[pos], x7\n" + "add %[output], %[output], x8\n" + "cmp %[pos], %[count]\n" + "blt 1b\n" + "2:\n" + : [count] "+r"(count), [input] "+r"(input), [output] "+r"(output), [pos] "+r"(pos) + : [table] "r"(table) + : "memory", "cc", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", + "z14", "z31", "p0", "p1", "z2", "z3", "z4", "x7", "x8"); + for (int i = 0; i < tail; i++) + { + output[i] = table[input[i]]; + } +} + +} // namespace cpu +} // namespace arm_compute + +#endif // ARM_COMPUTE_ENABLE_SVE +#endif // __aarch64__ diff --git a/src/cpu/kernels/lut/list.h b/src/cpu/kernels/lut/list.h index da90346267..9acfe97728 100644 --- a/src/cpu/kernels/lut/list.h +++ b/src/cpu/kernels/lut/list.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023 Arm Limited. + * Copyright (c) 2023-2024 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -22,8 +22,8 @@ * SOFTWARE. */ -#ifndef SRC_CORE_NEON_KERNELS_LUT_LIST_H -#define SRC_CORE_NEON_KERNELS_LUT_LIST_H +#ifndef ACL_SRC_CPU_KERNELS_LUT_LIST_H +#define ACL_SRC_CPU_KERNELS_LUT_LIST_H #include #include @@ -34,17 +34,27 @@ namespace cpu { #ifdef __aarch64__ -#define DECLARE_LUT_KERNEL(func_name) \ +#define DECLARE_LUT_U8_KERNEL(func_name) \ void func_name(const uint8_t *table, size_t num_strings, size_t string_length, const uint8_t *const *input, \ uint8_t *const *output) -DECLARE_LUT_KERNEL(lut_u8_neon); -DECLARE_LUT_KERNEL(lut_u8_sve2); +DECLARE_LUT_U8_KERNEL(lut_u8_neon); +DECLARE_LUT_U8_KERNEL(lut_u8_sve2); + +#undef DECLARE_LUT_U8_KERNEL + +#define DECLARE_LUT_U16_KERNEL(func_name) \ + void func_name(const uint16_t *table, size_t num_strings, size_t string_length, const uint16_t *input, \ + uint16_t *output) + +DECLARE_LUT_U16_KERNEL(lut_u16_neon); +DECLARE_LUT_U16_KERNEL(lut_u16_sve); + +#undef DECLARE_LUT_U16_KERNEL -#undef DECLARE_LUT_KERNEL #endif // __aarch64__ } // namespace cpu } // namespace arm_compute -#endif // SRC_CORE_NEON_KERNELS_LUT_LIST_H +#endif // ACL_SRC_CPU_KERNELS_LUT_LIST_H diff --git a/src/cpu/kernels/mul/generic/neon/fp16.cpp b/src/cpu/kernels/mul/generic/neon/fp16.cpp new file mode 100644 index 0000000000..920f298527 --- /dev/null +++ b/src/cpu/kernels/mul/generic/neon/fp16.cpp @@ -0,0 +1,145 @@ +/* + * Copyright (c) 2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) + +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/TensorInfo.h" + +#include "src/core/CPP/Validate.h" +#include "src/core/NEON/wrapper/wrapper.h" +#include "src/cpu/CpuTypes.h" + +namespace arm_compute +{ +namespace cpu +{ +void mul_F16_F16_F16(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, float scale) +{ + // Create input windows + Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape()); + Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape()); + + // Clear X Dimension on execution window as we handle manually + Window win = window; + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + constexpr int window_step_x = 16; + const auto window_start_x = static_cast(window.x().start()); + const auto window_end_x = static_cast(window.x().end()); + const bool is_broadcast_across_x = src1->info()->tensor_shape().x() != src2->info()->tensor_shape().x(); + if (is_broadcast_across_x) + { + const bool is_broadcast_input_2 = input2_win.x().step() == 0; + Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win; + Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win; + const ITensor *broadcast_tensor = is_broadcast_input_2 ? src2 : src1; + const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src2 : src1; + // Clear X Dimension on execution window as we handle manually + non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + Iterator broadcast_input(broadcast_tensor, broadcast_win); + Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win); + Iterator dst(out, win); + execute_window_loop( + win, + [&](const Coordinates &) + { + const auto non_broadcast_input_ptr = reinterpret_cast(non_broadcast_input.ptr()); + const auto output_ptr = reinterpret_cast(dst.ptr()); + const auto broadcast_value = *reinterpret_cast(broadcast_input.ptr()); + const float16x8x2_t broadcast_value_vec = {{ + vdupq_n_f16(broadcast_value), + vdupq_n_f16(broadcast_value), + }}; + const auto scale_vec = vdupq_n_f16(scale); + // Compute window_step_x elements per iteration + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const float16x8x2_t non_broadcast_v = {{ + vld1q_f16(non_broadcast_input_ptr + x), + vld1q_f16(non_broadcast_input_ptr + x + 8), + }}; + const float16x8x2_t result = {{ + vmulq_f16(vmulq_f16(broadcast_value_vec.val[0], non_broadcast_v.val[0]), scale_vec), + vmulq_f16(vmulq_f16(broadcast_value_vec.val[1], non_broadcast_v.val[1]), scale_vec), + }}; + vst1q_f16(output_ptr + x, result.val[0]); + vst1q_f16(output_ptr + x + 8, result.val[1]); + } + // Compute left-over elements + for (; x < window_end_x; ++x) + { + const auto non_broadcast_v = *(non_broadcast_input_ptr + x); + *(output_ptr + x) = broadcast_value * non_broadcast_v * scale; + } + }, + broadcast_input, non_broadcast_input, dst); + } + else + { + input1_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + input2_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + Iterator input1(src1, input1_win); + Iterator input2(src2, input2_win); + Iterator dst(out, win); + execute_window_loop( + win, + [&](const Coordinates &) + { + const auto input1_ptr = reinterpret_cast(input1.ptr()); + const auto input2_ptr = reinterpret_cast(input2.ptr()); + const auto output_ptr = reinterpret_cast(dst.ptr()); + // Compute window_step_x elements per iteration + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const float16x8x2_t ta1 = {{ + vld1q_f16(input1_ptr + x), + vld1q_f16(input1_ptr + x + 8), + }}; + const float16x8x2_t ta2 = {{ + vld1q_f16(input2_ptr + x), + vld1q_f16(input2_ptr + x + 8), + }}; + const float16x8_t scale_vec = vdupq_n_f16(scale); + const float16x8x2_t result = {{ + vmulq_f16(vmulq_f16(ta1.val[0], ta2.val[0]), scale_vec), + vmulq_f16(vmulq_f16(ta1.val[1], ta2.val[1]), scale_vec), + }}; + vst1q_f16(output_ptr + x, result.val[0]); + vst1q_f16(output_ptr + x + 8, result.val[1]); + } + // Compute left-over elements + for (; x < window_end_x; ++x) + { + const auto ta1 = *(input1_ptr + x); + const auto ta2 = *(input2_ptr + x); + *(output_ptr + x) = ta1 * ta2 * scale; + } + }, + input1, input2, dst); + } +} +} // namespace cpu +} // namespace arm_compute +#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */ diff --git a/src/cpu/kernels/mul/generic/neon/fp32.cpp b/src/cpu/kernels/mul/generic/neon/fp32.cpp new file mode 100644 index 0000000000..3001eb5110 --- /dev/null +++ b/src/cpu/kernels/mul/generic/neon/fp32.cpp @@ -0,0 +1,138 @@ +/* + * Copyright (c) 2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/TensorInfo.h" + +#include "src/core/CPP/Validate.h" +#include "src/core/NEON/wrapper/wrapper.h" +#include "src/cpu/CpuTypes.h" + +namespace arm_compute +{ +namespace cpu +{ +void mul_F32_F32_F32(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, float scale) +{ + // Create input windows + Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape()); + Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape()); + + // Clear X Dimension on execution window as we handle manually + Window win = window; + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + constexpr int window_step_x = 16 / sizeof(float); + const auto window_start_x = static_cast(window.x().start()); + const auto window_end_x = static_cast(window.x().end()); + const bool is_broadcast_across_x = src1->info()->tensor_shape().x() != src2->info()->tensor_shape().x(); + + using ExactTagType = typename wrapper::traits::neon_vector::tag_type; + + if (is_broadcast_across_x) + { + const bool is_broadcast_input_2 = input2_win.x().step() == 0; + Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win; + Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win; + const ITensor *broadcast_tensor = is_broadcast_input_2 ? src2 : src1; + const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src2 : src1; + + // Clear X Dimension on execution window as we handle manually + non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator broadcast_input(broadcast_tensor, broadcast_win); + Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win); + Iterator dst(out, win); + + execute_window_loop( + win, + [&](const Coordinates &) + { + const auto non_broadcast_input_ptr = reinterpret_cast(non_broadcast_input.ptr()); + const auto output_ptr = reinterpret_cast(dst.ptr()); + + const float broadcast_value = *reinterpret_cast(broadcast_input.ptr()); + const auto broadcast_value_vec = wrapper::vdup_n(broadcast_value, ExactTagType{}); + const auto scale_vec = wrapper::vdup_n(scale, ExactTagType{}); + + // Compute window_step_x elements per iteration + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const auto non_broadcast_v = wrapper::vloadq(non_broadcast_input_ptr + x); + auto res = wrapper::vmul(wrapper::vmul(broadcast_value_vec, non_broadcast_v), scale_vec); + wrapper::vstore(output_ptr + x, res); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + const auto non_broadcast_v = *(non_broadcast_input_ptr + x); + *(output_ptr + x) = broadcast_value * non_broadcast_v * scale; + } + }, + broadcast_input, non_broadcast_input, dst); + } + else + { + // Clear X Dimension on execution window as we handle manually + input1_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + input2_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator input1(src1, input1_win); + Iterator input2(src2, input2_win); + Iterator dst(out, win); + + execute_window_loop( + win, + [&](const Coordinates &) + { + const auto input1_ptr = reinterpret_cast(input1.ptr()); + const auto input2_ptr = reinterpret_cast(input2.ptr()); + const auto output_ptr = reinterpret_cast(dst.ptr()); + + // Compute window_step_x elements per iteration + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const auto ta1 = wrapper::vloadq(input1_ptr + x); + const auto ta2 = wrapper::vloadq(input2_ptr + x); + const auto scale_vec = wrapper::vdup_n(scale, ExactTagType{}); + const auto res = wrapper::vmul(wrapper::vmul(ta1, ta2), scale_vec); + wrapper::vstore(output_ptr + x, res); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + const auto ta1 = *(input1_ptr + x); + const auto ta2 = *(input2_ptr + x); + *(output_ptr + x) = ta1 * ta2 * scale; + } + }, + input1, input2, dst); + } +} +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/softmax/generic/sve/qasymm8.cpp b/src/cpu/kernels/mul/generic/neon/list.h similarity index 73% rename from src/cpu/kernels/softmax/generic/sve/qasymm8.cpp rename to src/cpu/kernels/mul/generic/neon/list.h index 85e5ccfea1..710cb68b72 100644 --- a/src/cpu/kernels/softmax/generic/sve/qasymm8.cpp +++ b/src/cpu/kernels/mul/generic/neon/list.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2022 Arm Limited. + * Copyright (c) 2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -21,18 +21,18 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ - -#include "arm_compute/core/Helpers.h" - -#include "src/cpu/kernels/softmax/generic/sve/impl.h" - +#ifndef ACL_SRC_CPU_KERNELS_MUL_GENERIC_NEON_LIST_H +#define ACL_SRC_CPU_KERNELS_MUL_GENERIC_NEON_LIST_H namespace arm_compute { namespace cpu { -void sve_qasymm8_logits(const ITensor *in, ITensor *out, const Window &window) -{ - return sve_logits_1d_max(in, out, window); -} +#define DECLARE_MUL_KERNEL(func_name) \ + void func_name(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, float scale) + +DECLARE_MUL_KERNEL(mul_F32_F32_F32); +DECLARE_MUL_KERNEL(mul_F16_F16_F16); +#undef DECLARE_MUL_KERNEL } // namespace cpu } // namespace arm_compute +#endif // ACL_SRC_CPU_KERNELS_MUL_GENERIC_NEON_LIST_H diff --git a/src/cpu/kernels/softmax/generic/sve/fp16.cpp b/src/cpu/kernels/norm_layer/generic/neon/fp16.cpp similarity index 50% rename from src/cpu/kernels/softmax/generic/sve/fp16.cpp rename to src/cpu/kernels/norm_layer/generic/neon/fp16.cpp index 5e94f72faf..f85fe7a31a 100644 --- a/src/cpu/kernels/softmax/generic/sve/fp16.cpp +++ b/src/cpu/kernels/norm_layer/generic/neon/fp16.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2023 Arm Limited. + * Copyright (c) 2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -22,29 +22,46 @@ * SOFTWARE. */ #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) -#include "arm_compute/core/Helpers.h" +#include "src/core/NEON/wrapper/wrapper.h" #include "src/cpu/CpuTypes.h" -#include "src/cpu/kernels/softmax/generic/sve/impl.h" +#include "src/cpu/kernels/norm_layer/generic/neon/impl.h" + namespace arm_compute { namespace cpu { -void sve_fp16_softmax(const ITensor *in, - const ITensor *max, - void *const tmp, - ITensor *out, - const float beta, - bool is_log, - const Window &window) + +void neon_normalize_float16_8_0_2D( + const Window &window, const ITensor *in, const ITensor *in_squared, ITensor *out, NormalizationLayerInfo ninfo) +{ + arm_compute::normalize_float(window, in, in_squared, out, ninfo); +} + +void neon_normalize_float16_8_0( + const Window &window, const ITensor *in, const ITensor *in_squared, ITensor *out, NormalizationLayerInfo ninfo) +{ + arm_compute::normalize_float(window, in, in_squared, out, ninfo); +} + +void neon_normalize_float16_8_1_2D( + const Window &window, const ITensor *in, const ITensor *in_squared, ITensor *out, NormalizationLayerInfo ninfo) +{ + arm_compute::normalize_float(window, in, in_squared, out, ninfo); +} + +void neon_normalize_float16_8_1( + const Window &window, const ITensor *in, const ITensor *in_squared, ITensor *out, NormalizationLayerInfo ninfo) { - return sve_softmax_logits_1d_float(in, max, tmp, out, beta, is_log, window); + arm_compute::normalize_float(window, in, in_squared, out, ninfo); } -void sve_fp16_logits(const ITensor *in, ITensor *out, const Window &window) +void neon_normalize_float16_8_2( + const Window &window, const ITensor *in, const ITensor *in_squared, ITensor *out, NormalizationLayerInfo ninfo) { - return sve_logits_1d_max(in, out, window); + arm_compute::normalize_float(window, in, in_squared, out, ninfo); } + } // namespace cpu } // namespace arm_compute #endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */ diff --git a/src/cpu/kernels/norm_layer/generic/neon/fp32.cpp b/src/cpu/kernels/norm_layer/generic/neon/fp32.cpp new file mode 100644 index 0000000000..0b64f46956 --- /dev/null +++ b/src/cpu/kernels/norm_layer/generic/neon/fp32.cpp @@ -0,0 +1,63 @@ +/* + * Copyright (c) 2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "src/core/NEON/wrapper/wrapper.h" +#include "src/cpu/CpuTypes.h" +#include "src/cpu/kernels/norm_layer/generic/neon/impl.h" + +namespace arm_compute +{ +namespace cpu +{ +void neon_normalize_float32_4_0_2D( + const Window &window, const ITensor *in, const ITensor *in_squared, ITensor *out, NormalizationLayerInfo ninfo) +{ + arm_compute::normalize_float(window, in, in_squared, out, ninfo); +} + +void neon_normalize_float32_4_0( + const Window &window, const ITensor *in, const ITensor *in_squared, ITensor *out, NormalizationLayerInfo ninfo) +{ + arm_compute::normalize_float(window, in, in_squared, out, ninfo); +} + +void neon_normalize_float32_4_1_2D( + const Window &window, const ITensor *in, const ITensor *in_squared, ITensor *out, NormalizationLayerInfo ninfo) +{ + arm_compute::normalize_float(window, in, in_squared, out, ninfo); +} + +void neon_normalize_float32_4_1( + const Window &window, const ITensor *in, const ITensor *in_squared, ITensor *out, NormalizationLayerInfo ninfo) +{ + arm_compute::normalize_float(window, in, in_squared, out, ninfo); +} + +void neon_normalize_float32_4_2( + const Window &window, const ITensor *in, const ITensor *in_squared, ITensor *out, NormalizationLayerInfo ninfo) +{ + arm_compute::normalize_float(window, in, in_squared, out, ninfo); +} +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/norm_layer/generic/neon/impl.h b/src/cpu/kernels/norm_layer/generic/neon/impl.h new file mode 100644 index 0000000000..6103165679 --- /dev/null +++ b/src/cpu/kernels/norm_layer/generic/neon/impl.h @@ -0,0 +1,177 @@ +/* + * Copyright (c) 2017-2021, 2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ACL_SRC_CPU_KERNELS_NORM_LAYER_GENERIC_NEON_IMPL_H +#define ACL_SRC_CPU_KERNELS_NORM_LAYER_GENERIC_NEON_IMPL_H + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" + +#include "src/core/helpers/NormalizationHelpers.h" +#include "src/core/helpers/WindowHelpers.h" +#include "src/core/NEON/NEMath.h" +#include "src/core/NEON/wrapper/wrapper.h" + +namespace arm_compute +{ +/** Function to perform normalization depending on the given template + * dimension. The second template parameter specifies whether the + * normalization has to be 1D or 2D. + * + * @note Only supported normalizations are: + * - 1D over X or Z + * - 2D over X and Y + * + * @param[in] window Region on which to execute the kernel. + * @param[in] in Source tensor. 3 lower dims represent a single input with dimensions [width, height, IFM], + * and an optional 4th dimension for batch of inputs. Data types supported: FP16/F32. Data layouts supported: NCHW/NHWC. + * @param[in] in_squared Source with each element has been squared. 3 lower dims represent a single input with dimensions [width, height, IFM], + * Data type and layout supported: same as @p input. + * @param[in] out Destination tensor. Output will have the same number of dimensions as input. Data type and layout supported: same as @p input. + * @param[in] ninfo Normalization layer information like the normalization type, normalization size and other parameters. + */ +template +void normalize_float( + const Window &window, const ITensor *in, const ITensor *in_squared, ITensor *out, NormalizationLayerInfo ninfo) +{ + /** SIMD vector tag type. */ + using ExactTagType = typename wrapper::traits::neon_vector::tag_type; + + Window win(window); + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + const auto window_start_x = static_cast(window.x().start()); + const auto window_end_x = static_cast(window.x().end()); + const int window_step_x = S; + + Iterator input(in, win); + Iterator input_squared(in_squared, win); + Iterator output(out, win); + + const int dim_y = in->info()->data_layout() == DataLayout::NCHW ? 1 : 2; + const int radius = ninfo.norm_size() / 2; + const int input_squared_stride_x = in_squared->info()->strides_in_bytes()[0]; + const int input_squared_stride_slice = in_squared->info()->strides_in_bytes()[dim]; + const int input_squared_stride_row = in_squared->info()->strides_in_bytes()[dim_y]; + + const int max_right = in->info()->dimension(dim) - 1; + const int max_bottom = in->info()->dimension(dim_y) - 1; + + const auto coeff_vec = wrapper::vdup_n(static_cast(ninfo.scale_coeff()), ExactTagType{}); + const auto beta_vec = wrapper::vdup_n(static_cast(ninfo.beta()), ExactTagType{}); + const auto kappa_vec = wrapper::vdup_n(static_cast(ninfo.kappa()), ExactTagType{}); + + auto sequential_normalization = [&](const int x, const Coordinates &id, const int current_row, const int first_row, + const int last_row, const T *input_ptr, const uint8_t *input_squared_start_ptr, + T *output_ptr) + { + const int current_slice = dim == 0 ? x : id[dim]; + const int first_slice = std::max(current_slice - radius, 0); + const int last_slice = std::min(current_slice + radius, max_right); + + const uint8_t *const input_squared_x_ptr = input_squared_start_ptr + x * input_squared_stride_x; + // Accumulate 2D In-Map values + auto accu = static_cast(0.f); + for (int j = first_row; j <= last_row; ++j) + { + // Compute row displacement + const uint8_t *const input_squared_ptr = input_squared_x_ptr + (j - current_row) * input_squared_stride_row; + for (int i = first_slice; i <= last_slice; ++i) + { + accu += + *reinterpret_cast(input_squared_ptr + (i - current_slice) * input_squared_stride_slice); + } + } + + // Normalize + const auto normalized = + std::pow(accu * static_cast(ninfo.scale_coeff()) + static_cast(ninfo.kappa()), ninfo.beta()); + const auto normalized_pixel = (*(input_ptr + x)) / normalized; + *(output_ptr + x) = normalized_pixel; + }; + + execute_window_loop( + win, + [&](const Coordinates &id) + { + const auto input_ptr = reinterpret_cast(input.ptr()); + auto output_ptr = reinterpret_cast(output.ptr()); + + // Get range to normalize + const int current_row = do_2D_norm ? id[dim_y] : 0; + const int first_row = do_2D_norm ? std::max(current_row - radius, 0) : 0; + const int last_row = do_2D_norm ? std::min(current_row + radius, max_bottom) : 0; + + int x = window_start_x; + // Compute serially starting elements for the case x dimension is width + for (; x < radius && x < window_end_x && dim == 0; ++x) + { + sequential_normalization(x, id, current_row, first_row, last_row, input_ptr, input_squared.ptr(), + output_ptr); + } + + // Compute vectorized + for (; x <= window_end_x - window_step_x - radius; x += window_step_x) + { + const int current_slice = dim == 0 ? x : id[dim]; + const int first_slice = std::max(current_slice - radius, 0); + const int last_slice = std::min(current_slice + radius, max_right); + + const uint8_t *const input_squared_x_ptr = input_squared.ptr() + x * input_squared_stride_x; + // Accumulate 2D In-Map values + auto accu = wrapper::vdup_n(static_cast(0.f), ExactTagType{}); + for (int j = first_row; j <= last_row; ++j) + { + // Compute row displacement + const uint8_t *const input_squared_ptr = + input_squared_x_ptr + (j - current_row) * input_squared_stride_row; + for (int i = first_slice; i <= last_slice; ++i) + { + accu = wrapper::vadd( + accu, wrapper::vloadq(reinterpret_cast( + input_squared_ptr + (i - current_slice) * input_squared_stride_slice))); + } + } + + // Normalize + const auto normalized = wrapper::vpow(wrapper::vmla(kappa_vec, coeff_vec, accu), beta_vec); + const auto normalized_pixel = wrapper::vmul(wrapper::vloadq(input_ptr + x), wrapper::vinv(normalized)); + wrapper::vstore(reinterpret_cast(output_ptr + x), normalized_pixel); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + sequential_normalization(x, id, current_row, first_row, last_row, input_ptr, input_squared.ptr(), + output_ptr); + } + }, + input, input_squared, output); +} + +} // namespace arm_compute +#endif // ACL_SRC_CPU_KERNELS_NORM_LAYER_GENERIC_NEON_IMPL_H diff --git a/src/cpu/kernels/softmax/generic/sve2/qasymm8.cpp b/src/cpu/kernels/norm_layer/generic/neon/list.h similarity index 52% rename from src/cpu/kernels/softmax/generic/sve2/qasymm8.cpp rename to src/cpu/kernels/norm_layer/generic/neon/list.h index 95623786b3..f2e83d7af1 100644 --- a/src/cpu/kernels/softmax/generic/sve2/qasymm8.cpp +++ b/src/cpu/kernels/norm_layer/generic/neon/list.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2022 Arm Limited. + * Copyright (c) 2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -21,24 +21,29 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ - -#include "arm_compute/core/Helpers.h" - -#include "src/cpu/kernels/softmax/generic/sve2/impl.h" - +#ifndef ACL_SRC_CPU_KERNELS_NORM_LAYER_GENERIC_NEON_LIST_H +#define ACL_SRC_CPU_KERNELS_NORM_LAYER_GENERIC_NEON_LIST_H namespace arm_compute { namespace cpu { -void sve2_qasymm8_softmax(const ITensor *in, - const ITensor *max, - void *const tmp, - ITensor *out, - const float beta, - bool is_log, - const Window &window) -{ - return sve2_softmax_logits_1d_quantized(in, max, tmp, out, beta, is_log, window); -} + +#define DECLARE_NORMALIZATION_KERNEL(func_name) \ + void func_name(const Window &window, const ITensor *in, const ITensor *in_squared, ITensor *out, \ + NormalizationLayerInfo ninfo) + +DECLARE_NORMALIZATION_KERNEL(neon_normalize_float32_4_0_2D); +DECLARE_NORMALIZATION_KERNEL(neon_normalize_float32_4_0); +DECLARE_NORMALIZATION_KERNEL(neon_normalize_float32_4_1_2D); +DECLARE_NORMALIZATION_KERNEL(neon_normalize_float32_4_1); +DECLARE_NORMALIZATION_KERNEL(neon_normalize_float32_4_2); +DECLARE_NORMALIZATION_KERNEL(neon_normalize_float16_8_0_2D); +DECLARE_NORMALIZATION_KERNEL(neon_normalize_float16_8_0); +DECLARE_NORMALIZATION_KERNEL(neon_normalize_float16_8_1_2D); +DECLARE_NORMALIZATION_KERNEL(neon_normalize_float16_8_1); +DECLARE_NORMALIZATION_KERNEL(neon_normalize_float16_8_2); + +#undef DECLARE_NORMALIZATION_KERNEL } // namespace cpu } // namespace arm_compute +#endif // ACL_SRC_CPU_KERNELS_NORM_LAYER_GENERIC_NEON_LIST_H diff --git a/src/cpu/kernels/softmax/generic/neon/fp16.cpp b/src/cpu/kernels/softmax/generic/neon/fp16.cpp index 2e2adf33e0..db8f881712 100644 --- a/src/cpu/kernels/softmax/generic/neon/fp16.cpp +++ b/src/cpu/kernels/softmax/generic/neon/fp16.cpp @@ -31,21 +31,18 @@ namespace arm_compute { namespace cpu { -void neon_fp16_softmax(const ITensor *in, - const ITensor *max, - void *const tmp, - ITensor *out, - const float beta, - bool is_log, - const Window &window) -{ - return neon_softmax_logits_1d_float(in, max, tmp, out, beta, is_log, window); -} -void neon_fp16_logits(const ITensor *in, ITensor *out, const Window &window) +template +void neon_fp16_softmax(const ITensor *in, void *const tmp, ITensor *out, const float beta, const Window &window) { - return neon_logits_1d_max(in, out, window); + return neon_softmax_float(in, tmp, out, beta, window); } + +template void +neon_fp16_softmax(const ITensor *in, void *const tmp, ITensor *out, const float beta, const Window &window); +template void +neon_fp16_softmax(const ITensor *in, void *const tmp, ITensor *out, const float beta, const Window &window); + } // namespace cpu } // namespace arm_compute #endif //defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) diff --git a/src/cpu/kernels/softmax/generic/neon/fp32.cpp b/src/cpu/kernels/softmax/generic/neon/fp32.cpp index 61df40c1b5..c281d1bf31 100644 --- a/src/cpu/kernels/softmax/generic/neon/fp32.cpp +++ b/src/cpu/kernels/softmax/generic/neon/fp32.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2022 Arm Limited. + * Copyright (c) 2021-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -29,20 +29,17 @@ namespace arm_compute { namespace cpu { -void neon_fp32_softmax(const ITensor *in, - const ITensor *max, - void *const tmp, - ITensor *out, - const float beta, - bool is_log, - const Window &window) -{ - return neon_softmax_logits_1d_float(in, max, tmp, out, beta, is_log, window); -} -void neon_fp32_logits(const ITensor *in, ITensor *out, const Window &window) +template +void neon_fp32_softmax(const ITensor *in, void *const tmp, ITensor *out, const float beta, const Window &window) { - return neon_logits_1d_max(in, out, window); + return neon_softmax_float(in, tmp, out, beta, window); } + +template void +neon_fp32_softmax(const ITensor *in, void *const tmp, ITensor *out, const float beta, const Window &window); +template void +neon_fp32_softmax(const ITensor *in, void *const tmp, ITensor *out, const float beta, const Window &window); + } // namespace cpu } // namespace arm_compute diff --git a/src/cpu/kernels/softmax/generic/neon/impl.cpp b/src/cpu/kernels/softmax/generic/neon/impl.cpp index 5d6e6a4f80..487f6ae051 100644 --- a/src/cpu/kernels/softmax/generic/neon/impl.cpp +++ b/src/cpu/kernels/softmax/generic/neon/impl.cpp @@ -29,43 +29,76 @@ namespace arm_compute { namespace cpu { -template void neon_logits_1d_max(const ITensor *in, ITensor *out, const Window &window); -template void neon_logits_1d_max(const ITensor *in, ITensor *out, const Window &window); - -template -void neon_softmax_logits_1d_quantized( - const ITensor *in, const ITensor *max, void *const tmp, ITensor *out, float beta, bool is_log, const Window &window) +template +void neon_softmax_quantized(const ITensor *in, void *const tmp, ITensor *out, float beta, const Window &window) { static_assert(std::is_same::value || std::is_same::value, "quantized type should be either qasymm8_t or qasymm8_signed_t."); - const int start_x = in->info()->valid_region().anchor.x(); const int input_width = in->info()->valid_region().shape.x(); - const float scale_beta = -beta * in->info()->quantization_info().uniform().scale; - const auto scale_beta_vec = vdupq_n_f32(scale_beta); + const float scale_beta = -beta * in->info()->quantization_info().uniform().scale; + const float32x4_t scale_beta_vec = vdupq_n_f32(scale_beta); + + Iterator in_it(in, window); + Iterator out_it(out, window); - Iterator in_it(in, window); - Iterator max_it(max, window); - Iterator out_it(out, window); constexpr int vec_size = 16; +#ifndef __aarch64__ + const int sum_stages = log2(vec_size >> 1); +#endif // __aarch64__ + + using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t; + execute_window_loop( window, [&](const Coordinates &) { /* Get pointers */ - const auto in_ptr = reinterpret_cast(in_it.ptr()) + start_x; - const auto out_ptr = reinterpret_cast(out_it.ptr()) + start_x; - const auto tmp_ptr = reinterpret_cast(tmp); + const T *in_ptr = reinterpret_cast(in_it.ptr()); + T *out_ptr = reinterpret_cast(out_it.ptr()); + float *tmp_ptr = reinterpret_cast(tmp); + + T max_val; + + /* Compute Max */ + { + // Init max value + auto vec_max = wrapper::vdup_n(support::cpp11::lowest(), ExactTagType{}); + int x = 0; - float sum{}; - float sum_inversed{}; + for (; x <= (input_width - vec_size); x += vec_size) + { + const auto current_value = wrapper::vloadq(in_ptr + x); + vec_max = wrapper::vmax(vec_max, current_value); + } + +#ifdef __aarch64__ + max_val = wrapper::vmaxv(vec_max); +#else // __aarch64__ + auto carry_max = wrapper::vpmax(wrapper::vgethigh(vec_max), wrapper::vgetlow(vec_max)); + + for (int i = 0; i < sum_stages; ++i) + { + carry_max = wrapper::vpmax(carry_max, carry_max); + } + + max_val = wrapper::vgetlane(carry_max, 0); +#endif // __aarch64__ + + // Compute left-over elements + for (; x < input_width; ++x) + { + max_val = std::max(*(in_ptr + x), max_val); + } + } // Compute Max + + float sum_transformed{}; /* Compute exponentials and sum */ { /* Get max value */ - const auto max_val = *reinterpret_cast(max_it.ptr()); const auto vec_max = wrapper::vdup_n(max_val, wrapper::traits::vector_128_tag{}); /* Init sum to zero */ @@ -80,11 +113,11 @@ void neon_softmax_logits_1d_quantized( int x = 0; for (; x <= (input_width - vec_size); x += vec_size) { - auto vec_elements = wrapper::vloadq(in_ptr + x); - vec_elements = wrapper::vqsub(vec_max, vec_elements); - auto vec_elements_flt = convert_int_to_float(vec_elements); + auto vec_elements = wrapper::vloadq(in_ptr + x); + vec_elements = wrapper::vqsub(vec_max, vec_elements); + float32x4x4_t vec_elements_flt = convert_int_to_float(vec_elements); - if (is_log) + if (IS_LOG) { vec_elements_flt.val[0] = vmulq_f32(vec_elements_flt.val[0], scale_beta_vec); vec_elements_flt.val[1] = vmulq_f32(vec_elements_flt.val[1], scale_beta_vec); @@ -111,17 +144,24 @@ void neon_softmax_logits_1d_quantized( } /* Reduce sum */ - const auto sum_16_byte = + const float32x4_t sum_16_byte = vaddq_f32(vaddq_f32(vec_sum.val[0], vec_sum.val[1]), vaddq_f32(vec_sum.val[2], vec_sum.val[3])); + + float sum; + +#ifdef __aarch64__ + sum = wrapper::vaddv(sum_16_byte); +#else // __aarch64__ auto sum_res = vpadd_f32(vget_high_f32(sum_16_byte), vget_low_f32(sum_16_byte)); sum_res = vpadd_f32(sum_res, sum_res); sum = wrapper::vgetlane(sum_res, 0); +#endif // __aarch64__ /* Run remaining elements */ for (; x < input_width; ++x) { float element{}; - if (is_log) + if (IS_LOG) { element = (max_val - in_ptr[x]) * scale_beta; sum += std::exp(element); @@ -135,19 +175,22 @@ void neon_softmax_logits_1d_quantized( tmp_ptr[x] = element; } - if (!is_log) + if (!IS_LOG) { - sum_inversed = 256.f / sum; + sum_transformed = 256.f / sum; } else { - sum = std::log(sum); + sum_transformed = std::log(sum); } - } + } // Compute exponentials and sum /* Normalize exponentials */ { constexpr bool is_qasymm8_signed = std::is_same::value; + + const float32x4_t sum_vec = vdupq_n_f32(sum_transformed); + /* Loop over row and compute softmax */ int x = 0; for (; x <= (input_width - vec_size); x += vec_size) @@ -155,23 +198,23 @@ void neon_softmax_logits_1d_quantized( using int_vec_type = wrapper::traits::neon_vector_t; float32x4x4_t vec_in = vld4q_f32(tmp_ptr + x); int_vec_type normalized_value{}; - if (is_log) + if (IS_LOG) { const float32x4x4_t sub = { - vsubq_f32(vec_in.val[0], vdupq_n_f32(sum)), - vsubq_f32(vec_in.val[1], vdupq_n_f32(sum)), - vsubq_f32(vec_in.val[2], vdupq_n_f32(sum)), - vsubq_f32(vec_in.val[3], vdupq_n_f32(sum)), + vsubq_f32(vec_in.val[0], sum_vec), + vsubq_f32(vec_in.val[1], sum_vec), + vsubq_f32(vec_in.val[2], sum_vec), + vsubq_f32(vec_in.val[3], sum_vec), }; normalized_value = convert_float_to_int(sub); } else { float32x4x4_t mul = { - vmulq_f32(vec_in.val[0], vdupq_n_f32(sum_inversed)), - vmulq_f32(vec_in.val[1], vdupq_n_f32(sum_inversed)), - vmulq_f32(vec_in.val[2], vdupq_n_f32(sum_inversed)), - vmulq_f32(vec_in.val[3], vdupq_n_f32(sum_inversed)), + vmulq_f32(vec_in.val[0], sum_vec), + vmulq_f32(vec_in.val[1], sum_vec), + vmulq_f32(vec_in.val[2], sum_vec), + vmulq_f32(vec_in.val[3], sum_vec), }; if (is_qasymm8_signed) @@ -190,34 +233,31 @@ void neon_softmax_logits_1d_quantized( /* Run remaining elements */ for (; x < input_width; ++x) { - if (is_log) + if (IS_LOG) { - out_ptr[x] = utils::cast::saturate_cast(tmp_ptr[x] - sum); + out_ptr[x] = utils::cast::saturate_cast(tmp_ptr[x] - sum_transformed); } else { - out_ptr[x] = utils::cast::saturate_cast((tmp_ptr[x] * sum_inversed) - + out_ptr[x] = utils::cast::saturate_cast((tmp_ptr[x] * sum_transformed) - (is_qasymm8_signed ? 128.f : 0)); } } - } + } // Normalize exponentials }, - in_it, max_it, out_it); + in_it, out_it); } -template void neon_softmax_logits_1d_quantized(const ITensor *in, - const ITensor *max, - void *const tmp, - ITensor *out, - float beta, - bool is_log, - const Window &window); -template void neon_softmax_logits_1d_quantized(const ITensor *in, - const ITensor *max, - void *const tmp, - ITensor *out, - float beta, - bool is_log, - const Window &window); +template void neon_softmax_quantized( + const ITensor *in, void *const tmp, ITensor *out, float beta, const Window &window); + +template void neon_softmax_quantized( + const ITensor *in, void *const tmp, ITensor *out, float beta, const Window &window); + +template void neon_softmax_quantized( + const ITensor *in, void *const tmp, ITensor *out, float beta, const Window &window); + +template void neon_softmax_quantized( + const ITensor *in, void *const tmp, ITensor *out, float beta, const Window &window); } // namespace cpu } // namespace arm_compute diff --git a/src/cpu/kernels/softmax/generic/neon/impl.h b/src/cpu/kernels/softmax/generic/neon/impl.h index 4d9b789297..60380cd233 100644 --- a/src/cpu/kernels/softmax/generic/neon/impl.h +++ b/src/cpu/kernels/softmax/generic/neon/impl.h @@ -21,8 +21,8 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#ifndef SRC_CORE_NEON_KERNELS_SOFTMAX_IMPL_H -#define SRC_CORE_NEON_KERNELS_SOFTMAX_IMPL_H +#ifndef ACL_SRC_CPU_KERNELS_SOFTMAX_GENERIC_NEON_IMPL_H +#define ACL_SRC_CPU_KERNELS_SOFTMAX_GENERIC_NEON_IMPL_H #include "arm_compute/core/Helpers.h" @@ -33,105 +33,100 @@ namespace arm_compute { namespace cpu { -template -void neon_logits_1d_max(const ITensor *in, ITensor *out, const Window &window) -{ - /** SIMD vector tag type. */ - using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t; - - constexpr int window_step_x = 16 / sizeof(T); - const auto window_start_x = static_cast(window.x().start()); - const auto window_end_x = static_cast(window.x().end()); - - Window win{window}; - win.set(Window::DimX, Window::Dimension(0, 1, 1)); - Iterator input(in, win); - Iterator output(out, win); - - const int sum_stages = log2(window_step_x / 2); - execute_window_loop( - win, - [&](const Coordinates &) - { - // Get pointers - const auto in_ptr = reinterpret_cast(input.ptr()); - const auto out_ptr = reinterpret_cast(output.ptr()); - - // Init max value - auto vec_max = wrapper::vdup_n(support::cpp11::lowest(), ExactTagType{}); - int x = window_start_x; - - for (; x <= (window_end_x - window_step_x); x += window_step_x) - { - const auto current_value = wrapper::vloadq(in_ptr + x); - vec_max = wrapper::vmax(vec_max, current_value); - } - auto carry_max = wrapper::vpmax(wrapper::vgethigh(vec_max), wrapper::vgetlow(vec_max)); - - for (int i = 0; i < sum_stages; ++i) - { - carry_max = wrapper::vpmax(carry_max, carry_max); - } - T max_val = wrapper::vgetlane(carry_max, 0); - // Compute left-over elements - for (; x < window_end_x; ++x) - { - max_val = *(in_ptr + x) > max_val ? *(in_ptr + x) : max_val; - } +#ifdef __aarch64__ +namespace +{ +// These helper functions are added because vaddv does not exist for fp16, +// and, therefore, is not part of the wrapper::vaddv interface. +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +inline float16_t wrapper_vaddv(const float16x8_t &a, int sum_stages) +{ + auto sum_res = wrapper::vpadd(wrapper::vgethigh(a), wrapper::vgetlow(a)); + for (int i = 0; i < sum_stages; ++i) + { + sum_res = wrapper::vpadd(sum_res, sum_res); + } + return wrapper::vgetlane(sum_res, 0); +} +#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - *out_ptr = max_val; - }, - input, output); +inline float wrapper_vaddv(const float32x4_t &a, int sum_stages) +{ + ARM_COMPUTE_UNUSED(sum_stages); + return wrapper::vaddv(a); } +} // namespace +#endif // __aarch64__ -template -void neon_softmax_logits_1d_quantized(const ITensor *in, - const ITensor *max, - void *const tmp, - ITensor *out, - float beta, - bool is_log, - const Window &window); - -template -void neon_softmax_logits_1d_float(const ITensor *in, - const ITensor *max, - void *const tmp, - ITensor *out, - const float beta, - bool is_log, - const Window &window) +// The template implementation for float data types is stored in the header file because +// we need all fp16 instantiated code to live in fp16.cpp files. +template +void neon_softmax_float(const ITensor *in, void *const tmp, ITensor *out, float beta, const Window &window) { - const int start_x = in->info()->valid_region().anchor.x(); + ARM_COMPUTE_UNUSED(tmp); + const int input_width = in->info()->valid_region().shape.x(); Iterator in_it(in, window); - Iterator max_it(max, window); Iterator out_it(out, window); /** SIMD vector tag type. */ using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t; - constexpr int vec_size = 16 / sizeof(T); - const int sum_stages = log2(vec_size / 2); + constexpr int vec_size = 16 / sizeof(T); + + const int sum_stages = log2(vec_size >> 1); + + const auto beta_vec = wrapper::vdup_n(static_cast(beta), ExactTagType{}); execute_window_loop( window, [&](const Coordinates &) { /* Get pointers */ - const auto in_ptr = reinterpret_cast(in_it.ptr()) + start_x; - const auto out_ptr = reinterpret_cast(out_it.ptr()) + start_x; - const auto tmp_ptr = reinterpret_cast(tmp); + const T *in_ptr = reinterpret_cast(in_it.ptr()); + T *out_ptr = reinterpret_cast(out_it.ptr()); + + T max_val; + + /* Compute Max */ + { + // Init max value + auto vec_max = wrapper::vdup_n(support::cpp11::lowest(), ExactTagType{}); + int x = 0; + + for (; x <= (input_width - vec_size); x += vec_size) + { + const auto current_value = wrapper::vloadq(in_ptr + x); + vec_max = wrapper::vmax(vec_max, current_value); + } + +#ifdef __aarch64__ + max_val = wrapper::vmaxv(vec_max); +#else // __aarch64__ + auto carry_max = wrapper::vpmax(wrapper::vgethigh(vec_max), wrapper::vgetlow(vec_max)); + + for (int i = 0; i < sum_stages; ++i) + { + carry_max = wrapper::vpmax(carry_max, carry_max); + } + + max_val = wrapper::vgetlane(carry_max, 0); +#endif // __aarch64__ - T sum{}; - T sum_inversed{}; + // Compute left-over elements + for (; x < input_width; ++x) + { + max_val = std::max(*(in_ptr + x), max_val); + } + } // compute max + + T sum_transformed{}; /* Compute exponentials and sum */ { /* Get max value */ - const auto max_val = *reinterpret_cast(max_it.ptr()); const auto vec_max = wrapper::vdup_n(max_val, ExactTagType{}); /* Init sum to zero */ @@ -143,35 +138,38 @@ void neon_softmax_logits_1d_float(const ITensor *in, { auto vec_elements = wrapper::vloadq(in_ptr + x); vec_elements = wrapper::vsub(vec_elements, vec_max); - if (is_log) + if (IS_LOG) { - vec_elements = - wrapper::vmul(vec_elements, wrapper::vdup_n(static_cast(beta), ExactTagType{})); - vec_sum = wrapper::vadd(vec_sum, wrapper::vexpq(vec_elements)); + vec_elements = wrapper::vmul(vec_elements, beta_vec); + vec_sum = wrapper::vadd(vec_sum, wrapper::vexpq(vec_elements)); } else { - vec_elements = wrapper::vexpq( - wrapper::vmul(vec_elements, wrapper::vdup_n(static_cast(beta), ExactTagType{}))); - vec_sum = wrapper::vadd(vec_sum, vec_elements); + vec_elements = wrapper::vexpq(wrapper::vmul(vec_elements, beta_vec)); + vec_sum = wrapper::vadd(vec_sum, vec_elements); } - wrapper::vstore(tmp_ptr + x, vec_elements); + wrapper::vstore(out_ptr + x, vec_elements); } /* Reduce sum */ + T sum{}; +#ifdef __aarch64__ + sum = wrapper_vaddv(vec_sum, sum_stages); +#else // __aarch64__ auto sum_res = wrapper::vpadd(wrapper::vgethigh(vec_sum), wrapper::vgetlow(vec_sum)); for (int i = 0; i < sum_stages; ++i) { sum_res = wrapper::vpadd(sum_res, sum_res); } sum = wrapper::vgetlane(sum_res, 0); +#endif // __aarch64__ /* Run remaining elements */ for (; x < input_width; ++x) { T element{}; - if (is_log) + if (IS_LOG) { element = (in_ptr[x] - max_val) * beta; sum += std::exp(element); @@ -181,55 +179,59 @@ void neon_softmax_logits_1d_float(const ITensor *in, element = std::exp((in_ptr[x] - max_val) * beta); sum += element; } - tmp_ptr[x] = element; + + out_ptr[x] = element; } - if (!is_log) + if (!IS_LOG) { - sum_inversed = T(1) / sum; + sum_transformed = T(1) / sum; } else { - sum = static_cast(std::log(sum)); + sum_transformed = static_cast(std::log(sum)); } - } + } // Compute exponentials and sum /* Normalize exponentials */ { + const auto sum_vec = wrapper::vdup_n(static_cast(sum_transformed), ExactTagType{}); + /* Loop over row and compute softmax */ int x = 0; for (; x <= (input_width - vec_size); x += vec_size) { - auto vec_in = wrapper::vloadq(tmp_ptr + x); - auto normalized_value = wrapper::vdup_n(static_cast(0), ExactTagType{}); - if (is_log) + const auto vec_in = wrapper::vloadq(out_ptr + x); + if (IS_LOG) { - normalized_value = wrapper::vsub(vec_in, wrapper::vdup_n(static_cast(sum), ExactTagType{})); + wrapper::vstore(out_ptr + x, wrapper::vsub(vec_in, sum_vec)); } else { - normalized_value = - wrapper::vmul(vec_in, wrapper::vdup_n(static_cast(sum_inversed), ExactTagType{})); + wrapper::vstore(out_ptr + x, wrapper::vmul(vec_in, sum_vec)); } - wrapper::vstore(out_ptr + x, normalized_value); } + /* Run remaining elements */ for (; x < input_width; ++x) { - if (is_log) + if (IS_LOG) { - out_ptr[x] = tmp_ptr[x] - sum; + out_ptr[x] = out_ptr[x] - sum_transformed; } else { - out_ptr[x] = tmp_ptr[x] * sum_inversed; + out_ptr[x] = out_ptr[x] * sum_transformed; } } - } + } // Normalize exponentials }, - in_it, max_it, out_it); + in_it, out_it); } + +template +void neon_softmax_quantized(const ITensor *in, void *const tmp, ITensor *out, float beta, const Window &window); } // namespace cpu } // namespace arm_compute -#endif /* SRC_CORE_NEON_KERNELS_SOFTMAX_IMPL_H */ +#endif // ACL_SRC_CPU_KERNELS_SOFTMAX_GENERIC_NEON_IMPL_H diff --git a/src/cpu/kernels/softmax/generic/neon/qasymm8.cpp b/src/cpu/kernels/softmax/generic/neon/qasymm8.cpp index 40713dc496..9589ebcd7c 100644 --- a/src/cpu/kernels/softmax/generic/neon/qasymm8.cpp +++ b/src/cpu/kernels/softmax/generic/neon/qasymm8.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2022 Arm Limited. + * Copyright (c) 2021-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -29,20 +29,16 @@ namespace arm_compute { namespace cpu { -void neon_qasymm8_softmax(const ITensor *in, - const ITensor *max, - void *const tmp, - ITensor *out, - const float beta, - bool is_log, - const Window &window) +template +void neon_qasymm8_softmax(const ITensor *in, void *const tmp, ITensor *out, const float beta, const Window &window) { - return neon_softmax_logits_1d_quantized(in, max, tmp, out, beta, is_log, window); + return neon_softmax_quantized(in, tmp, out, beta, window); } -void neon_qasymm8_logits(const ITensor *in, ITensor *out, const Window &window) -{ - return neon_logits_1d_max(in, out, window); -} +template void +neon_qasymm8_softmax(const ITensor *in, void *const tmp, ITensor *out, const float beta, const Window &window); +template void +neon_qasymm8_softmax(const ITensor *in, void *const tmp, ITensor *out, const float beta, const Window &window); + } // namespace cpu } // namespace arm_compute diff --git a/src/cpu/kernels/softmax/generic/neon/qasymm8_signed.cpp b/src/cpu/kernels/softmax/generic/neon/qasymm8_signed.cpp index 2c5e284f54..0bf6b2859a 100644 --- a/src/cpu/kernels/softmax/generic/neon/qasymm8_signed.cpp +++ b/src/cpu/kernels/softmax/generic/neon/qasymm8_signed.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2022 Arm Limited. + * Copyright (c) 2021-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -29,20 +29,17 @@ namespace arm_compute { namespace cpu { -void neon_qasymm8_signed_softmax(const ITensor *in, - const ITensor *max, - void *const tmp, - ITensor *out, - const float beta, - bool is_log, - const Window &window) +template +void neon_qasymm8_signed_softmax( + const ITensor *in, void *const tmp, ITensor *out, const float beta, const Window &window) { - return neon_softmax_logits_1d_quantized(in, max, tmp, out, beta, is_log, window); + return neon_softmax_quantized(in, tmp, out, beta, window); } -void neon_qasymm8_singed_logits(const ITensor *in, ITensor *out, const Window &window) -{ - return neon_logits_1d_max(in, out, window); -} +template void neon_qasymm8_signed_softmax( + const ITensor *in, void *const tmp, ITensor *out, const float beta, const Window &window); +template void neon_qasymm8_signed_softmax( + const ITensor *in, void *const tmp, ITensor *out, const float beta, const Window &window); + } // namespace cpu } // namespace arm_compute diff --git a/src/cpu/kernels/softmax/generic/sve/impl.cpp b/src/cpu/kernels/softmax/generic/sve/impl.cpp index 24f1bb8143..0d4b7f4509 100644 --- a/src/cpu/kernels/softmax/generic/sve/impl.cpp +++ b/src/cpu/kernels/softmax/generic/sve/impl.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2022 Arm Limited. + * Copyright (c) 2021-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -30,6 +30,9 @@ namespace arm_compute { namespace cpu { +/// TODO: (COMPMID-6505) Similar to Neon(TM), this implementation be converted to +/// a single kernel that performs softmax operation. Leaving the SVE code here for +/// future references. Implementation for Neon(TM) is introduced in COMPMID-6500 template void sve_logits_1d_max(const ITensor *in, ITensor *out, const Window &window) { @@ -172,25 +175,5 @@ void sve_softmax_logits_1d_float(const ITensor *in, }, in_it, max_it, out_it); } - -template void sve_logits_1d_max(const ITensor *in, ITensor *out, const Window &window); -template void sve_logits_1d_max(const ITensor *in, ITensor *out, const Window &window); -template void sve_logits_1d_max(const ITensor *in, ITensor *out, const Window &window); -template void sve_logits_1d_max(const ITensor *in, ITensor *out, const Window &window); - -template void sve_softmax_logits_1d_float(const ITensor *in, - const ITensor *max, - void *const tmp, - ITensor *out, - const float beta, - bool is_log, - const Window &window); -template void sve_softmax_logits_1d_float(const ITensor *in, - const ITensor *max, - void *const tmp, - ITensor *out, - const float beta, - bool is_log, - const Window &window); } // namespace cpu } // namespace arm_compute diff --git a/src/cpu/kernels/softmax/generic/sve/qasymm8_signed.cpp b/src/cpu/kernels/softmax/generic/sve/qasymm8_signed.cpp deleted file mode 100644 index 4be2e2eed6..0000000000 --- a/src/cpu/kernels/softmax/generic/sve/qasymm8_signed.cpp +++ /dev/null @@ -1,38 +0,0 @@ -/* - * Copyright (c) 2021-2022 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include "arm_compute/core/Helpers.h" - -#include "src/cpu/kernels/softmax/generic/sve/impl.h" - -namespace arm_compute -{ -namespace cpu -{ -void sve_qasymm8_signed_logits(const ITensor *in, ITensor *out, const Window &window) -{ - return sve_logits_1d_max(in, out, window); -} -} // namespace cpu -} // namespace arm_compute diff --git a/src/cpu/kernels/softmax/generic/sve2/impl.cpp b/src/cpu/kernels/softmax/generic/sve2/impl.cpp index 98b2f5117f..a8fb1d4adf 100644 --- a/src/cpu/kernels/softmax/generic/sve2/impl.cpp +++ b/src/cpu/kernels/softmax/generic/sve2/impl.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2022 Arm Limited. + * Copyright (c) 2021-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -32,6 +32,9 @@ namespace arm_compute { namespace cpu { +/// TODO: (COMPMID-6505) Similar to Neon(TM), this implementation be converted to +/// a single kernel that performs softmax operation. Leaving the SVE2 code here for +/// future references. Implementation for Neon(TM) is introduced in COMPMID-6500 template void sve2_softmax_logits_1d_quantized( const ITensor *in, const ITensor *max, void *const tmp, ITensor *out, float beta, bool is_log, const Window &window) @@ -205,20 +208,5 @@ void sve2_softmax_logits_1d_quantized( }, in_it, max_it, out_it); } - -template void sve2_softmax_logits_1d_quantized(const ITensor *in, - const ITensor *max, - void *const tmp, - ITensor *out, - float beta, - bool is_log, - const Window &window); -template void sve2_softmax_logits_1d_quantized(const ITensor *in, - const ITensor *max, - void *const tmp, - ITensor *out, - float beta, - bool is_log, - const Window &window); } // namespace cpu } // namespace arm_compute diff --git a/src/cpu/kernels/softmax/generic/sve2/qasymm8_signed.cpp b/src/cpu/kernels/softmax/generic/sve2/qasymm8_signed.cpp deleted file mode 100644 index c20462fcef..0000000000 --- a/src/cpu/kernels/softmax/generic/sve2/qasymm8_signed.cpp +++ /dev/null @@ -1,44 +0,0 @@ -/* - * Copyright (c) 2021-2022 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include "arm_compute/core/Helpers.h" - -#include "src/cpu/kernels/softmax/generic/sve2/impl.h" - -namespace arm_compute -{ -namespace cpu -{ -void sve2_qasymm8_signed_softmax(const ITensor *in, - const ITensor *max, - void *const tmp, - ITensor *out, - const float beta, - bool is_log, - const Window &window) -{ - return sve2_softmax_logits_1d_quantized(in, max, tmp, out, beta, is_log, window); -} -} // namespace cpu -} // namespace arm_compute diff --git a/src/cpu/kernels/softmax/list.h b/src/cpu/kernels/softmax/list.h index 627ce0c264..c143f6659d 100644 --- a/src/cpu/kernels/softmax/list.h +++ b/src/cpu/kernels/softmax/list.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2022 Arm Limited. + * Copyright (c) 2021-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -21,41 +21,24 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#ifndef SRC_CORE_NEON_KERNELS_SOFTMAX_LIST_H -#define SRC_CORE_NEON_KERNELS_SOFTMAX_LIST_H +#ifndef ACL_SRC_CPU_KERNELS_SOFTMAX_LIST_H +#define ACL_SRC_CPU_KERNELS_SOFTMAX_LIST_H namespace arm_compute { namespace cpu { -#define DECLARE_SOFTMAX_KERNEL(func_name) \ - void func_name(const ITensor *in, const ITensor *max, void *const tmp, ITensor *out, const float beta, \ - bool is_log, const Window &window) +#define DECLARE_SOFTMAX_KERNEL(func_name) \ + template \ + void func_name(const ITensor *in, void *const tmp, ITensor *out, const float beta, const Window &window) DECLARE_SOFTMAX_KERNEL(neon_fp32_softmax); DECLARE_SOFTMAX_KERNEL(neon_fp16_softmax); DECLARE_SOFTMAX_KERNEL(neon_qasymm8_softmax); DECLARE_SOFTMAX_KERNEL(neon_qasymm8_signed_softmax); -DECLARE_SOFTMAX_KERNEL(sve_fp32_softmax); -DECLARE_SOFTMAX_KERNEL(sve_fp16_softmax); -DECLARE_SOFTMAX_KERNEL(sve2_qasymm8_signed_softmax); -DECLARE_SOFTMAX_KERNEL(sve2_qasymm8_softmax); #undef DECLARE_SOFTMAX_KERNEL - -#define DECLARE_LOGITS_KERNEL(func_name) void func_name(const ITensor *in, ITensor *out, const Window &window) - -DECLARE_LOGITS_KERNEL(neon_fp32_logits); -DECLARE_LOGITS_KERNEL(neon_fp16_logits); -DECLARE_LOGITS_KERNEL(neon_qasymm8_logits); -DECLARE_LOGITS_KERNEL(neon_qasymm8_singed_logits); -DECLARE_LOGITS_KERNEL(sve_fp32_logits); -DECLARE_LOGITS_KERNEL(sve_fp16_logits); -DECLARE_LOGITS_KERNEL(sve_qasymm8_logits); -DECLARE_LOGITS_KERNEL(sve_qasymm8_signed_logits); - -#undef DECLARE_LOGITS_KERNEL } // namespace cpu } // namespace arm_compute -#endif /* SRC_CORE_NEON_KERNELS_SOFTMAX_LIST_H */ +#endif // ACL_SRC_CPU_KERNELS_SOFTMAX_LIST_H diff --git a/src/cpu/operators/CpuSoftmax.cpp b/src/cpu/operators/CpuSoftmax.cpp index e55d7f903e..ae14381ad9 100644 --- a/src/cpu/operators/CpuSoftmax.cpp +++ b/src/cpu/operators/CpuSoftmax.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Arm Limited. + * Copyright (c) 2021, 2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -41,13 +41,10 @@ namespace arm_compute { namespace cpu { -template -CpuSoftmaxGeneric::CpuSoftmaxGeneric() +CpuSoftmaxGeneric::CpuSoftmaxGeneric() : _permute_input(), _permute_output(), - _max_kernel(), _softmax_kernel(), - _max(), _tmp(), _input_permuted(), _output_permuted(), @@ -56,8 +53,7 @@ CpuSoftmaxGeneric::CpuSoftmaxGeneric() { } -template -void CpuSoftmaxGeneric::configure(const ITensorInfo *src, ITensorInfo *dst, float beta, int32_t axis) +void CpuSoftmaxGeneric::configure(const ITensorInfo *src, ITensorInfo *dst, float beta, int32_t axis, bool is_log) { // Perform validation step ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); @@ -79,29 +75,23 @@ void CpuSoftmaxGeneric::configure(const ITensorInfo *src, ITensorInfo *d // or it is the original input case (2D case) const ITensorInfo *tmp_input = (_needs_permute ? &_input_permuted : src); - // Create intermediate tensors shapes - TensorShape max_sum_shape = tmp_input->tensor_shape(); - max_sum_shape.set(0, 1); - const TensorInfo input_info = tmp_input->clone()->reset_padding().set_is_resizable(true); - DataType tmp_data_type = - is_data_type_quantized_asymmetric(tmp_input->data_type()) ? DataType::F32 : tmp_input->data_type(); - TensorInfo tensor_info_tmp(input_info.clone()->set_data_type(tmp_data_type)); - TensorInfo max_info(tmp_input->clone()->set_tensor_shape(max_sum_shape)); + TensorInfo tensor_info_tmp; + if (is_data_type_quantized_asymmetric(src->data_type())) + { + // Create intermediate tensors shapes + const TensorInfo input_info = tmp_input->clone()->reset_padding().set_is_resizable(true); + tensor_info_tmp = input_info.clone()->set_data_type(DataType::F32); + } // Init intermediate tensors - _max = TensorInfo(max_info); _tmp = TensorInfo(tensor_info_tmp); // Configure kernels - auto mk = std::make_unique(); - mk->configure(tmp_input, &_max); - _max_kernel = std::move(mk); - - auto sm = std::make_unique>(); + auto sm = std::make_unique(); if (_needs_permute) { // The normalization kernel stores the result in a permuted output tensor - sm->configure(tmp_input, &_max, &_output_permuted, beta, &_tmp); + sm->configure(tmp_input, &_output_permuted, beta, is_log, &_tmp); // Re-permute the permuted output into the requested (4D) output _permute_output.configure(&_output_permuted, dst, @@ -110,14 +100,15 @@ void CpuSoftmaxGeneric::configure(const ITensorInfo *src, ITensorInfo *d else { // Softmax 2D case - sm->configure(tmp_input, &_max, dst, beta, &_tmp); + sm->configure(tmp_input, dst, beta, is_log, &_tmp); } _softmax_kernel = std::move(sm); - _aux_mem[InternalTensorIdx::MAX] = - MemoryInfo(offset_int_vec(InternalTensorIdx::MAX), MemoryLifetime::Temporary, _max.total_size()); - _aux_mem[InternalTensorIdx::TMP] = - MemoryInfo(offset_int_vec(InternalTensorIdx::TMP), MemoryLifetime::Temporary, _tmp.total_size()); + if (_tmp.total_size() > 0) + { + _aux_mem[InternalTensorIdx::TMP] = + MemoryInfo(offset_int_vec(InternalTensorIdx::TMP), MemoryLifetime::Temporary, _tmp.total_size()); + } _aux_mem[InternalTensorIdx::PERMUTED_SRC] = MemoryInfo(offset_int_vec(InternalTensorIdx::PERMUTED_SRC), MemoryLifetime::Temporary, _input_permuted.total_size()); @@ -125,8 +116,8 @@ void CpuSoftmaxGeneric::configure(const ITensorInfo *src, ITensorInfo *d MemoryLifetime::Temporary, _output_permuted.total_size()); } -template -Status CpuSoftmaxGeneric::validate(const ITensorInfo *src, const ITensorInfo *dst, float beta, int32_t axis) +Status +CpuSoftmaxGeneric::validate(const ITensorInfo *src, const ITensorInfo *dst, float beta, int32_t axis, bool is_log) { // Perform validation step ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst); @@ -136,17 +127,12 @@ Status CpuSoftmaxGeneric::validate(const ITensorInfo *src, const ITensor static_cast(src->num_dimensions()) <= axis); // Create intermediate tensor info - DataType tmp_data_type = src->data_type(); - const TensorInfo tensor_info_tmp(src->clone()->set_data_type(tmp_data_type).set_is_resizable(true)); - - TensorShape max_sum_shape = src->tensor_shape(); - max_sum_shape.set(0, 1); - const TensorInfo tensor_info_max_sum(src->clone() - ->set_tensor_shape(max_sum_shape) - .set_data_type(tmp_data_type) - .set_quantization_info(src->quantization_info()) - .set_is_resizable(true)); - const TensorInfo dont_care; + TensorInfo tensor_info_tmp; + + if (is_data_type_quantized_asymmetric(src->data_type())) + { + tensor_info_tmp = src->clone()->set_data_type(DataType::F32).set_is_resizable(true); + } const unsigned int actual_axis = static_cast(wrap_around(axis, static_cast(src->num_dimensions()))); @@ -165,15 +151,12 @@ Status CpuSoftmaxGeneric::validate(const ITensorInfo *src, const ITensor ARM_COMPUTE_RETURN_ON_ERROR(CpuPermute::validate(&output_permuted, dst, permutation_vector)); } - ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuLogits1DMaxKernel::validate(src, &tensor_info_max_sum)); - ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuLogits1DSoftmaxKernel::validate( - &tensor_info_tmp, &tensor_info_max_sum, dst, beta, &dont_care)); + ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuSoftmaxKernel::validate(src, dst, beta, is_log, &tensor_info_tmp)); return Status{}; } -template -void CpuSoftmaxGeneric::run(ITensorPack &tensors) +void CpuSoftmaxGeneric::run(ITensorPack &tensors) { ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No inputs provided"); @@ -181,13 +164,11 @@ void CpuSoftmaxGeneric::run(ITensorPack &tensors) auto dst = tensors.get_tensor(TensorType::ACL_DST); CpuAuxTensorHandler tmp(offset_int_vec(InternalTensorIdx::TMP), _tmp, tensors, true); - CpuAuxTensorHandler max(offset_int_vec(InternalTensorIdx::MAX), _max, tensors, true); CpuAuxTensorHandler input_permuted(offset_int_vec(InternalTensorIdx::PERMUTED_SRC), _input_permuted, tensors, true); CpuAuxTensorHandler output_permuted(offset_int_vec(InternalTensorIdx::PERMUTED_DST), _output_permuted, tensors, true); - ITensorPack max_pack; ITensorPack softmax_pack; if (_needs_permute) @@ -195,24 +176,15 @@ void CpuSoftmaxGeneric::run(ITensorPack &tensors) ITensorPack permute_in_pack = {{TensorType::ACL_SRC, src}, {TensorType::ACL_DST, input_permuted.get()}}; _permute_input.run(permute_in_pack); - max_pack = {{TensorType::ACL_SRC, input_permuted.get()}, {TensorType::ACL_DST, max.get()}}; - softmax_pack = {{TensorType::ACL_SRC_0, input_permuted.get()}, - {TensorType::ACL_SRC_1, max.get()}, {TensorType::ACL_DST_0, output_permuted.get()}, {TensorType::ACL_DST_1, tmp.get()}}; } else { - max_pack = {{TensorType::ACL_SRC, src}, {TensorType::ACL_DST, max.get()}}; - - softmax_pack = {{TensorType::ACL_SRC_0, src}, - {TensorType::ACL_SRC_1, max.get()}, - {TensorType::ACL_DST_0, dst}, - {TensorType::ACL_DST_1, tmp.get()}}; + softmax_pack = {{TensorType::ACL_SRC_0, src}, {TensorType::ACL_DST_0, dst}, {TensorType::ACL_DST_1, tmp.get()}}; } - NEScheduler::get().schedule_op(_max_kernel.get(), Window::DimY, _max_kernel->window(), max_pack); NEScheduler::get().schedule_op(_softmax_kernel.get(), Window::DimY, _softmax_kernel->window(), softmax_pack); if (_needs_permute) @@ -224,13 +196,10 @@ void CpuSoftmaxGeneric::run(ITensorPack &tensors) } } -template -experimental::MemoryRequirements CpuSoftmaxGeneric::workspace() const +experimental::MemoryRequirements CpuSoftmaxGeneric::workspace() const { return _aux_mem; } -template class CpuSoftmaxGeneric; -template class CpuSoftmaxGeneric; } // namespace cpu } // namespace arm_compute diff --git a/src/cpu/operators/CpuSoftmax.h b/src/cpu/operators/CpuSoftmax.h index 8cab70e14f..47020e9b7c 100644 --- a/src/cpu/operators/CpuSoftmax.h +++ b/src/cpu/operators/CpuSoftmax.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2022 Arm Limited. + * Copyright (c) 2021-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -21,8 +21,8 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#ifndef ARM_COMPUTE_CPU_SOFTMAX_H -#define ARM_COMPUTE_CPU_SOFTMAX_H +#ifndef ACL_SRC_CPU_OPERATORS_CPUSOFTMAX_H +#define ACL_SRC_CPU_OPERATORS_CPUSOFTMAX_H #include "arm_compute/core/experimental/Types.h" #include "arm_compute/core/TensorInfo.h" @@ -37,9 +37,7 @@ namespace arm_compute { namespace cpu { -class CpuLogits1DMaxKernel; -template -class CpuLogits1DSoftmaxKernel; +class CpuSoftmaxKernel; /** Basic function to compute a SoftmaxLayer and a Log SoftmaxLayer. * @@ -52,31 +50,31 @@ class CpuLogits1DSoftmaxKernel; * This function runs the following function/kernels: * -# If axis is not 0: * -# @ref CpuPermute - * -# @ref kernels::CpuLogits1DMaxKernel - * -# @ref kernels::CpuLogits1DSoftmaxKernel + * -# @ref kernels::CpuSoftmaxKernel */ -template class CpuSoftmaxGeneric : public ICpuOperator { public: CpuSoftmaxGeneric(); /** Set the input and output tensors. * - * @param[in,out] src Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32. - * last value of each row to the nearest multiple. - * @param[out] dst Destination tensor ifo. Data types supported: same as @p input. - * @param[in] beta (Optional) A scaling factor for the exponent. - * @param[in] axis (Optional) The dimension in which to apply the function. E.g. for input of shape 4x5x6 and + * @param[in,out] src Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32. + * last value of each row to the nearest multiple. + * @param[out] dst Destination tensor ifo. Data types supported: same as @p input. + * @param[in] beta (Optional) A scaling factor for the exponent. + * @param[in] axis (Optional) The dimension in which to apply the function. E.g. for input of shape 4x5x6 and * axis=1, softmax will be applied to 4x6=24 vectors of size 5. Defaults to 0 + * @param[in] is_log True if the operation is log-softmax */ - void configure(const ITensorInfo *src, ITensorInfo *dst, float beta = 1.0f, int32_t axis = 0); + void configure(const ITensorInfo *src, ITensorInfo *dst, float beta = 1.0f, int32_t axis = 0, bool is_log = false); /** Static function to check if given info will lead to a valid configuration * * Similar to @ref CpuSoftmaxGeneric::configure() * * @return a status */ - static Status validate(const ITensorInfo *src, const ITensorInfo *dst, float beta = 1.0f, int32_t axis = 0); + static Status + validate(const ITensorInfo *src, const ITensorInfo *dst, float beta = 1.0f, int32_t axis = 0, bool is_log = false); // Inherited methods overridden: void run(ITensorPack &tensors) override; @@ -85,8 +83,7 @@ class CpuSoftmaxGeneric : public ICpuOperator private: enum InternalTensorIdx { - MAX = 0, - TMP, + TMP = 0, PERMUTED_SRC, PERMUTED_DST, COUNT @@ -94,10 +91,8 @@ class CpuSoftmaxGeneric : public ICpuOperator CpuPermute _permute_input; CpuPermute _permute_output; - std::unique_ptr _max_kernel; std::unique_ptr _softmax_kernel; - TensorInfo _max; TensorInfo _tmp; TensorInfo _input_permuted; TensorInfo _output_permuted; @@ -105,9 +100,7 @@ class CpuSoftmaxGeneric : public ICpuOperator bool _needs_permute; experimental::MemoryRequirements _aux_mem{}; }; -using CpuSoftmax = CpuSoftmaxGeneric; -using CpuLogSoftmax = CpuSoftmaxGeneric; } // namespace cpu } // namespace arm_compute -#endif /* ARM_COMPUTE_CPU_SOFTMAX_H */ +#endif // ACL_SRC_CPU_OPERATORS_CPUSOFTMAX_H diff --git a/src/cpu/operators/CpuWinogradConv2d.cpp b/src/cpu/operators/CpuWinogradConv2d.cpp index e4bcdc0b64..7d81aee0e9 100644 --- a/src/cpu/operators/CpuWinogradConv2d.cpp +++ b/src/cpu/operators/CpuWinogradConv2d.cpp @@ -122,13 +122,13 @@ bool get_winograd_kernel_implementation(const ITensorInfo success = arm_conv::winograd::get_implementation(*winograd_impl, &CPUInfo::get(), *conv_args, nthreads, enable_fast_math, &winograd_cfg, nullptr); } -#if defined(__aarch64__) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) +#if defined(__aarch64__) && defined(ENABLE_FP16_KERNELS) else if (data_type == DataType::F16) { success = arm_conv::winograd::get_implementation<__fp16>(*winograd_impl, &CPUInfo::get(), *conv_args, nthreads, enable_fast_math, &winograd_cfg, nullptr); } -#endif // defined(__aarch64__) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) +#endif // defined(__aarch64__) && defined(ENABLE_FP16_KERNELS) else { success = false; diff --git a/src/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp b/src/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp index 82bd465c99..611bc76463 100644 --- a/src/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp +++ b/src/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp @@ -541,6 +541,7 @@ void Fallback::prepare(ITensorPack &tensors) { auto b = tensors.get_const_tensor(TensorType::ACL_SRC_1); auto c = tensors.get_const_tensor(TensorType::ACL_SRC_2); + ARM_COMPUTE_ERROR_ON_NULLPTR(b); // Setup up matrix bias in the assembly kernel, it's just a pointer to matrix C. if (c && c->info()->data_type() == DataType::S32) @@ -614,6 +615,7 @@ void Fallback::run(ITensorPack &tensors) auto b = tensors.get_const_tensor(TensorType::ACL_SRC_1); auto c = tensors.get_const_tensor(TensorType::ACL_SRC_2); auto d = tensors.get_tensor(TensorType::ACL_DST); + ARM_COMPUTE_ERROR_ON_NULLPTR(a, d); int lda = a->info()->strides_in_bytes().y() / a->info()->element_size(); int ldb = 0; @@ -652,7 +654,7 @@ void Fallback::run(ITensorPack &tensors) } // Check if B is pre-tranposed and de-reference if not - if (!_gemm_kernel_asm->B_is_pretransposed()) + if (b_to_use && !_gemm_kernel_asm->B_is_pretransposed()) { ldb = b_to_use->info()->strides_in_bytes().y() / b_to_use->info()->element_size(); multi_stride_b = b_to_use->info()->strides_in_bytes().z() / b_to_use->info()->element_size(); @@ -670,7 +672,7 @@ void Fallback::run(ITensorPack &tensors) } // Pretranspose B if required - if (_B_pretranspose_required) + if (b_to_use && _B_pretranspose_required) { // Fixed format kernels need no pretranspose. ARM_COMPUTE_ERROR_ON(arm_compute::is_fixed_format( diff --git a/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel.cpp b/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel.cpp index 9a2a4890f3..4ca4b83f9c 100644 --- a/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel.cpp +++ b/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel.cpp @@ -41,7 +41,6 @@ #include "src/gpu/cl/kernels/gemm/ClGemmHelpers.h" #include "support/Cast.h" #include "support/StringSupport.h" - namespace arm_compute { namespace opencl @@ -101,7 +100,7 @@ Status validate_arguments(const ITensorInfo *src0, ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(0) != k); // Validate the reinterpreted-as-3D-case - if (gemm_info.depth_output_gemm3d != 0) + if (gemm_info.reinterpret_input_as_3d) { ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(1) * src0->dimension(2) != m); } @@ -284,8 +283,12 @@ void ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel::configure(const CLCompileCon build_opts.add_option("-DA_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.a())); build_opts.add_option("-DB_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.b())); + build_opts.add_option_if(gemm_info.reinterpret_input_as_3d, "-DREINTERPRET_INPUT_AS_3D"); + build_opts.add_option_if(gemm_info.depth_output_gemm3d != 0, "-DREINTERPRET_OUTPUT_AS_3D"); + build_opts.add_option_if(src1->num_dimensions() > 2, "-DBATCHED_RHS"); + std::string kernel_name("gemm_mm_reshaped_only_rhs_nt_mmul"); - kernel_name += rhs_info.export_to_cl_image ? "_texture" : ""; + kernel_name += _export_to_cl_image ? "_texture" : ""; // A macro guard to compile ONLY the kernel of interest build_opts.add_option("-D" + upper_string(kernel_name)); diff --git a/src/gpu/cl/kernels/gemm/native/ClGemmNativeKernelConfig.h b/src/gpu/cl/kernels/gemm/native/ClGemmNativeKernelConfig.h index 955bb3c01a..22aa1e2034 100644 --- a/src/gpu/cl/kernels/gemm/native/ClGemmNativeKernelConfig.h +++ b/src/gpu/cl/kernels/gemm/native/ClGemmNativeKernelConfig.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021 Arm Limited. + * Copyright (c) 2019-2021, 2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -21,8 +21,8 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#ifndef ARM_COMPUTE_CL_GEMM_NATIVE_KERNEL_CONFIGURATION_H -#define ARM_COMPUTE_CL_GEMM_NATIVE_KERNEL_CONFIGURATION_H +#ifndef ACL_SRC_GPU_CL_KERNELS_GEMM_NATIVE_CLGEMMNATIVEKERNELCONFIG_H +#define ACL_SRC_GPU_CL_KERNELS_GEMM_NATIVE_CLGEMMNATIVEKERNELCONFIG_H #include "src/gpu/cl/kernels/gemm/IClGemmKernelConfig.h" #include "src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeBifrost.h" @@ -58,6 +58,7 @@ class ClGemmNativeKernelConfigurationFactory final case GPUTarget::BIFROST: return std::make_unique(gpu); case GPUTarget::VALHALL: + case GPUTarget::FIFTHGEN: return std::make_unique(gpu); default: ARM_COMPUTE_ERROR("Not supported GPU target"); @@ -68,4 +69,4 @@ class ClGemmNativeKernelConfigurationFactory final } // namespace kernels } // namespace opencl } // namespace arm_compute -#endif /*ARM_COMPUTE_CL_GEMM_NATIVE_KERNEL_CONFIGURATION_H */ +#endif // ACL_SRC_GPU_CL_KERNELS_GEMM_NATIVE_CLGEMMNATIVEKERNELCONFIG_H diff --git a/src/gpu/cl/kernels/gemm/reshaped/ClGemmReshapedKernelConfig.h b/src/gpu/cl/kernels/gemm/reshaped/ClGemmReshapedKernelConfig.h index 83928b3f4f..6327ee3027 100644 --- a/src/gpu/cl/kernels/gemm/reshaped/ClGemmReshapedKernelConfig.h +++ b/src/gpu/cl/kernels/gemm/reshaped/ClGemmReshapedKernelConfig.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021 Arm Limited. + * Copyright (c) 2019-2021, 2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -21,8 +21,8 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#ifndef ARM_COMPUTE_CL_GEMM_RESHAPED_KERNEL_CONFIGURATION_H -#define ARM_COMPUTE_CL_GEMM_RESHAPED_KERNEL_CONFIGURATION_H +#ifndef ACL_SRC_GPU_CL_KERNELS_GEMM_RESHAPED_CLGEMMRESHAPEDKERNELCONFIG_H +#define ACL_SRC_GPU_CL_KERNELS_GEMM_RESHAPED_CLGEMMRESHAPEDKERNELCONFIG_H #include "src/gpu/cl/kernels/gemm/IClGemmKernelConfig.h" #include "src/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedBifrost.h" @@ -56,6 +56,7 @@ class ClGemmReshapedKernelConfigurationFactory final case GPUTarget::BIFROST: return std::make_unique(gpu); case GPUTarget::VALHALL: + case GPUTarget::FIFTHGEN: return std::make_unique(gpu); default: ARM_COMPUTE_ERROR("Not supported GPU target"); @@ -66,4 +67,4 @@ class ClGemmReshapedKernelConfigurationFactory final } // namespace kernels } // namespace opencl } // namespace arm_compute -#endif /* ARM_COMPUTE_CL_GEMM_RESHAPED_KERNEL_CONFIGURATION_H */ +#endif // ACL_SRC_GPU_CL_KERNELS_GEMM_RESHAPED_CLGEMMRESHAPEDKERNELCONFIG_H diff --git a/src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmReshapedOnlyRhsKernelConfig.h b/src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmReshapedOnlyRhsKernelConfig.h index e07ad993ed..1f0c5c2d87 100644 --- a/src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmReshapedOnlyRhsKernelConfig.h +++ b/src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmReshapedOnlyRhsKernelConfig.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021 Arm Limited. + * Copyright (c) 2019-2021, 2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -21,8 +21,8 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#ifndef ARM_COMPUTE_CL_GEMM_RESHAPED_ONLY_RHS_KERNEL_CONFIGURATION_H -#define ARM_COMPUTE_CL_GEMM_RESHAPED_ONLY_RHS_KERNEL_CONFIGURATION_H +#ifndef ACL_SRC_GPU_CL_KERNELS_GEMM_RESHAPED_ONLY_RHS_CLGEMMRESHAPEDONLYRHSKERNELCONFIG_H +#define ACL_SRC_GPU_CL_KERNELS_GEMM_RESHAPED_ONLY_RHS_CLGEMMRESHAPEDONLYRHSKERNELCONFIG_H #include "src/gpu/cl/kernels/gemm/IClGemmKernelConfig.h" #include "src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyBifrost.h" @@ -56,6 +56,7 @@ class ClGemmReshapedOnlyRhsKernelConfigurationFactory final case GPUTarget::BIFROST: return std::make_unique(gpu); case GPUTarget::VALHALL: + case GPUTarget::FIFTHGEN: return std::make_unique(gpu); default: ARM_COMPUTE_ERROR("Not supported GPU target"); @@ -66,4 +67,4 @@ class ClGemmReshapedOnlyRhsKernelConfigurationFactory final } // namespace kernels } // namespace opencl } // namespace arm_compute -#endif /* ARM_COMPUTE_CL_GEMM_RESHAPED_ONLY_RHS_KERNEL_CONFIGURATION_H */ +#endif // ACL_SRC_GPU_CL_KERNELS_GEMM_RESHAPED_ONLY_RHS_CLGEMMRESHAPEDONLYRHSKERNELCONFIG_H diff --git a/src/gpu/cl/operators/ClMatMul.cpp b/src/gpu/cl/operators/ClMatMul.cpp index 9962ee550a..28a2aa2540 100644 --- a/src/gpu/cl/operators/ClMatMul.cpp +++ b/src/gpu/cl/operators/ClMatMul.cpp @@ -34,6 +34,7 @@ #include "src/gpu/cl/kernels/ClMatMulNativeMMULKernel.h" #include "src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultConfigValhall.h" #include "src/runtime/heuristics/matmul_native/ClMatMulNativeKernelConfig.h" +#include "src/runtime/heuristics/matmul_native/ClMatMulNativeKernelVariant.h" #include "src/runtime/heuristics/matmul_native/IClMatMulNativeKernelConfig.h" using namespace arm_compute::cl_matmul; @@ -42,59 +43,6 @@ namespace arm_compute { namespace opencl { -namespace -{ -enum class MatMulKernelType -{ - /** Native matrix multiplication for FP types */ - NATIVE_FP, - - /** Native matrix multiplication for quantized types */ - NATIVE_QUANTIZED, - - /** Native matrix multiplication using MMUL extension for FP types */ - NATIVE_MMUL_FP, - - /** Native matrix multiplication using MMUL extension for Quantized types */ - NATIVE_MMUL_QUANTIZED -}; - -MatMulKernelType get_matmul_kernel(const ITensorInfo *lhs, - const ITensorInfo *rhs, - const MatMulInfo &matmul_info, - const ActivationLayerInfo &act_info) -{ - ARM_COMPUTE_UNUSED(lhs, rhs, matmul_info, act_info); - - const bool is_quantized = is_data_type_quantized_asymmetric(lhs->data_type()); - const bool is_mmul_supported = arm_matrix_multiply_supported(CLKernelLibrary::get().get_device()); - - const int k = matmul_info.adj_lhs() ? lhs->tensor_shape().y() : lhs->tensor_shape().x(); - - if (is_quantized) - { - // MMUL kernel works only when K is a multiple of 16 - if (is_mmul_supported && !act_info.enabled() && k % 16 == 0) - { - return MatMulKernelType::NATIVE_MMUL_QUANTIZED; - } - - return MatMulKernelType::NATIVE_QUANTIZED; - } - else - { - // MMUL kernel works only when K is a multiple of 4 - if (is_mmul_supported && !act_info.enabled() && k % 4 == 0) - { - return MatMulKernelType::NATIVE_MMUL_FP; - } - - return MatMulKernelType::NATIVE_FP; - } - - return is_quantized ? MatMulKernelType::NATIVE_QUANTIZED : MatMulKernelType::NATIVE_FP; -} -} // namespace using namespace arm_compute::opencl::kernels; ClMatMul::ClMatMul() @@ -119,7 +67,10 @@ Status ClMatMul::validate(const ITensorInfo *lhs, const MatMulKernelInfo kernel_info = t->configure(lhs, rhs, matmul_info); - switch (get_matmul_kernel(lhs, rhs, matmul_info, act_info)) + const auto kernel_selector = ClMatMulNativeKernelVariantFactory::create(gpu_target); + const MatMulKernelType kernel_type = kernel_selector->select_kernel(lhs, rhs, matmul_info, act_info); + + switch (kernel_type) { case MatMulKernelType::NATIVE_FP: return ClMatMulNativeKernel::validate(lhs, rhs, nullptr /* bias */, dst, kernel_info, act_info); @@ -151,7 +102,10 @@ void ClMatMul::configure(const CLCompileContext &compile_context, const auto kernel_config = ClMatMulNativeKernelConfigurationFactory::create(gpu_target); const MatMulKernelInfo kernel_info = kernel_config->configure(lhs, rhs, matmul_info); - switch (get_matmul_kernel(lhs, rhs, matmul_info, act_info)) + const auto kernel_selector = ClMatMulNativeKernelVariantFactory::create(gpu_target); + const MatMulKernelType kernel_type = kernel_selector->select_kernel(lhs, rhs, matmul_info, act_info); + + switch (kernel_type) { case MatMulKernelType::NATIVE_FP: { diff --git a/src/runtime/CL/CLMemoryRegion.cpp b/src/runtime/CL/CLMemoryRegion.cpp index 835958b816..c9ddf9b85c 100644 --- a/src/runtime/CL/CLMemoryRegion.cpp +++ b/src/runtime/CL/CLMemoryRegion.cpp @@ -26,6 +26,8 @@ #include "arm_compute/core/Error.h" #include "arm_compute/runtime/CL/CLScheduler.h" +#include "src/common/utils/Log.h" + namespace arm_compute { ICLMemoryRegion::ICLMemoryRegion(size_t size) @@ -72,7 +74,14 @@ CLBufferMemoryRegion::~CLBufferMemoryRegion() // Flush the command queue to ensure all commands that may use this memory buffer are scheduled to be finished before // this buffer is freed // Do not call finish as it is a blocking call which affects the performance - CLScheduler::get().queue().flush(); + try + { + CLScheduler::get().queue().flush(); + } + catch (const std::exception &e) + { + ARM_COMPUTE_LOG_ERROR_ACL(e.what()); + } } void *CLBufferMemoryRegion::ptr() diff --git a/src/runtime/CL/functions/CLReduceMean.cpp b/src/runtime/CL/functions/CLReduceMean.cpp index 6c6daff5ba..bef8d887fd 100644 --- a/src/runtime/CL/functions/CLReduceMean.cpp +++ b/src/runtime/CL/functions/CLReduceMean.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2021, 2023 Arm Limited. + * Copyright (c) 2018-2021, 2023-2024 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -66,7 +66,14 @@ validate_config(const ITensorInfo *input, const Coordinates &reduction_axis, boo TensorShape out_shape = input->tensor_shape(); // Validate output_shape only if not using auto_init convert_negative_axis(axis_local, input_dims); + +// Suppress warning produced by a compiler bug in GCC +// https://gcc.gnu.org/bugzilla/show_bug.cgi?id=104165 +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Warray-bounds" std::sort(axis_local.begin(), axis_local.begin() + reduction_ops); +#pragma GCC diagnostic pop + for (unsigned int i = 0; i < reduction_ops; ++i) { ARM_COMPUTE_RETURN_ERROR_ON(axis_local[i] > 3); @@ -202,7 +209,13 @@ void CLReduceMean::configure(const CLCompileContext &compile_context, // We have to sort the reduction axis vectors in order for remove_dimension // to work properly + +// Suppress warning produced by a compiler bug in GCC +// https://gcc.gnu.org/bugzilla/show_bug.cgi?id=104165 +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Warray-bounds" std::sort(axis_local.begin(), axis_local.begin() + _reduction_ops); +#pragma GCC diagnostic pop for (int i = 0; i < _reduction_ops; ++i) { out_shape.remove_dimension(axis_local[i] - i, false); diff --git a/src/runtime/CL/gemm/CLGEMMKernelSelection.h b/src/runtime/CL/gemm/CLGEMMKernelSelection.h index c528dbcac4..98dd44b1bf 100644 --- a/src/runtime/CL/gemm/CLGEMMKernelSelection.h +++ b/src/runtime/CL/gemm/CLGEMMKernelSelection.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020 Arm Limited. + * Copyright (c) 2020, 2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -21,8 +21,8 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#ifndef SRC_CLGEMMKERNELSELECTION_H -#define SRC_CLGEMMKERNELSELECTION_H +#ifndef ACL_SRC_RUNTIME_CL_GEMM_CLGEMMKERNELSELECTION_H +#define ACL_SRC_RUNTIME_CL_GEMM_CLGEMMKERNELSELECTION_H #include "arm_compute/runtime/CL/ICLGEMMKernelSelection.h" @@ -53,6 +53,7 @@ class CLGEMMKernelSelectionFactory final case GPUTarget::BIFROST: return std::make_unique(gpu); case GPUTarget::VALHALL: + case GPUTarget::FIFTHGEN: return std::make_unique(gpu); default: ARM_COMPUTE_ERROR("Not supported GPU target"); @@ -61,4 +62,4 @@ class CLGEMMKernelSelectionFactory final }; } // namespace cl_gemm } // namespace arm_compute -#endif /* SRC_CLGEMMKERNELSELECTION_H */ +#endif // ACL_SRC_RUNTIME_CL_GEMM_CLGEMMKERNELSELECTION_H diff --git a/src/runtime/NEON/functions/NEDepthToSpaceLayer.cpp b/src/runtime/NEON/functions/NEDepthToSpaceLayer.cpp index 47564059ec..5eea4dca65 100644 --- a/src/runtime/NEON/functions/NEDepthToSpaceLayer.cpp +++ b/src/runtime/NEON/functions/NEDepthToSpaceLayer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021 Arm Limited. + * Copyright (c) 2019-2021, 2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -25,15 +25,20 @@ #include "arm_compute/runtime/NEON/functions/NEDepthToSpaceLayer.h" #include "arm_compute/core/Error.h" -#include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Types.h" -#include "arm_compute/core/Validate.h" +#include "arm_compute/runtime/NEON/NEScheduler.h" #include "src/common/utils/Log.h" #include "src/core/NEON/kernels/NEDepthToSpaceLayerKernel.h" namespace arm_compute { +NEDepthToSpaceLayer::NEDepthToSpaceLayer() : _kernel{} +{ +} + +NEDepthToSpaceLayer::~NEDepthToSpaceLayer() = default; + void NEDepthToSpaceLayer::configure(const ITensor *input, ITensor *output, int32_t block_shape) { ARM_COMPUTE_LOG_PARAMS(input, output, block_shape); @@ -47,4 +52,10 @@ Status NEDepthToSpaceLayer::validate(const ITensorInfo *input, const ITensorInfo { return NEDepthToSpaceLayerKernel::validate(input, output, block_shape); } + +void NEDepthToSpaceLayer::run() +{ + NEScheduler::get().schedule(_kernel.get(), _kernel->get_split_dimension()); +} + } // namespace arm_compute diff --git a/src/runtime/NEON/functions/NEReduceMean.cpp b/src/runtime/NEON/functions/NEReduceMean.cpp index d37cf4a8d0..a23db87059 100644 --- a/src/runtime/NEON/functions/NEReduceMean.cpp +++ b/src/runtime/NEON/functions/NEReduceMean.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2023 Arm Limited. + * Copyright (c) 2018-2024 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -63,7 +63,14 @@ validate_config(const ITensorInfo *input, const Coordinates &reduction_axis, boo TensorShape out_shape = input->tensor_shape(); // Validate output_shape only if not using auto_init convert_negative_axis(axis_local, input_dims); + +// Suppress warning produced by a compiler bug in GCC +// https://gcc.gnu.org/bugzilla/show_bug.cgi?id=104165 +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Warray-bounds" std::sort(axis_local.begin(), axis_local.begin() + reduction_ops); +#pragma GCC diagnostic pop + for (unsigned int i = 0; i < reduction_ops; ++i) { ARM_COMPUTE_RETURN_ERROR_ON(axis_local[i] > 3); @@ -168,7 +175,14 @@ void NEReduceMean::configure(ITensor *input, const Coordinates &reduction_axis, TensorShape out_shape = tmp_input->info()->tensor_shape(); // We have to sort the reduction axis vectors in order for remove_dimension // to work properly + +// Suppress warning produced by a compiler bug in GCC +// https://gcc.gnu.org/bugzilla/show_bug.cgi?id=104165 +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Warray-bounds" std::sort(axis_local.begin(), axis_local.begin() + _reduction_ops); +#pragma GCC diagnostic pop + for (int i = 0; i < _reduction_ops; ++i) { out_shape.remove_dimension(axis_local[i] - i, false); diff --git a/src/runtime/NEON/functions/NESoftmaxLayer.cpp b/src/runtime/NEON/functions/NESoftmaxLayer.cpp index e3c2012d05..be588c5b52 100644 --- a/src/runtime/NEON/functions/NESoftmaxLayer.cpp +++ b/src/runtime/NEON/functions/NESoftmaxLayer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2021 Arm Limited. + * Copyright (c) 2017-2021, 2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -29,7 +29,6 @@ #include "src/core/helpers/MemoryHelpers.h" #include "src/core/helpers/SoftmaxHelpers.h" -#include "src/cpu/kernels/CpuSoftmaxKernel.h" #include "src/cpu/operators/CpuSoftmax.h" namespace arm_compute @@ -37,13 +36,12 @@ namespace arm_compute template struct NESoftmaxLayerGeneric::Impl { - const ITensor *src{nullptr}; - ITensor *dst{nullptr}; - Tensor max{nullptr}; - std::unique_ptr> op{nullptr}; - MemoryGroup memory_group{}; - ITensorPack run_pack{}; - WorkspaceData workspace_tensors{}; + const ITensor *src{nullptr}; + ITensor *dst{nullptr}; + std::unique_ptr op{nullptr}; + MemoryGroup memory_group{}; + ITensorPack run_pack{}; + WorkspaceData workspace_tensors{}; }; template @@ -67,8 +65,8 @@ void NESoftmaxLayerGeneric::configure(ITensor *input, ITensor *output, f _impl->src = input; _impl->dst = output; - _impl->op = std::make_unique>(); - _impl->op->configure(input->info(), output->info(), beta, axis); + _impl->op = std::make_unique(); + _impl->op->configure(input->info(), output->info(), beta, axis, IS_LOG); _impl->run_pack = {{TensorType::ACL_SRC, _impl->src}, {TensorType::ACL_DST, _impl->dst}}; _impl->workspace_tensors = manage_workspace(_impl->op->workspace(), _impl->memory_group, _impl->run_pack); @@ -79,7 +77,7 @@ Status NESoftmaxLayerGeneric::validate(const ITensorInfo *input, const ITensorInfo *output, float beta, int32_t axis) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); - ARM_COMPUTE_RETURN_ON_ERROR(cpu::CpuSoftmaxGeneric::validate(input, output, beta, axis)); + ARM_COMPUTE_RETURN_ON_ERROR(cpu::CpuSoftmaxGeneric::validate(input, output, beta, axis, IS_LOG)); return Status{}; } diff --git a/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp b/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp index 8d77abcfc7..7334be8456 100644 --- a/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp +++ b/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2022 Arm Limited. + * Copyright (c) 2017-2022, 2024 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -56,7 +56,7 @@ struct NEWinogradConvolutionLayer::Impl NEWinogradConvolutionLayer::NEWinogradConvolutionLayer(const std::shared_ptr &memory_manager) : _impl(std::make_unique()) { - _impl->memory_group = MemoryGroup(std::move(memory_manager)); + _impl->memory_group = MemoryGroup(memory_manager); } NEWinogradConvolutionLayer::~NEWinogradConvolutionLayer() = default; diff --git a/src/runtime/Scheduler.cpp b/src/runtime/Scheduler.cpp index e52fb59940..3f1e96968a 100644 --- a/src/runtime/Scheduler.cpp +++ b/src/runtime/Scheduler.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2020 Arm Limited. + * Copyright (c) 2017-2020, 2023 Arm Limited. * * SPDX-License-Identifier: MIT * diff --git a/src/runtime/heuristics/direct_conv/ClDirectConvKernelConfig.h b/src/runtime/heuristics/direct_conv/ClDirectConvKernelConfig.h index 2c2509f70b..215b17ef79 100644 --- a/src/runtime/heuristics/direct_conv/ClDirectConvKernelConfig.h +++ b/src/runtime/heuristics/direct_conv/ClDirectConvKernelConfig.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022 Arm Limited. + * Copyright (c) 2022-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -21,8 +21,8 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#ifndef SRC_RUNTIME_HEURISTICS_DIRECT_CONV_CLDIRECTCONVKERNELCONFIG -#define SRC_RUNTIME_HEURISTICS_DIRECT_CONV_CLDIRECTCONVKERNELCONFIG +#ifndef ACL_SRC_RUNTIME_HEURISTICS_DIRECT_CONV_CLDIRECTCONVKERNELCONFIG_H +#define ACL_SRC_RUNTIME_HEURISTICS_DIRECT_CONV_CLDIRECTCONVKERNELCONFIG_H #include "src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigBifrost.h" #include "src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigValhall.h" @@ -53,6 +53,7 @@ class ClDirectConvKernelConfigurationFactory final case GPUTarget::BIFROST: return std::make_unique(gpu); case GPUTarget::VALHALL: + case GPUTarget::FIFTHGEN: return std::make_unique(gpu); default: ARM_COMPUTE_ERROR("Not supported GPU target"); @@ -61,4 +62,4 @@ class ClDirectConvKernelConfigurationFactory final }; } // namespace cl_direct_conv } // namespace arm_compute -#endif /* SRC_RUNTIME_HEURISTICS_DIRECT_CONV_CLDIRECTCONVKERNELCONFIG */ +#endif // ACL_SRC_RUNTIME_HEURISTICS_DIRECT_CONV_CLDIRECTCONVKERNELCONFIG_H diff --git a/src/runtime/heuristics/dwc_native/ClDWCNativeKernelConfig.h b/src/runtime/heuristics/dwc_native/ClDWCNativeKernelConfig.h index 49ce6ff479..031cf1859a 100644 --- a/src/runtime/heuristics/dwc_native/ClDWCNativeKernelConfig.h +++ b/src/runtime/heuristics/dwc_native/ClDWCNativeKernelConfig.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022 Arm Limited. + * Copyright (c) 2022-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -21,8 +21,8 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#ifndef SRC_RUNTIME_HEURISTICS_DWC_NATIVE_CLDWCNATIVEKERNELCONFIG -#define SRC_RUNTIME_HEURISTICS_DWC_NATIVE_CLDWCNATIVEKERNELCONFIG +#ifndef ACL_SRC_RUNTIME_HEURISTICS_DWC_NATIVE_CLDWCNATIVEKERNELCONFIG_H +#define ACL_SRC_RUNTIME_HEURISTICS_DWC_NATIVE_CLDWCNATIVEKERNELCONFIG_H #include "src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigBifrost.h" #include "src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigValhall.h" @@ -54,6 +54,7 @@ class ClDWCNativeKernelConfigurationFactory final case GPUTarget::BIFROST: return std::make_unique(gpu); case GPUTarget::VALHALL: + case GPUTarget::FIFTHGEN: return std::make_unique(gpu); default: ARM_COMPUTE_ERROR("Not supported GPU target"); @@ -62,4 +63,4 @@ class ClDWCNativeKernelConfigurationFactory final }; } // namespace cl_dwc } // namespace arm_compute -#endif /* SRC_RUNTIME_HEURISTICS_DWC_NATIVE_CLDWCNATIVEKERNELCONFIG */ +#endif // ACL_SRC_RUNTIME_HEURISTICS_DWC_NATIVE_CLDWCNATIVEKERNELCONFIG_H diff --git a/src/runtime/heuristics/indirect_conv/ClIndirectConvKernelConfig.h b/src/runtime/heuristics/indirect_conv/ClIndirectConvKernelConfig.h index dd614e1f68..5e7ba6f8e9 100644 --- a/src/runtime/heuristics/indirect_conv/ClIndirectConvKernelConfig.h +++ b/src/runtime/heuristics/indirect_conv/ClIndirectConvKernelConfig.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022 Arm Limited. + * Copyright (c) 2022-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -21,8 +21,8 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#ifndef SRC_RUNTIME_HEURISTICS_INDIRECT_CONV_CLINDIRECTCONVKERNELCONFIG -#define SRC_RUNTIME_HEURISTICS_INDIRECT_CONV_CLINDIRECTCONVKERNELCONFIG +#ifndef ACL_SRC_RUNTIME_HEURISTICS_INDIRECT_CONV_CLINDIRECTCONVKERNELCONFIG_H +#define ACL_SRC_RUNTIME_HEURISTICS_INDIRECT_CONV_CLINDIRECTCONVKERNELCONFIG_H #include "src/runtime/heuristics/indirect_conv/ClIndirectConvDefaultConfigValhall.h" #include "src/runtime/heuristics/indirect_conv/IClIndirectConvKernelConfig.h" @@ -50,6 +50,7 @@ class ClIndirectConvKernelConfigurationFactory final case GPUTarget::MIDGARD: case GPUTarget::BIFROST: case GPUTarget::VALHALL: + case GPUTarget::FIFTHGEN: return std::make_unique(gpu); default: ARM_COMPUTE_ERROR("Not supported GPU target"); @@ -58,4 +59,4 @@ class ClIndirectConvKernelConfigurationFactory final }; } // namespace cl_indirect_conv } // namespace arm_compute -#endif /* SRC_RUNTIME_HEURISTICS_INDIRECT_CONV_CLINDIRECTCONVKERNELCONFIG */ +#endif // ACL_SRC_RUNTIME_HEURISTICS_INDIRECT_CONV_CLINDIRECTCONVKERNELCONFIG_H diff --git a/src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultConfigValhall.cpp b/src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultConfigValhall.cpp index 4b923547c4..3a02a60650 100644 --- a/src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultConfigValhall.cpp +++ b/src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultConfigValhall.cpp @@ -62,6 +62,7 @@ ClMatMulNativeDefaultConfigValhall::configure(const ITensorInfo *lhs, const ITen switch (_target) { case GPUTarget::G715: + case GPUTarget::G615: func = configs_G715.get_function(lhs->data_type()); break; case GPUTarget::G710: diff --git a/src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultVariantValhall.cpp b/src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultVariantValhall.cpp new file mode 100644 index 0000000000..3878f698fd --- /dev/null +++ b/src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultVariantValhall.cpp @@ -0,0 +1,113 @@ +/* + * Copyright (c) 2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultVariantValhall.h" + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/GPUTarget.h" +#include "arm_compute/core/TensorInfo.h" + +namespace arm_compute +{ +namespace cl_matmul +{ +ClMatMulNativeDefaultVariantValhall::ClMatMulNativeDefaultVariantValhall(GPUTarget gpu) + : IClMatMulNativeKernelVariant(gpu) +{ +} + +MatMulKernelType ClMatMulNativeDefaultVariantValhall::select_kernel(const ITensorInfo *lhs, + const ITensorInfo *rhs, + const MatMulInfo &info, + const ActivationLayerInfo &act_info) +{ + ARM_COMPUTE_UNUSED(rhs); + + using VariantFunctionExecutorPtr = + MatMulKernelType (ClMatMulNativeDefaultVariantValhall::*)(int k, bool act_enabled); + + ClMatMulNativeVariantArray configs_G715( + &ClMatMulNativeDefaultVariantValhall::configure_G715_float, + &ClMatMulNativeDefaultVariantValhall::configure_G715_quantized); + + ClMatMulNativeVariantArray configs_default( + &ClMatMulNativeDefaultVariantValhall::configure_default_float, + &ClMatMulNativeDefaultVariantValhall::configure_default_quantized); + + VariantFunctionExecutorPtr func = nullptr; + switch (_target) + { + case GPUTarget::G715: + case GPUTarget::G615: + func = configs_G715.get_function(lhs->data_type()); + break; + default: + func = configs_default.get_function(lhs->data_type()); + break; + } + + const int k = info.adj_lhs() ? lhs->tensor_shape().y() : lhs->tensor_shape().x(); + const bool act_enabled = act_info.enabled(); + + ARM_COMPUTE_ERROR_ON_MSG(func == nullptr, "Data type not supported for matmul native"); + return (this->*func)(k, act_enabled); +} + +MatMulKernelType ClMatMulNativeDefaultVariantValhall::configure_G715_float(int k, bool act_enabled) +{ + // MMUL kernel works only when K is a multiple of 4 + if (!act_enabled && k % 4 == 0) + { + return MatMulKernelType::NATIVE_MMUL_FP; + } + + return MatMulKernelType::NATIVE_FP; +} + +MatMulKernelType ClMatMulNativeDefaultVariantValhall::configure_G715_quantized(int k, bool act_enabled) +{ + // MMUL kernel works only when K is a multiple of 16 + if (!act_enabled && k % 16 == 0) + { + return MatMulKernelType::NATIVE_MMUL_QUANTIZED; + } + + return MatMulKernelType::NATIVE_QUANTIZED; +} + +MatMulKernelType ClMatMulNativeDefaultVariantValhall::configure_default_float(int k, bool act_enabled) +{ + ARM_COMPUTE_UNUSED(k, act_enabled); + + return MatMulKernelType::NATIVE_FP; +} + +MatMulKernelType ClMatMulNativeDefaultVariantValhall::configure_default_quantized(int k, bool act_enabled) +{ + ARM_COMPUTE_UNUSED(k, act_enabled); + + return MatMulKernelType::NATIVE_QUANTIZED; +} + +} // namespace cl_matmul +} // namespace arm_compute diff --git a/src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultVariantValhall.h b/src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultVariantValhall.h new file mode 100644 index 0000000000..a202676e98 --- /dev/null +++ b/src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultVariantValhall.h @@ -0,0 +1,57 @@ +/* + * Copyright (c) 2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ACL_SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_CLMATMULNATIVEDEFAULTVARIANTVALHALL_H +#define ACL_SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_CLMATMULNATIVEDEFAULTVARIANTVALHALL_H + +#include "src/runtime/heuristics/matmul_native/IClMatMulNativeKernelVariant.h" + +namespace arm_compute +{ +namespace cl_matmul +{ +/** Valhall based OpenCL matmul configuration */ +class ClMatMulNativeDefaultVariantValhall final : public IClMatMulNativeKernelVariant +{ +public: + /** Constructor + * + * @param[in] gpu GPU target + */ + ClMatMulNativeDefaultVariantValhall(GPUTarget gpu); + + // Inherited overridden method + MatMulKernelType select_kernel(const ITensorInfo *lhs, + const ITensorInfo *rhs, + const MatMulInfo &info, + const ActivationLayerInfo &act_info) override; + +private: + MatMulKernelType configure_G715_float(int k, bool act_enabled); + MatMulKernelType configure_G715_quantized(int k, bool act_enabled); + MatMulKernelType configure_default_float(int k, bool act_enabled); + MatMulKernelType configure_default_quantized(int k, bool act_enabled); +}; +} // namespace cl_matmul +} // namespace arm_compute +#endif // ACL_SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_CLMATMULNATIVEDEFAULTVARIANTVALHALL_H diff --git a/src/runtime/heuristics/matmul_native/ClMatMulNativeHelpers.h b/src/runtime/heuristics/matmul_native/ClMatMulNativeHelpers.h index a114fffa68..699f5fe8c1 100644 --- a/src/runtime/heuristics/matmul_native/ClMatMulNativeHelpers.h +++ b/src/runtime/heuristics/matmul_native/ClMatMulNativeHelpers.h @@ -21,8 +21,8 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#ifndef SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_CLMATMULNATIVEHELPERS -#define SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_CLMATMULNATIVEHELPERS +#ifndef ACL_SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_CLMATMULNATIVEHELPERS_H +#define ACL_SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_CLMATMULNATIVEHELPERS_H #include "arm_compute/core/Types.h" @@ -80,4 +80,4 @@ MatMulKernelInfo find_info(const MatMulNativeConfigsMatrix &configs, unsigned int b); } // namespace cl_matmul } // namespace arm_compute -#endif /* SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_CLMATMULNATIVEHELPERS */ +#endif // ACL_SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_CLMATMULNATIVEHELPERS_H diff --git a/src/runtime/heuristics/matmul_native/ClMatMulNativeKernelConfig.h b/src/runtime/heuristics/matmul_native/ClMatMulNativeKernelConfig.h index b10018a6d2..e7485bca81 100644 --- a/src/runtime/heuristics/matmul_native/ClMatMulNativeKernelConfig.h +++ b/src/runtime/heuristics/matmul_native/ClMatMulNativeKernelConfig.h @@ -21,8 +21,8 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#ifndef SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_CLMATMULNATIVEKERNELCONFIG -#define SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_CLMATMULNATIVEKERNELCONFIG +#ifndef ACL_SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_CLMATMULNATIVEKERNELCONFIG_H +#define ACL_SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_CLMATMULNATIVEKERNELCONFIG_H #include "src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultConfigValhall.h" #include "src/runtime/heuristics/matmul_native/IClMatMulNativeKernelConfig.h" @@ -50,6 +50,7 @@ class ClMatMulNativeKernelConfigurationFactory final case GPUTarget::MIDGARD: case GPUTarget::BIFROST: case GPUTarget::VALHALL: + case GPUTarget::FIFTHGEN: return std::make_unique(gpu); default: ARM_COMPUTE_ERROR("Not supported GPU target"); @@ -58,4 +59,4 @@ class ClMatMulNativeKernelConfigurationFactory final }; } // namespace cl_matmul } // namespace arm_compute -#endif /* SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_CLMATMULNATIVEKERNELCONFIG */ +#endif // ACL_SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_CLMATMULNATIVEKERNELCONFIG_H diff --git a/src/runtime/heuristics/matmul_native/ClMatMulNativeKernelVariant.h b/src/runtime/heuristics/matmul_native/ClMatMulNativeKernelVariant.h new file mode 100644 index 0000000000..c2895b8919 --- /dev/null +++ b/src/runtime/heuristics/matmul_native/ClMatMulNativeKernelVariant.h @@ -0,0 +1,63 @@ +/* + * Copyright (c) 2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ACL_SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_CLMATMULNATIVEKERNELVARIANT_H +#define ACL_SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_CLMATMULNATIVEKERNELVARIANT_H + +#include "src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultVariantValhall.h" +#include "src/runtime/heuristics/matmul_native/IClMatMulNativeKernelVariant.h" + +#include + +namespace arm_compute +{ +namespace cl_matmul +{ + +/** ClMatMul variant factory class */ +class ClMatMulNativeKernelVariantFactory final +{ +public: + /** Static method to call the ClMatMul configuration class accordingly with the GPU target + * + * @param[in] gpu GPU target + * + * @return IClMatMulNativeKernelVariant + */ + static std::unique_ptr create(GPUTarget gpu) + { + switch (get_arch_from_target(gpu)) + { + case GPUTarget::MIDGARD: + case GPUTarget::BIFROST: + case GPUTarget::VALHALL: + case GPUTarget::FIFTHGEN: + return std::make_unique(gpu); + default: + ARM_COMPUTE_ERROR("Not supported GPU target"); + } + } +}; +} // namespace cl_matmul +} // namespace arm_compute +#endif // ACL_SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_CLMATMULNATIVEKERNELVARIANT_H diff --git a/src/runtime/heuristics/matmul_native/IClMatMulNativeKernelConfig.h b/src/runtime/heuristics/matmul_native/IClMatMulNativeKernelConfig.h index b9b091100c..00ba3641d5 100644 --- a/src/runtime/heuristics/matmul_native/IClMatMulNativeKernelConfig.h +++ b/src/runtime/heuristics/matmul_native/IClMatMulNativeKernelConfig.h @@ -21,8 +21,8 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#ifndef SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_ICLMATMULNATIVEKERNELCONFIG -#define SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_ICLMATMULNATIVEKERNELCONFIG +#ifndef ACL_SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_ICLMATMULNATIVEKERNELCONFIG_H +#define ACL_SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_ICLMATMULNATIVEKERNELCONFIG_H #include "arm_compute/core/GPUTarget.h" #include "arm_compute/core/KernelDescriptors.h" @@ -85,7 +85,9 @@ class ClMatMulNativeConfigArray std::array _configs; }; -/** Basic interface for the matmul native kernel configuration */ +/** Basic interface for the matmul native kernel configuration + * This is the base class that chooses architecture specific kernel configurations. +*/ class IClMatMulNativeKernelConfig { public: @@ -112,4 +114,4 @@ class IClMatMulNativeKernelConfig }; } // namespace cl_matmul } // namespace arm_compute -#endif /* SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_ICLMATMULNATIVEKERNELCONFIG */ +#endif // ACL_SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_ICLMATMULNATIVEKERNELCONFIG_H diff --git a/src/runtime/heuristics/matmul_native/IClMatMulNativeKernelVariant.h b/src/runtime/heuristics/matmul_native/IClMatMulNativeKernelVariant.h new file mode 100644 index 0000000000..eac41dd6a3 --- /dev/null +++ b/src/runtime/heuristics/matmul_native/IClMatMulNativeKernelVariant.h @@ -0,0 +1,135 @@ +/* + * Copyright (c) 2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ACL_SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_ICLMATMULNATIVEKERNELVARIANT_H +#define ACL_SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_ICLMATMULNATIVEKERNELVARIANT_H + +#include "arm_compute/core/CoreTypes.h" // DataType +#include "arm_compute/core/GPUTarget.h" +#include "arm_compute/core/ITensorInfo.h" +#include "arm_compute/function_info/ActivationLayerInfo.h" +#include "arm_compute/function_info/MatMulInfo.h" + +#include "src/core/common/Macros.h" + +#include + +namespace arm_compute +{ +namespace cl_matmul +{ +enum class MatMulKernelType +{ + /** Native matrix multiplication for FP types */ + NATIVE_FP, + + /** Native matrix multiplication for quantized types */ + NATIVE_QUANTIZED, + + /** Native matrix multiplication using MMUL extension for FP types */ + NATIVE_MMUL_FP, + + /** Native matrix multiplication using MMUL extension for Quantized types */ + NATIVE_MMUL_QUANTIZED +}; + +/** Basic container for the OpenCL MatMul Native variant functions */ +template +class ClMatMulNativeVariantArray +{ +public: + /** Alias for Float index */ + static constexpr size_t DT_FLOAT = 0; + /** Alias for Quantized type index */ + static constexpr size_t DT_QUANTIZED = 1; + + /** Constructor + * + * @param[in] func_float Function to call for matmul native float (F32, F16) + * @param[in] func_quantized Function to call for matmul native quantized (QASYMM8, QASYMM8_SIGNED, QSYMM8_PER_CHANNEL) + * + */ + ClMatMulNativeVariantArray(T func_float, T func_quantized) : _configs{func_float, func_quantized} + { + } + + /** Method to return the matmul native variant function based on data type + * + * @param[in] data_type Input data type + * + * @return the valid function otherwise it returns nullptr if the data type is not valid + */ + T get_function(DataType data_type) + { + switch (data_type) + { + case DataType::F32: + case DataType::F16: + return _configs.at(DT_FLOAT); + case DataType::QASYMM8: + case DataType::QASYMM8_SIGNED: + case DataType::QSYMM8_PER_CHANNEL: + return _configs.at(DT_QUANTIZED); + default: + return nullptr; + } + } + +private: + std::array _configs; +}; + +/** Basic interface for the matmul native kernel variant + * This is the base class that chooses architecture specific kernel variants. +*/ +class IClMatMulNativeKernelVariant +{ +public: + /** Constructor + * + * @param[in] arch GPU target + */ + IClMatMulNativeKernelVariant(GPUTarget arch) : _target(arch) + { + } + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(IClMatMulNativeKernelVariant); + /** Virtual destructor */ + virtual ~IClMatMulNativeKernelVariant() = default; + /** This method returns the @ref MatMulKernelType for the given inputs + * + * @param[in] lhs LHS tensor + * @param[in] rhs RHS tensor + * @param[in] info MatMul info + * @param[in] act_info Activation layer info + */ + virtual MatMulKernelType select_kernel(const ITensorInfo *lhs, + const ITensorInfo *rhs, + const MatMulInfo &info, + const ActivationLayerInfo &act_info) = 0; + +protected: + GPUTarget _target; +}; +} // namespace cl_matmul +} // namespace arm_compute +#endif // ACL_SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_ICLMATMULNATIVEKERNELVARIANT_H diff --git a/support/ToolchainSupport.h b/support/ToolchainSupport.h index 4d394889c3..accbb643c2 100644 --- a/support/ToolchainSupport.h +++ b/support/ToolchainSupport.h @@ -21,8 +21,8 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#ifndef ARM_COMPUTE_SUPPORT_TOOLCHAINSUPPORT -#define ARM_COMPUTE_SUPPORT_TOOLCHAINSUPPORT +#ifndef ACL_SUPPORT_TOOLCHAINSUPPORT_H +#define ACL_SUPPORT_TOOLCHAINSUPPORT_H #include "support/Bfloat16.h" #include "support/Half.h" @@ -184,8 +184,7 @@ inline T nearbyint(T value) template ::value>::type> inline T round(T value) { - //Workaround Valgrind's mismatches: when running from Valgrind the call to std::round(-4.500000) == -4.000000 instead of 5.00000 - return (value < 0.f) ? static_cast(value - 0.5f) : static_cast(value + 0.5f); + return std::round(value); } /** Round floating-point value with half value rounding away from zero and cast to long @@ -331,4 +330,4 @@ inline bool signbit(bfloat16 value) } // namespace cpp11 } // namespace support } // namespace arm_compute -#endif /* ARM_COMPUTE_SUPPORT_TOOLCHAINSUPPORT */ +#endif // ACL_SUPPORT_TOOLCHAINSUPPORT_H diff --git a/tests/SConscript b/tests/SConscript index 21904899c0..305f1693d1 100644 --- a/tests/SConscript +++ b/tests/SConscript @@ -83,17 +83,12 @@ if 'macos' in test_env['os']: if env['os'] in ['android', 'macos', 'bare_metal'] or env['standalone']: Import("arm_compute_a") - Import("arm_compute_core_a") Import("arm_compute_graph_a") - if env['os']=='windows': - test_env.Append(LIBS = [arm_compute_graph_a, arm_compute_a]) - else: - test_env.Append(LIBS = [arm_compute_graph_a, arm_compute_a, arm_compute_core_a]) + test_env.Append(LIBS = [arm_compute_graph_a, arm_compute_a]) arm_compute_lib = arm_compute_graph_a else: Import("arm_compute_graph_so") - Import("arm_compute_core_a") - test_env.Append(LIBS = ["arm_compute_graph", "arm_compute", "arm_compute_core"]) + test_env.Append(LIBS = ["arm_compute_graph", "arm_compute"]) arm_compute_lib = arm_compute_graph_so if env['os'] in ['bare_metal']: @@ -189,9 +184,8 @@ if env['fixed_format_kernels'] and test_env['validation_tests']: test_env.Append(CPPDEFINES = ['ARM_COMPUTE_ENABLE_FIXED_FORMAT_KERNELS']) if test_env['validation_tests']: - arm_compute_validation_framework = env.StaticLibrary('arm_compute_validation_framework', Glob('validation/reference/*.cpp') + Glob('validation/*.cpp'), LINKFLAGS=test_env['LINKFLAGS'], CXXFLAGS=test_env['CXXFLAGS'], LIBS= [ arm_compute_test_framework, arm_compute_core_a]) + arm_compute_validation_framework = env.StaticLibrary('arm_compute_validation_framework', Glob('validation/reference/*.cpp') + Glob('validation/*.cpp'), LINKFLAGS=test_env['LINKFLAGS'], CXXFLAGS=test_env['CXXFLAGS'], LIBS= [ arm_compute_test_framework ]) Depends(arm_compute_validation_framework , arm_compute_test_framework) - Depends(arm_compute_validation_framework , arm_compute_core_a) program_objects = files_validation + common_objects if test_env['os'] == 'bare_metal': diff --git a/tests/datasets/GEMMLowpFusedOffsetOutputDataset.h b/tests/datasets/GEMMLowpFusedOffsetOutputDataset.h index 8c90efcbdd..b0ad4879ba 100644 --- a/tests/datasets/GEMMLowpFusedOffsetOutputDataset.h +++ b/tests/datasets/GEMMLowpFusedOffsetOutputDataset.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2022 Arm Limited. + * Copyright (c) 2019-2024 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -21,8 +21,8 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#ifndef ARM_COMPUTE_TEST_GEMMLOWPOUTPUT_DATASET -#define ARM_COMPUTE_TEST_GEMMLOWPOUTPUT_DATASET +#ifndef ACL_TESTS_DATASETS_GEMMLOWPFUSEDOFFSETOUTPUTDATASET_H +#define ACL_TESTS_DATASETS_GEMMLOWPFUSEDOFFSETOUTPUTDATASET_H #include "utils/TypePrinter.h" @@ -40,21 +40,17 @@ namespace datasets class GEMMLowpFusedOffsetOutputDataset { public: - using type = std::tuple; + using type = std::tuple; struct iterator { iterator(std::vector::const_iterator a_it, std::vector::const_iterator b_it, std::vector::const_iterator c_it, - std::vector::const_iterator a_offset_it, - std::vector::const_iterator b_offset_it, - std::vector::const_iterator output_stage_it) + std::vector::const_iterator output_stage_it) : _a_it{ std::move(a_it) }, _b_it{ std::move(b_it) }, _c_it{ std::move(c_it) }, - _a_offset_it{ std::move(a_offset_it) }, - _b_offset_it{ std::move(b_offset_it) }, _output_stage_it{ std::move(output_stage_it) } { } @@ -65,33 +61,14 @@ class GEMMLowpFusedOffsetOutputDataset description << "A=" << *_a_it << ":"; description << "B=" << *_b_it << ":"; description << "C=" << *_c_it << ":"; - description << "a_offset=" << *_a_offset_it << ":"; - description << "b_offset=" << *_b_offset_it << ":"; - description << "output_type=" << string_from_gemmlowp_output_stage((*_output_stage_it).type) << ":"; - description << "output_offset=" << (*_output_stage_it).gemmlowp_offset << ":"; - description << "output_multiplier={"; - for(auto it = (*_output_stage_it).gemmlowp_multipliers.begin(); it != (*_output_stage_it).gemmlowp_multipliers.end(); ++it) - { - description << (*it) << ", "; - } - description << "}:"; - description << "output_shift={"; - - for(auto it = (*_output_stage_it).gemmlowp_shifts.begin(); it != (*_output_stage_it).gemmlowp_shifts.end(); ++it) - { - description << (*it) << ", "; - } - description << "}:"; - description << "output_min=" << (*_output_stage_it).gemmlowp_min_bound << ":"; - description << "output_max=" << (*_output_stage_it).gemmlowp_max_bound << ":"; - description << "is_quantized_per_channel=" << (*_output_stage_it).is_quantized_per_channel << ":"; + description << "output_type=" << string_from_gemmlowp_output_stage(*_output_stage_it) << ":"; return description.str(); } GEMMLowpFusedOffsetOutputDataset::type operator*() const { - return std::make_tuple(*_a_it, *_b_it, *_c_it, *_a_offset_it, *_b_offset_it, *_output_stage_it); + return std::make_tuple(*_a_it, *_b_it, *_c_it, *_output_stage_it); } iterator &operator++() @@ -99,8 +76,6 @@ class GEMMLowpFusedOffsetOutputDataset ++_a_it; ++_b_it; ++_c_it; - ++_a_offset_it; - ++_b_offset_it; ++_output_stage_it; return *this; @@ -110,45 +85,27 @@ class GEMMLowpFusedOffsetOutputDataset std::vector::const_iterator _a_it; std::vector::const_iterator _b_it; std::vector::const_iterator _c_it; - std::vector::const_iterator _a_offset_it; - std::vector::const_iterator _b_offset_it; - std::vector::const_iterator _output_stage_it; + std::vector::const_iterator _output_stage_it; }; iterator begin() const { - return iterator(_a_shapes.begin(), _b_shapes.begin(), _c_shapes.begin(), _a_offset.begin(), _b_offset.begin(), _output_stage.begin()); + return iterator(_a_shapes.begin(), _b_shapes.begin(), _c_shapes.begin(), _output_stage.begin()); } int size() const { - return std::min(_a_shapes.size(), std::min(_b_shapes.size(), std::min(_c_shapes.size(), std::min(_a_offset.size(), std::min(_b_offset.size(), _output_stage.size()))))); + return std::min(_a_shapes.size(), std::min(_b_shapes.size(), std::min(_c_shapes.size(), _output_stage.size()))); } - void add_config(TensorShape a, TensorShape b, TensorShape c, int32_t a_offset, int32_t b_offset, GEMMLowpOutputStageInfo output_stage) + void add_config(TensorShape a, TensorShape b, TensorShape c, GEMMLowpOutputStageType output_stage) { _a_shapes.emplace_back(std::move(a)); _b_shapes.emplace_back(std::move(b)); _c_shapes.emplace_back(std::move(c)); - _a_offset.emplace_back(std::move(a_offset)); - _b_offset.emplace_back(std::move(b_offset)); _output_stage.emplace_back(std::move(output_stage)); } - GEMMLowpOutputStageInfo OutputStageInfo(GEMMLowpOutputStageType type, int32_t offset, int32_t multiplier, int32_t shift, int32_t min, int32_t max) - { - GEMMLowpOutputStageInfo output_stage = GEMMLowpOutputStageInfo(); - output_stage.type = type; - output_stage.gemmlowp_offset = offset; - output_stage.gemmlowp_multiplier = multiplier; - output_stage.gemmlowp_shift = shift; - output_stage.gemmlowp_min_bound = min; - output_stage.gemmlowp_max_bound = max; - output_stage.gemmlowp_multipliers.push_back(multiplier); - output_stage.gemmlowp_shifts.push_back(shift); - return output_stage; - } - protected: GEMMLowpFusedOffsetOutputDataset() = default; GEMMLowpFusedOffsetOutputDataset(GEMMLowpFusedOffsetOutputDataset &&) = default; @@ -157,9 +114,7 @@ class GEMMLowpFusedOffsetOutputDataset std::vector _a_shapes{}; std::vector _b_shapes{}; std::vector _c_shapes{}; - std::vector _a_offset{}; - std::vector _b_offset{}; - std::vector _output_stage{}; + std::vector _output_stage{}; }; class SmallGEMMLowpFusedOffsetOutputUint8Dataset final : public GEMMLowpFusedOffsetOutputDataset @@ -167,47 +122,28 @@ class SmallGEMMLowpFusedOffsetOutputUint8Dataset final : public GEMMLowpFusedOff public: SmallGEMMLowpFusedOffsetOutputUint8Dataset() { - add_config(TensorShape(21U, 13U), TensorShape(1U, 21U), TensorShape(1U, 13U), 0, 0, OutputStageInfo(GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT, -100, 2, 13, 10, 210)); - add_config(TensorShape(52U, 13U), TensorShape(33U, 52U), TensorShape(33U, 13U), 0, 4, OutputStageInfo(GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT, 100, 2, 13, 10, 210)); - add_config(TensorShape(31U, 27U), TensorShape(23U, 31U), TensorShape(23U, 27U), 18, 23, OutputStageInfo(GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT, 200, 2, 13, 10, 210)); - add_config(TensorShape(32U, 72U), TensorShape(16U, 32U), TensorShape(16U, 72U), -9, 1, OutputStageInfo(GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT, -100, 2, 13, 10, 210)); - - add_config(TensorShape(21U, 1U), TensorShape(43U, 21U), TensorShape(43U, 1U), 0, 0, OutputStageInfo(GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT, -2, 254601600, 10, 10, 210)); - add_config(TensorShape(31U, 3U), TensorShape(72U, 31U), TensorShape(72U, 3U), -2, 13, OutputStageInfo(GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT, 0, 254601600, 10, 10, 210)); - add_config(TensorShape(31U, 27U), TensorShape(23U, 31U), TensorShape(23U, 27U), 5, 13, OutputStageInfo(GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT, 2, 254601602, 10, 10, 210)); - add_config(TensorShape(32U, 72U), TensorShape(17U, 32U), TensorShape(17U, 72U), -9, 1, OutputStageInfo(GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT, -1, 254601602, 10, 10, 210)); + add_config(TensorShape(21U, 13U), TensorShape(1U, 21U), TensorShape(1U, 13U),GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT); + add_config(TensorShape(52U, 13U), TensorShape(33U, 52U), TensorShape(33U, 13U),GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT); + add_config(TensorShape(31U, 27U), TensorShape(23U, 31U), TensorShape(23U, 27U),GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT); + add_config(TensorShape(32U, 72U), TensorShape(16U, 32U), TensorShape(16U, 72U),GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT); + add_config(TensorShape(21U, 1U), TensorShape(43U, 21U), TensorShape(43U, 1U),GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT); + add_config(TensorShape(31U, 3U), TensorShape(72U, 31U), TensorShape(72U, 3U),GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT); + add_config(TensorShape(32U, 72U), TensorShape(17U, 32U), TensorShape(17U, 72U),GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT); } }; -class SmallGEMMLowpFusedBatchedMatMulDatasetUnsigned final : public GEMMLowpFusedOffsetOutputDataset +class SmallGEMMLowpFusedBatchedMatMulDataset final : public GEMMLowpFusedOffsetOutputDataset { public: - SmallGEMMLowpFusedBatchedMatMulDatasetUnsigned() + SmallGEMMLowpFusedBatchedMatMulDataset() { - add_config(TensorShape(4U, 3U), TensorShape(2U, 4U), TensorShape(2U, 3U), 0, 0, OutputStageInfo(GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT, 5, 1 << 25, 5, 0, 254)); - add_config(TensorShape(4U, 3U), TensorShape(2U, 4U), TensorShape(2U, 3U), 0, 0, OutputStageInfo(GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT, 100, 1 << 25, 3, 0, 254)); - add_config(TensorShape(12U, 15U), TensorShape(7U, 12U), TensorShape(7U, 15U), -3, 15, OutputStageInfo(GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT, 0, 1 << 19, 0, 20, 210)); - add_config(TensorShape(59U, 17U), TensorShape(36U, 59U), TensorShape(36U, 17U), -2, 13, OutputStageInfo(GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT, -30, 2, 1 << 25, 14, 210)); - add_config(TensorShape(2U, 4U, 3U), TensorShape(5U, 2U, 3U), TensorShape(5U, 4U, 3U), -5, 12, OutputStageInfo(GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT, -20, 1 << 25, 4, 0, 127)); - add_config(TensorShape(15U, 7U, 3U), TensorShape(29U, 15U, 3U), TensorShape(29U, 7U, 3U), 5, 2, OutputStageInfo(GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT, -10, 1 << 25, 6, 10, 210)); - add_config(TensorShape(56U, 17U, 32U), TensorShape(5U, 56U, 32U), TensorShape(5U, 17U, 32U), -3, 2, OutputStageInfo(GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT, -15, 1 << 25, 3, 10, 210)); - add_config(TensorShape(13U, 256U, 32U), TensorShape(19U, 13U, 32U), TensorShape(19U, 256U, 32U), 5, 2, OutputStageInfo(GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT, -15, 1 << 25, 6, 50, 225)); - } -}; - -class SmallGEMMLowpFusedBatchedMatMulDatasetSigned final : public GEMMLowpFusedOffsetOutputDataset -{ -public: - SmallGEMMLowpFusedBatchedMatMulDatasetSigned() - { - add_config(TensorShape(4U, 3U), TensorShape(2U, 4U), TensorShape(2U, 3U), 0, 0, OutputStageInfo(GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT, 5, 1 << 25, 5, -128, 127)); - add_config(TensorShape(4U, 3U), TensorShape(2U, 4U), TensorShape(2U, 3U), 0, 0, OutputStageInfo(GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT, 100, 1 << 25, 3, -128, 127)); - add_config(TensorShape(12U, 15U), TensorShape(7U, 12U), TensorShape(7U, 15U), -3, 15, OutputStageInfo(GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT, 0, 1 << 19, 0, -108, 127)); - add_config(TensorShape(59U, 17U), TensorShape(36U, 59U), TensorShape(36U, 17U), -2, 13, OutputStageInfo(GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT, -30, 2, 1 << 25, -98, 107)); - add_config(TensorShape(2U, 4U, 3U), TensorShape(5U, 2U, 3U), TensorShape(5U, 4U, 3U), -5, 12, OutputStageInfo(GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT, -20, 1 << 25, 4, -127, 64)); - add_config(TensorShape(15U, 7U, 3U), TensorShape(29U, 15U, 3U), TensorShape(29U, 7U, 3U), 5, 2, OutputStageInfo(GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT, -10, 1 << 25, 6, -64, 127)); - add_config(TensorShape(56U, 17U, 32U), TensorShape(5U, 56U, 32U), TensorShape(5U, 17U, 32U), 3, 2, OutputStageInfo(GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT, -15, 1 << 25, 6, -127, 110)); - add_config(TensorShape(13U, 256U, 32U), TensorShape(19U, 13U, 32U), TensorShape(19U, 256U, 32U), 5, 2, OutputStageInfo(GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT, -15, 1 << 25, 6, -77, 115)); + add_config(TensorShape(4U, 3U), TensorShape(2U, 4U), TensorShape(2U, 3U), GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT); + add_config(TensorShape(12U, 15U), TensorShape(7U, 12U), TensorShape(7U, 15U), GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT); + add_config(TensorShape(59U, 17U), TensorShape(36U, 59U), TensorShape(36U, 17U), GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT); + add_config(TensorShape(2U, 4U, 3U), TensorShape(5U, 2U, 3U), TensorShape(5U, 4U, 3U), GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT); + add_config(TensorShape(15U, 7U, 3U), TensorShape(29U, 15U, 3U), TensorShape(29U, 7U, 3U), GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT); + add_config(TensorShape(56U, 17U, 32U), TensorShape(5U, 56U, 32U), TensorShape(5U, 17U, 32U), GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT); + add_config(TensorShape(13U, 256U, 32U), TensorShape(19U, 13U, 32U), TensorShape(19U, 256U, 32U), GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT); } }; @@ -216,14 +152,12 @@ class SmallGEMMLowpFusedOffsetOutputOutput3DUint8Dataset final : public GEMMLowp public: SmallGEMMLowpFusedOffsetOutputOutput3DUint8Dataset() { - add_config(TensorShape(21U, 1421U, 33U), TensorShape(34U, 21U), TensorShape(34U, 7U, 203U, 33U), 0, 0, OutputStageInfo(GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT, -100, 2, 13, 10, 210)); - add_config(TensorShape(31U, 102U, 55U), TensorShape(23U, 31U), TensorShape(23U, 1U, 102U, 55U), 0, 4, OutputStageInfo(GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT, 100, 2, 13, 10, 210)); - add_config(TensorShape(38U, 1200U, 77U), TensorShape(21U, 38U), TensorShape(21U, 4U, 300U, 77U), 18, 23, OutputStageInfo(GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT, 200, 2, 13, 10, 210)); - add_config(TensorShape(32U, 103U, 99U), TensorShape(17U, 32U), TensorShape(17U, 1U, 103U, 99U), -9, 1, OutputStageInfo(GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT, -100, 2, 13, 10, 210)); - add_config(TensorShape(16U, 1600U, 111U), TensorShape(8U, 16U), TensorShape(8U, 8U, 200U, 111U), 0, 0, OutputStageInfo(GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT, -2, 254601600, 10, 10, - 210)); - add_config(TensorShape(16U, 1600U, 113U), TensorShape(8U, 16U), TensorShape(8U, 8U, 200U, 113U), -2, 13, OutputStageInfo(GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT, 0, 254601600, 10, 10, - 210)); + add_config(TensorShape(21U, 1421U, 33U), TensorShape(34U, 21U), TensorShape(34U, 7U, 203U, 33U), GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT); + add_config(TensorShape(31U, 102U, 55U), TensorShape(23U, 31U), TensorShape(23U, 1U, 102U, 55U), GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT); + add_config(TensorShape(38U, 1200U, 77U), TensorShape(21U, 38U), TensorShape(21U, 4U, 300U, 77U), GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT); + add_config(TensorShape(32U, 103U, 99U), TensorShape(17U, 32U), TensorShape(17U, 1U, 103U, 99U), GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT); + add_config(TensorShape(16U, 1600U, 111U), TensorShape(8U, 16U), TensorShape(8U, 8U, 200U, 111U), GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT); + add_config(TensorShape(16U, 1600U, 113U), TensorShape(8U, 16U), TensorShape(8U, 8U, 200U, 113U), GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT); } }; @@ -232,14 +166,12 @@ class SmallGEMMLowpFusedOffsetOutputInputOutput3DUint8Dataset final : public GEM public: SmallGEMMLowpFusedOffsetOutputInputOutput3DUint8Dataset() { - add_config(TensorShape(21U, 7U, 203U, 33U), TensorShape(34U, 21U), TensorShape(34U, 7U, 203U, 33U), 0, 0, OutputStageInfo(GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT, -100, 2, 13, 10, 210)); - add_config(TensorShape(31U, 1U, 102U, 55U), TensorShape(23U, 31U), TensorShape(23U, 1U, 102U, 55U), 0, 4, OutputStageInfo(GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT, 100, 2, 13, 10, 210)); - add_config(TensorShape(38U, 4U, 300U, 77U), TensorShape(21U, 38U), TensorShape(21U, 4U, 300U, 77U), 18, 23, OutputStageInfo(GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT, 200, 2, 13, 10, 210)); - add_config(TensorShape(32U, 1U, 103U, 99U), TensorShape(17U, 32U), TensorShape(17U, 1U, 103U, 99U), -9, 1, OutputStageInfo(GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT, -100, 2, 13, 10, 210)); - add_config(TensorShape(16U, 8U, 200U, 111U), TensorShape(8U, 16U), TensorShape(8U, 8U, 200U, 111U), 0, 0, OutputStageInfo(GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT, -2, 254601600, 10, 10, - 210)); - add_config(TensorShape(16U, 8U, 200U, 113U), TensorShape(8U, 16U), TensorShape(8U, 8U, 200U, 113U), -2, 13, OutputStageInfo(GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT, 0, 254601600, 10, 10, - 210)); + add_config(TensorShape(21U, 7U, 203U, 33U), TensorShape(34U, 21U), TensorShape(34U, 7U, 203U, 33U), GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT); + add_config(TensorShape(31U, 1U, 102U, 55U), TensorShape(23U, 31U), TensorShape(23U, 1U, 102U, 55U), GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT); + add_config(TensorShape(38U, 4U, 300U, 77U), TensorShape(21U, 38U), TensorShape(21U, 4U, 300U, 77U), GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT); + add_config(TensorShape(32U, 1U, 103U, 99U), TensorShape(17U, 32U), TensorShape(17U, 1U, 103U, 99U), GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT); + add_config(TensorShape(16U, 8U, 200U, 111U), TensorShape(8U, 16U), TensorShape(8U, 8U, 200U, 111U), GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT); + add_config(TensorShape(16U, 8U, 200U, 113U), TensorShape(8U, 16U), TensorShape(8U, 8U, 200U, 113U), GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT); } }; @@ -248,28 +180,14 @@ class SmallGEMMLowpFusedOffsetOutputInt8Dataset final : public GEMMLowpFusedOffs public: SmallGEMMLowpFusedOffsetOutputInt8Dataset() { - add_config(TensorShape(21U, 1U), TensorShape(1U, 21U), TensorShape(1U, 1U), 0, 0, OutputStageInfo(GEMMLowpOutputStageType::QUANTIZE_DOWN, -50, 2, 13, -10, 110)); - add_config(TensorShape(31U, 3U), TensorShape(72U, 31U), TensorShape(72U, 3U), -2, 13, OutputStageInfo(GEMMLowpOutputStageType::QUANTIZE_DOWN, 0, 2, 13, -10, 110)); - add_config(TensorShape(52U, 26U), TensorShape(33U, 52U), TensorShape(33U, 26U), -2, 0, OutputStageInfo(GEMMLowpOutputStageType::QUANTIZE_DOWN, 0, 2, 13, -10, 110)); - add_config(TensorShape(38U, 43U), TensorShape(21U, 38U), TensorShape(21U, 43U), -3, -2, OutputStageInfo(GEMMLowpOutputStageType::QUANTIZE_DOWN, -40, 2, 13, -10, 110)); - - add_config(TensorShape(21U, 13U), TensorShape(33U, 21U), TensorShape(33U, 13U), 0, 0, OutputStageInfo(GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT, -1, 254601600, 10, -10, 110)); - add_config(TensorShape(52U, 26U), TensorShape(33U, 52U), TensorShape(33U, 26U), -2, 0, OutputStageInfo(GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT, 1, 254601600, 10, -10, 110)); - add_config(TensorShape(38U, 43U), TensorShape(21U, 38U), TensorShape(21U, 43U), -3, -2, OutputStageInfo(GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT, -2, 254601602, 10, -10, 110)); - add_config(TensorShape(32U, 72U), TensorShape(17U, 32U), TensorShape(17U, 72U), -9, 1, OutputStageInfo(GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT, -1, 254601602, 10, -10, 110)); - } -}; - -class SmallGEMMLowpFusedOffsetOutputPerChannelDataset final : public GEMMLowpFusedOffsetOutputDataset -{ -public: - SmallGEMMLowpFusedOffsetOutputPerChannelDataset() - { - add_config(TensorShape(21U, 1U, 6U), TensorShape(43U, 21U, 6U), TensorShape(43U, 1U, 6U), 0, 0, OutputStageInfo(GEMMLowpOutputStageType::QUANTIZE_DOWN, -200, 2, 13, 10, 210)); - add_config(TensorShape(21U, 13U, 3U), TensorShape(33U, 21U, 3U), TensorShape(33U, 13U, 3U), 0, 0, OutputStageInfo(GEMMLowpOutputStageType::QUANTIZE_DOWN, -100, 2, 13, 10, 210)); - add_config(TensorShape(31U, 3U, 2U), TensorShape(72U, 31U, 2U), TensorShape(72U, 3U, 2U), -2, 13, OutputStageInfo(GEMMLowpOutputStageType::QUANTIZE_DOWN, 0, 2, 13, 10, 210)); - add_config(TensorShape(52U, 13U, 7U), TensorShape(33U, 52U, 7U), TensorShape(33U, 13U, 7U), 0, 4, OutputStageInfo(GEMMLowpOutputStageType::QUANTIZE_DOWN, 100, 2, 13, 10, 210)); - add_config(TensorShape(52U, 26U, 8U), TensorShape(33U, 52U, 8U), TensorShape(33U, 26U, 8U), -2, 0, OutputStageInfo(GEMMLowpOutputStageType::QUANTIZE_DOWN, 0, 2, 13, 10, 210)); + add_config(TensorShape(21U, 1U), TensorShape(1U, 21U), TensorShape(1U, 1U), GEMMLowpOutputStageType::QUANTIZE_DOWN); + add_config(TensorShape(31U, 3U), TensorShape(72U, 31U), TensorShape(72U, 3U), GEMMLowpOutputStageType::QUANTIZE_DOWN); + add_config(TensorShape(52U, 26U), TensorShape(33U, 52U), TensorShape(33U, 26U), GEMMLowpOutputStageType::QUANTIZE_DOWN); + add_config(TensorShape(38U, 43U), TensorShape(21U, 38U), TensorShape(21U, 43U), GEMMLowpOutputStageType::QUANTIZE_DOWN); + add_config(TensorShape(21U, 13U), TensorShape(33U, 21U), TensorShape(33U, 13U), GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT); + add_config(TensorShape(52U, 26U), TensorShape(33U, 52U), TensorShape(33U, 26U), GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT); + add_config(TensorShape(38U, 43U), TensorShape(21U, 38U), TensorShape(21U, 43U), GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT); + add_config(TensorShape(32U, 72U), TensorShape(17U, 32U), TensorShape(17U, 72U), GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT); } }; @@ -278,15 +196,12 @@ class LargeGEMMLowpFusedOffsetOutputUint8Dataset final : public GEMMLowpFusedOff public: LargeGEMMLowpFusedOffsetOutputUint8Dataset() { - add_config(TensorShape(923U, 429U), TensorShape(871U, 923U), TensorShape(871U, 429U), 0, 0, OutputStageInfo(GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT, -100, 2, 18, 10, 210)); - add_config(TensorShape(873U, 513U), TensorShape(784U, 873U), TensorShape(784U, 513U), 0, 4, OutputStageInfo(GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT, 100, 2, 18, 10, 210)); - add_config(TensorShape(1021U, 973U), TensorShape(783U, 1021U), TensorShape(783U, 973U), 5, 13, OutputStageInfo(GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT, 200, 2, 18, 10, 210)); - add_config(TensorShape(941U, 1011U), TensorShape(623U, 941U), TensorShape(623U, 1011U), -9, 1, OutputStageInfo(GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT, -100, 2, 18, 10, 210)); + add_config(TensorShape(923U, 429U), TensorShape(871U, 923U), TensorShape(871U, 429U),GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT); + add_config(TensorShape(873U, 513U), TensorShape(784U, 873U), TensorShape(784U, 513U),GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT); + add_config(TensorShape(1021U, 973U), TensorShape(783U, 1021U), TensorShape(783U, 973U),GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT); + add_config(TensorShape(941U, 1011U), TensorShape(623U, 941U), TensorShape(623U, 1011U),GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT); + add_config(TensorShape(681U, 1023U), TensorShape(213U, 681U), TensorShape(213U, 1023U),GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT); - add_config(TensorShape(923U, 429U), TensorShape(871U, 923U), TensorShape(871U, 429U), 0, 0, OutputStageInfo(GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT, -1, 254601600, 15, 10, 210)); - add_config(TensorShape(873U, 513U), TensorShape(784U, 873U), TensorShape(784U, 513U), 0, 4, OutputStageInfo(GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT, 1, 254601600, 15, 10, 210)); - add_config(TensorShape(1021U, 973U), TensorShape(783U, 1021U), TensorShape(783U, 973U), 5, 13, OutputStageInfo(GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT, -2, 254601602, 15, 10, 210)); - add_config(TensorShape(681U, 1023U), TensorShape(213U, 681U), TensorShape(213U, 1023U), -3, -2, OutputStageInfo(GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT, -1, 254601602, 15, 10, 210)); } }; @@ -295,18 +210,17 @@ class LargeGEMMLowpFusedOffsetOutputInt8Dataset final : public GEMMLowpFusedOffs public: LargeGEMMLowpFusedOffsetOutputInt8Dataset() { - add_config(TensorShape(923U, 1U, 15U), TensorShape(871U, 923U, 15U), TensorShape(871U, 1U, 15U), 0, 0, OutputStageInfo(GEMMLowpOutputStageType::QUANTIZE_DOWN, -50, 2, 18, -10, 110)); - add_config(TensorShape(873U, 7U), TensorShape(784U, 873U), TensorShape(784U, 7U), -1, 3, OutputStageInfo(GEMMLowpOutputStageType::QUANTIZE_DOWN, 0, 2, 18, -10, 110)); - add_config(TensorShape(697U, 872U), TensorShape(563U, 697U), TensorShape(563U, 872U), -2, 0, OutputStageInfo(GEMMLowpOutputStageType::QUANTIZE_DOWN, 0, 2, 18, -10, 110)); - add_config(TensorShape(681U, 1023U), TensorShape(213U, 681U), TensorShape(213U, 1023U), -3, -2, OutputStageInfo(GEMMLowpOutputStageType::QUANTIZE_DOWN, -50, 2, 18, -10, 110)); - - add_config(TensorShape(923U, 1U), TensorShape(871U, 923U), TensorShape(871U, 1U), 0, 0, OutputStageInfo(GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT, -2, 254601600, 15, -10, 110)); - add_config(TensorShape(873U, 7U), TensorShape(784U, 873U), TensorShape(784U, 7U), -1, 3, OutputStageInfo(GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT, 0, 254601600, 15, -10, 110)); - add_config(TensorShape(697U, 872U), TensorShape(563U, 697U), TensorShape(563U, 872U), -2, 0, OutputStageInfo(GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT, 2, 254601602, 15, -10, 110)); - add_config(TensorShape(1021U, 973U), TensorShape(783U, 1021U), TensorShape(783U, 973U), 5, 13, OutputStageInfo(GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT, -2, 254601602, 15, -10, 110)); + add_config(TensorShape(923U, 1U, 15U), TensorShape(871U, 923U, 15U), TensorShape(871U, 1U, 15U), GEMMLowpOutputStageType::QUANTIZE_DOWN); + add_config(TensorShape(873U, 7U), TensorShape(784U, 873U), TensorShape(784U, 7U), GEMMLowpOutputStageType::QUANTIZE_DOWN); + add_config(TensorShape(697U, 872U), TensorShape(563U, 697U), TensorShape(563U, 872U), GEMMLowpOutputStageType::QUANTIZE_DOWN); + add_config(TensorShape(681U, 1023U), TensorShape(213U, 681U), TensorShape(213U, 1023U), GEMMLowpOutputStageType::QUANTIZE_DOWN); + add_config(TensorShape(923U, 1U), TensorShape(871U, 923U), TensorShape(871U, 1U), GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT); + add_config(TensorShape(873U, 7U), TensorShape(784U, 873U), TensorShape(784U, 7U), GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT); + add_config(TensorShape(697U, 872U), TensorShape(563U, 697U), TensorShape(563U, 872U), GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT); + add_config(TensorShape(1021U, 973U), TensorShape(783U, 1021U), TensorShape(783U, 973U), GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT); } }; } // namespace datasets } // namespace test } // namespace arm_compute -#endif /* ARM_COMPUTE_TEST_GEMMLOWPOUTPUT_DATASET */ +#endif // ACL_TESTS_DATASETS_GEMMLOWPFUSEDOFFSETOUTPUTDATASET_H diff --git a/tests/validate_examples/RunExample.cpp b/tests/validate_examples/RunExample.cpp index 5066e9663d..36bf587551 100644 --- a/tests/validate_examples/RunExample.cpp +++ b/tests/validate_examples/RunExample.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2021 Arm Limited. + * Copyright (c) 2018-2021, 2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -29,6 +29,7 @@ #include "ValidateExample.h" #include "arm_compute/runtime/CL/CLHelpers.h" #include "arm_compute/runtime/Scheduler.h" +#include "arm_compute/core/Version.h" #include "tests/AssetsLibrary.h" #include "tests/Globals.h" #include "tests/framework/Framework.h" diff --git a/tests/validation/CL/GEMMLowp.cpp b/tests/validation/CL/GEMMLowp.cpp index 0b057b9dce..1ae9e96626 100644 --- a/tests/validation/CL/GEMMLowp.cpp +++ b/tests/validation/CL/GEMMLowp.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2022 Arm Limited. + * Copyright (c) 2017-2024 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -44,6 +44,9 @@ namespace test { namespace validation { + +using framework::dataset::make; + namespace { constexpr AbsoluteTolerance tolerance_quant(1); /**< Tolerance value for comparing reference's output against implementation's output for quantized data types */ @@ -72,9 +75,9 @@ using CLGEMMLowpMatrixMultiplyCoreFusedOffsetOutputFixtureBatchedUnsigned = TEST_SUITE(BatchedMatMul) TEST_SUITE(QASYMM8) FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMLowpMatrixMultiplyCoreFusedOffsetOutputFixtureBatchedUnsigned, framework::DatasetMode::ALL, - combine(combine(datasets::SmallGEMMLowpFusedBatchedMatMulDatasetUnsigned(), - framework::dataset::make("DataType", { DataType::QASYMM8 })), - framework::dataset::make("bool", { false }))) + combine(datasets::SmallGEMMLowpFusedBatchedMatMulDataset(), + make("DataType", { DataType::QASYMM8 }), + make("reshape_b_only_on_first_run", { false }))) { validate(CLAccessor(_target), _reference, tolerance_quant); } @@ -84,9 +87,9 @@ using CLGEMMLowpMatrixMultiplyCoreFusedOffsetOutputFixtureBatchedSigned = GEMMLowpMatrixMultiplyCoreFusedOffsetOutputGenericValidationFixture; TEST_SUITE(QASYMM8_SIGNED) FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMLowpMatrixMultiplyCoreFusedOffsetOutputFixtureBatchedSigned, framework::DatasetMode::ALL, - combine(combine(datasets::SmallGEMMLowpFusedBatchedMatMulDatasetSigned(), - framework::dataset::make("DataType", { DataType::QASYMM8_SIGNED })), - framework::dataset::make("bool", { false }))) + combine(datasets::SmallGEMMLowpFusedBatchedMatMulDataset(), + make("DataType", { DataType::QASYMM8_SIGNED }), + make("reshape_b_only_on_first_run", { false }))) { validate(CLAccessor(_target), _reference, tolerance_quant); } @@ -96,9 +99,10 @@ TEST_SUITE_END() // BatchedMatMul TEST_SUITE(FusedOffsetOutput) TEST_SUITE(QASYMM8) using CLGEMMLowpMatrixMultiplyCoreFusedOffsetOutputUint8Fixture = GEMMLowpMatrixMultiplyCoreFusedOffsetOutputGenericValidationFixture; -FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMLowpMatrixMultiplyCoreFusedOffsetOutputUint8Fixture, framework::DatasetMode::ALL, combine(combine(datasets::SmallGEMMLowpFusedOffsetOutputUint8Dataset(), - framework::dataset::make("DataType", { DataType::QASYMM8 })), - framework::dataset::make("reshape_b_only_on_first_run", { true, false }))) +FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMLowpMatrixMultiplyCoreFusedOffsetOutputUint8Fixture, framework::DatasetMode::ALL, + combine(datasets::SmallGEMMLowpFusedOffsetOutputUint8Dataset(), + make("DataType", { DataType::QASYMM8 }), + make("reshape_b_only_on_first_run", { true, false }))) { // Validate output validate(CLAccessor(_target), _reference, tolerance_quant); @@ -108,9 +112,9 @@ TEST_SUITE(Output3D) using CLGEMMLowpMatrixMultiplyCoreFusedOffsetOutputOutput3DUint8Fixture = GEMMLowpMatrixMultiplyCoreFusedOffsetOutputGenericValidationFixture; FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMLowpMatrixMultiplyCoreFusedOffsetOutputOutput3DUint8Fixture, framework::DatasetMode::ALL, - combine(combine(datasets::SmallGEMMLowpFusedOffsetOutputOutput3DUint8Dataset(), - framework::dataset::make("DataType", { DataType::QASYMM8 })), - framework::dataset::make("reshape_b_only_on_first_run", { true, false }))) + combine(datasets::SmallGEMMLowpFusedOffsetOutputOutput3DUint8Dataset(), + make("DataType", { DataType::QASYMM8 }), + make("reshape_b_only_on_first_run", { true, false }))) { // Validate output validate(CLAccessor(_target), _reference, tolerance_quant); @@ -121,18 +125,19 @@ TEST_SUITE(InputOutput3D) using CLGEMMLowpMatrixMultiplyCoreFusedOffsetOutputInputOutput3DUint8Fixture = GEMMLowpMatrixMultiplyCoreFusedOffsetOutputGenericValidationFixture; FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMLowpMatrixMultiplyCoreFusedOffsetOutputInputOutput3DUint8Fixture, framework::DatasetMode::ALL, - combine(combine(datasets::SmallGEMMLowpFusedOffsetOutputInputOutput3DUint8Dataset(), - framework::dataset::make("DataType", { DataType::QASYMM8 })), - framework::dataset::make("reshape_b_only_on_first_run", { true, false }))) + combine(datasets::SmallGEMMLowpFusedOffsetOutputInputOutput3DUint8Dataset(), + make("DataType", { DataType::QASYMM8 }), + make("reshape_b_only_on_first_run", { true, false }))) { // Validate output validate(CLAccessor(_target), _reference, tolerance_quant); } TEST_SUITE_END() // InputOutput3D -FIXTURE_DATA_TEST_CASE(RunLarge, CLGEMMLowpMatrixMultiplyCoreFusedOffsetOutputUint8Fixture, framework::DatasetMode::NIGHTLY, combine(combine(datasets::LargeGEMMLowpFusedOffsetOutputUint8Dataset(), - framework::dataset::make("DataType", { DataType::QASYMM8 })), - framework::dataset::make("reshape_b_only_on_first_run", { true, false }))) +FIXTURE_DATA_TEST_CASE(RunLarge, CLGEMMLowpMatrixMultiplyCoreFusedOffsetOutputUint8Fixture, framework::DatasetMode::NIGHTLY, + combine(datasets::LargeGEMMLowpFusedOffsetOutputUint8Dataset(), + make("DataType", { DataType::QASYMM8 }), + make("reshape_b_only_on_first_run", { true, false }))) { // Validate output validate(CLAccessor(_target), _reference, tolerance_quant); @@ -141,8 +146,9 @@ TEST_SUITE_END() // QASYMM8 TEST_SUITE(QASYMM8_SIGNED) using CLGEMMLowpMatrixMultiplyCoreFusedOffsetOutputInt8Fixture = GEMMLowpMatrixMultiplyCoreFusedOffsetOutputValidationFixture; -FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMLowpMatrixMultiplyCoreFusedOffsetOutputInt8Fixture, framework::DatasetMode::ALL, combine(datasets::SmallGEMMLowpFusedOffsetOutputInt8Dataset(), - framework::dataset::make("DataType", { DataType::QASYMM8_SIGNED }))) +FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMLowpMatrixMultiplyCoreFusedOffsetOutputInt8Fixture, framework::DatasetMode::ALL, + combine(datasets::SmallGEMMLowpFusedOffsetOutputInt8Dataset(), + make("DataType", { DataType::QASYMM8_SIGNED }))) { // Validate output validate(CLAccessor(_target), _reference, tolerance_quant); @@ -185,24 +191,24 @@ TEST_SUITE(QuantizeDownInt32Scale) TEST_SUITE(QASYMM8) -const auto quantize_down_int32_to_uint8_scale_cases = framework::dataset::make("result_offset", -2, 1) * framework::dataset::make("result_mult_int", 1, 2) * framework::dataset::make("result_shift", 2, - 3) - * framework::dataset::make("min", 0) * framework::dataset::make("max", 255) * framework::dataset::make("addBias", { false, true }); +const auto quantize_down_int32_to_uint8_scale_cases = make("result_offset", -2, 1) * make("result_mult_int", 1, 2) * make("result_shift", 2, 3) + * make("min", 0) * make("max", 255) * make("addBias", { false, true }); -const auto quantize_down_int32_to_uint8_scale_relu_cases = framework::dataset::make("result_offset", -2, 1) * framework::dataset::make("result_mult_int", 1, - 2) - * framework::dataset::make("result_shift", 2, 3) * framework::dataset::make("min", 0, 2) * framework::dataset::make("max", 171, 173) * framework::dataset::make("addBias", { false, true }); +const auto quantize_down_int32_to_uint8_scale_relu_cases = make("result_offset", -2, 1) * make("result_mult_int", 1, 2) + * make("result_shift", 2, 3) * make("min", 0, 2) * make("max", 171, 173) * make("addBias", { false, true }); using CLGEMMLowpQuantizeDownInt32ScaleFixture = GEMMLowpQuantizeDownInt32ToUint8ScaleValidationFixture; -FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMLowpQuantizeDownInt32ScaleFixture, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), quantize_down_int32_to_uint8_scale_cases)) +FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMLowpQuantizeDownInt32ScaleFixture, framework::DatasetMode::ALL, + combine(datasets::SmallShapes(), quantize_down_int32_to_uint8_scale_cases)) { // Validate output validate(CLAccessor(_target), _reference); } TEST_SUITE(BoundedReLu) -FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMLowpQuantizeDownInt32ScaleFixture, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), quantize_down_int32_to_uint8_scale_relu_cases)) +FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMLowpQuantizeDownInt32ScaleFixture, framework::DatasetMode::ALL, + combine(datasets::SmallShapes(), quantize_down_int32_to_uint8_scale_relu_cases)) { // Validate output validate(CLAccessor(_target), _reference); @@ -213,24 +219,24 @@ TEST_SUITE_END() // QASYMM8 TEST_SUITE(QASYMM8_SIGNED) -const auto quantize_down_int32_to_int8_scale_cases = framework::dataset::make("result_offset", -2, 1) * framework::dataset::make("result_mult_int", 1, 2) * framework::dataset::make("result_shift", 2, - 3) - * framework::dataset::make("min", -128) * framework::dataset::make("max", 127) * framework::dataset::make("addBias", { false, true }); +const auto quantize_down_int32_to_int8_scale_cases = make("result_offset", -2, 1) * make("result_mult_int", 1, 2) * make("result_shift", 2, 3) + * make("min", -128) * make("max", 127) * make("addBias", { false, true }); -const auto quantize_down_int32_to_int8_scale_relu_cases = framework::dataset::make("result_offset", -2, 1) * framework::dataset::make("result_mult_int", 1, - 2) - * framework::dataset::make("result_shift", 2, 3) * framework::dataset::make("min", -100, -98) * framework::dataset::make("max", 71, 73) * framework::dataset::make("addBias", { false, true }); +const auto quantize_down_int32_to_int8_scale_relu_cases = make("result_offset", -2, 1) * make("result_mult_int", 1, 2) + * make("result_shift", 2, 3) * make("min", -100, -98) * make("max", 71, 73) * make("addBias", { false, true }); using CLGEMMLowpQuantizeDownInt32ScaleFixture = GEMMLowpQuantizeDownInt32ToInt8ScaleValidationFixture; -FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMLowpQuantizeDownInt32ScaleFixture, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), quantize_down_int32_to_int8_scale_cases)) +FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMLowpQuantizeDownInt32ScaleFixture, framework::DatasetMode::ALL, + combine(datasets::SmallShapes(), quantize_down_int32_to_int8_scale_cases)) { // Validate output validate(CLAccessor(_target), _reference); } TEST_SUITE(BoundedReLu) -FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMLowpQuantizeDownInt32ScaleFixture, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), quantize_down_int32_to_int8_scale_relu_cases)) +FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMLowpQuantizeDownInt32ScaleFixture, framework::DatasetMode::ALL, + combine(datasets::SmallShapes(), quantize_down_int32_to_int8_scale_relu_cases)) { // Validate output validate(CLAccessor(_target), _reference); @@ -247,13 +253,14 @@ using CLGEMMLowpQuantizeDownInt32ScaleByFloatFixture = GEMMLowpQuantizeDownInt32ScaleByFloatValidationFixture; FIXTURE_DATA_TEST_CASE(RunTiny, CLGEMMLowpQuantizeDownInt32ScaleByFloatFixture, framework::DatasetMode::ALL, - combine(combine(combine(combine(combine(combine(framework::dataset::make("DataType", DataType::QASYMM8), - datasets::TinyShapes()), - framework::dataset::make("result_real_multiplier", 0.33f)), - framework::dataset::make("result_offset", 2, 3)), - framework::dataset::make("min", 0)), - framework::dataset::make("max", 255)), - framework::dataset::make("addBias", { false, true }))) + combine( + make("DataType", DataType::QASYMM8), + datasets::TinyShapes(), + make("result_real_multiplier", 0.33f), + make("result_offset", 2, 3), + make("min", 0), + make("max", 255), + make("addBias", { false, true }))) { // Validate output validate(CLAccessor(_target), _reference); @@ -264,13 +271,14 @@ TEST_SUITE(QASYMM8_SIGNED) using CLGEMMLowpQuantizeDownInt32ScaleByFloatFixture_Signed = GEMMLowpQuantizeDownInt32ScaleByFloatValidationFixture; FIXTURE_DATA_TEST_CASE(RunTiny, CLGEMMLowpQuantizeDownInt32ScaleByFloatFixture_Signed, framework::DatasetMode::ALL, - combine(combine(combine(combine(combine(combine(framework::dataset::make("DataType", DataType::QASYMM8_SIGNED), - datasets::TinyShapes()), - framework::dataset::make("result_real_multiplier", 0.33f)), - framework::dataset::make("result_offset", 2, 3)), - framework::dataset::make("min", -128)), - framework::dataset::make("max", 127)), - framework::dataset::make("addBias", { false, true }))) + combine( + make("DataType", DataType::QASYMM8_SIGNED), + datasets::TinyShapes(), + make("result_real_multiplier", 0.33f), + make("result_offset", 2, 3), + make("min", -128), + make("max", 127), + make("addBias", { false, true }))) { // Validate output validate(CLAccessor(_target), _reference); diff --git a/tests/validation/NEON/GEMMLowp.cpp b/tests/validation/NEON/GEMMLowp.cpp index 46058bd148..9c4d1741eb 100644 --- a/tests/validation/NEON/GEMMLowp.cpp +++ b/tests/validation/NEON/GEMMLowp.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2023 Arm Limited. + * Copyright (c) 2017-2024 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -50,9 +50,12 @@ namespace validation TEST_SUITE(NEON) TEST_SUITE(GEMMLowp) TEST_SUITE(MatrixMultiplyCore) + using NEGEMMLowpMatrixMultiplyCoreFixture = GEMMLowpMatrixMultiplyCoreValidationFixture; using NEGEMMLowpBatchedMatMulFixture = GEMMLowpMatrixMultiplyCoreValidationFixture; +using framework::dataset::make; + DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, framework::dataset::concat(datasets::SmallGEMMLowpDataset(), datasets::LargeGEMMLowpDataset()), shape_a, shape_b, shape_c, a_offset, b_offset) { @@ -80,26 +83,26 @@ DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, framework::dataset::c // *INDENT-OFF* // clang-format off -DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip( - framework::dataset::make("InputAInfo", { TensorInfo(TensorShape(21U, 13U), 1, DataType::QASYMM8, QuantizationInfo(1.f/255, 10)), // Input not a multiple of 4 +DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip( + make("InputAInfo", { TensorInfo(TensorShape(21U, 13U), 1, DataType::QASYMM8, QuantizationInfo(1.f/255, 10)), // Input not a multiple of 4 TensorInfo(TensorShape(21U, 13U), 1, DataType::S32), // Mismatching data type TensorInfo(TensorShape(20U, 13U), 1, DataType::QASYMM8, QuantizationInfo(1.f/255, 10)), // Invalid dimensions TensorInfo(TensorShape(21U, 13U), 1, DataType::QASYMM8, QuantizationInfo(1.f/255, 10)), // Invalid dimensions TensorInfo(TensorShape(16U, 32U), 1, DataType::QASYMM8, QuantizationInfo(1.f/255, 10)), }), - framework::dataset::make("InputBInfo",{ TensorInfo(TensorShape(33U, 21U), 1, DataType::QASYMM8, QuantizationInfo(1.f/256, 10)), + make("InputBInfo",{ TensorInfo(TensorShape(33U, 21U), 1, DataType::QASYMM8, QuantizationInfo(1.f/256, 10)), TensorInfo(TensorShape(33U, 21U), 1, DataType::QASYMM8, QuantizationInfo(1.f/256, 10)), TensorInfo(TensorShape(33U, 21U), 1, DataType::QASYMM8, QuantizationInfo(1.f/256, 10)), TensorInfo(TensorShape(33U, 21U), 1, DataType::QASYMM8, QuantizationInfo(1.f/256, 10)), TensorInfo(TensorShape(64U, 16U), 1, DataType::QASYMM8, QuantizationInfo(1.f/256, 10)), - })), - framework::dataset::make("OutputInfo",{ TensorInfo(TensorShape(33U, 13U), 1, DataType::S32), + }), + make("OutputInfo",{ TensorInfo(TensorShape(33U, 13U), 1, DataType::S32), TensorInfo(TensorShape(33U, 13U), 1, DataType::S32), TensorInfo(TensorShape(33U, 13U), 1, DataType::S32), TensorInfo(TensorShape(8U, 11U), 1, DataType::S32), TensorInfo(TensorShape(64U, 32U), 1, DataType::S32), - })), - framework::dataset::make("Expected", { true, false, false, false, true })), + }), + make("Expected", { true, false, false, false, true })), a_info, b_info, output_info, expected) { // Lock tensors @@ -231,9 +234,9 @@ using NEGEMMLowpMatrixMultiplyCoreFusedOffsetOutputFixtureBatchedUnsigned = TEST_SUITE(BatchedMatMul) TEST_SUITE(QASYMM8) FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMLowpMatrixMultiplyCoreFusedOffsetOutputFixtureBatchedUnsigned, framework::DatasetMode::ALL, - combine(combine(datasets::SmallGEMMLowpFusedBatchedMatMulDatasetUnsigned(), - framework::dataset::make("DataType", { DataType::QASYMM8 })), - framework::dataset::make("bool", { false }))) + combine(datasets::SmallGEMMLowpFusedBatchedMatMulDataset(), + make("DataType", { DataType::QASYMM8 }), + make("reshape_b_only_on_first_run", { false }))) { validate(Accessor(_target), _reference, tolerance_batched); } @@ -243,9 +246,9 @@ using NEGEMMLowpMatrixMultiplyCoreFusedOffsetOutputFixtureBatchedSigned = GEMMLowpMatrixMultiplyCoreFusedOffsetOutputGenericValidationFixture; TEST_SUITE(QASYMM8_SIGNED) FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMLowpMatrixMultiplyCoreFusedOffsetOutputFixtureBatchedSigned, framework::DatasetMode::ALL, - combine(combine(datasets::SmallGEMMLowpFusedBatchedMatMulDatasetSigned(), - framework::dataset::make("DataType", { DataType::QASYMM8_SIGNED })), - framework::dataset::make("bool", { false }))) + combine(datasets::SmallGEMMLowpFusedBatchedMatMulDataset(), + make("DataType", { DataType::QASYMM8_SIGNED }), + make("reshape_b_only_on_first_run", { false }))) { validate(Accessor(_target), _reference, tolerance_batched); } @@ -256,15 +259,17 @@ using NEGEMMLowpMatrixMultiplyCoreFusedOffsetOutputFixture = GEMMLowpMatrixMulti constexpr AbsoluteTolerance tolerance_quant(1); TEST_SUITE(FusedOffsetOutput) -FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMLowpMatrixMultiplyCoreFusedOffsetOutputFixture, framework::DatasetMode::ALL, combine(datasets::SmallGEMMLowpFusedOffsetOutputUint8Dataset(), - framework::dataset::make("DataType", { DataType::QASYMM8 }))) +FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMLowpMatrixMultiplyCoreFusedOffsetOutputFixture, framework::DatasetMode::ALL, + combine(datasets::SmallGEMMLowpFusedOffsetOutputUint8Dataset(), + make("DataType", { DataType::QASYMM8 }))) { // Validate output validate(Accessor(_target), _reference, tolerance_quant); } -FIXTURE_DATA_TEST_CASE(RunLarge, NEGEMMLowpMatrixMultiplyCoreFusedOffsetOutputFixture, framework::DatasetMode::NIGHTLY, combine(datasets::LargeGEMMLowpFusedOffsetOutputUint8Dataset(), - framework::dataset::make("DataType", { DataType::QASYMM8 }))) +FIXTURE_DATA_TEST_CASE(RunLarge, NEGEMMLowpMatrixMultiplyCoreFusedOffsetOutputFixture, framework::DatasetMode::NIGHTLY, + combine(datasets::LargeGEMMLowpFusedOffsetOutputUint8Dataset(), + make("DataType", { DataType::QASYMM8 }))) { // Validate output validate(Accessor(_target), _reference, tolerance_quant); diff --git a/tests/validation/NEON/ReduceMean.cpp b/tests/validation/NEON/ReduceMean.cpp index 49a38cd49c..8ca0bb53a7 100644 --- a/tests/validation/NEON/ReduceMean.cpp +++ b/tests/validation/NEON/ReduceMean.cpp @@ -46,8 +46,13 @@ constexpr AbsoluteTolerance tolerance_f32(0.001f); /**< Tolerance value f #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC constexpr AbsoluteTolerance tolerance_f16(0.03f); /**< Tolerance value for comparing reference's output against implementation's output for 16-bit floating-point type */ #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +#ifdef __aarch64__ constexpr AbsoluteTolerance tolerance_u8(1); /**< Tolerance value for comparing reference's output against implementation's output for unsigned 8-bit asymmetric quantized type */ +constexpr AbsoluteTolerance tolerance_s8(1); /**< Tolerance value for comparing reference's output against implementation's output for signed 8-bit asymmetric quantized type */ +#else // __aarch64__ +constexpr AbsoluteTolerance tolerance_u8(2); /**< Tolerance value for comparing reference's output against implementation's output for unsigned 8-bit asymmetric quantized type */ constexpr AbsoluteTolerance tolerance_s8(2); /**< Tolerance value for comparing reference's output against implementation's output for signed 8-bit asymmetric quantized type */ +#endif // __aarch64__ const auto axis_keep = combine(framework::dataset::make("Axis", { Coordinates(0), Coordinates(1, 0), Coordinates(1, 2), Coordinates(0, 2), Coordinates(1, 3), Coordinates(2, 3), Coordinates(0, 1, 2, 3) }), framework::dataset::make("KeepDims", { true })); diff --git a/tests/validation/NEON/SoftmaxLayer.cpp b/tests/validation/NEON/SoftmaxLayer.cpp index b372bdf3fa..2397d81547 100644 --- a/tests/validation/NEON/SoftmaxLayer.cpp +++ b/tests/validation/NEON/SoftmaxLayer.cpp @@ -22,14 +22,12 @@ * SOFTWARE. */ #include "arm_compute/core/Types.h" -#include "arm_compute/core/utils/StringUtils.h" #include "arm_compute/runtime/NEON/functions/NESoftmaxLayer.h" #include "arm_compute/runtime/Tensor.h" #include "arm_compute/runtime/TensorAllocator.h" #include "src/common/cpuinfo/CpuIsaInfo.h" #include "src/cpu/kernels/CpuSoftmaxKernel.h" #include "tests/NEON/Accessor.h" -#include "tests/PaddingCalculator.h" #include "tests/datasets/ShapeDatasets.h" #include "tests/framework/Asserts.h" #include "tests/framework/Macros.h" @@ -42,6 +40,7 @@ namespace test { namespace validation { +using framework::dataset::make; namespace { /** Tolerance for float operations */ @@ -53,7 +52,7 @@ constexpr AbsoluteTolerance tolerance_qasymm8(1); constexpr AbsoluteTolerance tolerance_qasymm8_signed(1); /** CNN data types */ -const auto CNNDataTypes = framework::dataset::make("DataType", +const auto CNNDataTypes = make("DataType", { #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC DataType::F16, @@ -66,53 +65,53 @@ TEST_SUITE(NEON) TEST_SUITE(SoftmaxLayer) // *INDENT-OFF* // clang-format off -DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip( - framework::dataset::make("InputInfo", { TensorInfo(TensorShape(27U, 13U), 1, DataType::F32), // Mismatching data types - TensorInfo(TensorShape(27U, 13U), 1, DataType::F32), // Mismatching shapes - TensorInfo(TensorShape(27U, 13U), 1, DataType::QASYMM8, // Invalid output quantization info - QuantizationInfo(1.f/256, 12)), - TensorInfo(TensorShape(32U, 13U), 1, DataType::F32), - TensorInfo(TensorShape(32U, 13U), 1, DataType::QASYMM8, - QuantizationInfo(1.f/256, 12)), - TensorInfo(TensorShape(32U, 13U), 1, DataType::F32), - TensorInfo(TensorShape(32U, 13U), 1, DataType::QASYMM8, //Invalid axis high - QuantizationInfo(1.f/256, 12)), - TensorInfo(TensorShape(32U, 13U), 1, DataType::QASYMM8, //Invalid axis low - QuantizationInfo(1.f/256, 12)), - }), - framework::dataset::make("OutputInfo",{ TensorInfo(TensorShape(27U, 13U), 1, DataType::F16), - TensorInfo(TensorShape(27U, 11U), 1, DataType::F32), - TensorInfo(TensorShape(27U, 13U), 1, DataType::QASYMM8, - QuantizationInfo(1.f/256, 12)), - TensorInfo(TensorShape(32U, 13U), 1, DataType::F32), - TensorInfo(TensorShape(32U, 13U), 1, DataType::QASYMM8, - QuantizationInfo(1.f/256, 0)), - TensorInfo(TensorShape(32U, 13U), 1, DataType::F32), - TensorInfo(TensorShape(32U, 13U), 1, DataType::QASYMM8, - QuantizationInfo(1.f/256, 0)), - TensorInfo(TensorShape(32U, 13U), 1, DataType::QASYMM8, - QuantizationInfo(1.f/256, 0)), - })), - framework::dataset::make("beta", { 1.0, - 2.0, - 1.0, - 2.0, - 1.0, - 1.0, - 2.0, - 1.0, - })), - framework::dataset::make("axis", { 0, - 0, - 0, - 1, - 0, - -1, - 2, - -3, - })), - framework::dataset::make("Expected", { false, false, false, true, true, true, false, false })), - input_info, output_info, beta, axis, expected) +DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip( + make("InputInfo", { TensorInfo(TensorShape(27U, 13U), 1, DataType::F32), // Mismatching data types + TensorInfo(TensorShape(27U, 13U), 1, DataType::F32), // Mismatching shapes + TensorInfo(TensorShape(27U, 13U), 1, DataType::QASYMM8, // Invalid output quantization info + QuantizationInfo(1.f/256, 12)), + TensorInfo(TensorShape(32U, 13U), 1, DataType::F32), + TensorInfo(TensorShape(32U, 13U), 1, DataType::QASYMM8, + QuantizationInfo(1.f/256, 12)), + TensorInfo(TensorShape(32U, 13U), 1, DataType::F32), + TensorInfo(TensorShape(32U, 13U), 1, DataType::QASYMM8, //Invalid axis high + QuantizationInfo(1.f/256, 12)), + TensorInfo(TensorShape(32U, 13U), 1, DataType::QASYMM8, //Invalid axis low + QuantizationInfo(1.f/256, 12)), + }), + make("OutputInfo",{ TensorInfo(TensorShape(27U, 13U), 1, DataType::F16), + TensorInfo(TensorShape(27U, 11U), 1, DataType::F32), + TensorInfo(TensorShape(27U, 13U), 1, DataType::QASYMM8, + QuantizationInfo(1.f/256, 12)), + TensorInfo(TensorShape(32U, 13U), 1, DataType::F32), + TensorInfo(TensorShape(32U, 13U), 1, DataType::QASYMM8, + QuantizationInfo(1.f/256, 0)), + TensorInfo(TensorShape(32U, 13U), 1, DataType::F32), + TensorInfo(TensorShape(32U, 13U), 1, DataType::QASYMM8, + QuantizationInfo(1.f/256, 0)), + TensorInfo(TensorShape(32U, 13U), 1, DataType::QASYMM8, + QuantizationInfo(1.f/256, 0)), + }), + make("beta", { 1.0, + 2.0, + 1.0, + 2.0, + 1.0, + 1.0, + 2.0, + 1.0, + }), + make("axis", { 0, + 0, + 0, + 1, + 0, + -1, + 2, + -3, + }), + make("Expected", { false, false, false, true, true, true, false, false })), + input_info, output_info, beta, axis, expected) { ARM_COMPUTE_EXPECT(bool(NESoftmaxLayer::validate(&input_info.clone()->set_is_resizable(false), &output_info.clone()->set_is_resizable(false), beta, axis)) == expected, framework::LogLevel::ERRORS); } @@ -122,54 +121,26 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip( template using NESoftmaxLayerFixture = SoftmaxValidationFixture; -DATA_TEST_CASE(KernelSelection_max_logits, framework::DatasetMode::ALL, concat( - combine(framework::dataset::make("CpuExt", std::string("NEON")), - framework::dataset::make("DataType", { DataType::F32, - DataType::F16, - DataType::QASYMM8, - DataType::QASYMM8_SIGNED - })), - combine(framework::dataset::make("CpuExt", std::string("SVE")), - framework::dataset::make("DataType", { DataType::F32, - DataType::F16, - DataType::QASYMM8, - DataType::QASYMM8_SIGNED - }))), - cpu_ext, data_type) -{ - using namespace cpu::kernels; - - cpuinfo::CpuIsaInfo cpu_isa{}; - cpu_isa.neon = (cpu_ext == "NEON"); - cpu_isa.sve = (cpu_ext == "SVE"); - cpu_isa.fp16 = (data_type == DataType::F16); - - const auto *selected_impl = CpuLogits1DMaxKernel::get_implementation(DataTypeISASelectorData{ data_type, cpu_isa }, cpu::KernelSelectionType::Preferred); - - ARM_COMPUTE_ERROR_ON_NULLPTR(selected_impl); - - std::string expected = lower_string(cpu_ext) + "_" + cpu_impl_dt(data_type) + "_logits_1d_max"; - std::string actual = selected_impl->name; - - ARM_COMPUTE_EXPECT_EQUAL(expected, actual, framework::LogLevel::ERRORS); -} - -DATA_TEST_CASE(KernelSelection_logits, framework::DatasetMode::ALL, concat(concat( - combine(framework::dataset::make("CpuExt", std::string("NEON")), - framework::dataset::make("DataType", { DataType::F32, - DataType::F16, - DataType::QASYMM8, - DataType::QASYMM8_SIGNED - })), - combine(framework::dataset::make("CpuExt", std::string("SVE")), - framework::dataset::make("DataType", { DataType::F32, - DataType::F16 - }))), - combine(framework::dataset::make("CpuExt", std::string("SVE2")), - framework::dataset::make("DataType", { DataType::QASYMM8, - DataType::QASYMM8_SIGNED - }))), - cpu_ext, data_type) +DATA_TEST_CASE(KernelSelection, framework::DatasetMode::ALL, + concat(concat( + combine( + make("CpuExt", std::string("NEON")), + make("DataType", { DataType::F32, + DataType::F16, + DataType::QASYMM8, + DataType::QASYMM8_SIGNED}) + ), + combine( + make("CpuExt", std::string("SVE")), + make("DataType", { DataType::F32, + DataType::F16})) + ), + combine( + make("CpuExt", std::string("SVE2")), + make("DataType", { DataType::QASYMM8, + DataType::QASYMM8_SIGNED})) + ), + cpu_ext, data_type) { using namespace cpu::kernels; @@ -179,11 +150,12 @@ DATA_TEST_CASE(KernelSelection_logits, framework::DatasetMode::ALL, concat(conca cpu_isa.sve2 = (cpu_ext == "SVE2"); cpu_isa.fp16 = (data_type == DataType::F16); - const auto *selected_impl = CpuLogits1DSoftmaxKernel::get_implementation(DataTypeISASelectorData{ data_type, cpu_isa }, cpu::KernelSelectionType::Preferred); + const auto *selected_impl = CpuSoftmaxKernel::get_implementation( + SoftmaxKernelDataTypeISASelectorData{ data_type, cpu_isa, false /* is_log */ }, cpu::KernelSelectionType::Preferred); ARM_COMPUTE_ERROR_ON_NULLPTR(selected_impl); - std::string expected = lower_string(cpu_ext) + "_" + cpu_impl_dt(data_type) + "_softmax_logits_1d"; + std::string expected = "neon_" + cpu_impl_dt(data_type) + "_softmax"; std::string actual = selected_impl->name; ARM_COMPUTE_EXPECT_EQUAL(expected, actual, framework::LogLevel::ERRORS); @@ -192,26 +164,32 @@ DATA_TEST_CASE(KernelSelection_logits, framework::DatasetMode::ALL, concat(conca TEST_SUITE(Float) #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC TEST_SUITE(FP16) -FIXTURE_DATA_TEST_CASE(RunSmall, NESoftmaxLayerFixture, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::Small4DShapes(), - framework::dataset::make("DataType", DataType::F16)), - framework::dataset::make("Beta", { 1.0f, 2.0f })), - framework::dataset::make("Axis", { 0, 1 }))) +FIXTURE_DATA_TEST_CASE(RunSmall, NESoftmaxLayerFixture, framework::DatasetMode::PRECOMMIT, + combine( + datasets::Small4DShapes(), + make("DataType", DataType::F16), + make("Beta", { 1.0f, 2.0f }), + make("Axis", { 0, 1 }))) { // Validate output validate(Accessor(_target), _reference, tolerance_f16); } -FIXTURE_DATA_TEST_CASE(RunSmall4D, NESoftmaxLayerFixture, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::Small4DShapes(), - framework::dataset::make("DataType", DataType::F16)), - framework::dataset::make("Beta", { 1.0f, 2.0f })), - framework::dataset::make("Axis", { 0, 2, -1 }))) +FIXTURE_DATA_TEST_CASE(RunSmall4D, NESoftmaxLayerFixture, framework::DatasetMode::PRECOMMIT, + combine( + datasets::Small4DShapes(), + make("DataType", DataType::F16), + make("Beta", { 1.0f, 2.0f }), + make("Axis", { 0, 2, -1 }))) { // Validate output validate(Accessor(_target), _reference, tolerance_f16); } -FIXTURE_DATA_TEST_CASE(RunLarge, NESoftmaxLayerFixture, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::SoftmaxLayerLargeShapes(), - framework::dataset::make("DataType", DataType::F16)), - framework::dataset::make("Beta", { 1.0f, 2.0f })), - framework::dataset::make("Axis", { 0 }))) +FIXTURE_DATA_TEST_CASE(RunLarge, NESoftmaxLayerFixture, framework::DatasetMode::NIGHTLY, + combine( + datasets::SoftmaxLayerLargeShapes(), + make("DataType", DataType::F16), + make("Beta", { 1.0f, 2.0f }), + make("Axis", { 0 }))) { // Validate output validate(Accessor(_target), _reference, tolerance_f16); @@ -220,26 +198,30 @@ TEST_SUITE_END() //FP16 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ TEST_SUITE(FP32) -FIXTURE_DATA_TEST_CASE(RunSmall2D, NESoftmaxLayerFixture, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SoftmaxLayerSmallShapes(), - framework::dataset::make("DataType", DataType::F32)), - framework::dataset::make("Beta", { 1.0f, 2.0f })), - framework::dataset::make("Axis", { 0, -1 }))) +FIXTURE_DATA_TEST_CASE(RunSmall2D, NESoftmaxLayerFixture, framework::DatasetMode::PRECOMMIT, + combine( + datasets::SoftmaxLayerSmallShapes(), + make("DataType", DataType::F32), + make("Beta", { 1.0f, 2.0f }), + make("Axis", { 0, -1 }))) { // Validate output validate(Accessor(_target), _reference, tolerance_f32); } -FIXTURE_DATA_TEST_CASE(RunSmall4D, NESoftmaxLayerFixture, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::Small4DShapes(), - framework::dataset::make("DataType", DataType::F32)), - framework::dataset::make("Beta", { 1.0f, 2.0f })), - framework::dataset::make("Axis", { 0, -2, 3 }))) +FIXTURE_DATA_TEST_CASE(RunSmall4D, NESoftmaxLayerFixture, framework::DatasetMode::PRECOMMIT, + combine(datasets::Small4DShapes(), + make("DataType", DataType::F32), + make("Beta", { 1.0f, 2.0f }), + make("Axis", { 0, -2, 3 }))) { // Validate output validate(Accessor(_target), _reference, tolerance_f32); } -FIXTURE_DATA_TEST_CASE(RunLarge, NESoftmaxLayerFixture, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::SoftmaxLayerLargeShapes(), - framework::dataset::make("DataType", DataType::F32)), - framework::dataset::make("Beta", { 1.0f, 2.0f })), - framework::dataset::make("Axis", { 0 }))) +FIXTURE_DATA_TEST_CASE(RunLarge, NESoftmaxLayerFixture, framework::DatasetMode::NIGHTLY, + combine(datasets::SoftmaxLayerLargeShapes(), + make("DataType", DataType::F32), + make("Beta", { 1.0f, 2.0f }), + make("Axis", { 0 }))) { // Validate output validate(Accessor(_target), _reference, tolerance_f32); @@ -252,29 +234,40 @@ using NESoftmaxLayerQuantizedFixture = SoftmaxValidationQuantizedFixture, framework::DatasetMode::ALL, combine(combine(combine(datasets::SoftmaxLayerSmallShapes(), - framework::dataset::make("DataType", DataType::QASYMM8)), - combine(framework::dataset::make("QuantizationInfo", { QuantizationInfo(0.5f, -10) }), - framework::dataset::make("Beta", { 1.0f, 2.f }))), - framework::dataset::make("Axis", { 0, -1 }))) +FIXTURE_DATA_TEST_CASE(RunSmall2D, NESoftmaxLayerQuantizedFixture, framework::DatasetMode::ALL, + combine( + datasets::SoftmaxLayerSmallShapes(), + make("DataType", DataType::QASYMM8), + combine( + make("QuantizationInfo", { QuantizationInfo(0.5f, -10) }), + make("Beta", { 1.0f, 2.f }) + ), + make("Axis", { 0, -1 }))) { // Validate output validate(Accessor(_target), _reference, tolerance_qasymm8); } -FIXTURE_DATA_TEST_CASE(RunSmall4D, NESoftmaxLayerQuantizedFixture, framework::DatasetMode::ALL, combine(combine(combine(datasets::Small4DShapes(), - framework::dataset::make("DataType", DataType::QASYMM8)), - combine(framework::dataset::make("QuantizationInfo", { QuantizationInfo(0.5f, -10) }), - framework::dataset::make("Beta", { 1.0f, 2.f }))), - framework::dataset::make("Axis", { 0, 1, -2 }))) +FIXTURE_DATA_TEST_CASE(RunSmall4D, NESoftmaxLayerQuantizedFixture, framework::DatasetMode::ALL, + combine( + datasets::Small4DShapes(), + make("DataType", DataType::QASYMM8), + combine( + make("QuantizationInfo", { QuantizationInfo(0.5f, -10) }), + make("Beta", { 1.0f, 2.f })), + make("Axis", { 0, 1, -2 }))) { // Validate output validate(Accessor(_target), _reference, tolerance_qasymm8); } -FIXTURE_DATA_TEST_CASE(RunLarge, NESoftmaxLayerQuantizedFixture, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::SoftmaxLayerLargeShapes(), - framework::dataset::make("DataType", DataType::QASYMM8)), - combine(framework::dataset::make("QuantizationInfo", { QuantizationInfo(0.5f, -10) }), - framework::dataset::make("Beta", { 1.0f, 2.0f }))), - framework::dataset::make("Axis", { 0 }))) +FIXTURE_DATA_TEST_CASE(RunLarge, NESoftmaxLayerQuantizedFixture, framework::DatasetMode::NIGHTLY, + combine( + datasets::SoftmaxLayerLargeShapes(), + make("DataType", DataType::QASYMM8), + combine( + make("QuantizationInfo", { QuantizationInfo(0.5f, -10) }), + make("Beta", { 1.0f, 2.0f }) + ), + make("Axis", { 0 }))) { // Validate output validate(Accessor(_target), _reference, tolerance_qasymm8); @@ -282,20 +275,28 @@ FIXTURE_DATA_TEST_CASE(RunLarge, NESoftmaxLayerQuantizedFixture, framew TEST_SUITE_END() //QASYMM8 TEST_SUITE(QASYMM8_SIGNED) -FIXTURE_DATA_TEST_CASE(RunSmall2D, NESoftmaxLayerQuantizedFixture, framework::DatasetMode::ALL, combine(combine(combine(datasets::SoftmaxLayerSmallShapes(), - framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)), - combine(framework::dataset::make("QuantizationInfo", { QuantizationInfo(0.5f, -10) }), - framework::dataset::make("Beta", { 1.0f, 2.f }))), - framework::dataset::make("Axis", { 0, -1 }))) +FIXTURE_DATA_TEST_CASE(RunSmall2D, NESoftmaxLayerQuantizedFixture, framework::DatasetMode::ALL, + combine( + datasets::SoftmaxLayerSmallShapes(), + make("DataType", DataType::QASYMM8_SIGNED), + combine( + make("QuantizationInfo", { QuantizationInfo(0.5f, -10) }), + make("Beta", { 1.0f, 2.f }) + ), + make("Axis", { 0, -1 }))) { // Validate output validate(Accessor(_target), _reference, tolerance_qasymm8_signed); } -FIXTURE_DATA_TEST_CASE(RunSmall4D, NESoftmaxLayerQuantizedFixture, framework::DatasetMode::ALL, combine(combine(combine(datasets::Small4DShapes(), - framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)), - combine(framework::dataset::make("QuantizationInfo", { QuantizationInfo(0.5f, -10) }), - framework::dataset::make("Beta", { 1.0f, 2.f }))), - framework::dataset::make("Axis", { 0, 1, -1 }))) +FIXTURE_DATA_TEST_CASE(RunSmall4D, NESoftmaxLayerQuantizedFixture, framework::DatasetMode::ALL, + combine( + datasets::Small4DShapes(), + make("DataType", DataType::QASYMM8_SIGNED), + combine( + make("QuantizationInfo", { QuantizationInfo(0.5f, -10) }), + make("Beta", { 1.0f, 2.f }) + ), + make("Axis", { 0, 1, -1 }))) { // Validate output validate(Accessor(_target), _reference, tolerance_qasymm8_signed); diff --git a/tests/validation/UNIT/GPUTarget.cpp b/tests/validation/UNIT/GPUTarget.cpp index 5ec2592f00..2e64635b7a 100644 --- a/tests/validation/UNIT/GPUTarget.cpp +++ b/tests/validation/UNIT/GPUTarget.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2022 Arm Limited. + * Copyright (c) 2018-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -62,6 +62,8 @@ TEST_CASE(GetGPUTargetFromName, framework::DatasetMode::ALL) ARM_COMPUTE_EXPECT(get_target_from_name("Mali-G310") == GPUTarget::G310, framework::LogLevel::ERRORS); ARM_COMPUTE_EXPECT(get_target_from_name("Mali-G715") == GPUTarget::G715, framework::LogLevel::ERRORS); ARM_COMPUTE_EXPECT(get_target_from_name("Mali-G615") == GPUTarget::G615, framework::LogLevel::ERRORS); + ARM_COMPUTE_EXPECT(get_target_from_name("Mali-G720") == GPUTarget::G720, framework::LogLevel::ERRORS); + ARM_COMPUTE_EXPECT(get_target_from_name("Mali-G620") == GPUTarget::G620, framework::LogLevel::ERRORS); } TEST_CASE(GPUTargetIsIn, framework::DatasetMode::ALL) diff --git a/tests/validation/fixtures/GEMMLowpFixture.h b/tests/validation/fixtures/GEMMLowpFixture.h index 1492ac6945..a65a1e6bd8 100644 --- a/tests/validation/fixtures/GEMMLowpFixture.h +++ b/tests/validation/fixtures/GEMMLowpFixture.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2023 Arm Limited. + * Copyright (c) 2017-2024 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -21,14 +21,19 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#ifndef ARM_COMPUTE_TEST_GEMMLOWP_FIXTURE -#define ARM_COMPUTE_TEST_GEMMLOWP_FIXTURE +#ifndef ACL_TESTS_VALIDATION_FIXTURES_GEMMLOWPFIXTURE_H +#define ACL_TESTS_VALIDATION_FIXTURES_GEMMLOWPFIXTURE_H #include "arm_compute/core/utils/quantization/AsymmHelpers.h" +#include "src/core/utils/quantization/AsymmHelpers.h" +#include "tests/validation/Helpers.h" #include "tests/framework/Fixture.h" #include "tests/validation/Validation.h" #include "tests/validation/reference/GEMMLowp.h" +#include +#include + namespace arm_compute { namespace test @@ -37,82 +42,46 @@ namespace validation { namespace { + template void fill(U &&tensor, int i) { - switch(tensor.data_type()) - { - case DataType::QSYMM8_PER_CHANNEL: - { - int min_bound = 128; - int max_bound = -127; - for(size_t j = 0; j < tensor.quantization_info().scale().size(); j++) - { - std::pair bounds = get_symm_quantized_per_channel_bounds(tensor.quantization_info(), -1.0f, 1.0f, i); - if(bounds.first < min_bound) - { - min_bound = bounds.first; - } - if(bounds.second > max_bound) - { - max_bound = bounds.second; - } - } - std::uniform_int_distribution distribution(min_bound, max_bound); - library->fill(tensor, distribution, i); - break; - } - case DataType::QASYMM8: - { - std::uniform_int_distribution distribution(1, 254); - library->fill(tensor, distribution, i); - break; - } - case DataType::S32: - { - std::uniform_int_distribution distribution(-20000, 20000); - library->fill(tensor, distribution, i); - break; - } - case DataType::F16: - { - arm_compute::utils::uniform_real_distribution_16bit distribution{ -1.0f, 1.0f }; - library->fill(tensor, distribution, i); - break; - } - case DataType::F32: - { - std::uniform_real_distribution distribution(-1.0f, 1.0f); - library->fill(tensor, distribution, i); - break; - } - default: - library->fill_tensor_uniform(tensor, i); - } + ARM_COMPUTE_ASSERT(is_data_type_quantized(tensor.data_type())); + library->fill_tensor_uniform(tensor, i); } +template +void fill_bias_s32(U &&tensor, int i, int32_t min, int32_t max) +{ + ARM_COMPUTE_ASSERT(tensor.data_type() == DataType::S32); + std::uniform_int_distribution distribution(min, max); + library->fill(tensor, distribution, i); +} + +/** Information about how to fill tensors */ +struct TensorFillInfo +{ + // Bias fill range. Default values are arbitrary + int32_t min_bias {-20000}; + int32_t max_bias {20000}; + // Optional extra hash to randomize tensor filling + int32_t hash {0}; +}; + template -TensorType compute_gemmlowp_target(const TensorShape &shape_a, const TensorShape &shape_b, const TensorShape &shape_output, int32_t a_offset, int32_t b_offset, - GEMMLowpOutputStageInfo output_stage = GEMMLowpOutputStageInfo(), DataType data_type_a = DataType::QASYMM8, DataType data_type_b = DataType::QASYMM8, - QuantizationInfo b_qinfo = QuantizationInfo(), bool reshape_b_only_on_first_run = false) +TensorType compute_gemmlowp_target(const TensorShape &shape_a, const TensorShape &shape_b, const TensorShape &shape_output, const QuantizationInfo& a_qinfo, const QuantizationInfo& b_qinfo, + const QuantizationInfo& output_qinfo, DataType data_type_a = DataType::QASYMM8, DataType data_type_b = DataType::QASYMM8, + GEMMLowpOutputStageInfo output_stage = GEMMLowpOutputStageInfo(), bool reshape_b_only_on_first_run = false, const TensorFillInfo& finfo = TensorFillInfo() ) { + ARM_COMPUTE_ASSERT(is_data_type_quantized_asymmetric(data_type_a)); + ARM_COMPUTE_ASSERT(data_type_a == data_type_b); // Create tensors - DataType data_type_output = output_stage.type == GEMMLowpOutputStageType::NONE ? DataType::S32 : data_type_a; + const DataType data_type_output = output_stage.type == GEMMLowpOutputStageType::NONE ? DataType::S32 : data_type_a; - TensorType a = create_tensor(shape_a, data_type_a, 1); - TensorType b = create_tensor(shape_b, data_type_b, 1); // gemm output before output stage mismatch if i pass data_layout_output here. to be investigated - TensorType output = create_tensor(shape_output, data_type_output, 1); + TensorType a = create_tensor(shape_a, data_type_a, 1, a_qinfo); + TensorType b = create_tensor(shape_b, data_type_b, 1, b_qinfo); // gemm output before output stage mismatch if i pass data_layout_output here. to be investigated + TensorType output = create_tensor(shape_output, data_type_output, 1, output_qinfo /* output_qinfo will be ignored when output stage type is None */); - a.info()->set_quantization_info(QuantizationInfo(1.0f / 255, a_offset)); - - if(data_type_b == DataType::QSYMM8_PER_CHANNEL) - { - b.info()->set_quantization_info(b_qinfo); - } - else - { - b.info()->set_quantization_info(QuantizationInfo(1.0f / 255, b_offset)); - } TensorType bias; if(is_fused) { @@ -142,26 +111,26 @@ TensorType compute_gemmlowp_target(const TensorShape &shape_a, const TensorShape ARM_COMPUTE_ASSERT(!output.info()->is_resizable()); // Fill tensors - fill(AccessorType(a), 0); - fill(AccessorType(b), 1); + fill(AccessorType(a), 0 + finfo.hash); + fill(AccessorType(b), 1 + finfo.hash); if(is_fused) { ARM_COMPUTE_ASSERT(bias.info()->is_resizable()); bias.allocator()->allocate(); ARM_COMPUTE_ASSERT(!bias.info()->is_resizable()); - fill(AccessorType(bias), 2); + fill_bias_s32(AccessorType(bias), 2 + finfo.hash, finfo.min_bias, finfo.max_bias); } // Run with variable inputs. if(run_twice) { gemmlowp.run(); - fill(AccessorType(a), 3); // Fill tensors with new seed after run - fill(AccessorType(b), 4); + fill(AccessorType(a), 3 + finfo.hash); // Fill tensors with new seed after run + fill(AccessorType(b), 4 + finfo.hash); if(is_fused) { - fill(AccessorType(bias), 5); + fill_bias_s32(AccessorType(bias), 5 + finfo.hash, finfo.min_bias, finfo.max_bias); } } @@ -171,9 +140,11 @@ TensorType compute_gemmlowp_target(const TensorShape &shape_a, const TensorShape } template -SimpleTensor compute_gemmlowp_reference(const TensorShape &shape_a, const TensorShape &shape_b, const TensorShape &shape_output, int32_t a_offset, int32_t b_offset, - DataType data_type_a = DataType::QASYMM8, DataType data_type_b = DataType::QASYMM8, QuantizationInfo b_qinfo = QuantizationInfo()) +SimpleTensor compute_gemmlowp_reference(const TensorShape &shape_a, const TensorShape &shape_b, const TensorShape &shape_output, const QuantizationInfo& a_qinfo, const QuantizationInfo& b_qinfo, + DataType data_type_a = DataType::QASYMM8, DataType data_type_b = DataType::QASYMM8, const TensorFillInfo& finfo = TensorFillInfo()) { + ARM_COMPUTE_ASSERT(is_data_type_quantized_asymmetric(data_type_a)); + ARM_COMPUTE_ASSERT(data_type_a == data_type_b); TensorShape shape_a_to_use = shape_a; if(reinterpret_input_as_3d) { @@ -182,8 +153,8 @@ SimpleTensor compute_gemmlowp_reference(const TensorShape &shape_a, con } // Create reference - SimpleTensor a{ shape_a_to_use, data_type_a, 1 }; - SimpleTensor b{ shape_b, data_type_b, 1, data_type_b == DataType::QSYMM8_PER_CHANNEL ? b_qinfo : QuantizationInfo(1.0f / 255, b_offset) }; + SimpleTensor a{ shape_a_to_use, data_type_a, 1, a_qinfo }; + SimpleTensor b{ shape_b, data_type_b, 1, b_qinfo }; TensorShape shape_a_to_use_transposed{ shape_a_to_use }; TensorShape shape_b_transposed{ shape_b }; @@ -193,12 +164,12 @@ SimpleTensor compute_gemmlowp_reference(const TensorShape &shape_a, con shape_b_transposed.set(0, shape_b[1]); shape_b_transposed.set(1, shape_b[0]); - SimpleTensor a_transposed{ shape_a_to_use_transposed, data_type_a, 1 }; - SimpleTensor b_transposed{ shape_b_transposed, data_type_b, 1, data_type_b == DataType::QSYMM8_PER_CHANNEL ? b_qinfo : QuantizationInfo(1.0f / 255, b_offset) }; + SimpleTensor a_transposed{ shape_a_to_use_transposed, data_type_a, 1, a_qinfo }; + SimpleTensor b_transposed{ shape_b_transposed, data_type_b, 1, b_qinfo }; // Fill reference - fill(a, 0); - fill(b, 1); + fill(a, 0 + finfo.hash); + fill(b, 1 + finfo.hash); // Transpose reference if required /* Note: Assuming the usual batch matmul dimensions A = (B x M x K), B = (B x K x N), if pretranspose_A is set to true, then A is assumed to be (B x K x M), @@ -216,16 +187,18 @@ SimpleTensor compute_gemmlowp_reference(const TensorShape &shape_a, con } // Run with variable inputs. + const int32_t a_offset = a_qinfo.uniform().offset; + const int32_t b_offset = b_qinfo.uniform().offset; if(run_twice) { reference::gemmlowp_matrix_multiply_core((pretranspose_A ? a_transposed : a), (pretranspose_B ? b_transposed : b), shape_output, a_offset, b_offset); - fill((pretranspose_A) ? a_transposed : a, 3); - fill((pretranspose_B) ? b_transposed : b, 4); + fill((pretranspose_A) ? a_transposed : a, 3 + finfo.hash); + fill((pretranspose_B) ? b_transposed : b, 4 + finfo.hash); } return reference::gemmlowp_matrix_multiply_core((pretranspose_A ? a_transposed : a), (pretranspose_B ? b_transposed : b), shape_output, a_offset, b_offset); } -} +} // namespace template class GEMMLowpMatrixMultiplyCoreValidationFixture : public framework::Fixture @@ -233,20 +206,22 @@ class GEMMLowpMatrixMultiplyCoreValidationFixture : public framework::Fixture public: void setup(TensorShape shape_a, TensorShape shape_b, TensorShape shape_output, int32_t a_offset, int32_t b_offset) { - _target = compute_target(shape_a, shape_b, shape_output, a_offset, b_offset); - _reference = compute_reference(shape_a, shape_b, shape_output, a_offset, b_offset); + const auto a_qinfo = QuantizationInfo(1.0f / 255, a_offset); + const auto b_qinfo = QuantizationInfo(1.0f / 255, b_offset); + _target = compute_target(shape_a, shape_b, shape_output, a_qinfo, b_qinfo); + _reference = compute_reference(shape_a, shape_b, shape_output, a_qinfo, b_qinfo); } protected: - TensorType compute_target(const TensorShape &shape_a, const TensorShape &shape_b, const TensorShape &shape_output, int32_t a_offset, int32_t b_offset) + TensorType compute_target(const TensorShape &shape_a, const TensorShape &shape_b, const TensorShape &shape_output, const QuantizationInfo& a_qinfo, const QuantizationInfo& b_qinfo) { - return compute_gemmlowp_target(shape_a, shape_b, shape_output, a_offset, - b_offset); + const auto output_qinfo = QuantizationInfo(); // No output stage + return compute_gemmlowp_target(shape_a, shape_b, shape_output, a_qinfo, b_qinfo, output_qinfo); } - SimpleTensor compute_reference(const TensorShape &shape_a, const TensorShape &shape_b, const TensorShape &shape_output, int32_t a_offset, int32_t b_offset) + SimpleTensor compute_reference(const TensorShape &shape_a, const TensorShape &shape_b, const TensorShape &shape_output, const QuantizationInfo& a_qinfo, const QuantizationInfo& b_qinfo) { - return compute_gemmlowp_reference(shape_a, shape_b, shape_output, a_offset, b_offset); + return compute_gemmlowp_reference(shape_a, shape_b, shape_output, a_qinfo, b_qinfo); } TensorType _target{}; @@ -257,54 +232,138 @@ template + static void setup_quantization(DataType data_type, const TensorShape& shape_a, const TensorShape& shape_b, QuantizationInfo& a_qinfo, QuantizationInfo& b_qinfo, QuantizationInfo& output_qinfo, TensorFillInfo& finfo) + { + // This hash is used by random generators. There may be hash collisions but + // this is intentional as it's a very easy way to make the the current + // random generation process almost different for many test configurations, + // which were using the same set of values before. + finfo.hash = shape_a[0] + shape_a[1] + shape_b[0] + shape_b[1]; + + const int32_t t_max = static_cast(std::numeric_limits::max()); + const int32_t t_min = static_cast(std::numeric_limits::min()); + + std::mt19937 generator(library->seed() + finfo.hash); + std::uniform_real_distribution distribution_float(-5.0f, 3.0f); + std::uniform_int_distribution distribution_t(t_min, t_max); + + const float scale_lhs = pow(2, distribution_float(generator)); // [2^-5, 2^3] + const float scale_rhs = pow(2, distribution_float(generator)); // [2^-5, 2^3] + + const int32_t offset_lhs = distribution_t(generator); + const int32_t offset_rhs = distribution_t(generator); + + a_qinfo = QuantizationInfo(scale_lhs, offset_lhs); + b_qinfo = QuantizationInfo(scale_rhs, offset_rhs); + + // reinterpret_input_as_3d or reinterpret_output_as_3d can be ignored, as the underlying gemm / matmul computation + // is equivalent to a standard 2D one with m-n-k dimensions + const int m = shape_a.y(); + const int n = shape_b.x(); + const int k = shape_a.x(); + + const float bias_fraction = 0.5f; // We enabled is_fused in compute_gemmlowp_target below, thus bias is included + + QuantizationHint q_hint = suggest_matmul_dst_q_info_and_bias(a_qinfo, b_qinfo, m, n, k, data_type, bias_fraction); + output_qinfo = q_hint.q_info; + finfo.min_bias = q_hint.bias_min; + finfo.max_bias = q_hint.bias_max; + + // Both target and reference implementations use negated offsets, i.e. + // float_val = (int_val + offset) * scale + // instead of + // float_val = (int_val - offset) * scale + // as usual. Therefore, after calculating the output quantization above, we + // negate the offsets of inputs' offsets. + a_qinfo = QuantizationInfo(scale_lhs, -offset_lhs); + b_qinfo = QuantizationInfo(scale_rhs, -offset_rhs); + } + + /** Initialize output stage info from quantization info */ + static Status init_gemmlowp_output_stage_info( + DataType data_type, + const QuantizationInfo& a_qinfo, + const QuantizationInfo& b_qinfo, + const QuantizationInfo& output_qinfo, + GEMMLowpOutputStageType type, + GEMMLowpOutputStageInfo &gemmlowp_output_stage_info) + { + ARM_COMPUTE_RETURN_ERROR_ON(!is_data_type_quantized_asymmetric(data_type)); + + const UniformQuantizationInfo aq_unif = a_qinfo.uniform(); + const UniformQuantizationInfo bq_unif = b_qinfo.uniform(); + const UniformQuantizationInfo oq_unif = output_qinfo.uniform(); + + float multiplier = (aq_unif.scale * bq_unif.scale) / oq_unif.scale; + int32_t int_multiplier; + int32_t shift; + + ARM_COMPUTE_RETURN_ON_ERROR( + quantization::calculate_quantized_multiplier(multiplier, &int_multiplier, &shift)); + + int32_t type_min = 0; + int32_t type_max = 0; + std::tie(type_min, type_max) = quantization::get_quantized_asymmetric_output_min_max(output_qinfo, ActivationLayerInfo(), data_type); + + gemmlowp_output_stage_info.gemmlowp_real_multiplier = multiplier; + gemmlowp_output_stage_info.gemmlowp_multiplier = int_multiplier; + gemmlowp_output_stage_info.gemmlowp_multipliers = { int_multiplier }; + gemmlowp_output_stage_info.gemmlowp_shift = shift; + gemmlowp_output_stage_info.gemmlowp_shifts = { shift }; + gemmlowp_output_stage_info.gemmlowp_offset = oq_unif.offset; + gemmlowp_output_stage_info.type = type; + gemmlowp_output_stage_info.gemmlowp_min_bound = type_min; + gemmlowp_output_stage_info.gemmlowp_max_bound = type_max; + + return Status{}; + } + + /** Currently this fixture only tests the following data type configurations: + * + * 1. a and b are of the same data type + * 2. The data type is quantized asymmetric + * + */ + void setup(TensorShape shape_a, TensorShape shape_b, TensorShape shape_output, GEMMLowpOutputStageType output_stage_type, DataType data_type, bool reshape_b_only_on_first_run) { - ARM_COMPUTE_ASSERT(output_stage.type != GEMMLowpOutputStageType::NONE); - DataType data_type_a = data_type_b == DataType::QASYMM8_SIGNED ? DataType::QASYMM8_SIGNED : DataType::QASYMM8; + ARM_COMPUTE_ASSERT(output_stage_type != GEMMLowpOutputStageType::NONE); + ARM_COMPUTE_ASSERT(is_data_type_quantized_asymmetric(data_type)); - if(data_type_b == DataType::QSYMM8_PER_CHANNEL) - { - output_stage.is_quantized_per_channel = true; - const size_t num_channels = shape_b[0]; - std::vector scales(num_channels); - std::uniform_real_distribution distribution(0.f, 1.f); - library->fill(scales, distribution, 0); - output_stage.gemmlowp_multipliers.resize(num_channels); - output_stage.gemmlowp_shifts.resize(num_channels); - for(size_t i = 0; i < num_channels; ++i) - { - quantization::calculate_quantized_multiplier(scales[i], &output_stage.gemmlowp_multipliers[i], &output_stage.gemmlowp_shifts[i]); - } + // Randomized dynamic quantization: randomize quantization info in a way that ensures no result saturation + // most of the time + QuantizationInfo a_qinfo; + QuantizationInfo b_qinfo; + QuantizationInfo output_qinfo; + TensorFillInfo finfo; + setup_quantization(data_type, shape_a, shape_b, a_qinfo, b_qinfo, output_qinfo, finfo); - _reference = compute_reference(shape_a, shape_b, shape_output, a_offset, 0, output_stage, data_type_a, data_type_b, QuantizationInfo(scales)); - _target = compute_target(shape_a, shape_b, shape_output, a_offset, 0, output_stage, data_type_a, data_type_b, QuantizationInfo(scales), reshape_b_only_on_first_run); - } - else - { - _reference = compute_reference(shape_a, shape_b, shape_output, a_offset, b_offset, output_stage, data_type_a, data_type_b, QuantizationInfo()); - _target = compute_target(shape_a, shape_b, shape_output, a_offset, b_offset, output_stage, data_type_a, data_type_b, QuantizationInfo(), reshape_b_only_on_first_run); - } + GEMMLowpOutputStageInfo output_stage; + init_gemmlowp_output_stage_info(data_type, a_qinfo, b_qinfo, output_qinfo, output_stage_type, output_stage); + + _reference = compute_reference(shape_a, shape_b, shape_output, a_qinfo, b_qinfo, data_type, data_type, output_stage, finfo); + _target = compute_target(shape_a, shape_b, shape_output, a_qinfo, b_qinfo, output_qinfo, data_type, data_type, output_stage, reshape_b_only_on_first_run, finfo); } protected: - TensorType compute_target(const TensorShape &shape_a, const TensorShape &shape_b, const TensorShape &shape_output, int32_t a_offset, int32_t b_offset, GEMMLowpOutputStageInfo output_stage, - DataType data_type_a, DataType data_type_b, QuantizationInfo b_qinfo, bool reshape_b_only_on_first_run = false) + TensorType compute_target(const TensorShape &shape_a, const TensorShape &shape_b, const TensorShape &shape_output, const QuantizationInfo& a_qinfo, const QuantizationInfo& b_qinfo, const QuantizationInfo& output_qinfo, + DataType data_type_a, DataType data_type_b, const GEMMLowpOutputStageInfo& output_stage, bool reshape_b_only_on_first_run = false, const TensorFillInfo& finfo = TensorFillInfo()) { - return compute_gemmlowp_target(shape_a, shape_b, shape_output, a_offset, - b_offset, - output_stage, data_type_a, data_type_b, b_qinfo, reshape_b_only_on_first_run); + return compute_gemmlowp_target(shape_a, shape_b, shape_output, a_qinfo, + b_qinfo, output_qinfo, data_type_a, data_type_b, output_stage, reshape_b_only_on_first_run, finfo); } - SimpleTensor compute_reference(const TensorShape &shape_a, const TensorShape &shape_b, const TensorShape &shape_output, int32_t a_offset, int32_t b_offset, - GEMMLowpOutputStageInfo output_stage, DataType data_type_a, DataType data_type_b, QuantizationInfo b_qinfo) + SimpleTensor compute_reference(const TensorShape &shape_a, const TensorShape &shape_b, const TensorShape &shape_output, const QuantizationInfo& a_qinfo, const QuantizationInfo& b_qinfo, + DataType data_type_a, DataType data_type_b, const GEMMLowpOutputStageInfo& output_stage, const TensorFillInfo& finfo = TensorFillInfo()) { - SimpleTensor output = compute_gemmlowp_reference(shape_a, shape_b, shape_output, a_offset, b_offset, data_type_a, data_type_b, - b_qinfo); + SimpleTensor output = compute_gemmlowp_reference(shape_a, shape_b, shape_output, a_qinfo, b_qinfo, data_type_a, data_type_b, finfo); TensorShape bias_shape(shape_b[0]); SimpleTensor bias{ bias_shape, DataType::S32, 1 }; - (run_twice) ? fill(bias, 5) : fill(bias, 2); // Fill bias with same seed as last run of gemmlowp_target + (run_twice) ? fill_bias_s32(bias, 5 + finfo.hash, finfo.min_bias, finfo.max_bias) : fill_bias_s32(bias, 2 + finfo.hash, finfo.min_bias, finfo.max_bias); // Fill bias with same seed as last run of gemmlowp_target switch(output_stage.type) { @@ -330,10 +389,10 @@ class GEMMLowpMatrixMultiplyCoreFusedOffsetOutputValidationFixture : public GEMMLowpMatrixMultiplyCoreFusedOffsetOutputGenericValidationFixture { public: - void setup(TensorShape shape_a, TensorShape shape_b, TensorShape shape_output, int32_t a_offset, int32_t b_offset, GEMMLowpOutputStageInfo output_stage, DataType data_type_b) + void setup(TensorShape shape_a, TensorShape shape_b, TensorShape shape_output, GEMMLowpOutputStageType output_stage_type, DataType data_type) { GEMMLowpMatrixMultiplyCoreFusedOffsetOutputGenericValidationFixture::setup(shape_a, shape_b, - shape_output, a_offset, b_offset, output_stage, data_type_b, false); + shape_output, output_stage_type, data_type, false /* reshape_b_only_on_first_run */); } }; @@ -2076,4 +2135,4 @@ class GEMMLowpMatrixMultiplyNative3DValidationFixture : public framework::Fixtur } // namespace validation } // namespace test } // namespace arm_compute -#endif /* ARM_COMPUTE_TEST_GEMMLOWP_FIXTURE */ +#endif // ACL_TESTS_VALIDATION_FIXTURES_GEMMLOWPFIXTURE_H diff --git a/tests/validation/fixtures/WinogradConvolutionLayerFixture.h b/tests/validation/fixtures/WinogradConvolutionLayerFixture.h index 1e8820492a..20b678b36c 100644 --- a/tests/validation/fixtures/WinogradConvolutionLayerFixture.h +++ b/tests/validation/fixtures/WinogradConvolutionLayerFixture.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2021, 2023 Arm Limited. + * Copyright (c) 2018-2021, 2023-2024 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -21,8 +21,8 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#ifndef ARM_COMPUTE_TEST_WINOGRAD_LAYER_FIXTURE -#define ARM_COMPUTE_TEST_WINOGRAD_LAYER_FIXTURE +#ifndef ACL_TESTS_VALIDATION_FIXTURES_WINOGRADCONVOLUTIONLAYERFIXTURE_H +#define ACL_TESTS_VALIDATION_FIXTURES_WINOGRADCONVOLUTIONLAYERFIXTURE_H #include "arm_compute/core/TensorShape.h" #include "arm_compute/core/Types.h" @@ -229,7 +229,7 @@ class WinogradConvolutionLayerFastMathValidationFixture : public framework::Fixt SimpleTensor filter_transform_out = reference::winograd_filter_transform(weights_t1, filter_transform_shape, winograd_info); SimpleTensor batched_gemm = reference::gemm(input_transform_out, filter_transform_out, dummy_c, 1.0f, 0.0f); SimpleTensor conv_out = reference::winograd_output_transform(batched_gemm, bias_t1, output_transform_shape, winograd_info); - SimpleTensor conv_out_t(std::move(copy_tensor(conv_out))); + SimpleTensor conv_out_t(copy_tensor(conv_out)); return (act_info.enabled()) ? reference::activation_layer(conv_out_t, act_info) : conv_out_t; } @@ -584,4 +584,4 @@ class WinogradOutputTransformValidationFixture : public framework::Fixture } // namespace validation } // namespace test } // namespace arm_compute -#endif /* ARM_COMPUTE_TEST_WINOGRAD_LAYER_FIXTURE */ \ No newline at end of file +#endif // ACL_TESTS_VALIDATION_FIXTURES_WINOGRADCONVOLUTIONLAYERFIXTURE_H diff --git a/utils/TypePrinter.h b/utils/TypePrinter.h index 4f14d985af..e8831a354c 100644 --- a/utils/TypePrinter.h +++ b/utils/TypePrinter.h @@ -2230,6 +2230,9 @@ inline ::std::ostream &operator<<(::std::ostream &os, const GPUTarget &gpu_targe case GPUTarget::VALHALL: os << "VALHALL"; break; + case GPUTarget::FIFTHGEN: + os << "FIFTHGEN"; + break; case GPUTarget::T600: os << "T600"; break; @@ -2299,6 +2302,12 @@ inline ::std::ostream &operator<<(::std::ostream &os, const GPUTarget &gpu_targe case GPUTarget::G615: os << "G615"; break; + case GPUTarget::G720: + os << "G720"; + break; + case GPUTarget::G620: + os << "G620"; + break; default: ARM_COMPUTE_ERROR("NOT_SUPPORTED!"); }