diff --git a/Android.bp b/Android.bp
index 7681205770..f7d4d257d6 100644
--- a/Android.bp
+++ b/Android.bp
@@ -392,6 +392,7 @@ cc_library_static {
         "src/core/Utils.cpp",
         "src/core/Validate.cpp",
         "src/core/Version.cpp",
+        "src/core/helpers/LUTManager.cpp",
         "src/core/helpers/SoftmaxHelpers.cpp",
         "src/core/helpers/Utils.cpp",
         "src/core/helpers/WindowHelpers.cpp",
@@ -488,6 +489,8 @@ cc_library_static {
         "src/cpu/kernels/crop/generic/neon/fp16.cpp",
         "src/cpu/kernels/crop/generic/neon/fp32.cpp",
         "src/cpu/kernels/crop/generic/neon/integer.cpp",
+        "src/cpu/kernels/depth_to_space/nchw/any/impl.cpp",
+        "src/cpu/kernels/depth_to_space/nhwc/any/impl.cpp",
         "src/cpu/kernels/depthwiseconv2d/generic/neon/fp16.cpp",
         "src/cpu/kernels/depthwiseconv2d/generic/neon/fp32.cpp",
         "src/cpu/kernels/depthwiseconv2d/generic/neon/impl.cpp",
@@ -515,6 +518,8 @@ cc_library_static {
         "src/cpu/kernels/fuse_batch_normalization/generic/fp16.cpp",
         "src/cpu/kernels/fuse_batch_normalization/generic/fp32.cpp",
         "src/cpu/kernels/fuse_batch_normalization/nchw/all.cpp",
+        "src/cpu/kernels/fuse_batch_normalization/nchw/neon/fp16.cpp",
+        "src/cpu/kernels/fuse_batch_normalization/nchw/neon/fp32.cpp",
         "src/cpu/kernels/fuse_batch_normalization/nhwc/neon/fp16.cpp",
         "src/cpu/kernels/fuse_batch_normalization/nhwc/neon/fp32.cpp",
         "src/cpu/kernels/gemm_matrix_add/generic/neon/fp16.cpp",
@@ -543,6 +548,10 @@ cc_library_static {
         "src/cpu/kernels/meanstddevnorm/generic/neon/fp32.cpp",
         "src/cpu/kernels/meanstddevnorm/generic/neon/impl.cpp",
         "src/cpu/kernels/meanstddevnorm/generic/neon/qasymm8.cpp",
+        "src/cpu/kernels/mul/generic/neon/fp16.cpp",
+        "src/cpu/kernels/mul/generic/neon/fp32.cpp",
+        "src/cpu/kernels/norm_layer/generic/neon/fp16.cpp",
+        "src/cpu/kernels/norm_layer/generic/neon/fp32.cpp",
         "src/cpu/kernels/pool2d/neon/fp16.cpp",
         "src/cpu/kernels/pool2d/neon/fp32.cpp",
         "src/cpu/kernels/pool2d/neon/nchw/all.cpp",
@@ -1033,6 +1042,7 @@ cc_library_static {
         "src/runtime/heuristics/dwc_native/ClDWCNativeHeuristicsHelpers.cpp",
         "src/runtime/heuristics/indirect_conv/ClIndirectConvDefaultConfigValhall.cpp",
         "src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultConfigValhall.cpp",
+        "src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultVariantValhall.cpp",
         "src/runtime/heuristics/matmul_native/ClMatMulNativeHelpers.cpp",
         "utils/CommonGraphOptions.cpp",
         "utils/GraphUtils.cpp",
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 9dd3e2cef7..35b6ca2b7f 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2023 Arm Limited.
+# Copyright (c) 2023-2024 Arm Limited.
 #
 # SPDX-License-Identifier: MIT
 #
@@ -28,7 +28,7 @@ cmake_minimum_required(VERSION 3.13 FATAL_ERROR)
 list(APPEND CMAKE_MESSAGE_CONTEXT ArmCompute)
 project(
   ArmCompute
-  VERSION 33.0.0
+  VERSION 34.0.0
   DESCRIPTION
     "The Arm Compute Library is a collection of low-level machine learning functions optimized for Arm® Cortex®-A CPU and Arm® Mali™ GPU architectures"
   LANGUAGES C CXX ASM)
diff --git a/LICENSE b/LICENSE
index 0d2cb83aaa..781685ab31 100644
--- a/LICENSE
+++ b/LICENSE
@@ -1,6 +1,6 @@
 MIT License
 
-Copyright (c) 2017-2023 Arm Limited
+Copyright (c) 2017-2024 Arm Limited
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
diff --git a/README.md b/README.md
index 9b06dbeabf..71a6518594 100644
--- a/README.md
+++ b/README.md
@@ -1,9 +1,8 @@
 
-> **⚠ Important**
-> From release 22.05: 'master' branch has been replaced with 'main' following our inclusive language update, more information [here](https://arm-software.github.io/ComputeLibrary/latest/contribution_guidelines.xhtml#S5_0_inc_lang).
-
-> **⚠ Important**
-> From release 22.08: armv7a with Android build will no longer be tested or maintained.
+> **⚠ Deprecation Notice**
+> 24.01 announcement: NCHW data format specific optimizations will gradually be removed from the code base in
+> future releases. The implication of this is that the user is expected to translate NCHW models into NHWC in
+> order to benefit from the optimizations.
 
 > **⚠ Important**
 > From release 23.02: The 23.02 release introduces a change to the default tensor extend padding behavior.
@@ -16,7 +15,7 @@
  <img src="https://raw.githubusercontent.com/ARM-software/ComputeLibrary/gh-pages/ACL_logo.png"/><br><br>
 </div>
 
-# Compute Library ![](https://img.shields.io/badge/latest_release-23.11-green)
+# Compute Library ![](https://img.shields.io/badge/latest_release-24.01-green)
 
 
 The Compute Library is a collection of low-level machine learning functions optimized for Arm® Cortex®-A, Arm® Neoverse® and Arm® Mali™ GPUs architectures.<br>
@@ -44,7 +43,7 @@ Key Features:
 <br>
 
 ## Documentation
-[![Documentation](https://img.shields.io/badge/documentation-23.11-green)](https://arm-software.github.io/ComputeLibrary/latest)
+[![Documentation](https://img.shields.io/badge/documentation-24.01-green)](https://arm-software.github.io/ComputeLibrary/latest)
 
 > Note: The documentation includes the reference API, changelogs, build guide, contribution guide, errata, etc.
 
@@ -57,24 +56,24 @@ All the binaries can be downloaded from [here](https://github.com/ARM-software/C
 
 | Platform       | Operating System | Release archive (Download) |
 | -------------- | ---------------- | -------------------------- |
-| Raspberry Pi 4 | Linux® 32bit      | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v23.11/arm_compute-v23.11-bin-linux-armv7a-neon.tar.gz) |
-| Raspberry Pi 4 | Linux® 64bit      | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v23.11/arm_compute-v23.11-bin-linux-arm64-v8a-neon.tar.gz) |
-| Odroid N2      | Linux® 64bit      | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v23.11/arm_compute-v23.11-bin-linux-arm64-v8a-neon.tar.gz) [![](https://img.shields.io/badge/build-opencl-blue)](https://github.com/ARM-software/ComputeLibrary/releases/download/v23.11/arm_compute-v23.11-bin-linux-arm64-v8a-cl.tar.gz) [![](https://img.shields.io/badge/build-neon+cl-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/download/v23.11/arm_compute-v23.11-bin-linux-arm64-v8a-neon-cl.tar.gz) |
-| HiKey960       | Linux® 64bit      | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v23.11/arm_compute-v23.11-bin-linux-arm64-v8a-neon.tar.gz) [![](https://img.shields.io/badge/build-opencl-blue)](https://github.com/ARM-software/ComputeLibrary/releases/download/v23.11/arm_compute-v23.11-bin-linux-arm64-v8a-cl.tar.gz) [![](https://img.shields.io/badge/build-neon+cl-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/download/v23.11/arm_compute-v23.11-bin-linux-arm64-v8a-neon-cl.tar.gz) |
+| Raspberry Pi 4 | Linux® 32bit      | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.01/arm_compute-v24.01-bin-linux-armv7a-neon.tar.gz) |
+| Raspberry Pi 4 | Linux® 64bit      | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.01/arm_compute-v24.01-bin-linux-arm64-v8a-neon.tar.gz) |
+| Odroid N2      | Linux® 64bit      | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.01/arm_compute-v24.01-bin-linux-arm64-v8a-neon.tar.gz) [![](https://img.shields.io/badge/build-opencl-blue)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.01/arm_compute-v24.01-bin-linux-arm64-v8a-cl.tar.gz) [![](https://img.shields.io/badge/build-neon+cl-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.01/arm_compute-v24.01-bin-linux-arm64-v8a-neon-cl.tar.gz) |
+| HiKey960       | Linux® 64bit      | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.01/arm_compute-v24.01-bin-linux-arm64-v8a-neon.tar.gz) [![](https://img.shields.io/badge/build-opencl-blue)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.01/arm_compute-v24.01-bin-linux-arm64-v8a-cl.tar.gz) [![](https://img.shields.io/badge/build-neon+cl-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.01/arm_compute-v24.01-bin-linux-arm64-v8a-neon-cl.tar.gz) |
 
 <br>
 
 | Architecture | Operating System | Release archive (Download) |
 | ------------ | ---------------- | -------------------------- |
-| armv7        | Linux®            | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v23.11/arm_compute-v23.11-bin-linux-armv7a-neon.tar.gz) [![](https://img.shields.io/badge/build-opencl-blue)](https://github.com/ARM-software/ComputeLibrary/releases/download/v23.11/arm_compute-v23.11-bin-linux-armv7a-cl.tar.gz) [![](https://img.shields.io/badge/build-neon+cl-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/download/v23.11/arm_compute-v23.11-bin-linux-armv7a-neon-cl.tar.gz) |
-| arm64-v8a    | Android™          | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v23.11/arm_compute-v23.11-bin-android-arm64-v8a-neon.tar.gz) [![](https://img.shields.io/badge/build-opencl-blue)](https://github.com/ARM-software/ComputeLibrary/releases/download/v23.11/arm_compute-v23.11-bin-android-arm64-v8a-cl.tar.gz) [![](https://img.shields.io/badge/build-neon+cl-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/download/v23.11/arm_compute-v23.11-bin-android-arm64-v8a-neon-cl.tar.gz) |
-| arm64-v8a    | Linux®            | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v23.11/arm_compute-v23.11-bin-linux-arm64-v8a-neon.tar.gz) [![](https://img.shields.io/badge/build-opencl-blue)](https://github.com/ARM-software/ComputeLibrary/releases/download/v23.11/arm_compute-v23.11-bin-linux-arm64-v8a-cl.tar.gz) [![](https://img.shields.io/badge/build-neon+cl-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/download/v23.11/arm_compute-v23.11-bin-linux-arm64-v8a-neon-cl.tar.gz) |
-| arm64-v8.2-a | Android™          | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v23.11/arm_compute-v23.11-bin-android-arm64-v8.2-a-neon.tar.gz) [![](https://img.shields.io/badge/build-opencl-blue)](https://github.com/ARM-software/ComputeLibrary/releases/download/v23.11/arm_compute-v23.11-bin-android-arm64-v8.2-a-cl.tar.gz) [![](https://img.shields.io/badge/build-neon+cl-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/download/v23.11/arm_compute-v23.11-bin-android-arm64-v8.2-a-neon-cl.tar.gz) |
-| arm64-v8.2-a | Linux®            | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v23.11/arm_compute-v23.11-bin-linux-arm64-v8.2-a-neon.tar.gz) [![](https://img.shields.io/badge/build-opencl-blue)](https://github.com/ARM-software/ComputeLibrary/releases/download/v23.11/arm_compute-v23.11-bin-linux-arm64-v8.2-a-cl.tar.gz) [![](https://img.shields.io/badge/build-neon+cl-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/download/v23.11/arm_compute-v23.11-bin-linux-arm64-v8.2-a-neon-cl.tar.gz) |
+| armv7        | Linux®            | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.01/arm_compute-v24.01-bin-linux-armv7a-neon.tar.gz) [![](https://img.shields.io/badge/build-opencl-blue)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.01/arm_compute-v24.01-bin-linux-armv7a-cl.tar.gz) [![](https://img.shields.io/badge/build-neon+cl-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.01/arm_compute-v24.01-bin-linux-armv7a-neon-cl.tar.gz) |
+| arm64-v8a    | Android™          | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.01/arm_compute-v24.01-bin-android-arm64-v8a-neon.tar.gz) [![](https://img.shields.io/badge/build-opencl-blue)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.01/arm_compute-v24.01-bin-android-arm64-v8a-cl.tar.gz) [![](https://img.shields.io/badge/build-neon+cl-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.01/arm_compute-v24.01-bin-android-arm64-v8a-neon-cl.tar.gz) |
+| arm64-v8a    | Linux®            | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.01/arm_compute-v24.01-bin-linux-arm64-v8a-neon.tar.gz) [![](https://img.shields.io/badge/build-opencl-blue)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.01/arm_compute-v24.01-bin-linux-arm64-v8a-cl.tar.gz) [![](https://img.shields.io/badge/build-neon+cl-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.01/arm_compute-v24.01-bin-linux-arm64-v8a-neon-cl.tar.gz) |
+| arm64-v8.2-a | Android™          | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.01/arm_compute-v24.01-bin-android-arm64-v8.2-a-neon.tar.gz) [![](https://img.shields.io/badge/build-opencl-blue)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.01/arm_compute-v24.01-bin-android-arm64-v8.2-a-cl.tar.gz) [![](https://img.shields.io/badge/build-neon+cl-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.01/arm_compute-v24.01-bin-android-arm64-v8.2-a-neon-cl.tar.gz) |
+| arm64-v8.2-a | Linux®            | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.01/arm_compute-v24.01-bin-linux-arm64-v8.2-a-neon.tar.gz) [![](https://img.shields.io/badge/build-opencl-blue)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.01/arm_compute-v24.01-bin-linux-arm64-v8.2-a-cl.tar.gz) [![](https://img.shields.io/badge/build-neon+cl-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.01/arm_compute-v24.01-bin-linux-arm64-v8.2-a-neon-cl.tar.gz) |
 
 <br>
 
-Please refer to the following link for more pre-built binaries: [![](https://img.shields.io/badge/v23.11-bins-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/tag/v23.11)
+Please refer to the following link for more pre-built binaries: [![](https://img.shields.io/badge/v24.01-bins-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/tag/v24.01)
 
 Pre-build binaries are generated with the following security / good coding practices related flags:
 > -Wall, -Wextra, -Wformat=2, -Winit-self, -Wstrict-overflow=2, -Wswitch-default, -Woverloaded-virtual, -Wformat-security, -Wctor-dtor-privacy, -Wsign-promo, -Weffc++, -pedantic, -fstack-protector-strong
diff --git a/SConscript b/SConscript
index 099ff706ad..96b3bdc689 100644
--- a/SConscript
+++ b/SConscript
@@ -1,7 +1,7 @@
 #!/usr/bin/python
 # -*- coding: utf-8 -*-
 
-# Copyright (c) 2016-2023 Arm Limited.
+# Copyright (c) 2016-2024 Arm Limited.
 #
 # SPDX-License-Identifier: MIT
 #
@@ -31,15 +31,8 @@ import zlib
 import json
 import codecs
 
-from SCons.Warnings import warn, DeprecatedWarning
-
-warn(DeprecatedWarning,
-     "DEPRECATION NOTICE: Legacy libarm_compute_core has been deprecated and is scheduled for removal in 24.02 release."
-     " Link your application only to libarm_compute for core library functionality"
-     )
-
-VERSION = "v23.11"
-LIBRARY_VERSION_MAJOR = 33
+VERSION = "v24.01"
+LIBRARY_VERSION_MAJOR = 34
 LIBRARY_VERSION_MINOR =  0
 LIBRARY_VERSION_PATCH =  0
 SONAME_VERSION = str(LIBRARY_VERSION_MAJOR) + "." + str(LIBRARY_VERSION_MINOR) + "." + str(LIBRARY_VERSION_PATCH)
@@ -89,31 +82,42 @@ def build_obj_list(arch_info, sources, static=False):
 #         A list of static objects
 #         A list of shared objects
 
-def build_lib_objects():
+def build_multiisa_lib_objects():
     lib_static_objs = [] # static objects
     lib_shared_objs = [] # shared objects
 
+    # note that ARM_COMPUTE_ENABLE_FP16 is enabled in update_data_type_layout_flags() to make
+    # sure the environment is progated to the validation suite
     arm_compute_env.Append(CPPDEFINES = ['ENABLE_NEON', 'ARM_COMPUTE_ENABLE_NEON',
-                           'ENABLE_SVE', 'ARM_COMPUTE_ENABLE_SVE',
-                           'ARM_COMPUTE_ENABLE_FP16', 'ARM_COMPUTE_ENABLE_BF16',
+                           'ENABLE_SVE', 'ARM_COMPUTE_ENABLE_SVE','ARM_COMPUTE_ENABLE_BF16',
                            'ARM_COMPUTE_ENABLE_I8MM', 'ARM_COMPUTE_ENABLE_SVEF32MM'])
 
     # Build all the common files for the base architecture
     if env['arch'] == 'armv8a':
-        lib_static_objs += build_obj_list(filedefs["armv8-a"], lib_files, static=True)
-        lib_shared_objs += build_obj_list(filedefs["armv8-a"], lib_files, static=False)
+        lib_static_objs += build_obj_list(filedefs["armv8-a"], misa_lib_files, static=True)
+        lib_shared_objs += build_obj_list(filedefs["armv8-a"], misa_lib_files, static=False)
     else:
-        lib_static_objs += build_obj_list(filedefs["armv8.2-a"], lib_files, static=True)
-        lib_shared_objs += build_obj_list(filedefs["armv8.2-a"], lib_files, static=False)
+        lib_static_objs += build_obj_list(filedefs["armv8.2-a"], misa_lib_files, static=True)
+        lib_shared_objs += build_obj_list(filedefs["armv8.2-a"], misa_lib_files, static=False)
+
+    # Build the FP16 specific files
+    lib_static_objs += build_obj_list(filedefs["armv8.2-a"], misa_lib_files_neon_fp16, static=True)
+    lib_shared_objs += build_obj_list(filedefs["armv8.2-a"], misa_lib_files_neon_fp16, static=False)
 
     # Build the SVE specific files
-    lib_static_objs += build_obj_list(filedefs["armv8.2-a-sve"], lib_files_sve, static=True)
-    lib_shared_objs += build_obj_list(filedefs["armv8.2-a-sve"], lib_files_sve, static=False)
+    lib_static_objs += build_obj_list(filedefs["armv8.2-a-sve"], misa_lib_files_sve, static=True)
+    lib_shared_objs += build_obj_list(filedefs["armv8.2-a-sve"], misa_lib_files_sve, static=False)
+    lib_static_objs += build_obj_list(filedefs["armv8.2-a-sve"], misa_lib_files_sve_fp16, static=True)
+    lib_shared_objs += build_obj_list(filedefs["armv8.2-a-sve"], misa_lib_files_sve_fp16, static=False)
+
 
     # Build the SVE2 specific files
     arm_compute_env.Append(CPPDEFINES = ['ARM_COMPUTE_ENABLE_SVE2'])
-    lib_static_objs += build_obj_list(filedefs["armv8.6-a-sve2"], lib_files_sve2, static=True)
-    lib_shared_objs += build_obj_list(filedefs["armv8.6-a-sve2"], lib_files_sve2, static=False)
+    lib_static_objs += build_obj_list(filedefs["armv8.6-a-sve2"], misa_lib_files_sve2, static=True)
+    lib_shared_objs += build_obj_list(filedefs["armv8.6-a-sve2"], misa_lib_files_sve2, static=False)
+    lib_static_objs += build_obj_list(filedefs["armv8.6-a-sve2"], misa_lib_files_sve2_fp16, static=True)
+    lib_shared_objs += build_obj_list(filedefs["armv8.6-a-sve2"], misa_lib_files_sve2_fp16, static=False)
+
 
     return lib_static_objs, lib_shared_objs
 
@@ -291,29 +295,29 @@ def get_attrs_list(env, data_types, data_layouts):
     return attrs
 
 
-def get_operator_backend_files(filelist, operators, backend='', techs=[], attrs=[]):
+def get_operator_backend_files(filelist, operators, backend='', techs=[], attrs=[], include_common=True):
     files = { "common" : [] }
-
     # Early return if filelist is empty
     if backend not in filelist:
         return files
-
     # Iterate over operators and create the file lists to compiler
     for operator in operators:
         if operator in filelist[backend]['operators']:
-            files['common'] += filelist[backend]['operators'][operator]["files"]["common"]
+            if include_common :
+                files['common'] += filelist[backend]['operators'][operator]["files"]["common"]
             for tech in techs:
                 if tech in filelist[backend]['operators'][operator]["files"]:
                     # Add tech as a key to dictionary if not there
                     if tech not in files:
                         files[tech] = []
-
                     # Add tech files to the tech file list
                     tech_files = filelist[backend]['operators'][operator]["files"][tech]
-                    files[tech] += tech_files.get('common', [])
+                    if include_common:
+                        files[tech] += tech_files.get('common', [])
                     for attr in attrs:
                         files[tech] += tech_files.get(attr, [])
 
+
     # Remove duplicates if they exist
     return {k: list(set(v)) for k,v in files.items()}
 
@@ -615,6 +619,17 @@ if env['opencl']:
 lib_files_sve = []
 lib_files_sve2 = []
 
+# the variables below are used for the multi_isa builds
+# please note that the variables names without the _fp16 suffix
+# do not hold any fp16 files.
+
+misa_lib_files = lib_files
+misa_lib_files_sve = []
+misa_lib_files_sve2 = []
+misa_lib_files_neon_fp16 = []
+misa_lib_files_sve_fp16 = []
+misa_lib_files_sve2_fp16 = []
+
 if env['neon']:
     # build winograd/depthwise sources for either v7a / v8a
     arm_compute_env.Append(CPPPATH = ["src/core/NEON/kernels/arm_gemm",
@@ -627,8 +642,6 @@ if env['neon']:
                                       "arm_compute/core/NEON/kernels/assembly/",
                                       "src/cpu/kernels/assembly/"])
 
-    lib_files += filelist['cpu']['common']
-
     # Setup SIMD file list to include
     simd = ['neon']
     if env['multi_isa']:
@@ -643,7 +656,6 @@ if env['neon']:
     else:
         attrs = get_attrs_list(env, env['data_type_support'], env['data_layout_support'])
 
-
     if env['fixed_format_kernels']:
         attrs.append("fixed_format_kernels")
 
@@ -651,19 +663,46 @@ if env['neon']:
     cpu_operators = custom_operators if use_custom_ops else filelist['cpu']['operators'].keys()
     cpu_ops_to_build = resolve_operator_dependencies(filelist, cpu_operators, 'cpu')
 
-    cpu_files = get_operator_backend_files(filelist, cpu_ops_to_build, 'cpu', simd, attrs)
+    if env['multi_isa']:
+        misa_lib_files += filelist['cpu']['common']
 
-    # Shared among ALL CPU files
-    lib_files += cpu_files.get('common', [])
+        # For multi_isa builds we need to build fp16 files for armv8.2-a+fp16 so we filter them out of cpu_files removing the attribute fp16
+        attrs.remove('fp16')
+        cpu_files = get_operator_backend_files(filelist, cpu_ops_to_build, 'cpu', simd, attrs)
 
-    # Arm® Neon™ specific files
-    lib_files += cpu_files.get('neon', [])
+        # Shared among ALL CPU files
+        misa_lib_files += cpu_files.get('common', [])
 
-    # SVE files only
-    lib_files_sve = cpu_files.get('sve', [])
+        # Arm® Neon™ specific files
+        misa_lib_files += cpu_files.get('neon', [])
 
-    # SVE2 files only
-    lib_files_sve2 = cpu_files.get('sve2', [])
+        # Get all the fp16 files
+        fp16_cpu_files = get_operator_backend_files(filelist, cpu_ops_to_build, 'cpu', simd, ['fp16'],False)
+
+        misa_lib_files_neon_fp16 = fp16_cpu_files.get('neon',[])
+        misa_lib_files_sve_fp16 = fp16_cpu_files.get('sve',[])
+        misa_lib_files_sve2_fp16 = fp16_cpu_files.get('sve2',[])
+
+        # SVE files only minus FP16
+        misa_lib_files_sve = cpu_files.get('sve', [])
+
+        # SVE2 files only minus FP16
+        misa_lib_files_sve2 = cpu_files.get('sve2', [])
+    else:
+        lib_files += filelist['cpu']['common']
+
+        # Non multi_isa build
+        cpu_files = get_operator_backend_files(filelist, cpu_ops_to_build, 'cpu', simd, attrs)
+
+        # Shared among ALL CPU files
+        lib_files += cpu_files.get('common', [])
+
+        # Arm® Neon™ specific files
+        lib_files += cpu_files.get('neon', [])
+
+        lib_files_sve = cpu_files.get('sve', [])
+
+        lib_files_sve2 = cpu_files.get('sve2', [])
 
     graph_files += Glob('src/graph/backends/NEON/*.cpp')
 
@@ -681,7 +720,7 @@ Export('bootcode_o')
 
 
 if (env['multi_isa']):
-    lib_static_objs, lib_shared_objs = build_lib_objects()
+    lib_static_objs, lib_shared_objs = build_multiisa_lib_objects()
 
 
 # STATIC library build.
@@ -708,18 +747,6 @@ if env['os'] != 'bare_metal' and not env['standalone']:
 
     Export('arm_compute_so')
 
-# Generate dummy core lib for backwards compatibility
-if env['os'] == 'macos':
-    # macos static library archiver fails if given an empty list of files
-    arm_compute_core_a = build_library('arm_compute_core-static', arm_compute_env, lib_files, static=True)
-else:
-    arm_compute_core_a = build_library('arm_compute_core-static', arm_compute_env, [], static=True)
-
-Export('arm_compute_core_a')
-
-if env['os'] != 'bare_metal' and not env['standalone']:
-    arm_compute_core_a_so = build_library('arm_compute_core', arm_compute_env, [], static=False)
-    Export('arm_compute_core_a_so')
 
 arm_compute_graph_env = arm_compute_env.Clone()
 
diff --git a/SConstruct b/SConstruct
index 68c518a4a0..cf8fb52bd6 100644
--- a/SConstruct
+++ b/SConstruct
@@ -62,8 +62,14 @@ def read_build_config_json(build_config):
 
 def update_data_type_layout_flags(env, data_types, data_layouts):
     # Manage data-types
-    if any(i in data_types for i in ['all', 'fp16']):
-        env.Append(CXXFLAGS = ['-DENABLE_FP16_KERNELS'])
+    if env['multi_isa']:
+        if  any(i in data_types for i in ['all', 'fp16']):
+            env.Append(CXXFLAGS = ['-DENABLE_FP16_KERNELS', '-DARM_COMPUTE_ENABLE_FP16'])
+    else:
+            if not 'v8a' in env['arch'] and not 'v7a' in env['arch'] and not 'armv8r64' in env['arch']:
+                if  any(i in data_types for i in ['all', 'fp16']):
+                    env.Append(CXXFLAGS = ['-DENABLE_FP16_KERNELS','-DARM_COMPUTE_ENABLE_FP16'])
+
     if any(i in data_types for i in ['all', 'fp32']):
         env.Append(CXXFLAGS = ['-DENABLE_FP32_KERNELS'])
     if any(i in data_types for i in ['all', 'qasymm8']):
@@ -112,7 +118,7 @@ vars.AddVariables(
     BoolVariable("exceptions", "Enable/disable C++ exception support", True),
     BoolVariable("high_priority", "Generate a library containing only the high priority operators", False),
     PathVariable("linker_script", "Use an external linker script", "", PathVariable.PathAccept),
-    PathVariable("external_tests_dir", """Add examples, benchmarks and tests to the tests suite from an external path. In order to use this option, the external tests directory must have the following structure: 
+    PathVariable("external_tests_dir", """Add examples, benchmarks and tests to the tests suite from an external path. In order to use this option, the external tests directory must have the following structure:
     EXTERNAL_TESTS_DIR:
     └── tests
         ├── benchmark
@@ -240,7 +246,6 @@ env.Append(CXXFLAGS = ['-DARCH_ARM',
 if not 'windows' in env['os']:
     env.Append(CXXFLAGS = ['-Wall','-std=c++14', '-pedantic' ])
 
-env.Append(CPPDEFINES = ['_GLIBCXX_USE_NANOSLEEP'])
 
 cpp_tool = {'linux': 'g++', 'android' : 'clang++',
              'tizen': 'g++', 'macos':'clang++',
@@ -312,8 +317,7 @@ if env['multi_isa']:
         Exit(1)
 
     if 'v8a' in env['arch']:
-        print("INFO: multi_isa armv8-a architecture build doesn't enable __ARM_FEATURE_FP16_VECTOR_ARITHMETIC. Use armv8.2-a or beyond to enable FP16 vector arithmetic support")
-        env.Append(CXXFLAGS = ['-march=armv8-a']) # note: this will disable fp16 extension __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+        env.Append(CXXFLAGS = ['-march=armv8-a'])
     else:
         if 'v8.6-a' in env['arch']:
             if "disable_mmla_fp" not in env['custom_options']:
@@ -536,7 +540,7 @@ if env['standalone']:
     if not 'windows' in env['os']:
         env.Append(CXXFLAGS = ['-fPIC'])
         env.Append(LINKFLAGS = ['-static-libgcc','-static-libstdc++'])
-       
+
 if env['Werror']:
     env.Append(CXXFLAGS = ['-Werror'])
 
@@ -597,7 +601,7 @@ if env['debug']:
     else:
         env.Append(CXXFLAGS = ['-Z7','-MTd','-fms-compatibility','-fdelayed-template-parsing'])
         env.Append(LINKFLAGS = ['-DEBUG'])
- 
+
     env.Append(CPPDEFINES = ['ARM_COMPUTE_DEBUG_ENABLED'])
 else:
     if not 'windows' in env['os']:
diff --git a/arm_compute/core/CL/OpenCL.h b/arm_compute/core/CL/OpenCL.h
index a5c4e39df2..8b5bf97099 100644
--- a/arm_compute/core/CL/OpenCL.h
+++ b/arm_compute/core/CL/OpenCL.h
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_OPENCL_H
-#define ARM_COMPUTE_OPENCL_H
+#ifndef ACL_ARM_COMPUTE_CORE_CL_OPENCL_H
+#define ACL_ARM_COMPUTE_CORE_CL_OPENCL_H
 
 #include <string>
 #include <utility>
@@ -139,6 +139,7 @@ class CLSymbols final
     DECLARE_FUNCTION_PTR(clWaitForEvents);
     DECLARE_FUNCTION_PTR(clCreateImage);
     DECLARE_FUNCTION_PTR(clSetKernelExecInfo);
+    DECLARE_FUNCTION_PTR(clGetExtensionFunctionAddressForPlatform);
 
     // Command buffer and mutable dispatch command buffer extensions
     DECLARE_FUNCTION_PTR(clCreateCommandBufferKHR);
@@ -159,4 +160,4 @@ class CLSymbols final
     std::pair<bool, bool> _loaded;
 };
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_OPENCL_H */
+#endif // ACL_ARM_COMPUTE_CORE_CL_OPENCL_H
diff --git a/arm_compute/core/GPUTarget.h b/arm_compute/core/GPUTarget.h
index affa79a89e..b107a52d9f 100644
--- a/arm_compute/core/GPUTarget.h
+++ b/arm_compute/core/GPUTarget.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022 Arm Limited.
+ * Copyright (c) 2018-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_GPUTARGET_H
-#define ARM_COMPUTE_GPUTARGET_H
+#ifndef ACL_ARM_COMPUTE_CORE_GPUTARGET_H
+#define ACL_ARM_COMPUTE_CORE_GPUTARGET_H
 
 #include "support/Traits.h"
 
@@ -39,6 +39,7 @@ enum class GPUTarget
     MIDGARD             = 0x100,
     BIFROST             = 0x200,
     VALHALL             = 0x300,
+    FIFTHGEN            = 0X400,
     T600                = 0x110,
     T700                = 0x120,
     T800                = 0x130,
@@ -62,6 +63,8 @@ enum class GPUTarget
     G310                = 0x343,
     G715                = 0x350,
     G615                = 0x351,
+    G720                = 0x410,
+    G620                = 0X411
 };
 
 /** Enable bitwise operations on GPUTarget enumerations */
@@ -114,4 +117,4 @@ inline bool gpu_target_is_in(GPUTarget target_to_check, GPUTarget target)
     return target_to_check == target;
 }
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_GPUTARGET_H */
+#endif // ACL_ARM_COMPUTE_CORE_GPUTARGET_H
diff --git a/arm_compute/core/utils/misc/ShapeCalculator.h b/arm_compute/core/utils/misc/ShapeCalculator.h
index 86dcfdc3d0..e97d81390e 100644
--- a/arm_compute/core/utils/misc/ShapeCalculator.h
+++ b/arm_compute/core/utils/misc/ShapeCalculator.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2023 Arm Limited.
+ * Copyright (c) 2017-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -60,7 +60,14 @@ inline TensorShape calculate_reduce_mean_shape(ITensorInfo *input, const Coordin
     {
         // We have to sort the reduction axis vectors in order for remove_dimension
         // to work properly
+
+// Suppress warning produced by a compiler bug in GCC
+// https://gcc.gnu.org/bugzilla/show_bug.cgi?id=104165
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Warray-bounds"
         std::sort(axis_local.begin(), axis_local.begin() + reduction_ops);
+#pragma GCC diagnostic pop
+
         for (int i = 0; i < reduction_ops; ++i)
         {
             out_shape.remove_dimension(axis_local[i] - i, false);
diff --git a/arm_compute/function_info/ActivationLayerInfo.h b/arm_compute/function_info/ActivationLayerInfo.h
index 195b67cf99..9390d0c54f 100644
--- a/arm_compute/function_info/ActivationLayerInfo.h
+++ b/arm_compute/function_info/ActivationLayerInfo.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2023 Arm Limited.
+ * Copyright (c) 2016-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,13 +21,19 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ACL_ARM_COMPUTE_FUNCTION_INFO_ACTIVATIONLAYERINFO
-#define ACL_ARM_COMPUTE_FUNCTION_INFO_ACTIVATIONLAYERINFO
+#ifndef ACL_ARM_COMPUTE_FUNCTION_INFO_ACTIVATIONLAYERINFO_H
+#define ACL_ARM_COMPUTE_FUNCTION_INFO_ACTIVATIONLAYERINFO_H
 
 #include "arm_compute/core/CoreTypes.h"
+#include "arm_compute/core/Error.h"
 #include "arm_compute/core/QuantizationInfo.h"
 
 #include <array>
+#include <memory>
+
+#ifdef __aarch64__
+#include <arm_neon.h>
+#endif // __arch64__
 
 namespace arm_compute
 {
@@ -58,7 +64,10 @@ class ActivationLayerInfo
     typedef arm_compute::ActivationFunction ActivationFunction;
 
     /** Lookup table  */
-    using LookupTable256 = std::array<qasymm8_t, 256>;
+#ifdef __aarch64__
+    using LookupTable256   = std::array<qasymm8_t, 256>;
+    using LookupTable65536 = std::array<float16_t, 65536>;
+#endif // __aarch64__
 
     ActivationLayerInfo() = default;
     /** Default Constructor
@@ -101,6 +110,16 @@ class ActivationLayerInfo
     {
         _lut = std::move(lut);
     }
+
+    const LookupTable65536 &lut_fp16() const
+    {
+        ARM_COMPUTE_ERROR_ON(_lut_fp16 == nullptr);
+        return *_lut_fp16;
+    }
+    void setLookupTable65536(std::shared_ptr<LookupTable65536> lut)
+    {
+        _lut_fp16 = lut;
+    }
 #endif // __aarch64__
 private:
     ActivationFunction _act     = {ActivationLayerInfo::ActivationFunction::IDENTITY};
@@ -109,8 +128,9 @@ class ActivationLayerInfo
     bool               _enabled = {false};
 
 #ifdef __aarch64__
-    LookupTable256 _lut = {};
+    LookupTable256                    _lut = {};
+    std::shared_ptr<LookupTable65536> _lut_fp16{nullptr};
 #endif // __aarch64__
 };
 } // namespace arm_compute
-#endif /* ACL_ARM_COMPUTE_FUNCTION_INFO_ACTIVATIONLAYERINFO */
+#endif // ACL_ARM_COMPUTE_FUNCTION_INFO_ACTIVATIONLAYERINFO_H
diff --git a/arm_compute/runtime/NEON/functions/NEDepthToSpaceLayer.h b/arm_compute/runtime/NEON/functions/NEDepthToSpaceLayer.h
index c7df29a704..d27369670e 100644
--- a/arm_compute/runtime/NEON/functions/NEDepthToSpaceLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEDepthToSpaceLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,25 +21,27 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_NEDEPTHTOSPACELAYER_H
-#define ARM_COMPUTE_NEDEPTHTOSPACELAYER_H
+#ifndef ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NEDEPTHTOSPACELAYER_H
+#define ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NEDEPTHTOSPACELAYER_H
 
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/IFunction.h"
-#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h"
+
+#include <memory>
 
 namespace arm_compute
 {
 // Forward declarations
 class ITensor;
 class ITensorInfo;
+class NEDepthToSpaceLayerKernel;
 
 /** Basic function to run @ref NEDepthToSpaceLayerKernel. */
-class NEDepthToSpaceLayer : public INESimpleFunctionNoBorder
+class NEDepthToSpaceLayer : public IFunction
 {
 public:
     /** Constructor */
-    NEDepthToSpaceLayer() = default;
+    NEDepthToSpaceLayer();
     /** Prevent instances of this class from being copied (As this class contains pointers) */
     NEDepthToSpaceLayer(const NEDepthToSpaceLayer &) = delete;
     /** Prevent instances of this class from being copied (As this class contains pointers) */
@@ -49,7 +51,7 @@ class NEDepthToSpaceLayer : public INESimpleFunctionNoBorder
     /** Prevent instances of this class from being moved (As this class contains non movable objects) */
     NEDepthToSpaceLayer &operator=(NEDepthToSpaceLayer &&) = delete;
     /** Default destructor */
-    ~NEDepthToSpaceLayer() = default;
+    ~NEDepthToSpaceLayer();
     /** Set the input and output tensors.
      *
      * Valid data layouts:
@@ -75,6 +77,11 @@ class NEDepthToSpaceLayer : public INESimpleFunctionNoBorder
      * @return a status
      */
     static Status validate(const ITensorInfo *input, const ITensorInfo *output, int32_t block_shape);
+
+    void run() override;
+
+private:
+    std::unique_ptr<NEDepthToSpaceLayerKernel> _kernel;
 };
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_NEDEPTHTOSPACELAYER_H */
+#endif // ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NEDEPTHTOSPACELAYER_H
diff --git a/arm_compute/runtime/Scheduler.h b/arm_compute/runtime/Scheduler.h
index bd29cbb31f..7c83f86caa 100644
--- a/arm_compute/runtime/Scheduler.h
+++ b/arm_compute/runtime/Scheduler.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2019, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_SCHEDULER_H
-#define ARM_COMPUTE_SCHEDULER_H
+#ifndef ACL_ARM_COMPUTE_RUNTIME_SCHEDULER_H
+#define ACL_ARM_COMPUTE_RUNTIME_SCHEDULER_H
 
 #include "arm_compute/runtime/IScheduler.h"
 
@@ -81,4 +81,4 @@ class Scheduler
     Scheduler();
 };
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_SCHEDULER_H */
+#endif // ACL_ARM_COMPUTE_RUNTIME_SCHEDULER_H
diff --git a/cmake/Options.cmake b/cmake/Options.cmake
index bc51cbbc0d..e5c8cb8efe 100644
--- a/cmake/Options.cmake
+++ b/cmake/Options.cmake
@@ -116,4 +116,4 @@ endif()
 if(ARM_COMPUTE_ENABLE_FIXED_FORMAT_KERNELS)
   add_definitions(-DARM_COMPUTE_ENABLE_FIXED_FORMAT_KERNELS)
 endif()
-add_definitions(-D_GLIBCXX_USE_NANOSLEEP)
\ No newline at end of file
+add_definitions(-D_GLIBCXX_USE_NANOSLEEP)
diff --git a/docs/Doxyfile b/docs/Doxyfile
index 0b2f32ad1a..0d8654944d 100644
--- a/docs/Doxyfile
+++ b/docs/Doxyfile
@@ -38,7 +38,7 @@ PROJECT_NAME           = "Compute Library"
 # could be handy for archiving the generated documentation or if some version
 # control system is used.
 
-PROJECT_NUMBER         = 23.11
+PROJECT_NUMBER         = 24.01
 
 # Using the PROJECT_BRIEF tag one can provide an optional one line description
 # for a project that appears at the top of each page and should give viewer a
diff --git a/docs/user_guide/how_to_build_and_run_examples.dox b/docs/user_guide/how_to_build_and_run_examples.dox
index 4da26d31bc..775cb6abbe 100644
--- a/docs/user_guide/how_to_build_and_run_examples.dox
+++ b/docs/user_guide/how_to_build_and_run_examples.dox
@@ -76,21 +76,21 @@ The examples get automatically built by scons as part of the build process of th
 
 To cross compile a Arm® Neon™ example for Linux 32bit:
 
-	arm-linux-gnueabihf-g++ examples/neon_cnn.cpp utils/Utils.cpp -I. -Iinclude -std=c++14 -mfpu=neon -L. -larm_compute -larm_compute_core -o neon_cnn
+	arm-linux-gnueabihf-g++ examples/neon_cnn.cpp utils/Utils.cpp -I. -Iinclude -std=c++14 -mfpu=neon -L. -larm_compute -o neon_cnn
 
 To cross compile a Arm® Neon™ example for Linux 64bit:
 
-	aarch64-linux-gnu-g++ examples/neon_cnn.cpp utils/Utils.cpp -I. -Iinclude -std=c++14 -L. -larm_compute -larm_compute_core -o neon_cnn
+	aarch64-linux-gnu-g++ examples/neon_cnn.cpp utils/Utils.cpp -I. -Iinclude -std=c++14 -L. -larm_compute -o neon_cnn
 
 (notice the only difference with the 32 bit command is that we don't need the -mfpu option and the compiler's name is different)
 
 To cross compile an OpenCL example for Linux 32bit:
 
-	arm-linux-gnueabihf-g++ examples/cl_sgemm.cpp utils/Utils.cpp -I. -Iinclude -std=c++14 -mfpu=neon -L. -larm_compute -larm_compute_core -o cl_sgemm -DARM_COMPUTE_CL
+	arm-linux-gnueabihf-g++ examples/cl_sgemm.cpp utils/Utils.cpp -I. -Iinclude -std=c++14 -mfpu=neon -L. -larm_compute -o cl_sgemm -DARM_COMPUTE_CL
 
 To cross compile an OpenCL example for Linux 64bit:
 
-	aarch64-linux-gnu-g++ examples/cl_sgemm.cpp utils/Utils.cpp -I. -Iinclude -std=c++14 -L. -larm_compute -larm_compute_core -o cl_sgemm -DARM_COMPUTE_CL
+	aarch64-linux-gnu-g++ examples/cl_sgemm.cpp utils/Utils.cpp -I. -Iinclude -std=c++14 -L. -larm_compute -o cl_sgemm -DARM_COMPUTE_CL
 
 (notice the only difference with the 32 bit command is that we don't need the -mfpu option and the compiler's name is different)
 
@@ -98,43 +98,43 @@ To cross compile the examples with the Graph API, such as graph_lenet.cpp, you n
 
 i.e. to cross compile the "graph_lenet" example for Linux 32bit:
 
-	arm-linux-gnueabihf-g++ examples/graph_lenet.cpp utils/Utils.cpp utils/GraphUtils.cpp utils/CommonGraphOptions.cpp -I. -Iinclude -std=c++14 -mfpu=neon -L. -larm_compute_graph -larm_compute -larm_compute_core -Wl,--allow-shlib-undefined -o graph_lenet
+	arm-linux-gnueabihf-g++ examples/graph_lenet.cpp utils/Utils.cpp utils/GraphUtils.cpp utils/CommonGraphOptions.cpp -I. -Iinclude -std=c++14 -mfpu=neon -L. -larm_compute_graph -larm_compute -Wl,--allow-shlib-undefined -o graph_lenet
 
 i.e. to cross compile the "graph_lenet" example for Linux 64bit:
 
-	aarch64-linux-gnu-g++ examples/graph_lenet.cpp utils/Utils.cpp utils/GraphUtils.cpp utils/CommonGraphOptions.cpp -I. -Iinclude -std=c++14 -L. -larm_compute_graph -larm_compute -larm_compute_core -Wl,--allow-shlib-undefined -o graph_lenet
+	aarch64-linux-gnu-g++ examples/graph_lenet.cpp utils/Utils.cpp utils/GraphUtils.cpp utils/CommonGraphOptions.cpp -I. -Iinclude -std=c++14 -L. -larm_compute_graph -larm_compute -Wl,--allow-shlib-undefined -o graph_lenet
 
 (notice the only difference with the 32 bit command is that we don't need the -mfpu option and the compiler's name is different)
 
-@note If compiling using static libraries, this order must be followed when linking: arm_compute_graph_static, arm_compute, arm_compute_core
+@note If compiling using static libraries, this order must be followed when linking: arm_compute_graph_static, arm_compute
 
 To compile natively (i.e directly on an Arm device) for Arm® Neon™ for Linux 32bit:
 
-	g++ examples/neon_cnn.cpp utils/Utils.cpp -I. -Iinclude -std=c++14 -mfpu=neon -larm_compute -larm_compute_core -o neon_cnn
+	g++ examples/neon_cnn.cpp utils/Utils.cpp -I. -Iinclude -std=c++14 -mfpu=neon -larm_compute -o neon_cnn
 
 To compile natively (i.e directly on an Arm device) for Arm® Neon™ for Linux 64bit:
 
-	g++ examples/neon_cnn.cpp utils/Utils.cpp -I. -Iinclude -std=c++14 -larm_compute -larm_compute_core -o neon_cnn
+	g++ examples/neon_cnn.cpp utils/Utils.cpp -I. -Iinclude -std=c++14 -larm_compute -o neon_cnn
 
 (notice the only difference with the 32 bit command is that we don't need the -mfpu option)
 
 To compile natively (i.e directly on an Arm device) for OpenCL for Linux 32bit or Linux 64bit:
 
-	g++ examples/cl_sgemm.cpp utils/Utils.cpp -I. -Iinclude -std=c++14 -larm_compute -larm_compute_core -o cl_sgemm -DARM_COMPUTE_CL
+	g++ examples/cl_sgemm.cpp utils/Utils.cpp -I. -Iinclude -std=c++14 -larm_compute -o cl_sgemm -DARM_COMPUTE_CL
 
 To compile natively the examples with the Graph API, such as graph_lenet.cpp, you need to link the examples against arm_compute_graph.so too.
 
 i.e. to natively compile the "graph_lenet" example for Linux 32bit:
 
-	g++ examples/graph_lenet.cpp utils/Utils.cpp utils/GraphUtils.cpp utils/CommonGraphOptions.cpp -I. -Iinclude -std=c++14 -mfpu=neon -L. -larm_compute_graph -larm_compute -larm_compute_core -Wl,--allow-shlib-undefined -o graph_lenet
+	g++ examples/graph_lenet.cpp utils/Utils.cpp utils/GraphUtils.cpp utils/CommonGraphOptions.cpp -I. -Iinclude -std=c++14 -mfpu=neon -L. -larm_compute_graph -larm_compute -Wl,--allow-shlib-undefined -o graph_lenet
 
 i.e. to natively compile the "graph_lenet" example for Linux 64bit:
 
-	g++ examples/graph_lenet.cpp utils/Utils.cpp utils/GraphUtils.cpp utils/CommonGraphOptions.cpp -I. -Iinclude -std=c++14 -L. -larm_compute_graph -larm_compute -larm_compute_core -Wl,--allow-shlib-undefined -o graph_lenet
+	g++ examples/graph_lenet.cpp utils/Utils.cpp utils/GraphUtils.cpp utils/CommonGraphOptions.cpp -I. -Iinclude -std=c++14 -L. -larm_compute_graph -larm_compute -Wl,--allow-shlib-undefined -o graph_lenet
 
 (notice the only difference with the 32 bit command is that we don't need the -mfpu option)
 
-@note If compiling using static libraries, this order must be followed when linking: arm_compute_graph_static, arm_compute, arm_compute_core
+@note If compiling using static libraries, this order must be followed when linking: arm_compute_graph_static, arm_compute
 
 @note These two commands assume libarm_compute.so is available in your library path, if not add the path to it using -L (e.g. -Llib/linux-armv8a-neon-cl-asserts/)
 @note You might need to export the path to OpenCL library as well in your LD_LIBRARY_PATH if Compute Library was built with OpenCL enabled.
@@ -265,23 +265,23 @@ Once you've got your Android standalone toolchain built and added to your path y
 To cross compile a Arm® Neon™ example:
 
 	#32 bit:
-	arm-linux-androideabi-clang++ examples/neon_cnn.cpp utils/Utils.cpp -I. -Iinclude -std=c++14 -larm_compute-static -larm_compute_core-static -L. -o neon_cnn_arm -static-libstdc++ -pie
+	arm-linux-androideabi-clang++ examples/neon_cnn.cpp utils/Utils.cpp -I. -Iinclude -std=c++14 -larm_compute-static -L. -o neon_cnn_arm -static-libstdc++ -pie
 	#64 bit:
-	aarch64-linux-android-clang++ examples/neon_cnn.cpp utils/Utils.cpp -I. -Iinclude -std=c++14 -larm_compute-static -larm_compute_core-static -L. -o neon_cnn_aarch64 -static-libstdc++ -pie
+	aarch64-linux-android-clang++ examples/neon_cnn.cpp utils/Utils.cpp -I. -Iinclude -std=c++14 -larm_compute-static -L. -o neon_cnn_aarch64 -static-libstdc++ -pie
 
 To cross compile an OpenCL example:
 
 	#32 bit:
-	arm-linux-androideabi-clang++ examples/cl_sgemm.cpp utils/Utils.cpp -I. -Iinclude -std=c++14 -larm_compute-static -larm_compute_core-static -L. -o cl_sgemm_arm -static-libstdc++ -pie -DARM_COMPUTE_CL
+	arm-linux-androideabi-clang++ examples/cl_sgemm.cpp utils/Utils.cpp -I. -Iinclude -std=c++14 -larm_compute-static -L. -o cl_sgemm_arm -static-libstdc++ -pie -DARM_COMPUTE_CL
 	#64 bit:
-	aarch64-linux-android-clang++ examples/cl_sgemm.cpp utils/Utils.cpp -I. -Iinclude -std=c++14 -larm_compute-static -larm_compute_core-static -L. -o cl_sgemm_aarch64 -static-libstdc++ -pie -DARM_COMPUTE_CL
+	aarch64-linux-android-clang++ examples/cl_sgemm.cpp utils/Utils.cpp -I. -Iinclude -std=c++14 -larm_compute-static -L. -o cl_sgemm_aarch64 -static-libstdc++ -pie -DARM_COMPUTE_CL
 
 To cross compile the examples with the Graph API, such as graph_lenet.cpp, you need to link the library arm_compute_graph also.
 
 	#32 bit:
-	arm-linux-androideabi-clang++ examples/graph_lenet.cpp utils/Utils.cpp utils/GraphUtils.cpp utils/CommonGraphOptions.cpp -I. -Iinclude -std=c++14 -Wl,--whole-archive -larm_compute_graph-static -Wl,--no-whole-archive -larm_compute-static -larm_compute_core-static -L. -o graph_lenet_arm -static-libstdc++ -pie -DARM_COMPUTE_CL
+	arm-linux-androideabi-clang++ examples/graph_lenet.cpp utils/Utils.cpp utils/GraphUtils.cpp utils/CommonGraphOptions.cpp -I. -Iinclude -std=c++14 -Wl,--whole-archive -larm_compute_graph-static -Wl,--no-whole-archive -larm_compute-static -L. -o graph_lenet_arm -static-libstdc++ -pie -DARM_COMPUTE_CL
 	#64 bit:
-	aarch64-linux-android-clang++ examples/graph_lenet.cpp utils/Utils.cpp utils/GraphUtils.cpp utils/CommonGraphOptions.cpp -I. -Iinclude -std=c++14 -Wl,--whole-archive -larm_compute_graph-static -Wl,--no-whole-archive -larm_compute-static -larm_compute_core-static -L. -o graph_lenet_aarch64 -static-libstdc++ -pie -DARM_COMPUTE_CL
+	aarch64-linux-android-clang++ examples/graph_lenet.cpp utils/Utils.cpp utils/GraphUtils.cpp utils/CommonGraphOptions.cpp -I. -Iinclude -std=c++14 -Wl,--whole-archive -larm_compute_graph-static -Wl,--no-whole-archive -larm_compute-static -L. -o graph_lenet_aarch64 -static-libstdc++ -pie -DARM_COMPUTE_CL
 
 @note Due to some issues in older versions of the Arm® Mali™ OpenCL DDK (<= r13p0), we recommend to link arm_compute statically on Android.
 @note When linked statically the arm_compute_graph library currently needs the --whole-archive linker flag in order to work properly
diff --git a/docs/user_guide/release_version_and_change_log.dox b/docs/user_guide/release_version_and_change_log.dox
index 11731e5a33..40ad09fd84 100644
--- a/docs/user_guide/release_version_and_change_log.dox
+++ b/docs/user_guide/release_version_and_change_log.dox
@@ -41,6 +41,18 @@ If there is more than one release in a month then an extra sequential number is
 
 @section S2_2_changelog Changelog
 
+v24.01 Public major release
+ - Remove the legacy 'libarm_compute_core' library. This library is an artifact of Compute Library's legacy library architecture and no longer serves any purpose.
+  You should link only to the main `libarm_compute` library for core functionality.
+ - Expand GPUTarget list with Mali™ G720 and G620.
+ - Optimize CPU activation functions using LUT-based implementation:
+   - Sigmoid function for FP16.
+ - New features
+   - Add support for FP16 in all multi_isa builds.
+ - Performance optimizations:
+   - Optimize @ref NESoftmaxLayer
+   - Optimize @ref NEDepthToSpaceLayer.
+
 v23.11 Public major release
  - New features
    - Add support for input data type U64/S64 in CLCast and NECast.
@@ -432,8 +444,8 @@ v21.02 Public major release
    - @ref NEActivationLayer
    - @ref NEArithmeticAddition
    - @ref NEBatchNormalizationLayerKernel
-   - @ref cpu::kernels::CpuLogits1DSoftmaxKernel
-   - @ref cpu::kernels::CpuLogits1DMaxKernel
+   - cpu::kernels::CpuLogits1DSoftmaxKernel
+   - cpu::kernels::CpuLogits1DMaxKernel
    - @ref cpu::kernels::CpuElementwiseUnaryKernel
  - Remove padding from OpenCL kernels:
    - CLDirectConvolutionLayerKernel
diff --git a/examples/SConscript b/examples/SConscript
index bfac9deb2b..16f31d93d4 100644
--- a/examples/SConscript
+++ b/examples/SConscript
@@ -38,15 +38,14 @@ utils = examples_env.Object("../utils/Utils.cpp")
 if env['os'] in ['android', 'macos', 'bare_metal'] or env['standalone']:
     Import('arm_compute_graph_a')
     Import('arm_compute_a')
-    Import('arm_compute_core_a')
-    arm_compute_libs = [ arm_compute_a, arm_compute_core_a ]
+    arm_compute_libs = [ arm_compute_a ]
     arm_compute_graph_libs = arm_compute_libs # The graph library needs to be linked separately with --whole-archive
     arm_compute_dependency = arm_compute_a
     graph_dependency = [arm_compute_graph_a]
 else:
     Import('arm_compute_graph_so')
     Import('arm_compute_so')
-    arm_compute_libs = ["arm_compute", "arm_compute_core"]
+    arm_compute_libs = ["arm_compute"]
     arm_compute_graph_libs = [ "arm_compute_graph" ] + arm_compute_libs
     arm_compute_dependency = arm_compute_so
     graph_dependency = [arm_compute_graph_so]
diff --git a/examples/graph_ssd_mobilenet.cpp b/examples/graph_ssd_mobilenet.cpp
index 5162fe6890..6218d47dd6 100644
--- a/examples/graph_ssd_mobilenet.cpp
+++ b/examples/graph_ssd_mobilenet.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022 Arm Limited.
+ * Copyright (c) 2018-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,6 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
+#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 #include "arm_compute/graph.h"
 
 #include "support/ToolchainSupport.h"
@@ -757,7 +758,8 @@ class GraphSSDMobilenetExample : public Example
                                   std::move(conv_16_2_class_pre), std::move(conv_17_2_class_pre))
                           .set_name("ClassPrediction/concat");
 
-        const QuantizationInfo logistic_out_qinfo = QuantizationInfo(0.00390625f, 0);
+        const QuantizationInfo logistic_out_qinfo = QuantizationInfo(
+            0.00390625f, quantization::get_min_max_values_from_quantized_data_type(common_params.data_type).first);
         class_pred << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC),
                                       logistic_out_qinfo)
                           .set_name("ClassPrediction/logistic");
diff --git a/filelist.json b/filelist.json
index 5bca2419d6..7c530f3f33 100644
--- a/filelist.json
+++ b/filelist.json
@@ -14,6 +14,7 @@
     "src/core/Error.cpp",
     "src/core/GPUTarget.cpp",
     "src/core/Helpers.cpp",
+    "src/core/helpers/LUTManager.cpp",
     "src/core/IAccessWindow.cpp",
     "src/core/IKernel.cpp",
     "src/core/ITensor.cpp",
@@ -532,7 +533,8 @@
           "src/gpu/cl/operators/ClMatMul.cpp",
           "src/runtime/CL/functions/CLMatMul.cpp",
           "src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultConfigValhall.cpp",
-          "src/runtime/heuristics/matmul_native/ClMatMulNativeHelpers.cpp"
+          "src/runtime/heuristics/matmul_native/ClMatMulNativeHelpers.cpp",
+          "src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultVariantValhall.cpp"
         ]
       }
     },
@@ -982,12 +984,15 @@
             "fp16": [
               "src/cpu/kernels/fuse_batch_normalization/generic/fp16.cpp",
               "src/core/NEON/kernels/batchnormalization/impl/NEON/fp16.cpp",
-              "src/cpu/kernels/fuse_batch_normalization/nhwc/neon/fp16.cpp"
+              "src/cpu/kernels/fuse_batch_normalization/nhwc/neon/fp16.cpp",
+              "src/cpu/kernels/fuse_batch_normalization/nchw/neon/fp16.cpp"
+
             ],
             "fp32": [
               "src/cpu/kernels/fuse_batch_normalization/generic/fp32.cpp",
               "src/core/NEON/kernels/batchnormalization/impl/NEON/fp32.cpp",
-              "src/cpu/kernels/fuse_batch_normalization/nhwc/neon/fp32.cpp"
+              "src/cpu/kernels/fuse_batch_normalization/nhwc/neon/fp32.cpp",
+              "src/cpu/kernels/fuse_batch_normalization/nchw/neon/fp32.cpp"
             ]
           },
           "sve": {
@@ -1122,27 +1127,20 @@
               "src/core/NEON/kernels/convolution/common/qasymm8.cpp",
               "src/core/NEON/kernels/convolution/common/qsymm8.cpp",
               "src/core/NEON/kernels/convolution/common/utils.cpp",
-              "src/core/NEON/kernels/convolution/winograd/input_transforms_fp16.cpp",
               "src/core/NEON/kernels/convolution/winograd/input_transforms_fp32.cpp",
-              "src/core/NEON/kernels/convolution/winograd/output_transforms_fp16.cpp",
               "src/core/NEON/kernels/convolution/winograd/output_transforms_fp32.cpp",
-              "src/core/NEON/kernels/convolution/winograd/weight_transforms_fp16.cpp",
               "src/core/NEON/kernels/convolution/winograd/weight_transforms_fp32.cpp",
-              "src/core/NEON/kernels/convolution/winograd/winograd_fp16.cpp",
               "src/core/NEON/kernels/convolution/winograd/winograd_fp32.cpp",
-              "src/core/NEON/kernels/convolution/winograd/input_transforms/a64_fp16_6x6.cpp",
               "src/core/NEON/kernels/convolution/winograd/input_transforms/a64_fp32_6x6.cpp",
               "src/core/NEON/kernels/convolution/winograd/input_transforms/arm_fp32_1x8.cpp",
               "src/core/NEON/kernels/convolution/winograd/input_transforms/arm_fp32_4x4.cpp",
               "src/core/NEON/kernels/convolution/winograd/input_transforms/arm_fp32_6x6.cpp",
-              "src/core/NEON/kernels/convolution/winograd/output_transforms/a64_fp16_4x4_3x3.cpp",
               "src/core/NEON/kernels/convolution/winograd/output_transforms/arm_fp32_1x2_1x7.cpp",
               "src/core/NEON/kernels/convolution/winograd/output_transforms/arm_fp32_1x4_1x5.cpp",
               "src/core/NEON/kernels/convolution/winograd/output_transforms/arm_fp32_1x6_1x3.cpp",
               "src/core/NEON/kernels/convolution/winograd/output_transforms/arm_fp32_2x2_3x3.cpp",
               "src/core/NEON/kernels/convolution/winograd/output_transforms/arm_fp32_2x2_5x5.cpp",
               "src/core/NEON/kernels/convolution/winograd/output_transforms/arm_fp32_4x4_3x3.cpp",
-              "src/core/NEON/kernels/convolution/winograd/weight_transforms/a64_fp16_4x4_3x3.cpp",
               "src/core/NEON/kernels/convolution/winograd/weight_transforms/arm_fp32_2x2_3x3.cpp",
               "src/core/NEON/kernels/convolution/winograd/weight_transforms/arm_fp32_2x2_5x5.cpp",
               "src/core/NEON/kernels/convolution/winograd/weight_transforms/arm_fp32_4x4_3x3.cpp",
@@ -1159,6 +1157,13 @@
             ],
             "fp16": [
               "src/cpu/kernels/directconv2d/nchw/fp16.cpp",
+              "src/core/NEON/kernels/convolution/winograd/winograd_fp16.cpp",
+              "src/core/NEON/kernels/convolution/winograd/input_transforms_fp16.cpp",
+              "src/core/NEON/kernels/convolution/winograd/output_transforms_fp16.cpp",
+              "src/core/NEON/kernels/convolution/winograd/weight_transforms_fp16.cpp",
+              "src/core/NEON/kernels/convolution/winograd/input_transforms/a64_fp16_6x6.cpp",
+              "src/core/NEON/kernels/convolution/winograd/output_transforms/a64_fp16_4x4_3x3.cpp",
+              "src/core/NEON/kernels/convolution/winograd/weight_transforms/a64_fp16_4x4_3x3.cpp",
               "src/cpu/kernels/directconv2d/nhwc/neon/fp16.cpp"
             ]
           },
@@ -1214,7 +1219,9 @@
         "files": {
           "common": [
             "src/core/NEON/kernels/NEDepthToSpaceLayerKernel.cpp",
-            "src/runtime/NEON/functions/NEDepthToSpaceLayer.cpp"
+            "src/runtime/NEON/functions/NEDepthToSpaceLayer.cpp",
+            "src/cpu/kernels/depth_to_space/nhwc/any/impl.cpp",
+            "src/cpu/kernels/depth_to_space/nchw/any/impl.cpp"
           ]
         }
       },
@@ -1241,7 +1248,6 @@
               "src/core/NEON/kernels/arm_conv/depthwise/depthwise_u8s8u8q.cpp",
               "src/core/NEON/kernels/arm_conv/depthwise/depthwise_s8q.cpp",
               "src/core/NEON/kernels/arm_conv/depthwise/depthwise_fp32.cpp",
-              "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp",
               "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst/generic.cpp",
               "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst/generic.cpp",
               "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp",
@@ -1252,18 +1258,6 @@
               "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst/generic.cpp",
               "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp",
               "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp",
-              "src/core/NEON/kernels/arm_conv/depthwise/depthwise_fp16.cpp",
-              "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp",
-              "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp",
-              "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp",
-              "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp",
-              "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp",
-              "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp",
-              "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp",
-              "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp",
-              "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_direct.cpp",
-              "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_indirect.cpp",
-              "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_generic_output9_mla_depthfirst/generic.cpp",
               "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp",
               "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp",
               "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp",
@@ -1300,7 +1294,22 @@
               "src/core/NEON/kernels/arm_conv/depthwise/premultiply.cpp",
               "src/cpu/kernels/depthwiseconv2d/generic/neon/impl.cpp"
               ],
-              "fp16":["src/cpu/kernels/depthwiseconv2d/generic/neon/fp16.cpp"],
+              "fp16":[
+              "src/cpu/kernels/depthwiseconv2d/generic/neon/fp16.cpp",
+              "src/core/NEON/kernels/arm_conv/depthwise/depthwise_fp16.cpp",
+              "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp",
+              "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp",
+              "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp",
+              "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp",
+              "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp",
+              "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp",
+              "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp",
+              "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp",
+              "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp",
+              "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_direct.cpp",
+              "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_indirect.cpp",
+              "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_generic_output9_mla_depthfirst/generic.cpp"
+              ],
               "fp32":["src/cpu/kernels/depthwiseconv2d/generic/neon/fp32.cpp"],
               "qasymm8":["src/cpu/kernels/depthwiseconv2d/generic/neon/qasymm8.cpp"],
               "qasymm8_signed":["src/cpu/kernels/depthwiseconv2d/generic/neon/qasymm8_signed.cpp"]
@@ -1820,6 +1829,11 @@
             "qasymm8": ["src/cpu/kernels/lut/generic/neon/u8.cpp"],
             "qasymm8_signed": ["src/cpu/kernels/lut/generic/neon/u8.cpp"]
           },
+          "sve": {
+            "fp16": ["src/cpu/kernels/lut/generic/sve/u16.cpp"],
+            "qasymm16": ["src/cpu/kernels/lut/generic/sve/u16.cpp"],
+            "qasymm16_signed": ["src/cpu/kernels/lut/generic/sve/u16.cpp"]
+          },
           "sve2": {
             "qasymm8": ["src/cpu/kernels/lut/generic/sve2/u8.cpp"],
             "qasymm8_signed": ["src/cpu/kernels/lut/generic/sve2/u8.cpp"]
@@ -1902,7 +1916,11 @@
             "src/cpu/operators/CpuMul.cpp",
             "src/cpu/kernels/CpuMulKernel.cpp",
             "src/runtime/NEON/functions/NEPixelWiseMultiplication.cpp"
-          ]
+          ],
+        "neon":{
+          "fp16":["src/cpu/kernels/mul/generic/neon/fp16.cpp"],
+          "fp32":["src/cpu/kernels/mul/generic/neon/fp32.cpp"]
+        }
         }
       },
       "Normalize": {
@@ -1911,7 +1929,11 @@
           "common": [
             "src/core/NEON/kernels/NENormalizationLayerKernel.cpp",
             "src/runtime/NEON/functions/NENormalizationLayer.cpp"
-          ]
+          ],
+        "neon":{
+          "fp16":["src/cpu/kernels/norm_layer/generic/neon/fp16.cpp"],
+          "fp32":["src/cpu/kernels/norm_layer/generic/neon/fp32.cpp"]
+        }
         }
       },
       "Pad": {
@@ -1943,16 +1965,11 @@
           "neon": {
             "common": [
               "src/core/NEON/kernels/arm_conv/pooling/kernels/cpp_nhwc_1x1_stride_any_depthfirst/generic.cpp",
-              "src/core/NEON/kernels/arm_conv/pooling/pooling_fp16.cpp",
               "src/core/NEON/kernels/arm_conv/pooling/pooling_fp32.cpp",
               "src/core/NEON/kernels/arm_conv/pooling/pooling_s8.cpp",
               "src/core/NEON/kernels/arm_conv/pooling/pooling_s8q.cpp",
               "src/core/NEON/kernels/arm_conv/pooling/pooling_u8.cpp",
               "src/core/NEON/kernels/arm_conv/pooling/pooling_u8q.cpp",
-              "src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst/generic.cpp",
-              "src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_avg_generic_depthfirst/generic.cpp",
-              "src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp",
-              "src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_max_generic_depthfirst/generic.cpp",
               "src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst/generic.cpp",
               "src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_avg_generic_depthfirst/generic.cpp",
               "src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp",
@@ -1969,7 +1986,14 @@
               "src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8q_nhwc_max_generic_depthfirst/generic.cpp"
             ],
             "nchw": [ "src/cpu/kernels/pool2d/neon/nchw/all.cpp" ],
-            "fp16": [ "src/cpu/kernels/pool2d/neon/fp16.cpp" ],
+            "fp16": [
+                "src/cpu/kernels/pool2d/neon/fp16.cpp",
+                "src/core/NEON/kernels/arm_conv/pooling/pooling_fp16.cpp",
+                "src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst/generic.cpp",
+                "src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_avg_generic_depthfirst/generic.cpp",
+                "src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp",
+                "src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_max_generic_depthfirst/generic.cpp"
+             ],
             "fp32": [ "src/cpu/kernels/pool2d/neon/fp32.cpp" ],
             "qasymm8":[ "src/cpu/kernels/pool2d/neon/qasymm8.cpp" ],
             "qasymm8_signed":["src/cpu/kernels/pool2d/neon/qasymm8_signed.cpp"]
@@ -2198,16 +2222,10 @@
             "qasymm8_signed":["src/cpu/kernels/softmax/generic/neon/qasymm8_signed.cpp"]
           },
           "sve": {
-            "common": [ "src/cpu/kernels/softmax/generic/sve/impl.cpp" ],
-            "fp32": ["src/cpu/kernels/softmax/generic/sve/fp32.cpp"],
-            "fp16": ["src/cpu/kernels/softmax/generic/sve/fp16.cpp"],
-            "qasymm8": ["src/cpu/kernels/softmax/generic/sve/qasymm8.cpp" ],
-            "qasymm8_signed": ["src/cpu/kernels/softmax/generic/sve/qasymm8_signed.cpp"]
+            "common": [ "src/cpu/kernels/softmax/generic/sve/impl.cpp" ]
           },
           "sve2":{
-            "common" :["src/cpu/kernels/softmax/generic/sve2/impl.cpp"],
-            "qasymm8":[ "src/cpu/kernels/softmax/generic/sve2/qasymm8.cpp"],
-            "qasymm8_signed":["src/cpu/kernels/softmax/generic/sve2/qasymm8_signed.cpp"]
+            "common" :["src/cpu/kernels/softmax/generic/sve2/impl.cpp"]
           }
         }
       },
diff --git a/scripts/arm_compute_library_nn_driver.go b/scripts/arm_compute_library_nn_driver.go
index dda77b55df..2aab2d3fe7 100644
--- a/scripts/arm_compute_library_nn_driver.go
+++ b/scripts/arm_compute_library_nn_driver.go
@@ -46,6 +46,7 @@ func globalFlags(ctx android.BaseContext) []string {
         if theArch == "armv8-2a" {
           cppflags = append(cppflags, "-march=armv8.2-a+fp16")
           cppflags = append(cppflags, "-DARM_COMPUTE_ENABLE_FP16")
+          cppflags = append(cppflags, "-DENABLE_FP16_KERNELS")
         }
       }
     }
@@ -74,9 +75,6 @@ func globalFlags(ctx android.BaseContext) []string {
         if strings.ToUpper(x) == "ALL" || strings.ToUpper(x) == "QSYMM16" {
             cppflags = append(cppflags, "-DENABLE_QSYMM16_KERNELS")
         }
-        if strings.ToUpper(x) == "ALL" || strings.ToUpper(x) == "FP16" {
-            cppflags = append(cppflags, "-DENABLE_FP16_KERNELS")
-        }
         if strings.ToUpper(x) == "ALL" || strings.ToUpper(x) == "FP32" {
             cppflags = append(cppflags, "-DENABLE_FP32_KERNELS")
         }
diff --git a/scripts/format_code.py b/scripts/format_code.py
index f1ee7a731c..29dbea7f0d 100755
--- a/scripts/format_code.py
+++ b/scripts/format_code.py
@@ -262,6 +262,9 @@ def run(self):
             self.shell.prepend_env("PATH","%s/../bin" % this_dir)
 
             for f in self.files:
+                if not self.skip_copyright:
+                    check_copyright(f)
+
                 skip_this_file = False
                 for e in exceptions:
                     if e in f:
@@ -272,8 +275,6 @@ def run(self):
                     continue
 
                 logger.info("Formatting %s" % f)
-                if not self.skip_copyright:
-                    check_copyright(f)
 
             check_license("LICENSE")
 
diff --git a/src/BUILD.bazel b/src/BUILD.bazel
index a22632e1f5..9d5ae63484 100644
--- a/src/BUILD.bazel
+++ b/src/BUILD.bazel
@@ -117,9 +117,7 @@ filegroup(
 	"cpu/kernels/elementwise_binary/generic/sve2/qasymm8_signed.cpp",
 	"cpu/kernels/elementwise_unary/generic/sve2/q8.cpp",
 	"cpu/kernels/lut/generic/sve2/u8.cpp",
-	"cpu/kernels/softmax/generic/sve2/impl.cpp",
-	"cpu/kernels/softmax/generic/sve2/qasymm8.cpp",
-	"cpu/kernels/softmax/generic/sve2/qasymm8_signed.cpp"]  +
+	"cpu/kernels/softmax/generic/sve2/impl.cpp"]  +
     glob(["**/*.h",
     "**/*.hpp",
     "**/*.inl"]),
@@ -337,16 +335,13 @@ filegroup(
 	"cpu/kernels/elementwise_unary/generic/sve/fp32.cpp",
 	"cpu/kernels/elementwise_unary/generic/sve/impl.cpp",
 	"cpu/kernels/elementwise_unary/generic/sve/integer.cpp",
+	"cpu/kernels/lut/generic/sve/u16.cpp",
 	"cpu/kernels/scale/sve/fp16.cpp",
 	"cpu/kernels/scale/sve/fp32.cpp",
 	"cpu/kernels/scale/sve/integer.cpp",
 	"cpu/kernels/scale/sve/qasymm8.cpp",
 	"cpu/kernels/scale/sve/qasymm8_signed.cpp",
-	"cpu/kernels/softmax/generic/sve/fp16.cpp",
-	"cpu/kernels/softmax/generic/sve/fp32.cpp",
-	"cpu/kernels/softmax/generic/sve/impl.cpp",
-	"cpu/kernels/softmax/generic/sve/qasymm8.cpp",
-	"cpu/kernels/softmax/generic/sve/qasymm8_signed.cpp"]  +
+	"cpu/kernels/softmax/generic/sve/impl.cpp"]  +
     glob(["**/*.h",
     "**/*.hpp",
     "**/*.inl"]),
@@ -643,6 +638,7 @@ filegroup(
 	"core/Utils.cpp",
 	"core/Validate.cpp",
 	"core/Version.cpp",
+	"core/helpers/LUTManager.cpp",
 	"core/helpers/SoftmaxHelpers.cpp",
 	"core/helpers/Utils.cpp",
 	"core/helpers/WindowHelpers.cpp",
@@ -739,6 +735,8 @@ filegroup(
 	"cpu/kernels/crop/generic/neon/fp16.cpp",
 	"cpu/kernels/crop/generic/neon/fp32.cpp",
 	"cpu/kernels/crop/generic/neon/integer.cpp",
+	"cpu/kernels/depth_to_space/nchw/any/impl.cpp",
+	"cpu/kernels/depth_to_space/nhwc/any/impl.cpp",
 	"cpu/kernels/depthwiseconv2d/generic/neon/fp16.cpp",
 	"cpu/kernels/depthwiseconv2d/generic/neon/fp32.cpp",
 	"cpu/kernels/depthwiseconv2d/generic/neon/impl.cpp",
@@ -766,6 +764,8 @@ filegroup(
 	"cpu/kernels/fuse_batch_normalization/generic/fp16.cpp",
 	"cpu/kernels/fuse_batch_normalization/generic/fp32.cpp",
 	"cpu/kernels/fuse_batch_normalization/nchw/all.cpp",
+	"cpu/kernels/fuse_batch_normalization/nchw/neon/fp16.cpp",
+	"cpu/kernels/fuse_batch_normalization/nchw/neon/fp32.cpp",
 	"cpu/kernels/fuse_batch_normalization/nhwc/neon/fp16.cpp",
 	"cpu/kernels/fuse_batch_normalization/nhwc/neon/fp32.cpp",
 	"cpu/kernels/gemm_matrix_add/generic/neon/fp16.cpp",
@@ -794,6 +794,10 @@ filegroup(
 	"cpu/kernels/meanstddevnorm/generic/neon/fp32.cpp",
 	"cpu/kernels/meanstddevnorm/generic/neon/impl.cpp",
 	"cpu/kernels/meanstddevnorm/generic/neon/qasymm8.cpp",
+	"cpu/kernels/mul/generic/neon/fp16.cpp",
+	"cpu/kernels/mul/generic/neon/fp32.cpp",
+	"cpu/kernels/norm_layer/generic/neon/fp16.cpp",
+	"cpu/kernels/norm_layer/generic/neon/fp32.cpp",
 	"cpu/kernels/pool2d/neon/fp16.cpp",
 	"cpu/kernels/pool2d/neon/fp32.cpp",
 	"cpu/kernels/pool2d/neon/nchw/all.cpp",
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 37599cdadd..be7a6ef188 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -312,16 +312,13 @@ target_sources(
 	cpu/kernels/elementwise_unary/generic/sve/fp32.cpp
 	cpu/kernels/elementwise_unary/generic/sve/impl.cpp
 	cpu/kernels/elementwise_unary/generic/sve/integer.cpp
+	cpu/kernels/lut/generic/sve/u16.cpp
 	cpu/kernels/scale/sve/fp16.cpp
 	cpu/kernels/scale/sve/fp32.cpp
 	cpu/kernels/scale/sve/integer.cpp
 	cpu/kernels/scale/sve/qasymm8.cpp
 	cpu/kernels/scale/sve/qasymm8_signed.cpp
-	cpu/kernels/softmax/generic/sve/fp16.cpp
-	cpu/kernels/softmax/generic/sve/fp32.cpp
 	cpu/kernels/softmax/generic/sve/impl.cpp
-	cpu/kernels/softmax/generic/sve/qasymm8.cpp
-	cpu/kernels/softmax/generic/sve/qasymm8_signed.cpp
 )
 
 target_sources(
@@ -339,8 +336,6 @@ target_sources(
 	cpu/kernels/elementwise_unary/generic/sve2/q8.cpp
 	cpu/kernels/lut/generic/sve2/u8.cpp
 	cpu/kernels/softmax/generic/sve2/impl.cpp
-	cpu/kernels/softmax/generic/sve2/qasymm8.cpp
-	cpu/kernels/softmax/generic/sve2/qasymm8_signed.cpp
 )
 
 target_sources(
@@ -634,6 +629,7 @@ target_sources(
 	core/Utils.cpp
 	core/Validate.cpp
 	core/Version.cpp
+	core/helpers/LUTManager.cpp
 	core/helpers/SoftmaxHelpers.cpp
 	core/helpers/Utils.cpp
 	core/helpers/WindowHelpers.cpp
@@ -730,6 +726,8 @@ target_sources(
 	cpu/kernels/crop/generic/neon/fp16.cpp
 	cpu/kernels/crop/generic/neon/fp32.cpp
 	cpu/kernels/crop/generic/neon/integer.cpp
+	cpu/kernels/depth_to_space/nchw/any/impl.cpp
+	cpu/kernels/depth_to_space/nhwc/any/impl.cpp
 	cpu/kernels/depthwiseconv2d/generic/neon/fp16.cpp
 	cpu/kernels/depthwiseconv2d/generic/neon/fp32.cpp
 	cpu/kernels/depthwiseconv2d/generic/neon/impl.cpp
@@ -757,6 +755,8 @@ target_sources(
 	cpu/kernels/fuse_batch_normalization/generic/fp16.cpp
 	cpu/kernels/fuse_batch_normalization/generic/fp32.cpp
 	cpu/kernels/fuse_batch_normalization/nchw/all.cpp
+	cpu/kernels/fuse_batch_normalization/nchw/neon/fp16.cpp
+	cpu/kernels/fuse_batch_normalization/nchw/neon/fp32.cpp
 	cpu/kernels/fuse_batch_normalization/nhwc/neon/fp16.cpp
 	cpu/kernels/fuse_batch_normalization/nhwc/neon/fp32.cpp
 	cpu/kernels/gemm_matrix_add/generic/neon/fp16.cpp
@@ -785,6 +785,10 @@ target_sources(
 	cpu/kernels/meanstddevnorm/generic/neon/fp32.cpp
 	cpu/kernels/meanstddevnorm/generic/neon/impl.cpp
 	cpu/kernels/meanstddevnorm/generic/neon/qasymm8.cpp
+	cpu/kernels/mul/generic/neon/fp16.cpp
+	cpu/kernels/mul/generic/neon/fp32.cpp
+	cpu/kernels/norm_layer/generic/neon/fp16.cpp
+	cpu/kernels/norm_layer/generic/neon/fp32.cpp
 	cpu/kernels/pool2d/neon/fp16.cpp
 	cpu/kernels/pool2d/neon/fp32.cpp
 	cpu/kernels/pool2d/neon/nchw/all.cpp
diff --git a/src/core/CL/CLMutableCommandBuffer.cpp b/src/core/CL/CLMutableCommandBuffer.cpp
index 05b351fc25..0e078d8416 100644
--- a/src/core/CL/CLMutableCommandBuffer.cpp
+++ b/src/core/CL/CLMutableCommandBuffer.cpp
@@ -26,6 +26,7 @@
 
 #include "arm_compute/core/Error.h"
 
+#include "src/common/utils/Log.h"
 #include "src/core/CL/CLUtils.h"
 
 namespace arm_compute
@@ -48,7 +49,11 @@ CLMutableCommandBuffer::CLMutableCommandBuffer(cl_command_queue queue) : CLComma
 CLMutableCommandBuffer::~CLMutableCommandBuffer()
 {
     const auto status = clReleaseCommandBufferKHR(_cb);
-    handle_cl_error("clReleaseCommandBufferKHR", status);
+    if (status != CL_SUCCESS)
+    {
+        const std::string error_message = "clReleaseCommandBufferKHR - Error code: " + std::to_string(status);
+        ARM_COMPUTE_LOG_ERROR_ACL(error_message);
+    }
 }
 
 void CLMutableCommandBuffer::add_kernel(cl_kernel          kernel,
diff --git a/src/core/CL/OpenCL.cpp b/src/core/CL/OpenCL.cpp
index 35421d025e..07baa5e7fb 100644
--- a/src/core/CL/OpenCL.cpp
+++ b/src/core/CL/OpenCL.cpp
@@ -132,6 +132,10 @@ bool CLSymbols::load(const std::vector<std::string> &libraries_filenames, bool u
     func_name##_ptr = reinterpret_cast<decltype(func_name) *>(dlsym(handle, #func_name));
 #endif /* __ANDROID__ */
 
+#define LOAD_EXTENSION_FUNCTION_PTR(func_name, platform_id) \
+    func_name##_ptr =                                       \
+        reinterpret_cast<decltype(func_name) *>(clGetExtensionFunctionAddressForPlatform(platform_id, #func_name));
+
     LOAD_FUNCTION_PTR(clCreateContext, handle);
     LOAD_FUNCTION_PTR(clCreateContextFromType, handle);
     LOAD_FUNCTION_PTR(clCreateCommandQueue, handle);
@@ -181,8 +185,27 @@ bool CLSymbols::load(const std::vector<std::string> &libraries_filenames, bool u
     LOAD_FUNCTION_PTR(clWaitForEvents, handle);
     LOAD_FUNCTION_PTR(clCreateImage, handle);
     LOAD_FUNCTION_PTR(clSetKernelExecInfo, handle);
+    LOAD_FUNCTION_PTR(clGetExtensionFunctionAddressForPlatform, handle);
+
+    // Load Extensions
+
+    // Number of platforms is assumed to be 1. For this to be greater than 1,
+    // the system must have more than one OpenCL implementation provided by
+    // different vendors. This is not our use case. Besides, the library
+    // already assumes one implementation as it uses one handle to load core
+    // functions.
+    constexpr unsigned int      num_platforms = 1U;
+    std::vector<cl_platform_id> platform_ids(num_platforms);
+    clGetPlatformIDs(num_platforms, platform_ids.data(), nullptr);
 
     // Command buffer and mutable dispatch command buffer extensions
+    /// TODO: (COMPMID-6742) Load Command Buffer extensions in a Portable way
+    /// using clGetExtensionFunctionAddressForPlatform().
+    /// The details can be found here:
+    ///    https://registry.khronos.org/OpenCL/specs/3.0-unified/html/OpenCL_Ext.html#getting-opencl-api-extension-function-pointers
+    ///
+    /// @note: There are some problems reported while loading these extensions in the recommended way.
+    ///        For details, please see COMPUTE-16545
     LOAD_FUNCTION_PTR(clCreateCommandBufferKHR, handle);
     LOAD_FUNCTION_PTR(clRetainCommandBufferKHR, handle);
     LOAD_FUNCTION_PTR(clReleaseCommandBufferKHR, handle);
@@ -193,9 +216,10 @@ bool CLSymbols::load(const std::vector<std::string> &libraries_filenames, bool u
     LOAD_FUNCTION_PTR(clUpdateMutableCommandsKHR, handle);
 
     // Third-party extensions
-    LOAD_FUNCTION_PTR(clImportMemoryARM, handle);
+    LOAD_EXTENSION_FUNCTION_PTR(clImportMemoryARM, platform_ids[0]);
 
 #undef LOAD_FUNCTION_PTR
+#undef LOAD_EXTENSION_FUNCTION_PTR
 
     //Don't call dlclose(handle) or all the symbols will be unloaded !
 
@@ -1063,6 +1087,19 @@ clSetKernelExecInfo(cl_kernel kernel, cl_kernel_exec_info param_name, size_t par
     }
 }
 
+void *clGetExtensionFunctionAddressForPlatform(cl_platform_id platform, const char *funcname)
+{
+    arm_compute::CLSymbols::get().load_default();
+    const auto func = arm_compute::CLSymbols::get().clGetExtensionFunctionAddressForPlatform_ptr;
+
+    if (func != nullptr)
+    {
+        return func(platform, funcname);
+    }
+
+    return nullptr;
+}
+
 cl_command_buffer_khr clCreateCommandBufferKHR(cl_uint                                 num_queues,
                                                const cl_command_queue                 *queues,
                                                const cl_command_buffer_properties_khr *properties,
diff --git a/src/core/CL/cl_kernels/common/gemm_reshaped_only_rhs_mmul.cl b/src/core/CL/cl_kernels/common/gemm_reshaped_only_rhs_mmul.cl
index 8919023d4c..09b8956b68 100644
--- a/src/core/CL/cl_kernels/common/gemm_reshaped_only_rhs_mmul.cl
+++ b/src/core/CL/cl_kernels/common/gemm_reshaped_only_rhs_mmul.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -117,9 +117,23 @@ __kernel void gemm_mm_reshaped_only_rhs_nt_mmul(
     uint rhs_y = block_id;
 
     // Compute LHS/RHS/DST matrix address
+#ifdef REINTERPRET_INPUT_AS_3D
+    lhs_offset_first_element_in_bytes += lhs_x * sizeof(DATA_TYPE) + (lhs_y + z * M) * lhs_stride_y;
+#else // REINTERPRET_INPUT_AS_3D
     lhs_offset_first_element_in_bytes += lhs_x * sizeof(DATA_TYPE) + lhs_y * lhs_stride_y + z * lhs_stride_z;
+#endif // REINTERPRET_INPUT_AS_3D
+
+#ifdef BATCHED_RHS
     rhs_offset_first_element_in_bytes += rhs_x * sizeof(DATA_TYPE) + rhs_y * rhs_stride_y + z * rhs_stride_z;
+#else // BATCHED_RHS
+    rhs_offset_first_element_in_bytes += rhs_x * sizeof(DATA_TYPE) + rhs_y * rhs_stride_y;
+#endif // BATCHED_RHS
+
+#ifdef REINTERPRET_OUTPUT_AS_3D
+    dst_offset_first_element_in_bytes += dst_x * sizeof(DATA_TYPE) + (dst_y + z * M) * dst_stride_y;
+#else // REINTERPRET_OUTPUT_AS_3D
     dst_offset_first_element_in_bytes += dst_x * sizeof(DATA_TYPE) + dst_y * dst_stride_y + z * dst_stride_z;
+#endif // REINTERPRET_OUTPUT_AS_3D
 
     // Note: If RHS derives from the weights of convolution 2d layer, RHS will always be 2D and rhs_stride_z will always be equal to 0 for
     // not sliding the tensor
@@ -367,11 +381,25 @@ __kernel void gemm_mm_reshaped_only_rhs_nt_mmul_texture(
 
     // Starting RHS coordinates
     uint rhs_x = block_y * N0 * MMUL_N0 + block_x * N0;
+
+#ifdef BATCHED_RHS
     uint rhs_y = block_id + z * rhs_h;
+#else // BATCHED_RHS
+    uint rhs_y = block_id;
+#endif // BATCHED_RHS
 
     // Compute LHS/RHS/DST matrix address
+#ifdef REINTERPRET_INPUT_AS_3D
+    lhs_offset_first_element_in_bytes += lhs_x * sizeof(DATA_TYPE) + (lhs_y + z * M) * lhs_stride_y;
+#else // REINTERPRET_INPUT_AS_3D
     lhs_offset_first_element_in_bytes += lhs_x * sizeof(DATA_TYPE) + lhs_y * lhs_stride_y + z * lhs_stride_z;
+#endif // REINTERPRET_INPUT_AS_3D
+
+#ifdef REINTERPRET_OUTPUT_AS_3D
+    dst_offset_first_element_in_bytes += dst_x * sizeof(DATA_TYPE) + (dst_y + z * M) * dst_stride_y;
+#else // REINTERPRET_OUTPUT_AS_3D
     dst_offset_first_element_in_bytes += dst_x * sizeof(DATA_TYPE) + dst_y * dst_stride_y + z * dst_stride_z;
+#endif // REINTERPRET_OUTPUT_AS_3D
 
     // Initialize the accumulators
     // MMUL extension accumulate the result in F32 for both F32 and F16
@@ -525,4 +553,4 @@ __kernel void gemm_mm_reshaped_only_rhs_nt_mmul_texture(
 #undef RHS_OFFSET_X
 #undef RHS_STEP_X
 }
-#endif // defined(GEMM_MM_RESHAPED_ONLY_RHS_MMUL_TEXTURE)
\ No newline at end of file
+#endif // defined(GEMM_MM_RESHAPED_ONLY_RHS_MMUL_TEXTURE)
diff --git a/src/core/CL/cl_kernels/common/generate_proposals.cl b/src/core/CL/cl_kernels/common/generate_proposals.cl
index 5b8502072a..bfe1922ac2 100644
--- a/src/core/CL/cl_kernels/common/generate_proposals.cl
+++ b/src/core/CL/cl_kernels/common/generate_proposals.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -59,18 +59,16 @@ __kernel void generate_proposals_compute_all_anchors(
     Vector anchors = CONVERT_TO_VECTOR_STRUCT_NO_STEP(anchors);
     Vector rois    = CONVERT_TO_VECTOR_STRUCT(rois);
 
-    const size_t idx = get_global_id(0);
+    const unsigned int idx = get_global_id(0);
     // Find the index of the anchor
-    const size_t anchor_idx = idx % NUM_ANCHORS;
+    const unsigned int anchor_idx = idx % NUM_ANCHORS;
 
     // Find which shift is this thread using
-    const size_t shift_idx = idx / NUM_ANCHORS;
+    const unsigned int shift_idx = idx / NUM_ANCHORS;
 
     // Compute the shift on the X and Y direction (the shift depends exclusively by the index thread id)
-    const DATA_TYPE
-    shift_x = (DATA_TYPE)(shift_idx % WIDTH) * STRIDE;
-    const DATA_TYPE
-    shift_y = (DATA_TYPE)(shift_idx / WIDTH) * STRIDE;
+    const float shift_x = (float)(shift_idx % WIDTH) * STRIDE;
+    const float shift_y = (float)(shift_idx / WIDTH) * STRIDE;
 
     const VEC_DATA_TYPE(DATA_TYPE, NUM_ROI_FIELDS)
     shift = (VEC_DATA_TYPE(DATA_TYPE, NUM_ROI_FIELDS))(shift_x, shift_y, shift_x, shift_y);
diff --git a/src/core/GPUTarget.cpp b/src/core/GPUTarget.cpp
index 2d1a13cb33..5904e1a06f 100644
--- a/src/core/GPUTarget.cpp
+++ b/src/core/GPUTarget.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022 Arm Limited.
+ * Copyright (c) 2018-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -30,6 +30,23 @@
 
 namespace
 {
+
+arm_compute::GPUTarget get_fifth_gen_target(const std::string &version)
+{
+    if (version.find("G720") != std::string::npos)
+    {
+        return arm_compute::GPUTarget::G720;
+    }
+    else if (version.find("G620") != std::string::npos)
+    {
+        return arm_compute::GPUTarget::G620;
+    }
+    else
+    {
+        return arm_compute::GPUTarget::UNKNOWN;
+    }
+}
+
 arm_compute::GPUTarget get_valhall_target(const std::string &version)
 {
     if (version.find("G77") != std::string::npos)
@@ -152,16 +169,18 @@ namespace arm_compute
 const std::string &string_from_target(GPUTarget target)
 {
     static std::map<GPUTarget, const std::string> gpu_target_map = {
-        {GPUTarget::MIDGARD, "midgard"}, {GPUTarget::BIFROST, "bifrost"}, {GPUTarget::VALHALL, "valhall"},
-        {GPUTarget::T600, "t600"},       {GPUTarget::T700, "t700"},       {GPUTarget::T800, "t800"},
-        {GPUTarget::G71, "g71"},         {GPUTarget::G72, "g72"},         {GPUTarget::G51, "g51"},
-        {GPUTarget::G51BIG, "g51big"},   {GPUTarget::G51LIT, "g51lit"},   {GPUTarget::G31, "g31"},
-        {GPUTarget::G76, "g76"},         {GPUTarget::G52, "g52"},         {GPUTarget::G52LIT, "g52lit"},
-        {GPUTarget::G77, "g77"},         {GPUTarget::G57, "g57"},         {GPUTarget::G78, "g78"},
-        {GPUTarget::G68, "g68"},         {GPUTarget::G78AE, "g78ae"},     {GPUTarget::G710, "g710"},
-        {GPUTarget::G610, "g610"},       {GPUTarget::G510, "g510"},       {GPUTarget::G310, "g310"},
-        {GPUTarget::G715, "g715"},       {GPUTarget::G615, "g615"},
-    };
+        {GPUTarget::MIDGARD, "midgard"},   {GPUTarget::BIFROST, "bifrost"}, {GPUTarget::VALHALL, "valhall"},
+        {GPUTarget::FIFTHGEN, "fifthgen"},
+
+        {GPUTarget::T600, "t600"},         {GPUTarget::T700, "t700"},       {GPUTarget::T800, "t800"},
+        {GPUTarget::G71, "g71"},           {GPUTarget::G72, "g72"},         {GPUTarget::G51, "g51"},
+        {GPUTarget::G51BIG, "g51big"},     {GPUTarget::G51LIT, "g51lit"},   {GPUTarget::G31, "g31"},
+        {GPUTarget::G76, "g76"},           {GPUTarget::G52, "g52"},         {GPUTarget::G52LIT, "g52lit"},
+        {GPUTarget::G77, "g77"},           {GPUTarget::G57, "g57"},         {GPUTarget::G78, "g78"},
+        {GPUTarget::G68, "g68"},           {GPUTarget::G78AE, "g78ae"},     {GPUTarget::G710, "g710"},
+        {GPUTarget::G610, "g610"},         {GPUTarget::G510, "g510"},       {GPUTarget::G310, "g310"},
+        {GPUTarget::G715, "g715"},         {GPUTarget::G615, "g615"},       {GPUTarget::G720, "g720"},
+        {GPUTarget::G620, "g620"}};
 
     return gpu_target_map[target];
 }
@@ -188,8 +207,13 @@ GPUTarget get_target_from_name(const std::string &device_name)
     GPUTarget gpu_target;
     if (target == 'G' || is_future_gpu)
     {
-        // Check for Valhall or Bifrost
-        gpu_target = get_valhall_target(version);
+        // Check for Valhall, Bifrost or 5-th Gen
+        gpu_target = get_fifth_gen_target(version);
+        if (gpu_target == GPUTarget::UNKNOWN)
+        {
+            gpu_target = get_valhall_target(version);
+        }
+
         if (gpu_target == GPUTarget::UNKNOWN)
         {
             gpu_target = get_bifrost_target(version);
diff --git a/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp b/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp
index deb89996a9..717fd11485 100644
--- a/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -151,128 +151,15 @@ Status validate_arguments(const ITensorInfo  *input,
 }
 } //namespace
 
-template <typename T, bool fused_activation, typename F>
-void NEBatchNormalizationLayerKernel::batch_normalization_nchw(const Window &window)
-{
-    /** SIMD vector tag type. */
-    using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>;
-
-    const int  window_step_x  = 16 / sizeof(T);
-    const auto window_start_x = static_cast<int>(window.x().start());
-    const auto window_end_x   = static_cast<int>(window.x().end());
-
-    Window win_to_use = window;
-    win_to_use.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    Iterator input(_input, win_to_use);
-    Iterator output(_output, win_to_use);
-
-    F activation_functor(_act_info);
-
-    // Hold information about the current feature map we are iterating.
-    // Only compute denominator and constants once per feature map.
-    int slice = -1;
-
-    const auto input_mean = reinterpret_cast<const T *>(_mean->ptr_to_element(Coordinates(0, 0)));
-    const auto input_var  = reinterpret_cast<const T *>(_var->ptr_to_element(Coordinates(0, 0)));
-    const auto input_gamma =
-        (_gamma != nullptr) ? reinterpret_cast<const T *>(_gamma->ptr_to_element(Coordinates(0, 0))) : nullptr;
-    const auto input_beta =
-        (_beta != nullptr) ? reinterpret_cast<const T *>(_beta->ptr_to_element(Coordinates(0, 0))) : nullptr;
-
-    T mean        = static_cast<T>(0);
-    T var         = static_cast<T>(0);
-    T gamma       = static_cast<T>(1);
-    T beta        = static_cast<T>(0);
-    T denominator = static_cast<T>(0);
-
-    auto       mean_vec        = wrapper::vdup_n(mean, ExactTagType{});
-    auto       var_vec         = wrapper::vdup_n(var, ExactTagType{});
-    auto       gamma_vec       = wrapper::vdup_n(gamma, ExactTagType{});
-    auto       beta_vec        = wrapper::vdup_n(beta, ExactTagType{});
-    auto       denominator_vec = wrapper::vdup_n(denominator, ExactTagType{});
-    const auto epsilon_vec     = wrapper::vdup_n(static_cast<T>(_epsilon), ExactTagType{});
-    execute_window_loop(
-        win_to_use,
-        [&](const Coordinates &id)
-        {
-            const auto input_ptr  = reinterpret_cast<const T *>(input.ptr());
-            const auto output_ptr = reinterpret_cast<T *>(output.ptr());
-
-            if (slice != id.z())
-            {
-                mean     = input_mean[id.z()];
-                var      = input_var[id.z()];
-                mean_vec = wrapper::vdup_n(mean, ExactTagType{});
-                var_vec  = wrapper::vdup_n(var, ExactTagType{});
-                if (input_gamma != nullptr)
-                {
-                    gamma     = input_gamma[id.z()];
-                    gamma_vec = wrapper::vdup_n(gamma, ExactTagType{});
-                }
-                if (input_beta != nullptr)
-                {
-                    beta     = input_beta[id.z()];
-                    beta_vec = wrapper::vdup_n(beta, ExactTagType{});
-                }
-
-                // Calculate denominator
-                denominator_vec = wrapper::vinvsqrt(wrapper::vadd(var_vec, epsilon_vec));
-                denominator     = wrapper::vgetlane(denominator_vec, 0);
-                slice           = id.z();
-            }
-
-            // Perform core calculations using vector operations
-            int x = window_start_x;
-            for (; x <= (window_end_x - window_step_x); x += window_step_x)
-            {
-                // Calculate x bar
-                const auto numerator = wrapper::vsub(wrapper::vloadq(input_ptr + x), mean_vec);
-                const auto x_bar     = wrapper::vmul(numerator, denominator_vec);
-                auto       res       = wrapper::vmla(beta_vec, x_bar, gamma_vec);
-
-                // Perform fused activation
-                if (fused_activation)
-                {
-                    activation_functor(res);
-                }
-
-                // Store results
-                wrapper::vstore(output_ptr + x, res);
-            }
-
-            // Compute left-over elements
-            for (; x < window_end_x; ++x)
-            {
-                const T numerator = input_ptr[x] - mean;
-                const T x_bar     = numerator * denominator;
-                T       res       = beta + x_bar * gamma;
-
-                // Perform fused activation
-                if (fused_activation)
-                {
-                    activation_functor(res);
-                }
-
-                // Store results
-                *(output_ptr + x) = res;
-            }
-        },
-        input, output);
-}
-
 void NEBatchNormalizationLayerKernel::configure_non_fused()
 {
     switch (_input->info()->data_type())
     {
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
         case DataType::F16:
-            _func = &NEBatchNormalizationLayerKernel::batch_normalization_nchw<float16_t, false,
-                                                                               detail::dummy<float16_t, 8>>;
+            _func = REGISTER_FP16_NEON(cpu::fp16_batch_normalization_nchw_non_fused);
             break;
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
         case DataType::F32:
-            _func = &NEBatchNormalizationLayerKernel::batch_normalization_nchw<float, false, detail::dummy<float, 4>>;
+            _func = REGISTER_FP32_NEON(cpu::fp32_batch_normalization_nchw_non_fused);
             break;
         default:
             ARM_COMPUTE_ERROR("Element size not supported");
@@ -285,29 +172,26 @@ void NEBatchNormalizationLayerKernel::configure_fused()
     // NCHW Fused Batched Normalization with activation functions : FP32
     static std::map<ActivationLayerInfo::ActivationFunction, BatchNormFunctionPtr> bn_fused_map_f32_nchw = {
         {ActivationLayerInfo::ActivationFunction::RELU,
-         &NEBatchNormalizationLayerKernel::batch_normalization_nchw<float, true, detail::relu<float, 4>>},
+         REGISTER_FP32_NEON(cpu::fp32_batch_normalization_nchw_non_fused_relu)},
         {ActivationLayerInfo::ActivationFunction::BOUNDED_RELU,
-         &NEBatchNormalizationLayerKernel::batch_normalization_nchw<float, true, detail::brelu<float, 4>>},
+         REGISTER_FP32_NEON(cpu::fp32_batch_normalization_nchw_non_fused_brelu)},
         {ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU,
-         &NEBatchNormalizationLayerKernel::batch_normalization_nchw<float, true, detail::lubrelu<float, 4>>}};
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+         REGISTER_FP32_NEON(cpu::fp32_batch_normalization_nchw_non_fused_lubrelu)}};
+
     // NCHW Fused Batched Normalization with activation functions : FP16
     static std::map<ActivationLayerInfo::ActivationFunction, BatchNormFunctionPtr> bn_fused_map_f16_nchw = {
         {ActivationLayerInfo::ActivationFunction::RELU,
-         &NEBatchNormalizationLayerKernel::batch_normalization_nchw<float16_t, true, detail::relu<float16_t, 8>>},
+         REGISTER_FP16_NEON(cpu::fp16_batch_normalization_nchw_non_fused_relu)},
         {ActivationLayerInfo::ActivationFunction::BOUNDED_RELU,
-         &NEBatchNormalizationLayerKernel::batch_normalization_nchw<float16_t, true, detail::brelu<float16_t, 8>>},
+         REGISTER_FP16_NEON(cpu::fp16_batch_normalization_nchw_non_fused_brelu)},
         {ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU,
-         &NEBatchNormalizationLayerKernel::batch_normalization_nchw<float16_t, true, detail::lubrelu<float16_t, 8>>}};
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+         REGISTER_FP16_NEON(cpu::fp16_batch_normalization_nchw_non_fused_lubrelu)}};
 
     switch (_input->info()->data_type())
     {
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
         case DataType::F16:
             _func = bn_fused_map_f16_nchw[_act_info.activation()];
             break;
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
         case DataType::F32:
             _func = bn_fused_map_f32_nchw[_act_info.activation()];
             break;
@@ -409,7 +293,7 @@ void NEBatchNormalizationLayerKernel::run(const Window &window, const ThreadInfo
     const bool is_nchw = _input->info()->data_layout() == DataLayout::NCHW;
     if (is_nchw)
     {
-        (this->*_func)(window);
+        (*_func)(window, _input, _output, _mean, _var, _beta, _gamma, _epsilon, _act_info);
     }
     else
     {
diff --git a/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.h b/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.h
index 2e8ff0dc9a..679ade0fae 100644
--- a/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.h
+++ b/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.h
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_NEBATCHNORMALIZATIONLAYERKERNEL_H
-#define ARM_COMPUTE_NEBATCHNORMALIZATIONLAYERKERNEL_H
+#ifndef ACL_SRC_CORE_NEON_KERNELS_NEBATCHNORMALIZATIONLAYERKERNEL_H
+#define ACL_SRC_CORE_NEON_KERNELS_NEBATCHNORMALIZATIONLAYERKERNEL_H
 
 #include "arm_compute/function_info/ActivationLayerInfo.h"
 
@@ -110,31 +110,19 @@ class NEBatchNormalizationLayerKernel : public INEKernel
     /** Configure execution function in case of fused activation **/
     void configure_fused();
 
-    /** Template function to run batch normalization on fp32
-     *
-     * @tparam T                Specialization data type
-     * @tparam fused_activation Boolean that flags if its a fused activation or not
-     * @tparam F                Activation function functor to run
-     *
-     * @param[in] window Region on which to execute the kernel. (Must be a valid region of the window returned by window()).
-     */
-    template <typename T, bool fused_activation, typename F>
-    void batch_normalization_nchw(const Window &window);
-    /** Template function to run batch normalization on fp32 on tensors with NHWC format
-     *
-     * @tparam T                Specialization data type
-     * @tparam fused_activation Boolean that flags if its a fused activation or not
-     * @tparam F                Activation function functor to run
-     *
-     * @param[in] window Region on which to execute the kernel. (Must be a valid region of the window returned by window()).
-     */
-    template <typename T, bool fused_activation, typename F>
-    void batch_normalization_nhwc(const Window &window);
     /** Common signature for all the batch normalization functions
      *
      * @param[in] window Region on which to execute the kernel.
      */
-    using BatchNormFunctionPtr = void (NEBatchNormalizationLayerKernel::*)(const Window &window);
+    using BatchNormFunctionPtr = void (*)(const Window       &window,
+                                          ITensor            *input,
+                                          ITensor            *output,
+                                          const ITensor      *mean,
+                                          const ITensor      *var,
+                                          const ITensor      *beta,
+                                          const ITensor      *gamma,
+                                          float               epsilon,
+                                          ActivationLayerInfo act_info);
 
 private:
     BatchNormFunctionPtr _func;
@@ -148,4 +136,4 @@ class NEBatchNormalizationLayerKernel : public INEKernel
     ActivationLayerInfo  _act_info;
 };
 } // namespace arm_compute
-#endif /*ARM_COMPUTE_NEBATCHNORMALIZATIONLAYERKERNEL_H */
+#endif // ACL_SRC_CORE_NEON_KERNELS_NEBATCHNORMALIZATIONLAYERKERNEL_H
diff --git a/src/core/NEON/kernels/NEDepthToSpaceLayerKernel.cpp b/src/core/NEON/kernels/NEDepthToSpaceLayerKernel.cpp
index de0079ee60..e0eb5cf202 100644
--- a/src/core/NEON/kernels/NEDepthToSpaceLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEDepthToSpaceLayerKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2020, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,6 +23,7 @@
  */
 #include "src/core/NEON/kernels/NEDepthToSpaceLayerKernel.h"
 
+#include "arm_compute/core/CoreTypes.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/Types.h"
@@ -31,13 +32,10 @@
 
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
-#include "src/core/NEON/wrapper/wrapper.h"
+#include "src/cpu/kernels/depth_to_space/list.h"
 
-#include <arm_neon.h>
 #include <cstdint>
 
-using namespace arm_compute::misc::shape_calculator;
-
 namespace arm_compute
 {
 namespace
@@ -70,15 +68,19 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, i
 } // namespace
 
 NEDepthToSpaceLayerKernel::NEDepthToSpaceLayerKernel()
-    : _input(nullptr), _output(nullptr), _block_shape(), _data_layout(DataLayout::UNKNOWN)
+    : _input(nullptr),
+      _output(nullptr),
+      _block_shape(),
+      _data_layout(DataLayout::UNKNOWN),
+      _split_dimension(Window::DimY)
 {
 }
 
 void NEDepthToSpaceLayerKernel::configure(const ITensor *input, ITensor *output, int32_t block_shape)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-    TensorShape output_shape =
-        compute_depth_to_space_shape(input->info()->tensor_shape(), input->info()->data_layout(), block_shape);
+    TensorShape output_shape = misc::shape_calculator::compute_depth_to_space_shape(
+        input->info()->tensor_shape(), input->info()->data_layout(), block_shape);
     // Output auto inizialitation if not yet initialized
     auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape));
 
@@ -90,9 +92,31 @@ void NEDepthToSpaceLayerKernel::configure(const ITensor *input, ITensor *output,
     _block_shape = block_shape;
     _data_layout = input->info()->data_layout();
 
+    constexpr size_t dim_b = 3;
+    const auto       dim_h = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
+    const auto       dim_w = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH);
+    const auto       dim_c = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::CHANNEL);
+
+    ARM_COMPUTE_ERROR_ON(get_data_layout_dimension_index(_data_layout, DataLayoutDimension::BATCHES) != dim_b);
+
     // Configure kernel window
-    Window win = calculate_max_window(*input->info(), Steps());
+    Steps steps;
+    steps.set(dim_h, block_shape);
+    steps.set(dim_w, block_shape);
+    steps.set(dim_c, output->info()->dimension(dim_c));
+
+    Window win = calculate_max_window(*output->info(), steps);
     ICPPKernel::configure(win);
+
+    const auto num_batches = input->info()->tensor_shape().total_size_upper(dim_b);
+    if (num_batches > 1)
+    {
+        _split_dimension = dim_b;
+    }
+    else
+    {
+        _split_dimension = dim_h;
+    }
 }
 
 Status NEDepthToSpaceLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, int32_t block_shape)
@@ -102,68 +126,80 @@ Status NEDepthToSpaceLayerKernel::validate(const ITensorInfo *input, const ITens
     return Status{};
 }
 
+size_t NEDepthToSpaceLayerKernel::get_split_dimension() const
+{
+    return _split_dimension;
+}
+
 void NEDepthToSpaceLayerKernel::run(const Window &window, const ThreadInfo &info)
 {
     ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICPPKernel::window(), window);
 
-    const int idx_channel  = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::CHANNEL);
-    const int depth_size   = _input->info()->dimension(idx_channel);
-    const int r            = (depth_size / (_block_shape * _block_shape));
-    const int element_size = _input->info()->element_size();
+    const auto *input_info  = _input->info();
+    const auto *output_info = _output->info();
+
+    const auto  element_size   = input_info->element_size();
+    const auto &input_strides  = input_info->strides_in_bytes();
+    const auto &output_strides = output_info->strides_in_bytes();
+
+    const auto &input_shape = input_info->tensor_shape();
 
-    Window slice_out = window.first_slice_window_3D();
+    const uintptr_t k_input_strides[]  = {input_strides[0], input_strides[1], input_strides[2], input_strides[3]};
+    const uintptr_t k_output_strides[] = {output_strides[0], output_strides[1], output_strides[2], output_strides[3]};
 
-    // The slice_out slice does not move
-    slice_out.set(Window::DimX, Window::Dimension(0, 0, 0));
-    slice_out.set(Window::DimY, Window::Dimension(0, 0, 0));
-    slice_out.set(Window::DimZ, Window::Dimension(0, 0, 0));
+    const uint8_t *k_input_ptr  = _input->buffer();
+    uint8_t       *k_output_ptr =               //
+        _output->buffer() +                     //
+        window[3].start() * output_strides[3] + //
+        window[2].start() * output_strides[2] + //
+        window[1].start() * output_strides[1] + //
+        window[0].start() * output_strides[0];
 
-    // Main loop for NCHW and NHWC
     if (_data_layout == DataLayout::NCHW)
     {
-        Window slice_in = window.first_slice_window_2D();
-        do
-        {
-            Iterator in(_input, slice_in);
-            execute_window_loop(
-                slice_in,
-                [&](const Coordinates &id)
-                {
-                    const int x = id.x();
-                    const int y = id.y();
-
-                    const int   z     = id.z() % r;
-                    const int   out_x = x * _block_shape + (id.z() / r) % _block_shape;
-                    const int   out_y = y * _block_shape + (id.z() / r) / _block_shape;
-                    Coordinates output_coords{out_x, out_y, z, id[3]};
-                    memcpy(_output->ptr_to_element(output_coords), in.ptr(), element_size);
-                },
-                in);
-        } while (window.slide_window_slice_2D(slice_in));
+        ARM_COMPUTE_ERROR_ON_MSG(window[2].start() != 0 || window[2].end() != window[2].step(),
+                                 "The window cannot be splitted in channel dimension");
+
+        const uintptr_t k_input_shape[] = {
+            window.num_iterations(0), //
+            window.num_iterations(1), //
+            input_shape[2],           // The window cannot be splitted in channel dimension.
+            window.num_iterations(3)  //
+        };
+
+        k_input_ptr += window[3].start() * input_strides[3] +                               //
+                       window[2].start() * _block_shape * _block_shape * input_strides[2] + //
+                       (window[1].start() / _block_shape) * input_strides[1] +              //
+                       (window[0].start() / _block_shape) * input_strides[0];
+
+        cpu::depth_to_space_nchw_any(                         //
+            k_input_ptr, k_output_ptr,                        //
+            k_input_shape, k_input_strides, k_output_strides, //
+            element_size, _block_shape);
     }
     else
     {
-        Window slice_in = window.first_slice_window_3D();
-        do
-        {
-            Iterator in(_input, slice_in);
-            execute_window_loop(
-                slice_in,
-                [&](const Coordinates &id)
-                {
-                    const int x = id.y();
-                    const int y = id.z();
-
-                    const int   z     = id.x() % r;
-                    const int   out_x = x * _block_shape + (id.x() / r) % _block_shape;
-                    const int   out_y = y * _block_shape + (id.x() / r) / _block_shape;
-                    Coordinates output_coords{z, out_x, out_y, id[3]};
-                    memcpy(_output->ptr_to_element(output_coords), in.ptr(), element_size);
-                },
-                in);
-        } while (window.slide_window_slice_3D(slice_in));
+        ARM_COMPUTE_ERROR_ON_MSG(window[0].start() != 0 || window[0].end() != window[0].step(),
+                                 "The window cannot be splitted in channel dimension");
+
+        const uintptr_t k_input_shape[] = {
+            input_shape[0],           // The window cannot be splitted in channel dimension.
+            window.num_iterations(1), //
+            window.num_iterations(2), //
+            window.num_iterations(3)  //
+        };
+
+        k_input_ptr += window[3].start() * input_strides[3] +                  //
+                       (window[2].start() / _block_shape) * input_strides[2] + //
+                       (window[1].start() / _block_shape) * input_strides[1] + //
+                       window[0].start() * _block_shape * _block_shape * input_strides[0];
+
+        cpu::depth_to_space_nhwc_any(                         //
+            k_input_ptr, k_output_ptr,                        //
+            k_input_shape, k_input_strides, k_output_strides, //
+            element_size, _block_shape);
     }
 }
 } // namespace arm_compute
diff --git a/src/core/NEON/kernels/NEDepthToSpaceLayerKernel.h b/src/core/NEON/kernels/NEDepthToSpaceLayerKernel.h
index 7e18dd88b8..ca431ec5fe 100644
--- a/src/core/NEON/kernels/NEDepthToSpaceLayerKernel.h
+++ b/src/core/NEON/kernels/NEDepthToSpaceLayerKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2020, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_NEDEPTHTOSPACELAYERKERNEL_H
-#define ARM_COMPUTE_NEDEPTHTOSPACELAYERKERNEL_H
+#ifndef ACL_SRC_CORE_NEON_KERNELS_NEDEPTHTOSPACELAYERKERNEL_H
+#define ACL_SRC_CORE_NEON_KERNELS_NEDEPTHTOSPACELAYERKERNEL_H
 
 #include "src/core/NEON/INEKernel.h"
 
@@ -68,14 +68,18 @@ class NEDepthToSpaceLayerKernel : public INEKernel
      */
     static Status validate(const ITensorInfo *input, const ITensorInfo *output, int32_t block_shape);
 
+    /** Get the dimension the scheduler should use to split. */
+    size_t get_split_dimension() const;
+
     // Inherited methods overridden:
     void run(const Window &window, const ThreadInfo &info) override;
 
 private:
-    const ITensor *_input;       /**< Source tensor */
-    ITensor       *_output;      /**< Destination tensor */
-    int32_t        _block_shape; /**< Block shape */
-    DataLayout     _data_layout; /**< Data layout of the operation */
+    const ITensor *_input;           /**< Source tensor */
+    ITensor       *_output;          /**< Destination tensor */
+    int32_t        _block_shape;     /**< Block shape */
+    DataLayout     _data_layout;     /**< Data layout of the operation */
+    size_t         _split_dimension; /**< The dimension the scheduler should use to split the workload. */
 };
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_NEDEPTHTOSPACELAYERKERNEL_H */
+#endif // ACL_SRC_CORE_NEON_KERNELS_NEDEPTHTOSPACELAYERKERNEL_H
diff --git a/src/core/NEON/kernels/NENormalizationLayerKernel.cpp b/src/core/NEON/kernels/NENormalizationLayerKernel.cpp
index 2c61bda147..8399c6c49d 100644
--- a/src/core/NEON/kernels/NENormalizationLayerKernel.cpp
+++ b/src/core/NEON/kernels/NENormalizationLayerKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -30,6 +30,7 @@
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
 
+#include "src/core/common/Registrars.h"
 #include "src/core/CPP/Validate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/NormalizationHelpers.h"
@@ -37,6 +38,8 @@
 #include "src/core/NEON/NEFixedPoint.h"
 #include "src/core/NEON/NEMath.h"
 #include "src/core/NEON/wrapper/wrapper.h"
+#include "src/cpu/kernels/norm_layer/generic/neon/impl.h"
+#include "src/cpu/kernels/norm_layer/generic/neon/list.h"
 
 namespace arm_compute
 {
@@ -91,7 +94,6 @@ void NENormalizationLayerKernel::configure(const ITensor         *input,
     _input_squared = input_squared;
     _output        = output;
     _norm_info     = norm_info;
-
     switch (_input->info()->data_type())
     {
         case DataType::F32:
@@ -102,33 +104,33 @@ void NENormalizationLayerKernel::configure(const ITensor         *input,
                 {
                     if (norm_info.type() == NormType::IN_MAP_2D)
                     {
-                        _func = &NENormalizationLayerKernel::normalize_float<float, 4, 0, true>;
+                        _func = REGISTER_FP32_NEON(cpu::neon_normalize_float32_4_0_2D);
                     }
                     else
                     {
-                        _func = &NENormalizationLayerKernel::normalize_float<float, 4, 0, false>;
+                        _func = REGISTER_FP32_NEON(cpu::neon_normalize_float32_4_0);
                     }
                     break;
                 }
                 case 1:
                     if (norm_info.type() == NormType::IN_MAP_2D)
                     {
-                        _func = &NENormalizationLayerKernel::normalize_float<float, 4, 1, true>;
+                        _func = REGISTER_FP32_NEON(cpu::neon_normalize_float32_4_1_2D);
                     }
                     else
                     {
-                        _func = &NENormalizationLayerKernel::normalize_float<float, 4, 1, false>;
+                        _func = REGISTER_FP32_NEON(cpu::neon_normalize_float32_4_1);
                     }
                     break;
                 case 2:
-                    _func = &NENormalizationLayerKernel::normalize_float<float, 4, 2, false>;
+                    _func = REGISTER_FP32_NEON(cpu::neon_normalize_float32_4_2);
                     break;
                 default:
                     break;
             }
             break;
         }
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+#ifdef ARM_COMPUTE_ENABLE_FP16
         case DataType::F16:
         {
             switch (norm_idx)
@@ -137,33 +139,33 @@ void NENormalizationLayerKernel::configure(const ITensor         *input,
                 {
                     if (norm_info.type() == NormType::IN_MAP_2D)
                     {
-                        _func = &NENormalizationLayerKernel::normalize_float<float16_t, 8, 0, true>;
+                        _func = REGISTER_FP16_NEON(cpu::neon_normalize_float16_8_0_2D);
                     }
                     else
                     {
-                        _func = &NENormalizationLayerKernel::normalize_float<float16_t, 8, 0, false>;
+                        _func = REGISTER_FP16_NEON(cpu::neon_normalize_float16_8_0);
                     }
                     break;
                 }
                 case 1:
                     if (norm_info.type() == NormType::IN_MAP_2D)
                     {
-                        _func = &NENormalizationLayerKernel::normalize_float<float16_t, 8, 1, true>;
+                        _func = REGISTER_FP16_NEON(cpu::neon_normalize_float16_8_1_2D);
                     }
                     else
                     {
-                        _func = &NENormalizationLayerKernel::normalize_float<float16_t, 8, 1, false>;
+                        _func = REGISTER_FP16_NEON(cpu::neon_normalize_float16_8_1);
                     }
                     break;
                 case 2:
-                    _func = &NENormalizationLayerKernel::normalize_float<float16_t, 8, 2, false>;
+                    _func = REGISTER_FP16_NEON(cpu::neon_normalize_float16_8_2);
                     break;
                 default:
                     break;
             }
             break;
         }
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+#endif /* ARM_COMPUTE_ENABLE_FP16 */
         default:
             ARM_COMPUTE_ERROR("NOT SUPPORTED!");
     }
@@ -173,124 +175,6 @@ void NENormalizationLayerKernel::configure(const ITensor         *input,
     INEKernel::configure(win);
 }
 
-template <typename T, unsigned int S, unsigned int dim, bool do_2D_norm>
-void NENormalizationLayerKernel::normalize_float(const Window &window)
-{
-    /** SIMD vector tag type. */
-    using ExactTagType = typename wrapper::traits::neon_vector<T, S>::tag_type;
-
-    Window win(window);
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    const auto window_start_x = static_cast<int>(window.x().start());
-    const auto window_end_x   = static_cast<int>(window.x().end());
-    const int  window_step_x  = S;
-
-    Iterator input(_input, win);
-    Iterator input_squared(_input_squared, win);
-    Iterator output(_output, win);
-
-    const int dim_y                      = _input->info()->data_layout() == DataLayout::NCHW ? 1 : 2;
-    const int radius                     = _norm_info.norm_size() / 2;
-    const int input_squared_stride_x     = _input_squared->info()->strides_in_bytes()[0];
-    const int input_squared_stride_slice = _input_squared->info()->strides_in_bytes()[dim];
-    const int input_squared_stride_row   = _input_squared->info()->strides_in_bytes()[dim_y];
-
-    const int max_right  = _input->info()->dimension(dim) - 1;
-    const int max_bottom = _input->info()->dimension(dim_y) - 1;
-
-    const auto coeff_vec = wrapper::vdup_n(static_cast<T>(_norm_info.scale_coeff()), ExactTagType{});
-    const auto beta_vec  = wrapper::vdup_n(static_cast<T>(_norm_info.beta()), ExactTagType{});
-    const auto kappa_vec = wrapper::vdup_n(static_cast<T>(_norm_info.kappa()), ExactTagType{});
-
-    auto sequential_normalization = [&](const int x, const Coordinates &id, const int current_row, const int first_row,
-                                        const int last_row, const T *input_ptr, const uint8_t *input_squared_start_ptr,
-                                        T *output_ptr)
-    {
-        const int current_slice = dim == 0 ? x : id[dim];
-        const int first_slice   = std::max(current_slice - radius, 0);
-        const int last_slice    = std::min(current_slice + radius, max_right);
-
-        const uint8_t *const input_squared_x_ptr = input_squared_start_ptr + x * input_squared_stride_x;
-        // Accumulate 2D In-Map values
-        auto accu = static_cast<T>(0.f);
-        for (int j = first_row; j <= last_row; ++j)
-        {
-            // Compute row displacement
-            const uint8_t *const input_squared_ptr = input_squared_x_ptr + (j - current_row) * input_squared_stride_row;
-            for (int i = first_slice; i <= last_slice; ++i)
-            {
-                accu +=
-                    *reinterpret_cast<const T *>(input_squared_ptr + (i - current_slice) * input_squared_stride_slice);
-            }
-        }
-
-        // Normalize
-        const auto normalized = std::pow(
-            accu * static_cast<T>(_norm_info.scale_coeff()) + static_cast<T>(_norm_info.kappa()), _norm_info.beta());
-        const auto normalized_pixel = (*(input_ptr + x)) / normalized;
-        *(output_ptr + x)           = normalized_pixel;
-    };
-
-    execute_window_loop(
-        win,
-        [&](const Coordinates &id)
-        {
-            const auto input_ptr  = reinterpret_cast<const T *>(input.ptr());
-            auto       output_ptr = reinterpret_cast<T *>(output.ptr());
-
-            // Get range to normalize
-            const int current_row = do_2D_norm ? id[dim_y] : 0;
-            const int first_row   = do_2D_norm ? std::max(current_row - radius, 0) : 0;
-            const int last_row    = do_2D_norm ? std::min(current_row + radius, max_bottom) : 0;
-
-            int x = window_start_x;
-            // Compute serially starting elements for the case x dimension is width
-            for (; x < radius && x < window_end_x && dim == 0; ++x)
-            {
-                sequential_normalization(x, id, current_row, first_row, last_row, input_ptr, input_squared.ptr(),
-                                         output_ptr);
-            }
-
-            // Compute vectorized
-            for (; x <= window_end_x - window_step_x - radius; x += window_step_x)
-            {
-                const int current_slice = dim == 0 ? x : id[dim];
-                const int first_slice   = std::max(current_slice - radius, 0);
-                const int last_slice    = std::min(current_slice + radius, max_right);
-
-                const uint8_t *const input_squared_x_ptr = input_squared.ptr() + x * input_squared_stride_x;
-                // Accumulate 2D In-Map values
-                auto accu = wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{});
-                for (int j = first_row; j <= last_row; ++j)
-                {
-                    // Compute row displacement
-                    const uint8_t *const input_squared_ptr =
-                        input_squared_x_ptr + (j - current_row) * input_squared_stride_row;
-                    for (int i = first_slice; i <= last_slice; ++i)
-                    {
-                        accu = wrapper::vadd(
-                            accu, wrapper::vloadq(reinterpret_cast<const T *>(
-                                      input_squared_ptr + (i - current_slice) * input_squared_stride_slice)));
-                    }
-                }
-
-                // Normalize
-                const auto normalized       = wrapper::vpow(wrapper::vmla(kappa_vec, coeff_vec, accu), beta_vec);
-                const auto normalized_pixel = wrapper::vmul(wrapper::vloadq(input_ptr + x), wrapper::vinv(normalized));
-                wrapper::vstore(reinterpret_cast<T *>(output_ptr + x), normalized_pixel);
-            }
-
-            // Compute left-over elements
-            for (; x < window_end_x; ++x)
-            {
-                sequential_normalization(x, id, current_row, first_row, last_row, input_ptr, input_squared.ptr(),
-                                         output_ptr);
-            }
-        },
-        input, input_squared, output);
-}
-
 Status NENormalizationLayerKernel::validate(const ITensorInfo           *input,
                                             const ITensorInfo           *input_squared,
                                             const ITensorInfo           *output,
@@ -309,6 +193,6 @@ void NENormalizationLayerKernel::run(const Window &window, const ThreadInfo &inf
     ARM_COMPUTE_ERROR_ON(_func == nullptr);
 
     // Run function
-    (this->*_func)(window);
+    (*_func)(window, _input, _input_squared, _output, _norm_info);
 }
 } // namespace arm_compute
diff --git a/src/core/NEON/kernels/NENormalizationLayerKernel.h b/src/core/NEON/kernels/NENormalizationLayerKernel.h
index 2d8d9f3d60..5ba4c3edca 100644
--- a/src/core/NEON/kernels/NENormalizationLayerKernel.h
+++ b/src/core/NEON/kernels/NENormalizationLayerKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2020, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_NENORMALIZATIONLAYERKERNEL_H
-#define ARM_COMPUTE_NENORMALIZATIONLAYERKERNEL_H
+#ifndef ACL_SRC_CORE_NEON_KERNELS_NENORMALIZATIONLAYERKERNEL_H
+#define ACL_SRC_CORE_NEON_KERNELS_NENORMALIZATIONLAYERKERNEL_H
 
 #include "src/core/NEON/INEKernel.h"
 
@@ -82,24 +82,12 @@ class NENormalizationLayerKernel : public INEKernel
     void run(const Window &window, const ThreadInfo &info) override;
 
 private:
-    /** Function to perform normalization depending on the given template
-     *  dimension. The second template parameter specifies whether the
-     *  normalization has to be 1D or 2D.
-     *
-     * @note Only supported normalizations are:
-     *  - 1D over X or Z
-     *  - 2D over X and Y
-     *
-     * @param[in] window Region on which to execute the kernel.
-     */
-    template <typename T, unsigned int S, unsigned int dim, bool do_2D_norm>
-    void normalize_float(const Window &window);
-
     /** Common signature for all the specialised normalization functions
      *
      * @param[in] window Region on which to execute the kernel.
      */
-    using NormalizationFunction = void (NENormalizationLayerKernel::*)(const Window &window);
+    using NormalizationFunction = void (*)(
+        const Window &window, const ITensor *in, const ITensor *in_squared, ITensor *out, NormalizationLayerInfo ninfo);
 
 private:
     NormalizationFunction  _func;
@@ -109,4 +97,4 @@ class NENormalizationLayerKernel : public INEKernel
     NormalizationLayerInfo _norm_info;
 };
 } // namespace arm_compute
-#endif /*ARM_COMPUTE_NENORMALIZATIONLAYERKERNEL_H */
+#endif // ACL_SRC_CORE_NEON_KERNELS_NENORMALIZATIONLAYERKERNEL_H
diff --git a/src/core/NEON/kernels/NEReorderKernel.cpp b/src/core/NEON/kernels/NEReorderKernel.cpp
index 6c2c987eb7..f5bea3e163 100644
--- a/src/core/NEON/kernels/NEReorderKernel.cpp
+++ b/src/core/NEON/kernels/NEReorderKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023 Arm Limited.
+ * Copyright (c) 2023-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -54,17 +54,41 @@ void NEReorderKernel::run(const Window &window, const ThreadInfo &info)
                 {
                     case WeightFormat::OHWIo4:
                     {
-                        arm_gemm::Transform<4, 1, true, arm_gemm::VLType::None>(
-                            reinterpret_cast<float *>(_output->buffer()) + jump_rows,
-                            reinterpret_cast<float *>(_input->buffer()), stride, k_start, k_end, 0, _xmax);
+                        switch (_output->info()->data_type())
+                        {
+                            case DataType::F32:
+                                arm_gemm::Transform<4, 1, true, arm_gemm::VLType::None>(
+                                    reinterpret_cast<float *>(_output->buffer()) + jump_rows,
+                                    reinterpret_cast<float *>(_input->buffer()), stride, k_start, k_end, 0, _xmax);
+                                break;
+                            case DataType::BFLOAT16:
+                                arm_gemm::Transform<4, 4, true, arm_gemm::VLType::None>(
+                                    reinterpret_cast<bfloat16 *>(_output->buffer()) + jump_rows,
+                                    reinterpret_cast<float *>(_input->buffer()), stride, k_start, k_end, 0, _xmax);
+                                break;
+                            default:
+                                ARM_COMPUTE_ERROR("Unsupported data type!");
+                        }
                         break;
                     }
 #if defined(ARM_COMPUTE_ENABLE_SVE)
                     case WeightFormat::OHWIo8:
                     {
-                        arm_gemm::Transform<1, 1, true, arm_gemm::VLType::SVE>(
-                            reinterpret_cast<float *>(_output->buffer()) + jump_rows,
-                            reinterpret_cast<float *>(_input->buffer()), stride, k_start, k_end, 0, _xmax);
+                        switch (_output->info()->data_type())
+                        {
+                            case DataType::F32:
+                                arm_gemm::Transform<1, 1, true, arm_gemm::VLType::SVE>(
+                                    reinterpret_cast<float *>(_output->buffer()) + jump_rows,
+                                    reinterpret_cast<float *>(_input->buffer()), stride, k_start, k_end, 0, _xmax);
+                                break;
+                            case DataType::BFLOAT16:
+                                arm_gemm::Transform<2, 4, true, arm_gemm::VLType::SVE>(
+                                    reinterpret_cast<bfloat16 *>(_output->buffer()) + jump_rows,
+                                    reinterpret_cast<float *>(_input->buffer()), stride, k_start, k_end, 0, _xmax);
+                                break;
+                            default:
+                                ARM_COMPUTE_ERROR("Unsupported data type!");
+                        }
                         break;
                     }
 #endif /* ARM_COMPUTE_ENABLE_SVE */
@@ -175,7 +199,8 @@ Status NEReorderKernel::validate(const ITensorInfo        *input,
     ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN);
     if (output->tensor_shape().total_size() != 0)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+        ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() != DataType::F32);
+        ARM_COMPUTE_RETURN_ERROR_ON(output->data_type() != DataType::F32 && output->data_type() != DataType::BFLOAT16);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
         // Only input WeightFormat OHWI supported
         ARM_COMPUTE_RETURN_ERROR_ON(input_wf != arm_compute::WeightFormat::OHWI);
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_bf16.cpp b/src/core/NEON/kernels/arm_gemm/gemm_bf16.cpp
index 72b0fac96a..5c08e6137d 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_bf16.cpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_bf16.cpp
@@ -206,30 +206,6 @@ GemmImplementation<bfloat16, float>::with_estimate(
     [](const GemmArgs &args) { return GemmInterleavedFixedFormat<cls_a64_ffinterleaved_bf16fp32_dot_8x12, bfloat16, float>::estimate_cycles<bfloat16>(args); },
     [](const GemmArgs &args) { return new GemmInterleavedFixedFormat<cls_a64_ffinterleaved_bf16fp32_dot_8x12, bfloat16, float>(args); }
 ),
-GemmImplementation<bfloat16, float>::with_estimate(
-    GemmMethod::GEMM_INTERLEAVED,
-    "a64_ffinterleaved_bf16fp32_mmla_8x12",
-    KernelWeightFormat::VL256_BL64,
-    [](const GemmArgs &args) { return args._ci->has_bf16(); },
-    [](const GemmArgs &args) { return GemmInterleavedFixedFormat<cls_a64_ffinterleaved_bf16fp32_mmla_8x12, bfloat16, float>::estimate_cycles<bfloat16>(args); },
-    [](const GemmArgs &args) { return new GemmInterleavedFixedFormat<cls_a64_ffinterleaved_bf16fp32_mmla_8x12, bfloat16, float>(args); }
-),
-GemmImplementation<bfloat16, float>::with_estimate(
-    GemmMethod::GEMM_INTERLEAVED,
-    "a64_ffhybrid_bf16fp32_mmla_6x16",
-    KernelWeightFormat::VL256_BL64,
-    [](const GemmArgs &args) { return args._ci->has_bf16(); },
-    [](const GemmArgs &args) { return GemmHybridIndirectFixedFormat<cls_a64_ffhybrid_bf16fp32_mmla_6x16, bfloat16, float>::estimate_cycles<bfloat16>(args); },
-    [](const GemmArgs &args) { return new GemmHybridIndirectFixedFormat<cls_a64_ffhybrid_bf16fp32_mmla_6x16, bfloat16, float>(args); }
-),
-GemmImplementation<bfloat16, float>::with_estimate(
-    GemmMethod::GEMM_INTERLEAVED,
-    "a64_ffinterleaved_bf16fp32_dot_8x12",
-    KernelWeightFormat::VL128_BL32,
-    [](const GemmArgs &args) { return args._ci->has_bf16(); },
-    [](const GemmArgs &args) { return GemmInterleavedFixedFormat<cls_a64_ffinterleaved_bf16fp32_dot_8x12, bfloat16, float>::estimate_cycles<bfloat16>(args); },
-    [](const GemmArgs &args) { return new GemmInterleavedFixedFormat<cls_a64_ffinterleaved_bf16fp32_dot_8x12, bfloat16, float>(args); }
-),
 #endif // ARM_COMPUTE_ENABLE_FIXED_FORMAT_KERNELS
 GemmImplementation<bfloat16, float>::with_estimate(
     GemmMethod::GEMM_INTERLEAVED,
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_4_2x4_fp32bf16.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_4_2x4_fp32bf16.hpp
new file mode 100644
index 0000000000..98200c50c5
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_4_2x4_fp32bf16.hpp
@@ -0,0 +1,346 @@
+/*
+ * Copyright (c) 2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+#if defined(__aarch64__)
+
+namespace {
+
+void a64_transpose_interleave_4_2x4_fp32bf16(bfloat16 *out, const float *in, size_t width, size_t in_stride, size_t height)
+{
+    float *pad_row = reinterpret_cast<float *>(alloca(width * sizeof(float)));
+
+    if (height % 4) {
+        memset(pad_row, 0, width * sizeof(float));
+    }
+
+    size_t out_stride = 4 * roundup<size_t>(height, 4) * sizeof(bfloat16);
+
+    __asm__ __volatile__(
+      "cmp %x[height], #0x8\n"
+      "blt 8f\n"
+      "1:"  // Main row loop: Head
+      "mov x9, %x[in]\n"
+      "mov x28, %x[width]\n"
+      "mov x27, %x[out]\n"
+      "sub %x[height], %x[height], #0x8\n"
+      "add x26, x9, %x[in_stride]\n"
+      "add x25, x26, %x[in_stride]\n"
+      "add x24, x25, %x[in_stride]\n"
+      "cmp x28, #0x8\n"
+      "add x23, x24, %x[in_stride]\n"
+      "add x22, x23, %x[in_stride]\n"
+      "add x21, x22, %x[in_stride]\n"
+      "add x20, x21, %x[in_stride]\n"
+      "add %x[in], x20, %x[in_stride]\n"
+      "blt 3f\n"
+      "2:"  // Main row loop: Unroll column loop
+      "ldr q19, [x9], #0x10\n"
+      "ldr q18, [x26], #0x10\n"
+      "sub x28, x28, #0x8\n"
+      "ldr q17, [x25], #0x10\n"
+      "ldr q16, [x24], #0x10\n"
+      "cmp x28, #0x8\n"
+      "ldr q1, [x23], #0x10\n"
+      "ldr q0, [x22], #0x10\n"
+      "ldr q31, [x21], #0x10\n"
+      "ldr q24, [x20], #0x10\n"
+      "ldr q23, [x9], #0x10\n"
+      "ldr q22, [x26], #0x10\n"
+      "zip1 v30.4s, v19.4s, v17.4s\n"
+      "zip1 v29.4s, v18.4s, v16.4s\n"
+      "ldr q21, [x25], #0x10\n"
+      "ldr q20, [x24], #0x10\n"
+      "zip2 v28.4s, v19.4s, v17.4s\n"
+      "zip2 v27.4s, v18.4s, v16.4s\n"
+      "ldr q19, [x23], #0x10\n"
+      "ldr q18, [x22], #0x10\n"
+      "zip1 v26.4s, v1.4s, v31.4s\n"
+      "zip1 v25.4s, v0.4s, v24.4s\n"
+      "ldr q17, [x21], #0x10\n"
+      "ldr q16, [x20], #0x10\n"
+      "zip2 v8.4s, v1.4s, v31.4s\n"
+      "zip2 v24.4s, v0.4s, v24.4s\n"
+      "zip1 v7.4s, v23.4s, v21.4s\n"
+      "zip1 v6.4s, v22.4s, v20.4s\n"
+      "zip2 v5.4s, v23.4s, v21.4s\n"
+      "zip2 v4.4s, v22.4s, v20.4s\n"
+      "zip1 v3.4s, v19.4s, v17.4s\n"
+      "zip1 v2.4s, v18.4s, v16.4s\n"
+      "zip2 v1.4s, v19.4s, v17.4s\n"
+      "zip2 v0.4s, v18.4s, v16.4s\n"
+      "zip1 v23.4s, v30.4s, v29.4s\n"
+      "zip1 v22.4s, v28.4s, v27.4s\n"
+      "zip1 v21.4s, v26.4s, v25.4s\n"
+      "zip1 v20.4s, v8.4s, v24.4s\n"
+      "zip1 v19.4s, v7.4s, v6.4s\n"
+      "zip1 v18.4s, v5.4s, v4.4s\n"
+      "zip1 v17.4s, v3.4s, v2.4s\n"
+      "zip1 v16.4s, v1.4s, v0.4s\n"
+      ".inst 0x0ea16aff  // bfcvtn v31.4h, v23.4s\n"
+      "zip2 v30.4s, v30.4s, v29.4s\n"
+      ".inst 0x0ea16add  // bfcvtn v29.4h, v22.4s\n"
+      "zip2 v28.4s, v28.4s, v27.4s\n"
+      ".inst 0x0ea16abb  // bfcvtn v27.4h, v21.4s\n"
+      "zip2 v26.4s, v26.4s, v25.4s\n"
+      ".inst 0x0ea16a99  // bfcvtn v25.4h, v20.4s\n"
+      "zip2 v24.4s, v8.4s, v24.4s\n"
+      ".inst 0x0ea16a77  // bfcvtn v23.4h, v19.4s\n"
+      "zip2 v22.4s, v7.4s, v6.4s\n"
+      ".inst 0x0ea16a55  // bfcvtn v21.4h, v18.4s\n"
+      "zip2 v20.4s, v5.4s, v4.4s\n"
+      ".inst 0x0ea16a33  // bfcvtn v19.4h, v17.4s\n"
+      "zip2 v18.4s, v3.4s, v2.4s\n"
+      ".inst 0x0ea16a11  // bfcvtn v17.4h, v16.4s\n"
+      "zip2 v16.4s, v1.4s, v0.4s\n"
+      ".inst 0x4ea16bdf  // bfcvtn2 v31.8h, v30.4s\n"
+      ".inst 0x4ea16b9d  // bfcvtn2 v29.8h, v28.4s\n"
+      ".inst 0x4ea16b5b  // bfcvtn2 v27.8h, v26.4s\n"
+      ".inst 0x4ea16b19  // bfcvtn2 v25.8h, v24.4s\n"
+      ".inst 0x4ea16ad7  // bfcvtn2 v23.8h, v22.4s\n"
+      ".inst 0x4ea16a95  // bfcvtn2 v21.8h, v20.4s\n"
+      "str q31, [x27, #0x0]\n"
+      "str q29, [x27, #0x10]\n"
+      ".inst 0x4ea16a53  // bfcvtn2 v19.8h, v18.4s\n"
+      ".inst 0x4ea16a11  // bfcvtn2 v17.8h, v16.4s\n"
+      "str q27, [x27, #0x20]\n"
+      "str q25, [x27, #0x30]\n"
+      "add x27, x27, %x[out_stride]\n"
+      "str q23, [x27, #0x0]\n"
+      "str q21, [x27, #0x10]\n"
+      "str q19, [x27, #0x20]\n"
+      "str q17, [x27, #0x30]\n"
+      "add x27, x27, %x[out_stride]\n"
+      "bge 2b\n"
+      "3:"  // Main row loop: Unroll column loop skip
+      "cmp x28, #0x4\n"
+      "blt 5f\n"
+      "4:"  // Main row loop: Column loop
+      "ldr q25, [x9], #0x10\n"
+      "ldr q24, [x26], #0x10\n"
+      "sub x28, x28, #0x4\n"
+      "ldr q21, [x25], #0x10\n"
+      "ldr q20, [x24], #0x10\n"
+      "cmp x28, #0x4\n"
+      "ldr q23, [x23], #0x10\n"
+      "ldr q19, [x22], #0x10\n"
+      "ldr q18, [x21], #0x10\n"
+      "ldr q17, [x20], #0x10\n"
+      "zip1 v22.4s, v25.4s, v21.4s\n"
+      "zip1 v16.4s, v24.4s, v20.4s\n"
+      "zip2 v21.4s, v25.4s, v21.4s\n"
+      "zip2 v20.4s, v24.4s, v20.4s\n"
+      "zip1 v27.4s, v23.4s, v18.4s\n"
+      "zip1 v26.4s, v19.4s, v17.4s\n"
+      "zip2 v25.4s, v23.4s, v18.4s\n"
+      "zip2 v24.4s, v19.4s, v17.4s\n"
+      "zip1 v19.4s, v22.4s, v16.4s\n"
+      "zip1 v18.4s, v21.4s, v20.4s\n"
+      "zip1 v17.4s, v27.4s, v26.4s\n"
+      "zip2 v23.4s, v22.4s, v16.4s\n"
+      "zip1 v16.4s, v25.4s, v24.4s\n"
+      "zip2 v22.4s, v21.4s, v20.4s\n"
+      ".inst 0x0ea16a75  // bfcvtn v21.4h, v19.4s\n"
+      ".inst 0x0ea16a54  // bfcvtn v20.4h, v18.4s\n"
+      ".inst 0x0ea16a33  // bfcvtn v19.4h, v17.4s\n"
+      "zip2 v18.4s, v27.4s, v26.4s\n"
+      ".inst 0x0ea16a11  // bfcvtn v17.4h, v16.4s\n"
+      "zip2 v16.4s, v25.4s, v24.4s\n"
+      ".inst 0x4ea16af5  // bfcvtn2 v21.8h, v23.4s\n"
+      ".inst 0x4ea16ad4  // bfcvtn2 v20.8h, v22.4s\n"
+      ".inst 0x4ea16a53  // bfcvtn2 v19.8h, v18.4s\n"
+      ".inst 0x4ea16a11  // bfcvtn2 v17.8h, v16.4s\n"
+      "str q21, [x27, #0x0]\n"
+      "str q20, [x27, #0x10]\n"
+      "str q19, [x27, #0x20]\n"
+      "str q17, [x27, #0x30]\n"
+      "add x27, x27, %x[out_stride]\n"
+      "bge 4b\n"
+      "5:"  // Main row loop: Column loop skip
+      "cbz x28, 7f\n"
+      "movi v16.16b, #0x0\n"
+      "str q16, [x27, #0x0]\n"
+      "str q16, [x27, #0x10]\n"
+      "str q16, [x27, #0x20]\n"
+      "str q16, [x27, #0x30]\n"
+      "6:"  // Main row loop: width 1 loop: loop
+      "ldr s23, [x9], #0x4\n"
+      "ldr s22, [x26], #0x4\n"
+      "sub x28, x28, #0x1\n"
+      "ldr s19, [x25], #0x4\n"
+      "ldr s17, [x24], #0x4\n"
+      "cmp x28, #0x1\n"
+      "ldr s21, [x23], #0x4\n"
+      "ldr s20, [x22], #0x4\n"
+      "ldr s18, [x21], #0x4\n"
+      "ldr s16, [x20], #0x4\n"
+      "zip1 v19.4s, v23.4s, v19.4s\n"
+      "zip1 v17.4s, v22.4s, v17.4s\n"
+      "zip1 v18.4s, v21.4s, v18.4s\n"
+      "zip1 v16.4s, v20.4s, v16.4s\n"
+      "zip1 v17.4s, v19.4s, v17.4s\n"
+      "zip1 v16.4s, v18.4s, v16.4s\n"
+      ".inst 0x0ea16a31  // bfcvtn v17.4h, v17.4s\n"
+      ".inst 0x0ea16a10  // bfcvtn v16.4h, v16.4s\n"
+      "str d17, [x27, #0x0]\n"
+      "str d16, [x27, #0x20]\n"
+      "add x27, x27, #0x8\n"
+      "bge 6b\n"
+      "7:"  // Main row loop: odd col skip
+      "cmp %x[height], #0x8\n"
+      "add %x[out], %x[out], #0x40\n"
+      "bge 1b\n"
+      "cbz %x[height], 16f\n"
+      "8:"  // Main loop skip
+      "9:"  // Tail row loop: Head
+      "mov x9, %x[in]\n"
+      "mov x20, %x[width]\n"
+      "cmp %x[height], #0x3\n"
+      "mov x27, %x[out]\n"
+      "add x26, x9, %x[in_stride]\n"
+      "add x25, x26, %x[in_stride]\n"
+      "add x24, x25, %x[in_stride]\n"
+      "csel x25, x25, %x[pad_row], GE\n"
+      "add %x[in], x24, %x[in_stride]\n"
+      "csel x24, x24, %x[pad_row], GT\n"
+      "cmp %x[height], #0x1\n"
+      "sub %x[height], %x[height], #0x4\n"
+      "csel x26, x26, %x[pad_row], GT\n"
+      "cmp x20, #0x8\n"
+      "blt 11f\n"
+      "10:"  // Tail row loop: Unroll column loop
+      "ldr q25, [x9], #0x10\n"
+      "ldr q24, [x26], #0x10\n"
+      "sub x20, x20, #0x8\n"
+      "ldr q21, [x25], #0x10\n"
+      "ldr q20, [x24], #0x10\n"
+      "cmp x20, #0x8\n"
+      "ldr q23, [x9], #0x10\n"
+      "ldr q19, [x26], #0x10\n"
+      "ldr q18, [x25], #0x10\n"
+      "ldr q17, [x24], #0x10\n"
+      "zip1 v22.4s, v25.4s, v21.4s\n"
+      "zip1 v16.4s, v24.4s, v20.4s\n"
+      "zip2 v21.4s, v25.4s, v21.4s\n"
+      "zip2 v20.4s, v24.4s, v20.4s\n"
+      "zip1 v27.4s, v23.4s, v18.4s\n"
+      "zip1 v26.4s, v19.4s, v17.4s\n"
+      "zip2 v25.4s, v23.4s, v18.4s\n"
+      "zip2 v24.4s, v19.4s, v17.4s\n"
+      "zip1 v19.4s, v22.4s, v16.4s\n"
+      "zip1 v18.4s, v21.4s, v20.4s\n"
+      "zip1 v17.4s, v27.4s, v26.4s\n"
+      "zip2 v23.4s, v22.4s, v16.4s\n"
+      "zip1 v16.4s, v25.4s, v24.4s\n"
+      "zip2 v22.4s, v21.4s, v20.4s\n"
+      ".inst 0x0ea16a75  // bfcvtn v21.4h, v19.4s\n"
+      ".inst 0x0ea16a54  // bfcvtn v20.4h, v18.4s\n"
+      ".inst 0x0ea16a33  // bfcvtn v19.4h, v17.4s\n"
+      "zip2 v18.4s, v27.4s, v26.4s\n"
+      ".inst 0x0ea16a11  // bfcvtn v17.4h, v16.4s\n"
+      "zip2 v16.4s, v25.4s, v24.4s\n"
+      ".inst 0x4ea16af5  // bfcvtn2 v21.8h, v23.4s\n"
+      ".inst 0x4ea16ad4  // bfcvtn2 v20.8h, v22.4s\n"
+      ".inst 0x4ea16a53  // bfcvtn2 v19.8h, v18.4s\n"
+      ".inst 0x4ea16a11  // bfcvtn2 v17.8h, v16.4s\n"
+      "str q21, [x27, #0x0]\n"
+      "str q20, [x27, #0x10]\n"
+      "add x27, x27, %x[out_stride]\n"
+      "str q19, [x27, #0x0]\n"
+      "str q17, [x27, #0x10]\n"
+      "add x27, x27, %x[out_stride]\n"
+      "bge 10b\n"
+      "11:"  // Tail row loop: Unroll column loop skip
+      "cmp x20, #0x4\n"
+      "blt 13f\n"
+      "12:"  // Tail row loop: Column loop
+      "ldr q21, [x9], #0x10\n"
+      "ldr q20, [x26], #0x10\n"
+      "sub x20, x20, #0x4\n"
+      "ldr q19, [x25], #0x10\n"
+      "ldr q17, [x24], #0x10\n"
+      "cmp x20, #0x4\n"
+      "zip1 v18.4s, v21.4s, v19.4s\n"
+      "zip1 v16.4s, v20.4s, v17.4s\n"
+      "zip2 v21.4s, v21.4s, v19.4s\n"
+      "zip2 v20.4s, v20.4s, v17.4s\n"
+      "zip1 v17.4s, v18.4s, v16.4s\n"
+      "zip2 v19.4s, v18.4s, v16.4s\n"
+      "zip1 v16.4s, v21.4s, v20.4s\n"
+      ".inst 0x0ea16a32  // bfcvtn v18.4h, v17.4s\n"
+      "zip2 v17.4s, v21.4s, v20.4s\n"
+      ".inst 0x0ea16a10  // bfcvtn v16.4h, v16.4s\n"
+      ".inst 0x4ea16a72  // bfcvtn2 v18.8h, v19.4s\n"
+      ".inst 0x4ea16a30  // bfcvtn2 v16.8h, v17.4s\n"
+      "str q18, [x27, #0x0]\n"
+      "str q16, [x27, #0x10]\n"
+      "add x27, x27, %x[out_stride]\n"
+      "bge 12b\n"
+      "13:"  // Tail row loop: Column loop skip
+      "cbz x20, 15f\n"
+      "movi v16.16b, #0x0\n"
+      "str q16, [x27, #0x0]\n"
+      "str q16, [x27, #0x10]\n"
+      "14:"  // Tail row loop: width 1 loop: loop
+      "ldr s19, [x9], #0x4\n"
+      "ldr s18, [x26], #0x4\n"
+      "sub x20, x20, #0x1\n"
+      "ldr s17, [x25], #0x4\n"
+      "ldr s16, [x24], #0x4\n"
+      "cmp x20, #0x1\n"
+      "zip1 v17.4s, v19.4s, v17.4s\n"
+      "zip1 v16.4s, v18.4s, v16.4s\n"
+      "zip1 v16.4s, v17.4s, v16.4s\n"
+      ".inst 0x0ea16a10  // bfcvtn v16.4h, v16.4s\n"
+      "str d16, [x27, #0x0]\n"
+      "add x27, x27, #0x8\n"
+      "bge 14b\n"
+      "15:"  // Tail row loop: odd col skip
+      "cmp %x[height], #0x1\n"
+      "add %x[out], %x[out], #0x20\n"
+      "bge 9b\n"
+      "16:"  // Done
+      : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
+      : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width)
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+    );
+}
+
+} // anonymous namespace
+template<>
+void Transform<4, 4, true, VLType::None>(
+    bfloat16 *out, const float *in, int stride, int x0, int xmax, int k0, int kmax)
+{
+    a64_transpose_interleave_4_2x4_fp32bf16(
+        out,
+        in + k0 * stride + x0,
+        (xmax-x0),
+        stride * sizeof(float),
+        (kmax-k0)
+    );
+}
+
+
+#endif  // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/list-sve.hpp b/src/core/NEON/kernels/arm_gemm/transforms/list-sve.hpp
index c066c01bab..1e6c3d35f4 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/list-sve.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/list-sve.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023,2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -42,6 +42,7 @@
 #include "sve_transpose_interleave_12VL_2x4_fp32bf16.hpp"
 #include "sve_transpose_interleave_1VL_1x4.hpp"
 #include "sve_transpose_interleave_1VL.hpp"
+#include "sve_transpose_interleave_2VL_2x4_fp32bf16.hpp"
 #include "sve_transpose_interleave_3VL_1x4.hpp"
 #include "sve_transpose_interleave_3VL_2x2.hpp"
 #include "sve_transpose_interleave_3VL.hpp"
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/list.hpp b/src/core/NEON/kernels/arm_gemm/transforms/list.hpp
index adbaa6cf2f..1ce319efee 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/list.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/list.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020 Arm Limited.
+ * Copyright (c) 2020,2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -44,6 +44,7 @@
 #include "a64_transpose_interleave_32_2x2.hpp"
 #include "a64_transpose_interleave_4_1x16.hpp"
 #include "a64_transpose_interleave_4_1x4.hpp"
+#include "a64_transpose_interleave_4_2x4_fp32bf16.hpp"
 #include "a64_transpose_interleave_48.hpp"
 #include "a64_transpose_interleave_64.hpp"
 #include "a64_transpose_interleave_96.hpp"
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_2VL_2x4_fp32bf16.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_2VL_2x4_fp32bf16.hpp
new file mode 100644
index 0000000000..f66fcdc994
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_2VL_2x4_fp32bf16.hpp
@@ -0,0 +1,149 @@
+/*
+ * Copyright (c) 2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+
+namespace {
+
+void sve_transpose_interleave_2VL_2x4_fp32bf16(bfloat16 *out, const float *in, size_t width, size_t in_stride, size_t height)
+{
+    float *pad_row = reinterpret_cast<float *>(alloca(width * sizeof(float)));
+
+    if (height % 4) {
+        memset(pad_row, 0, width * sizeof(float));
+    }
+
+    size_t out_stride = 2 * roundup<size_t>(height, 4) * get_vector_length<uint32_t>();
+
+    __asm__ __volatile__(
+      "ptrue p1.b\n"
+      "1:"  // Main row loop: Head
+      "mov x26, %x[in]\n"
+      "mov x25, %x[width]\n"
+      "cnth x24\n"
+      "cmp %x[height], #0x3\n"
+      "mov x23, %x[out]\n"
+      "add x22, x26, %x[in_stride]\n"
+      "add x21, x22, %x[in_stride]\n"
+      "add x20, x21, %x[in_stride]\n"
+      "add %x[in], x20, %x[in_stride]\n"
+      "csel x20, x20, %x[pad_row], GT\n"
+      "csel x21, x21, %x[pad_row], GE\n"
+      "cmp %x[height], #0x1\n"
+      "csel x22, x22, %x[pad_row], GT\n"
+      "cmp x25, x24\n"
+      "sub %x[height], %x[height], #0x4\n"
+      "blt 3f\n"
+      "2:"  // Main row loop: Unroll column loop
+      "ld1w { z18.s }, p1/Z, [x26]\n"
+      "ld1w { z17.s }, p1/Z, [x21]\n"
+      "sub x25, x25, x24\n"
+      "ld1w { z21.s }, p1/Z, [x26, #1, MUL VL]\n"
+      "ld1w { z16.s }, p1/Z, [x21, #1, MUL VL]\n"
+      "cmp x25, x24\n"
+      "addvl x26, x26, #2\n"
+      "ld1w { z26.s }, p1/Z, [x22]\n"
+      "ld1w { z20.s }, p1/Z, [x20]\n"
+      "addvl x21, x21, #2\n"
+      "zip1 z19.s, z18.s, z17.s\n"
+      "zip2 z18.s, z18.s, z17.s\n"
+      "ld1w { z25.s }, p1/Z, [x22, #1, MUL VL]\n"
+      "ld1w { z24.s }, p1/Z, [x20, #1, MUL VL]\n"
+      "addvl x22, x22, #2\n"
+      "zip1 z17.s, z21.s, z16.s\n"
+      "zip2 z16.s, z21.s, z16.s\n"
+      "addvl x20, x20, #2\n"
+      ".inst 0x658aa677  // bfcvt z23.h, p1/M, z19.s\n"
+      "zip1 z22.s, z26.s, z20.s\n"
+      ".inst 0x658aa655  // bfcvt z21.h, p1/M, z18.s\n"
+      "zip2 z20.s, z26.s, z20.s\n"
+      ".inst 0x658aa633  // bfcvt z19.h, p1/M, z17.s\n"
+      "zip1 z18.s, z25.s, z24.s\n"
+      ".inst 0x658aa611  // bfcvt z17.h, p1/M, z16.s\n"
+      "zip2 z16.s, z25.s, z24.s\n"
+      ".inst 0x648aa6d7  // bfcvtnt z23.h, p1/M, z22.s\n"
+      ".inst 0x648aa695  // bfcvtnt z21.h, p1/M, z20.s\n"
+      ".inst 0x648aa653  // bfcvtnt z19.h, p1/M, z18.s\n"
+      ".inst 0x648aa611  // bfcvtnt z17.h, p1/M, z16.s\n"
+      "st1h { z23.h }, p1, [x23]\n"
+      "st1h { z21.h }, p1, [x23, #1, MUL VL]\n"
+      "add x23, x23, %x[out_stride]\n"
+      "st1h { z19.h }, p1, [x23]\n"
+      "st1h { z17.h }, p1, [x23, #1, MUL VL]\n"
+      "add x23, x23, %x[out_stride]\n"
+      "bge 2b\n"
+      "3:"  // Main row loop: Unroll column loop skip
+      "cbz x25, 5f\n"
+      "4:"  // Main row loop: Column loop
+      "whilelt p0.s, XZR, x25\n"
+      "decd x25, ALL, MUL #2\n"
+      "ld1w { z19.s }, p0/Z, [x26]\n"
+      "addvl x26, x26, #1\n"
+      "ld1w { z16.s }, p0/Z, [x21]\n"
+      "addvl x21, x21, #1\n"
+      "ld1w { z20.s }, p0/Z, [x22]\n"
+      "addvl x22, x22, #1\n"
+      "ld1w { z18.s }, p0/Z, [x20]\n"
+      "addvl x20, x20, #1\n"
+      "cmp x25, #0x0\n"
+      "zip1 z17.s, z19.s, z16.s\n"
+      "zip2 z16.s, z19.s, z16.s\n"
+      "zip1 z19.s, z20.s, z18.s\n"
+      "zip2 z18.s, z20.s, z18.s\n"
+      ".inst 0x658aa631  // bfcvt z17.h, p1/M, z17.s\n"
+      ".inst 0x658aa610  // bfcvt z16.h, p1/M, z16.s\n"
+      ".inst 0x648aa671  // bfcvtnt z17.h, p1/M, z19.s\n"
+      ".inst 0x648aa650  // bfcvtnt z16.h, p1/M, z18.s\n"
+      "st1h { z17.h }, p1, [x23]\n"
+      "st1h { z16.h }, p1, [x23, #1, MUL VL]\n"
+      "add x23, x23, %x[out_stride]\n"
+      "bgt 4b\n"
+      "5:"  // Main row loop: Column loop skip
+      "cmp %x[height], #0x1\n"
+      "addvl %x[out], %x[out], #2\n"
+      "bge 1b\n"
+      : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
+      : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width)
+      : "cc", "memory", "p0", "p1", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26"
+    );
+}
+
+} // anonymous namespace
+template<>
+void Transform<2, 4, true, VLType::SVE>(
+    bfloat16 *out, const float *in, int stride, int x0, int xmax, int k0, int kmax)
+{
+    sve_transpose_interleave_2VL_2x4_fp32bf16(
+        out,
+        in + k0 * stride + x0,
+        (xmax-x0),
+        stride * sizeof(float),
+        (kmax-k0)
+    );
+}
+
+
+#endif  // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/batchnormalization/impl/list.h b/src/core/NEON/kernels/batchnormalization/impl/list.h
index cbf540bd71..c619788125 100644
--- a/src/core/NEON/kernels/batchnormalization/impl/list.h
+++ b/src/core/NEON/kernels/batchnormalization/impl/list.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020 Arm Limited.
+ * Copyright (c) 2020, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef SRC_CORE_NEON_KERNELS_BATCH_NORMALIZATION_LIST_H
-#define SRC_CORE_NEON_KERNELS_BATCH_NORMALIZATION_LIST_H
+#ifndef ACL_SRC_CORE_NEON_KERNELS_BATCHNORMALIZATION_IMPL_LIST_H
+#define ACL_SRC_CORE_NEON_KERNELS_BATCHNORMALIZATION_IMPL_LIST_H
 
 namespace arm_compute
 {
@@ -37,8 +37,23 @@ DECLARE_BATCH_NORMALIZATION_KERNEL(fp16_sve_batch_normalization);
 DECLARE_BATCH_NORMALIZATION_KERNEL(fp32_neon_batch_normalization);
 DECLARE_BATCH_NORMALIZATION_KERNEL(fp32_sve_batch_normalization);
 
-#undef DECLARE_ACTIVATION_KERNEL
+#define DECLARE_BATCH_NORMALIZATION_NCHW_KERNEL(func_name)                                                         \
+    void func_name(const Window &window, ITensor *input, ITensor *output, const ITensor *mean, const ITensor *var, \
+                   const ITensor *beta, const ITensor *gamma, float epsilon, ActivationLayerInfo act_info)
+
+DECLARE_BATCH_NORMALIZATION_NCHW_KERNEL(fp16_batch_normalization_nchw_non_fused);
+DECLARE_BATCH_NORMALIZATION_NCHW_KERNEL(fp32_batch_normalization_nchw_non_fused);
+DECLARE_BATCH_NORMALIZATION_NCHW_KERNEL(fp16_batch_normalization_nchw_non_fused_relu);
+DECLARE_BATCH_NORMALIZATION_NCHW_KERNEL(fp16_batch_normalization_nchw_non_fused_brelu);
+DECLARE_BATCH_NORMALIZATION_NCHW_KERNEL(fp16_batch_normalization_nchw_non_fused_lubrelu);
+DECLARE_BATCH_NORMALIZATION_NCHW_KERNEL(fp32_batch_normalization_nchw_non_fused_relu);
+DECLARE_BATCH_NORMALIZATION_NCHW_KERNEL(fp32_batch_normalization_nchw_non_fused_brelu);
+DECLARE_BATCH_NORMALIZATION_NCHW_KERNEL(fp32_batch_normalization_nchw_non_fused_lubrelu);
+
+#undef DECLARE_BATCH_NORMALIZATION_KERNEL
+#undef DECLARE_BATCH_NORMALIZATION_NCHW_KERNEL
+
 } // namespace cpu
 } // namespace arm_compute
 
-#endif /* SRC_CORE_NEON_KERNELS_BATCH_NORMALIZATION_LIST_H */
+#endif // ACL_SRC_CORE_NEON_KERNELS_BATCHNORMALIZATION_IMPL_LIST_H
diff --git a/src/core/NEON/wrapper/intrinsics/max.h b/src/core/NEON/wrapper/intrinsics/max.h
index cec437d171..32d38a856c 100644
--- a/src/core/NEON/wrapper/intrinsics/max.h
+++ b/src/core/NEON/wrapper/intrinsics/max.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2020, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_WRAPPER_MAX_H
-#define ARM_COMPUTE_WRAPPER_MAX_H
+#ifndef ACL_SRC_CORE_NEON_WRAPPER_INTRINSICS_MAX_H
+#define ACL_SRC_CORE_NEON_WRAPPER_INTRINSICS_MAX_H
 
 #include <arm_neon.h>
 
@@ -59,6 +59,39 @@ VMAX_IMPL(float16_t, float16x8_t, vmaxq, f16)
 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 
 #undef VMAX_IMPL
+
+#if defined(__aarch64__)
+// VMAXV: Across vector max
+#define VMAXV_IMPL(stype, vtype, prefix, postfix) \
+    inline stype vmaxv(const vtype &a)            \
+    {                                             \
+        return prefix##_##postfix(a);             \
+    }
+
+VMAXV_IMPL(uint8_t, uint8x8_t, vmaxv, u8)
+VMAXV_IMPL(int8_t, int8x8_t, vmaxv, s8)
+VMAXV_IMPL(uint16_t, uint16x4_t, vmaxv, u16)
+VMAXV_IMPL(int16_t, int16x4_t, vmaxv, s16)
+VMAXV_IMPL(uint32_t, uint32x2_t, vmaxv, u32)
+VMAXV_IMPL(int32_t, int32x2_t, vmaxv, s32)
+VMAXV_IMPL(float, float32x2_t, vmaxv, f32)
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+VMAXV_IMPL(float16_t, float16x4_t, vmaxv, f16)
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+
+VMAXV_IMPL(uint8_t, uint8x16_t, vmaxvq, u8)
+VMAXV_IMPL(int8_t, int8x16_t, vmaxvq, s8)
+VMAXV_IMPL(uint16_t, uint16x8_t, vmaxvq, u16)
+VMAXV_IMPL(int16_t, int16x8_t, vmaxvq, s16)
+VMAXV_IMPL(uint32_t, uint32x4_t, vmaxvq, u32)
+VMAXV_IMPL(int32_t, int32x4_t, vmaxvq, s32)
+VMAXV_IMPL(float, float32x4_t, vmaxvq, f32)
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+VMAXV_IMPL(float16_t, float16x8_t, vmaxvq, f16)
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+
+#undef VMAXV_IMPL
+#endif // defined(__aarch64__)
 } // namespace wrapper
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_WRAPPER_MAX_H */
+#endif // ACL_SRC_CORE_NEON_WRAPPER_INTRINSICS_MAX_H
diff --git a/src/core/Utils.cpp b/src/core/Utils.cpp
index 90a7ac32c0..532d08de92 100644
--- a/src/core/Utils.cpp
+++ b/src/core/Utils.cpp
@@ -450,8 +450,9 @@ std::pair<int32_t, int32_t> get_quantized_activation_min_max(const ActivationLay
     const int  b_int          = is_qasymm8_signed ? quantize_qasymm8_signed(b, oq_info) : quantize_qasymm8(b, oq_info);
     const auto type_max_value = std::get<1>(get_min_max(data_type)).get<int32_t>();
 
-    const int32_t min_activation =
-        act_info.activation() != ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU ? oq_info.offset : b_int;
+    const int32_t min_activation = act_info.activation() != ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU
+                                       ? std::min(oq_info.offset, type_max_value)
+                                       : b_int;
     const int32_t max_activation =
         act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU ? type_max_value : a_int;
 
diff --git a/src/core/common/Registrars.h b/src/core/common/Registrars.h
index 686304b8d7..50b3fc1284 100644
--- a/src/core/common/Registrars.h
+++ b/src/core/common/Registrars.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022 Arm Limited.
+ * Copyright (c) 2020-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef SRC_CORE_COMMON_REGISTRARS_H
-#define SRC_CORE_COMMON_REGISTRARS_H
+#ifndef ACL_SRC_CORE_COMMON_REGISTRARS_H
+#define ACL_SRC_CORE_COMMON_REGISTRARS_H
 
 #if defined(ENABLE_FP16_KERNELS)
 
@@ -38,11 +38,11 @@
 #define REGISTER_FP16_SVE2(func_name) nullptr
 #endif /* defined(ARM_COMPUTE_ENABLE_SVE2) */
 
-#if defined(ARM_COMPUTE_ENABLE_NEON) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+#if defined(ARM_COMPUTE_ENABLE_NEON)
 #define REGISTER_FP16_NEON(func_name) &(func_name)
 #else /* !defined(ARM_COMPUTE_ENABLE_NEON) */
 #define REGISTER_FP16_NEON(func_name) nullptr
-#endif /* defined(ARM_COMPUTE_ENABLE_NEON) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) */
+#endif /* defined(ARM_COMPUTE_ENABLE_NEON) */
 
 #else /* !defined(ENABLE_FP16_KERNELS) */
 #define REGISTER_FP16_NEON(func_name) nullptr
@@ -179,4 +179,4 @@
 #define REGISTER_BF16_NEON(func_name) nullptr
 #endif /* defined(ARM_COMPUTE_ENABLE_BF16)*/
 
-#endif /* SRC_CORE_COMMON_REGISTRARS_H */
+#endif // ACL_SRC_CORE_COMMON_REGISTRARS_H
diff --git a/src/core/helpers/LUTManager.cpp b/src/core/helpers/LUTManager.cpp
new file mode 100644
index 0000000000..06e35eed8c
--- /dev/null
+++ b/src/core/helpers/LUTManager.cpp
@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/helpers/LUTManager.h"
+
+namespace arm_compute
+{
+#ifdef __aarch64__
+namespace
+{
+
+void init_lut_fp16(ActivationLayerInfo::LookupTable65536 *lut)
+{
+    union Element
+    {
+        uint16_t  i = 0;
+        float16_t fp;
+    } item;
+    // Fill lut by iterating over all 16 bit values using the union.
+    while (true)
+    {
+        (*lut)[item.i] = 1.f / (1.f + std::exp(-item.fp));
+        if (item.i == 65535)
+            break;
+        item.i++;
+    }
+}
+} // namespace
+
+std::shared_ptr<ActivationLayerInfo::LookupTable65536> LUTManager::get_lut_table(LUTInfo info)
+{
+    const auto itr   = map_fp16.find(info);
+    auto       s_ptr = (itr != map_fp16.end()) ? itr->second.lock() : nullptr; // nullptr if invalid or not found.
+    if (s_ptr != nullptr)
+    {
+        // Found and valid
+        return s_ptr; // Return weak ptr as shared ptr
+    }
+    else
+    {
+        // Not found, or pointer not valid
+        // We do not use make_shared to prevent the weak_ptr keeping the control block alive
+        std::shared_ptr<ActivationLayerInfo::LookupTable65536> ptr(new ActivationLayerInfo::LookupTable65536);
+        init_lut_fp16(ptr.get());
+        map_fp16[info] = ptr;
+        return ptr;
+    }
+}
+#endif // __aarch64__
+
+// Static function to get LutManager instance
+LUTManager &LUTManager::get_instance()
+{
+    static auto inst_ = std::make_unique<LUTManager>(); // The one, single instance.
+    return *inst_;
+}
+
+} // namespace arm_compute
diff --git a/src/core/helpers/LUTManager.h b/src/core/helpers/LUTManager.h
new file mode 100644
index 0000000000..4e13ead7e3
--- /dev/null
+++ b/src/core/helpers/LUTManager.h
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef ACL_SRC_CORE_HELPERS_LUTMANAGER_H
+#define ACL_SRC_CORE_HELPERS_LUTMANAGER_H
+
+#include "arm_compute/core/CoreTypes.h"
+#include "arm_compute/core/QuantizationInfo.h"
+#include "arm_compute/function_info/ActivationLayerInfo.h"
+
+#include <map>
+#include <memory>
+
+namespace arm_compute
+{
+
+struct LUTInfo
+{
+    ActivationLayerInfo::ActivationFunction act;
+    DataType                                dt;
+    QuantizationInfo                        qinfo;
+    // Operators enable use of map with Lutinfo as key
+    friend bool operator<(const LUTInfo &l, const LUTInfo &r)
+    {
+        return (l.act < r.act) || ((l.act == r.act) && (l.dt < r.dt)) ||
+               ((l.act == r.act) && (l.dt == r.dt) && (l.qinfo.scale() < r.qinfo.scale())) ||
+               ((l.act == r.act) && (l.dt == r.dt) && (l.qinfo.scale() == r.qinfo.scale()) &&
+                (l.qinfo.offset() < l.qinfo.offset()));
+    }
+    bool operator==(const LUTInfo &l)
+    {
+        return this->act == l.act && this->dt == l.dt && this->qinfo == l.qinfo;
+    }
+};
+
+/* Class to handle getting look up table */
+class LUTManager
+{
+public:
+    LUTManager() = default;
+
+    static LUTManager &get_instance();
+#ifdef __aarch64__
+    std::shared_ptr<ActivationLayerInfo::LookupTable65536> get_lut_table(LUTInfo info);
+
+private:
+    std::map<LUTInfo, std::weak_ptr<ActivationLayerInfo::LookupTable65536>> map_fp16{};
+#endif // __aarch64__
+};
+
+} // namespace arm_compute
+#endif // ACL_SRC_CORE_HELPERS_LUTMANAGER_H
diff --git a/src/cpu/kernels/CpuActivationKernel.cpp b/src/cpu/kernels/CpuActivationKernel.cpp
index 50bf672d3c..7cfa39b286 100644
--- a/src/cpu/kernels/CpuActivationKernel.cpp
+++ b/src/cpu/kernels/CpuActivationKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2023 Arm Limited.
+ * Copyright (c) 2017-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -49,7 +49,8 @@ static const std::vector<CpuActivationKernel::ActivationKernel> available_kernel
      [](const ActivationDataTypeISASelectorData &data)
      {
          return (data.dt == DataType::QASYMM8 || data.dt == DataType::QASYMM8_SIGNED) &&
-                data.cpumodel == CPUModel::A510 && data.isa.sve2;
+                data.cpumodel == CPUModel::A510 && data.isa.sve2 &&
+                data.f != ActivationLayerInfo::ActivationFunction::RELU;
      },
      REGISTER_QASYMM8_SVE2(arm_compute::cpu::sve2_q8_activation_lut)},
 #endif // ARM_COMPUTE_ENABLE_SVE
@@ -57,7 +58,10 @@ static const std::vector<CpuActivationKernel::ActivationKernel> available_kernel
     {// Neon LUT implementantion takes precedence
      "neon_q8_activation_lut",
      [](const ActivationDataTypeISASelectorData &data)
-     { return data.dt == DataType::QASYMM8 || data.dt == DataType::QASYMM8_SIGNED; },
+     {
+         return (data.dt == DataType::QASYMM8 || data.dt == DataType::QASYMM8_SIGNED) &&
+                data.f != ActivationLayerInfo::ActivationFunction::RELU;
+     },
      REGISTER_Q8_NEON(arm_compute::cpu::neon_q8_activation_lut)},
 #endif // __aarch64__
     {"sve2_qu8_activation",
@@ -79,6 +83,13 @@ static const std::vector<CpuActivationKernel::ActivationKernel> available_kernel
                 data.f != ActivationLayerInfo::ActivationFunction::GELU;
      },
      REGISTER_QSYMM16_SVE2(arm_compute::cpu::sve2_qsymm16_activation)},
+    {"sve_fp16_activation_lut",
+     [](const ActivationDataTypeISASelectorData &data)
+     {
+         return data.dt == DataType::F16 && data.isa.fp16 && data.isa.sve &&
+                data.f == ActivationLayerInfo::ActivationFunction::LOGISTIC;
+     },
+     REGISTER_FP16_SVE(arm_compute::cpu::sve_fp16_activation_lut)},
     {"sve_fp16_activation",
      [](const ActivationDataTypeISASelectorData &data)
      {
@@ -214,9 +225,6 @@ void init_lut(ActivationLayerInfo::ActivationFunction act_func,
             case ActivationLayerInfo::ActivationFunction::LINEAR:
                 tmp_f = a * tmp_f + b;
                 break;
-            case ActivationLayerInfo::ActivationFunction::RELU:
-                tmp_f = std::max<>(0.f, tmp_f);
-                break;
             case ActivationLayerInfo::ActivationFunction::BOUNDED_RELU:
                 tmp_f = std::min<>(a, std::max(0.f, tmp_f));
                 break;
@@ -278,7 +286,11 @@ void CpuActivationKernel::configure(const ITensorInfo *src, ITensorInfo *dst, Ac
     _name       = std::string("CpuActivationKernel").append("/").append(uk->name);
 
 #ifdef __aarch64__
-    if (src->data_type() == DataType::QASYMM8 || src->data_type() == DataType::QASYMM8_SIGNED)
+    // Initialise lut_manager
+    LUTManager &lut_manager = LUTManager::get_instance();
+
+    if ((src->data_type() == DataType::QASYMM8 || src->data_type() == DataType::QASYMM8_SIGNED) &&
+        activation_info.activation() != ActivationFunction::RELU)
     {
         ActivationLayerInfo::LookupTable256 tmp_lut;
         init_lut(activation_info.activation(), src->data_type(), src->quantization_info().uniform(),
@@ -286,6 +298,13 @@ void CpuActivationKernel::configure(const ITensorInfo *src, ITensorInfo *dst, Ac
                  activation_info.a(), activation_info.b());
         activation_info.setLookupTable256(tmp_lut);
     }
+
+    if (src->data_type() == DataType::F16 &&
+        activation_info.activation() == ActivationLayerInfo::ActivationFunction::LOGISTIC)
+    {
+        const LUTInfo info = {activation_info.activation(), src->data_type(), src->quantization_info()};
+        activation_info.setLookupTable65536((lut_manager.get_lut_table(info)));
+    }
 #endif // __aarch64__
     _act_info = activation_info;
 
diff --git a/src/cpu/kernels/CpuActivationKernel.h b/src/cpu/kernels/CpuActivationKernel.h
index 4bad9fb3e8..c1487499d6 100644
--- a/src/cpu/kernels/CpuActivationKernel.h
+++ b/src/cpu/kernels/CpuActivationKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2023 Arm Limited.
+ * Copyright (c) 2017-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,12 +21,13 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_CPU_ACTIVATION_KERNEL_H
-#define ARM_COMPUTE_CPU_ACTIVATION_KERNEL_H
+#ifndef ACL_SRC_CPU_KERNELS_CPUACTIVATIONKERNEL_H
+#define ACL_SRC_CPU_KERNELS_CPUACTIVATIONKERNEL_H
 
 #include "arm_compute/function_info/ActivationLayerInfo.h"
 
 #include "src/core/common/Macros.h"
+#include "src/core/helpers/LUTManager.h"
 #include "src/cpu/ICpuKernel.h"
 
 namespace arm_compute
@@ -103,4 +104,4 @@ class CpuActivationKernel : public ICpuKernel<CpuActivationKernel>
 } // namespace kernels
 } // namespace cpu
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_CPU_ACTIVATION_KERNEL_H */
+#endif // ACL_SRC_CPU_KERNELS_CPUACTIVATIONKERNEL_H
diff --git a/src/cpu/kernels/CpuKernelSelectionTypes.h b/src/cpu/kernels/CpuKernelSelectionTypes.h
index b7daa4d583..45ebeec394 100644
--- a/src/cpu/kernels/CpuKernelSelectionTypes.h
+++ b/src/cpu/kernels/CpuKernelSelectionTypes.h
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_CPU_KERNEL_SELECTION_TYPES_H
-#define ARM_COMPUTE_CPU_KERNEL_SELECTION_TYPES_H
+#ifndef ACL_SRC_CPU_KERNELS_CPUKERNELSELECTIONTYPES_H
+#define ACL_SRC_CPU_KERNELS_CPUKERNELSELECTIONTYPES_H
 
 #include "arm_compute/core/Types.h"
 
@@ -99,6 +99,13 @@ struct ScaleKernelDataTypeISASelectorData
     InterpolationPolicy interpolation_policy;
 };
 
+struct SoftmaxKernelDataTypeISASelectorData
+{
+    DataType            dt;
+    cpuinfo::CpuIsaInfo isa;
+    bool                is_log;
+};
+
 // Selector pointer types
 using DataTypeISASelectorPtr            = std::add_pointer<bool(const DataTypeISASelectorData &data)>::type;
 using DataTypeDataLayoutSelectorPtr     = std::add_pointer<bool(const DataTypeDataLayoutISASelectorData &data)>::type;
@@ -113,9 +120,10 @@ using CpuAddKernelDataTypeISASelectorDataPtr =
     std::add_pointer<bool(const CpuAddKernelDataTypeISASelectorData &data)>::type;
 using ScaleKernelDataTypeISASelectorDataPtr =
     std::add_pointer<bool(const ScaleKernelDataTypeISASelectorData &data)>::type;
-
+using SoftmaxKernelDataTypeISASelectorDataPtr =
+    std::add_pointer<bool(const SoftmaxKernelDataTypeISASelectorData &data)>::type;
 } // namespace kernels
 } // namespace cpu
 } // namespace arm_compute
 
-#endif // ARM_COMPUTE_CPU_KERNEL_SELECTION_TYPES_H
+#endif // ACL_SRC_CPU_KERNELS_CPUKERNELSELECTIONTYPES_H
diff --git a/src/cpu/kernels/CpuMulKernel.cpp b/src/cpu/kernels/CpuMulKernel.cpp
index ba086e3ac6..8001482154 100644
--- a/src/cpu/kernels/CpuMulKernel.cpp
+++ b/src/cpu/kernels/CpuMulKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2022 Arm Limited.
+ * Copyright (c) 2016-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,12 +26,14 @@
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/TensorInfo.h"
 
+#include "src/core/common/Registrars.h"
 #include "src/core/CPP/Validate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 #include "src/core/NEON/NEAsymm.h"
 #include "src/core/NEON/NESymm.h"
 #include "src/core/NEON/wrapper/wrapper.h"
+#include "src/cpu/kernels/mul/generic/neon/list.h"
 
 #include <arm_neon.h>
 
@@ -1170,108 +1172,6 @@ void mul_S32_S32_S32(const ITensor *src1, const ITensor *src2, ITensor *out, con
     }
 }
 
-void mul_F32_F32_F32(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, float scale)
-{
-    // Create input windows
-    Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
-    Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape());
-
-    // Clear X Dimension on execution window as we handle manually
-    Window win = window;
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    constexpr int window_step_x         = 16 / sizeof(float);
-    const auto    window_start_x        = static_cast<int>(window.x().start());
-    const auto    window_end_x          = static_cast<int>(window.x().end());
-    const bool    is_broadcast_across_x = src1->info()->tensor_shape().x() != src2->info()->tensor_shape().x();
-
-    using ExactTagType = typename wrapper::traits::neon_vector<float, window_step_x>::tag_type;
-
-    if (is_broadcast_across_x)
-    {
-        const bool     is_broadcast_input_2 = input2_win.x().step() == 0;
-        Window         broadcast_win        = is_broadcast_input_2 ? input2_win : input1_win;
-        Window         non_broadcast_win    = !is_broadcast_input_2 ? input2_win : input1_win;
-        const ITensor *broadcast_tensor     = is_broadcast_input_2 ? src2 : src1;
-        const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src2 : src1;
-
-        // Clear X Dimension on execution window as we handle manually
-        non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-        Iterator broadcast_input(broadcast_tensor, broadcast_win);
-        Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
-        Iterator dst(out, win);
-
-        execute_window_loop(
-            win,
-            [&](const Coordinates &)
-            {
-                const auto non_broadcast_input_ptr = reinterpret_cast<const float *>(non_broadcast_input.ptr());
-                const auto output_ptr              = reinterpret_cast<float *>(dst.ptr());
-
-                const float broadcast_value     = *reinterpret_cast<const float *>(broadcast_input.ptr());
-                const auto  broadcast_value_vec = wrapper::vdup_n(broadcast_value, ExactTagType{});
-                const auto  scale_vec           = wrapper::vdup_n(scale, ExactTagType{});
-
-                // Compute window_step_x elements per iteration
-                int x = window_start_x;
-                for (; x <= (window_end_x - window_step_x); x += window_step_x)
-                {
-                    const auto non_broadcast_v = wrapper::vloadq(non_broadcast_input_ptr + x);
-                    auto       res = wrapper::vmul(wrapper::vmul(broadcast_value_vec, non_broadcast_v), scale_vec);
-                    wrapper::vstore(output_ptr + x, res);
-                }
-
-                // Compute left-over elements
-                for (; x < window_end_x; ++x)
-                {
-                    const auto non_broadcast_v = *(non_broadcast_input_ptr + x);
-                    *(output_ptr + x)          = broadcast_value * non_broadcast_v * scale;
-                }
-            },
-            broadcast_input, non_broadcast_input, dst);
-    }
-    else
-    {
-        // Clear X Dimension on execution window as we handle manually
-        input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-        input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-        Iterator input1(src1, input1_win);
-        Iterator input2(src2, input2_win);
-        Iterator dst(out, win);
-
-        execute_window_loop(
-            win,
-            [&](const Coordinates &)
-            {
-                const auto input1_ptr = reinterpret_cast<const float *>(input1.ptr());
-                const auto input2_ptr = reinterpret_cast<const float *>(input2.ptr());
-                const auto output_ptr = reinterpret_cast<float *>(dst.ptr());
-
-                // Compute window_step_x elements per iteration
-                int x = window_start_x;
-                for (; x <= (window_end_x - window_step_x); x += window_step_x)
-                {
-                    const auto ta1       = wrapper::vloadq(input1_ptr + x);
-                    const auto ta2       = wrapper::vloadq(input2_ptr + x);
-                    const auto scale_vec = wrapper::vdup_n(scale, ExactTagType{});
-                    const auto res       = wrapper::vmul(wrapper::vmul(ta1, ta2), scale_vec);
-                    wrapper::vstore(output_ptr + x, res);
-                }
-
-                // Compute left-over elements
-                for (; x < window_end_x; ++x)
-                {
-                    const auto ta1    = *(input1_ptr + x);
-                    const auto ta2    = *(input2_ptr + x);
-                    *(output_ptr + x) = ta1 * ta2 * scale;
-                }
-            },
-            input1, input2, dst);
-    }
-}
-
 void c_mul_F32_F32_F32_n(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window)
 {
     // Create input windows
@@ -1409,115 +1309,6 @@ void c_mul_F32_F32_F32_n(const ITensor *src1, const ITensor *src2, ITensor *out,
     }
 }
 
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-void mul_F16_F16_F16(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, float scale)
-{
-    // Create input windows
-    Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
-    Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape());
-
-    // Clear X Dimension on execution window as we handle manually
-    Window win = window;
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-    constexpr int window_step_x         = 16;
-    const auto    window_start_x        = static_cast<int>(window.x().start());
-    const auto    window_end_x          = static_cast<int>(window.x().end());
-    const bool    is_broadcast_across_x = src1->info()->tensor_shape().x() != src2->info()->tensor_shape().x();
-    if (is_broadcast_across_x)
-    {
-        const bool     is_broadcast_input_2 = input2_win.x().step() == 0;
-        Window         broadcast_win        = is_broadcast_input_2 ? input2_win : input1_win;
-        Window         non_broadcast_win    = !is_broadcast_input_2 ? input2_win : input1_win;
-        const ITensor *broadcast_tensor     = is_broadcast_input_2 ? src2 : src1;
-        const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src2 : src1;
-        // Clear X Dimension on execution window as we handle manually
-        non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-        Iterator broadcast_input(broadcast_tensor, broadcast_win);
-        Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
-        Iterator dst(out, win);
-        execute_window_loop(
-            win,
-            [&](const Coordinates &)
-            {
-                const auto non_broadcast_input_ptr = reinterpret_cast<const float16_t *>(non_broadcast_input.ptr());
-                const auto output_ptr              = reinterpret_cast<float16_t *>(dst.ptr());
-                const auto broadcast_value         = *reinterpret_cast<const float16_t *>(broadcast_input.ptr());
-                const float16x8x2_t broadcast_value_vec = {{
-                    vdupq_n_f16(broadcast_value),
-                    vdupq_n_f16(broadcast_value),
-                }};
-                const auto          scale_vec           = vdupq_n_f16(scale);
-                // Compute window_step_x elements per iteration
-                int x = window_start_x;
-                for (; x <= (window_end_x - window_step_x); x += window_step_x)
-                {
-                    const float16x8x2_t non_broadcast_v = {{
-                        vld1q_f16(non_broadcast_input_ptr + x),
-                        vld1q_f16(non_broadcast_input_ptr + x + 8),
-                    }};
-                    const float16x8x2_t result          = {{
-                                 vmulq_f16(vmulq_f16(broadcast_value_vec.val[0], non_broadcast_v.val[0]), scale_vec),
-                                 vmulq_f16(vmulq_f16(broadcast_value_vec.val[1], non_broadcast_v.val[1]), scale_vec),
-                    }};
-                    vst1q_f16(output_ptr + x, result.val[0]);
-                    vst1q_f16(output_ptr + x + 8, result.val[1]);
-                }
-                // Compute left-over elements
-                for (; x < window_end_x; ++x)
-                {
-                    const auto non_broadcast_v = *(non_broadcast_input_ptr + x);
-                    *(output_ptr + x)          = broadcast_value * non_broadcast_v * scale;
-                }
-            },
-            broadcast_input, non_broadcast_input, dst);
-    }
-    else
-    {
-        input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-        input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-        Iterator input1(src1, input1_win);
-        Iterator input2(src2, input2_win);
-        Iterator dst(out, win);
-        execute_window_loop(
-            win,
-            [&](const Coordinates &)
-            {
-                const auto input1_ptr = reinterpret_cast<const float16_t *>(input1.ptr());
-                const auto input2_ptr = reinterpret_cast<const float16_t *>(input2.ptr());
-                const auto output_ptr = reinterpret_cast<float16_t *>(dst.ptr());
-                // Compute window_step_x elements per iteration
-                int x = window_start_x;
-                for (; x <= (window_end_x - window_step_x); x += window_step_x)
-                {
-                    const float16x8x2_t ta1       = {{
-                              vld1q_f16(input1_ptr + x),
-                              vld1q_f16(input1_ptr + x + 8),
-                    }};
-                    const float16x8x2_t ta2       = {{
-                              vld1q_f16(input2_ptr + x),
-                              vld1q_f16(input2_ptr + x + 8),
-                    }};
-                    const float16x8_t   scale_vec = vdupq_n_f16(scale);
-                    const float16x8x2_t result    = {{
-                           vmulq_f16(vmulq_f16(ta1.val[0], ta2.val[0]), scale_vec),
-                           vmulq_f16(vmulq_f16(ta1.val[1], ta2.val[1]), scale_vec),
-                    }};
-                    vst1q_f16(output_ptr + x, result.val[0]);
-                    vst1q_f16(output_ptr + x + 8, result.val[1]);
-                }
-                // Compute left-over elements
-                for (; x < window_end_x; ++x)
-                {
-                    const auto ta1    = *(input1_ptr + x);
-                    const auto ta2    = *(input2_ptr + x);
-                    *(output_ptr + x) = ta1 * ta2 * scale;
-                }
-            },
-            input1, input2, dst);
-    }
-}
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-
 template <bool is_scale255, bool is_sat>
 void mul_U8_U8_S16(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, int n)
 {
@@ -1857,13 +1648,11 @@ void CpuMulKernel::configure(ITensorInfo   *src1,
                 }
             }
             break;
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
         case DataType::F16:
-            _func_float = &mul_F16_F16_F16;
+            _func_float = REGISTER_FP16_NEON(cpu::mul_F16_F16_F16);
             break;
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
         case DataType::F32:
-            _func_float = &mul_F32_F32_F32;
+            _func_float = REGISTER_FP32_NEON(cpu::mul_F32_F32_F32);
             break;
         default:
             ARM_COMPUTE_ERROR("You called with the wrong img formats");
diff --git a/src/cpu/kernels/CpuPool2dKernel.cpp b/src/cpu/kernels/CpuPool2dKernel.cpp
index 9308d860d1..2c9627bdee 100644
--- a/src/cpu/kernels/CpuPool2dKernel.cpp
+++ b/src/cpu/kernels/CpuPool2dKernel.cpp
@@ -271,11 +271,9 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo            *
                         break;
                 }
                 break;
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
             case DataType::F16:
                 num_elems_processed_per_iteration = 1;
                 break;
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
             case DataType::F32:
                 num_elems_processed_per_iteration = 1;
                 break;
diff --git a/src/cpu/kernels/CpuSoftmaxKernel.cpp b/src/cpu/kernels/CpuSoftmaxKernel.cpp
index ce144351f8..486f55e2c1 100644
--- a/src/cpu/kernels/CpuSoftmaxKernel.cpp
+++ b/src/cpu/kernels/CpuSoftmaxKernel.cpp
@@ -34,9 +34,12 @@
 #include "src/core/common/Registrars.h"
 #include "src/core/CPP/Validate.h"
 #include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/Utils.h"
 #include "src/core/helpers/WindowHelpers.h"
 #include "src/cpu/kernels/softmax/list.h"
 
+#include <vector>
+
 namespace arm_compute
 {
 namespace cpu
@@ -45,136 +48,40 @@ namespace kernels
 {
 namespace
 {
-/* Softmax Logits 1D Max - identifying the max value of 1D Logits  */
-static const std::vector<CpuLogits1DMaxKernel::SoftmaxLogits1DMaxKernel> available_kernels_max_logits = {
-    {"sve_fp32_logits_1d_max",
-     [](const DataTypeISASelectorData &data) { return (data.dt == DataType::F32) && data.isa.sve; },
-     REGISTER_FP32_SVE(sve_fp32_logits)},
-    {"sve_fp16_logits_1d_max",
-     [](const DataTypeISASelectorData &data) { return (data.dt == DataType::F16) && data.isa.sve && data.isa.fp16; },
-     REGISTER_FP16_SVE(sve_fp16_logits)},
-    {"sve_qu8_logits_1d_max",
-     [](const DataTypeISASelectorData &data) { return (data.dt == DataType::QASYMM8) && data.isa.sve; },
-     REGISTER_QASYMM8_SVE(sve_qasymm8_logits)},
-    {"sve_qs8_logits_1d_max",
-     [](const DataTypeISASelectorData &data) { return (data.dt == DataType::QASYMM8_SIGNED) && data.isa.sve; },
-     REGISTER_QASYMM8_SIGNED_SVE(sve_qasymm8_signed_logits)},
-    {"neon_fp32_logits_1d_max", [](const DataTypeISASelectorData &data) { return (data.dt == DataType::F32); },
-     REGISTER_FP32_NEON(neon_fp32_logits)},
-    {"neon_fp16_logits_1d_max",
-     [](const DataTypeISASelectorData &data) { return (data.dt == DataType::F16) && data.isa.fp16; },
-     REGISTER_FP16_NEON(neon_fp16_logits)},
-    {"neon_qu8_logits_1d_max", [](const DataTypeISASelectorData &data) { return (data.dt == DataType::QASYMM8); },
-     REGISTER_QASYMM8_NEON(neon_qasymm8_logits)},
-    {"neon_qs8_logits_1d_max",
-     [](const DataTypeISASelectorData &data) { return (data.dt == DataType::QASYMM8_SIGNED); },
-     REGISTER_QASYMM8_SIGNED_NEON(neon_qasymm8_singed_logits)},
+/* Softmax */
+static const std::vector<typename CpuSoftmaxKernel::SoftmaxKernel> available_kernels = {
+    {"neon_fp32_softmax",
+     [](const SoftmaxKernelDataTypeISASelectorData &data) { return (!data.is_log && data.dt == DataType::F32); },
+     REGISTER_FP32_NEON(neon_fp32_softmax<false>)},
+    {"neon_fp16_softmax",
+     [](const SoftmaxKernelDataTypeISASelectorData &data)
+     { return (!data.is_log && data.dt == DataType::F16) && data.isa.fp16; },
+     REGISTER_FP16_NEON(neon_fp16_softmax<false>)},
+    {"neon_qu8_softmax",
+     [](const SoftmaxKernelDataTypeISASelectorData &data) { return (!data.is_log && data.dt == DataType::QASYMM8); },
+     REGISTER_QASYMM8_NEON(arm_compute::cpu::neon_qasymm8_softmax<false>)},
+    {"neon_qs8_softmax",
+     [](const SoftmaxKernelDataTypeISASelectorData &data)
+     { return (!data.is_log && data.dt == DataType::QASYMM8_SIGNED); },
+     REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::neon_qasymm8_signed_softmax<false>)},
+    {"neon_fp32_log_softmax",
+     [](const SoftmaxKernelDataTypeISASelectorData &data) { return (data.is_log && data.dt == DataType::F32); },
+     REGISTER_FP32_NEON(neon_fp32_softmax<true>)},
+    {"neon_fp16_log_softmax",
+     [](const SoftmaxKernelDataTypeISASelectorData &data)
+     { return (data.is_log && data.dt == DataType::F16) && data.isa.fp16; },
+     REGISTER_FP16_NEON(neon_fp16_softmax<true>)},
+    {"neon_qu8_log_softmax",
+     [](const SoftmaxKernelDataTypeISASelectorData &data) { return (data.is_log && data.dt == DataType::QASYMM8); },
+     REGISTER_QASYMM8_NEON(arm_compute::cpu::neon_qasymm8_softmax<true>)},
+    {"neon_qs8_log_softmax",
+     [](const SoftmaxKernelDataTypeISASelectorData &data)
+     { return (data.is_log && data.dt == DataType::QASYMM8_SIGNED); },
+     REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::neon_qasymm8_signed_softmax<true>)},
 };
 
-Status validate_arguments_logits_1d_max(const ITensorInfo &input, const ITensorInfo &output)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(&input);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
-                                                         DataType::F16, DataType::F32);
-
-    // Validate in case of configured output
-    if (output.total_size() != 0)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&input, &output);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(&input, &output);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output.tensor_shape(),
-                                                           TensorShape(input.tensor_shape()).set(0, 1));
-    }
-
-    return Status{};
-}
-} //namespace
-const std::vector<CpuLogits1DMaxKernel::SoftmaxLogits1DMaxKernel> &CpuLogits1DMaxKernel::get_available_kernels()
-{
-    return available_kernels_max_logits;
-}
-
-void CpuLogits1DMaxKernel::configure(const ITensorInfo *src, ITensorInfo *dst)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_logits_1d_max(*src, *dst));
-
-    // Softmax across the x dimension
-    const TensorShape output_shape = TensorShape(src->tensor_shape()).set(0, 1);
-    // Output auto initialization if not yet initialized
-    auto_init_if_empty(*dst, output_shape, 1, src->data_type(), src->quantization_info());
-
-    const auto *uk = get_implementation(DataTypeISASelectorData{src->data_type(), CPUInfo::get().get_isa()});
-    ARM_COMPUTE_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
-
-    _run_method = uk->ukernel;
-    _name       = std::string("CpuLogits1DMaxKernel").append("/").append(uk->name);
-
-    Window win = calculate_max_window(*src, Steps());
-    ICpuKernel::configure(win);
-}
-
-Status CpuLogits1DMaxKernel::validate(const ITensorInfo *src, const ITensorInfo *dst)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_logits_1d_max(*src, *dst));
-
-    return Status{};
-}
-
-void CpuLogits1DMaxKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
-    ARM_COMPUTE_ERROR_ON(_run_method == nullptr);
-
-    const auto src = tensors.get_const_tensor(TensorType::ACL_SRC);
-    auto       dst = tensors.get_tensor(TensorType::ACL_DST);
-
-    _run_method(src, dst, window);
-}
-
-const char *CpuLogits1DMaxKernel::name() const
-{
-    return _name.c_str();
-}
-
-/* Softmax Logits 1D  - computation for QASYMM8 with pre-computed max.  */
-template <bool IS_LOG>
-static const std::vector<typename CpuLogits1DSoftmaxKernel<IS_LOG>::SoftmaxLogits1DKernel> available_kernels_logits = {
-    {"sve2_qu8_softmax_logits_1d",
-     [](const DataTypeISASelectorData &data) { return (data.dt == DataType::QASYMM8) && data.isa.sve2; },
-     REGISTER_QASYMM8_SVE2(sve2_qasymm8_softmax)},
-    {"sve2_qs8_softmax_logits_1d",
-     [](const DataTypeISASelectorData &data) { return (data.dt == DataType::QASYMM8_SIGNED) && data.isa.sve2; },
-     REGISTER_QASYMM8_SIGNED_SVE2(sve2_qasymm8_signed_softmax)},
-    {"sve_fp32_softmax_logits_1d",
-     [](const DataTypeISASelectorData &data) { return (data.dt == DataType::F32) && data.isa.sve; },
-     REGISTER_FP32_SVE(sve_fp32_softmax)},
-    {"sve_fp16_softmax_logits_1d",
-     [](const DataTypeISASelectorData &data) { return (data.dt == DataType::F16) && data.isa.sve && data.isa.fp16; },
-     REGISTER_FP16_SVE(sve_fp16_softmax)},
-
-    {"neon_fp32_softmax_logits_1d", [](const DataTypeISASelectorData &data) { return (data.dt == DataType::F32); },
-     REGISTER_FP32_NEON(neon_fp32_softmax)},
-    {"neon_fp16_softmax_logits_1d",
-     [](const DataTypeISASelectorData &data) { return (data.dt == DataType::F16) && data.isa.fp16; },
-     REGISTER_FP16_NEON(neon_fp16_softmax)},
-    {"neon_qu8_softmax_logits_1d", [](const DataTypeISASelectorData &data) { return (data.dt == DataType::QASYMM8); },
-     REGISTER_QASYMM8_NEON(arm_compute::cpu::neon_qasymm8_softmax)},
-    {"neon_qs8_softmax_logits_1d",
-     [](const DataTypeISASelectorData &data) { return (data.dt == DataType::QASYMM8_SIGNED); },
-     REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::neon_qasymm8_signed_softmax)},
-};
-namespace
-{
-Status validate_arguments_logits_softmax(const ITensorInfo &src,
-                                         const ITensorInfo &max,
-                                         const ITensorInfo &dst,
-                                         const float        beta,
-                                         const ITensorInfo &tmp,
-                                         bool               is_log)
+Status validate_arguments_softmax(
+    const ITensorInfo &src, const ITensorInfo &dst, float beta, const ITensorInfo &tmp, bool is_log)
 {
     ARM_COMPUTE_UNUSED(beta);
     // Check input
@@ -184,11 +91,6 @@ Status validate_arguments_logits_softmax(const ITensorInfo &src,
 
     const bool is_quantized_asymmetric = is_data_type_quantized_asymmetric(src.data_type());
 
-    // Check max
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src, &max);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(TensorShape(src.tensor_shape()).set(0, 1), max.tensor_shape());
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(&src, &max);
-
     // Check output if configured
     if (dst.total_size() != 0)
     {
@@ -203,8 +105,11 @@ Status validate_arguments_logits_softmax(const ITensorInfo &src,
     // Check tmp if configured
     if (tmp.total_size() != 0)
     {
-        const DataType tmp_data_type = is_quantized_asymmetric ? DataType::F32 : src.data_type();
-        ARM_COMPUTE_RETURN_ERROR_ON(tmp.data_type() != tmp_data_type);
+        // We have temporary storage only if src data type is quantized.
+        // Therefore, tmp data type must be F32
+        ARM_COMPUTE_RETURN_ERROR_ON(tmp.data_type() != DataType::F32);
+        ARM_COMPUTE_RETURN_ERROR_ON(!is_quantized_asymmetric);
+
         // We could potentially reduce tmp memory if we could predict or make an assumption
         // on the maximum number of threads that will run in parallel.
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&src, &tmp);
@@ -214,91 +119,97 @@ Status validate_arguments_logits_softmax(const ITensorInfo &src,
 }
 } // namespace
 
-template <bool IS_LOG>
-const std::vector<typename CpuLogits1DSoftmaxKernel<IS_LOG>::SoftmaxLogits1DKernel> &
-CpuLogits1DSoftmaxKernel<IS_LOG>::get_available_kernels()
+const std::vector<typename CpuSoftmaxKernel::SoftmaxKernel> &CpuSoftmaxKernel::get_available_kernels()
 {
-    return available_kernels_logits<IS_LOG>;
+    return available_kernels;
 }
 
-template <bool IS_LOG>
-void CpuLogits1DSoftmaxKernel<IS_LOG>::configure(
-    const ITensorInfo *src, const ITensorInfo *max, ITensorInfo *dst, const float beta, ITensorInfo *tmp)
+void CpuSoftmaxKernel::configure(const ITensorInfo *src, ITensorInfo *dst, float beta, bool is_log, ITensorInfo *tmp)
 {
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src, max, dst, tmp);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_logits_softmax(*src, *max, *dst, beta, *tmp, IS_LOG));
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst, tmp);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_softmax(*src, *dst, beta, *tmp, is_log));
 
     // Configure kernel window
     const bool is_quantized_asymmetric = is_data_type_quantized_asymmetric(src->data_type());
 
     // Output auto initialization if not yet initialized
     const QuantizationInfo output_quantization =
-        is_quantized_asymmetric ? arm_compute::get_softmax_output_quantization_info(src->data_type(), IS_LOG)
+        is_quantized_asymmetric ? arm_compute::get_softmax_output_quantization_info(src->data_type(), is_log)
                                 : dst->quantization_info();
     auto_init_if_empty(*dst, TensorInfo(*src).set_quantization_info(output_quantization).reset_padding());
 
-    // Tmp auto initialization if not yet initialized
-    const DataType tmp_data_type = is_quantized_asymmetric ? DataType::F32 : src->data_type();
-    auto_init_if_empty(*tmp, TensorInfo(*src).set_data_type(tmp_data_type).reset_padding());
+    // Tmp auto initialization if not yet initialized and src is quantized
+    if (is_quantized_asymmetric)
+    {
+        const DataType tmp_data_type = is_quantized_asymmetric ? DataType::F32 : src->data_type();
+        auto_init_if_empty(*tmp, TensorInfo(*src).set_data_type(tmp_data_type).reset_padding());
+    }
 
-    const auto *uk = CpuLogits1DSoftmaxKernel<IS_LOG>::get_implementation(
-        DataTypeISASelectorData{src->data_type(), CPUInfo::get().get_isa()});
+    const auto *uk = CpuSoftmaxKernel::get_implementation(
+        SoftmaxKernelDataTypeISASelectorData{src->data_type(), CPUInfo::get().get_isa(), is_log});
     ARM_COMPUTE_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
 
-    std::string kernel_name =
-        IS_LOG ? std::string("CpuLogits1DLogSoftmaxKernel") : std::string("CpuLogits1DSoftmaxKernel");
+    std::string kernel_name = is_log ? std::string("CpuLogSoftmaxKernel") : std::string("CpuSoftmaxKernel");
 
     _beta       = beta;
     _run_method = uk->ukernel;
     _name       = kernel_name.append("/").append(uk->name);
 
-    // Configure kernel window
-    Window win = calculate_max_window(*max, Steps());
+    Window win = calculate_max_window(*dst, Steps());
+
+    /// TODO: Check dimensions > 0 for holes only. For this, we need
+    /// a utility function checking if there are holes after some dimension.
+    if (!has_holes(*dst, dst->num_dimensions() - 1))
+    {
+        win = win.collapse(win, Window::DimY);
+    }
 
-    ICpuKernel<CpuLogits1DSoftmaxKernel<IS_LOG>>::configure(win);
+    win.set(Window::DimX, Window::Dimension(0, 1, 1)); // First dimension is the reduction axis
+
+    ICpuKernel<CpuSoftmaxKernel>::configure(win);
 }
 
-template <bool IS_LOG>
-Status CpuLogits1DSoftmaxKernel<IS_LOG>::validate(
-    const ITensorInfo *src, const ITensorInfo *max, const ITensorInfo *dst, const float beta, const ITensorInfo *tmp)
+Status CpuSoftmaxKernel::validate(
+    const ITensorInfo *src, const ITensorInfo *dst, float beta, bool is_log, const ITensorInfo *tmp)
 {
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src, max, dst, tmp);
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_logits_softmax(*src, *max, *dst, beta, *tmp, IS_LOG));
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst, tmp);
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_softmax(*src, *dst, beta, *tmp, is_log));
 
     return Status{};
 }
 
-template <bool IS_LOG>
-void CpuLogits1DSoftmaxKernel<IS_LOG>::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
+void CpuSoftmaxKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
 {
-    ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel<CpuLogits1DSoftmaxKernel<IS_LOG>>::window(), window);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel<CpuSoftmaxKernel>::window(), window);
     ARM_COMPUTE_ERROR_ON(_run_method == nullptr);
 
     const auto src = tensors.get_const_tensor(TensorType::ACL_SRC_0);
-    auto       max = tensors.get_tensor(TensorType::ACL_SRC_1);
     auto       dst = tensors.get_tensor(TensorType::ACL_DST_0);
-    auto       tmp = tensors.get_tensor(TensorType::ACL_DST_1);
 
-    const unsigned int num_elems_processed_per_iteration = src->info()->valid_region().shape.x();
-    const unsigned int tmp_size_for_thread = tmp->info()->element_size() * num_elems_processed_per_iteration;
+    if (is_data_type_quantized_asymmetric(src->info()->data_type()))
+    {
+        auto tmp = tensors.get_tensor(TensorType::ACL_DST_1);
+
+        const unsigned int num_elems_processed_per_iteration = src->info()->valid_region().shape.x();
+        const unsigned int tmp_size_for_thread = tmp->info()->element_size() * num_elems_processed_per_iteration;
 
-    ARM_COMPUTE_ERROR_ON(tmp->info()->total_size() < (info.num_threads * tmp_size_for_thread));
+        ARM_COMPUTE_ERROR_ON(tmp->info()->total_size() < (info.num_threads * tmp_size_for_thread));
 
-    void *tmp_for_thread = tmp->buffer() + (info.thread_id * tmp_size_for_thread);
-    _run_method(src, max, tmp_for_thread, dst, _beta, IS_LOG, window);
+        void *tmp_for_thread = tmp->buffer() + (info.thread_id * tmp_size_for_thread);
+        _run_method(src, tmp_for_thread, dst, _beta, window);
+    }
+    else
+    {
+        _run_method(src, nullptr, dst, _beta, window);
+    }
 }
 
-template <bool IS_LOG>
-const char *CpuLogits1DSoftmaxKernel<IS_LOG>::name() const
+const char *CpuSoftmaxKernel::name() const
 {
     return _name.c_str();
 }
 
-template class CpuLogits1DSoftmaxKernel<true>;
-template class CpuLogits1DSoftmaxKernel<false>;
-
 } // namespace kernels
 } // namespace cpu
 } // namespace arm_compute
diff --git a/src/cpu/kernels/CpuSoftmaxKernel.h b/src/cpu/kernels/CpuSoftmaxKernel.h
index 5d288179fd..3db1f3d0ef 100644
--- a/src/cpu/kernels/CpuSoftmaxKernel.h
+++ b/src/cpu/kernels/CpuSoftmaxKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2022 Arm Limited.
+ * Copyright (c) 2017-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_CPU_SOFTMAX_KERNEL_H
-#define ARM_COMPUTE_CPU_SOFTMAX_KERNEL_H
+#ifndef ACL_SRC_CPU_KERNELS_CPUSOFTMAXKERNEL_H
+#define ACL_SRC_CPU_KERNELS_CPUSOFTMAXKERNEL_H
 
 #include "src/core/common/Macros.h"
 #include "src/cpu/ICpuKernel.h"
@@ -33,102 +33,55 @@ namespace cpu
 {
 namespace kernels
 {
-/** Interface for the identifying the max value of 1D Logits */
-class CpuLogits1DMaxKernel : public ICpuKernel<CpuLogits1DMaxKernel>
+/** Interface for softmax computation */
+class CpuSoftmaxKernel : public ICpuKernel<CpuSoftmaxKernel>
 {
 private:
-    using SoftmaxLogits1DMaxKernelPtr = std::add_pointer<void(const ITensor *, ITensor *, const Window &)>::type;
+    using SoftmaxKernelPtr =
+        std::add_pointer<void(const ITensor *, void *const, ITensor *, float, const Window &)>::type;
 
 public:
-    CpuLogits1DMaxKernel() = default;
-    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuLogits1DMaxKernel);
-    /** Set the input and output tensors.
-     *
-     * @param[in]  src Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
-     * @param[out] dst Destination tensor info. Data types supported: same as @p input
-     */
-    void configure(const ITensorInfo *src, ITensorInfo *dst);
-    /** Static function to check if given info will lead to a valid configuration
-     *
-     * Similar to CpuLogits1DMaxKernel::configure()
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *dst);
-
-    // Inherited methods overridden:
-    void        run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
-    const char *name() const override;
-
-    struct SoftmaxLogits1DMaxKernel
-    {
-        const char                  *name;
-        const DataTypeISASelectorPtr is_selected;
-        SoftmaxLogits1DMaxKernelPtr  ukernel;
-    };
-
-    static const std::vector<SoftmaxLogits1DMaxKernel> &get_available_kernels();
-
-private:
-    SoftmaxLogits1DMaxKernelPtr _run_method{nullptr};
-    std::string                 _name{};
-};
-
-/** Interface for softmax computation for QASYMM8 with pre-computed max. */
-template <bool IS_LOG = false>
-class CpuLogits1DSoftmaxKernel : public ICpuKernel<CpuLogits1DSoftmaxKernel<IS_LOG>>
-{
-private:
-    using SoftmaxLogits1DKernelPtr = std::add_pointer<void(
-        const ITensor *, const ITensor *, void *const, ITensor *, float, bool, const Window &)>::type;
-
-public:
-    CpuLogits1DSoftmaxKernel() = default;
-    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuLogits1DSoftmaxKernel);
+    CpuSoftmaxKernel() = default;
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuSoftmaxKernel);
 
     /** Set the input and output tensors.
      *
-     * @param[in]  src  Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
-     * @param[in]  max  Max values tensor info. Same shape as input with dimension 0 set to 1.
-     *                  Data types supported: same as @p input.
-     * @param[out] dst  Destination tensor info. Data types supported: same as @p input.
-     * @param[in]  beta A scaling factor for the exponent.
+     * @param[in]  src    Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
+     * @param[out] dst    Destination tensor info. Data types supported: same as @p input.
+     * @param[in]  beta   A scaling factor for the exponent.
+     * @param[in]  is_log True if the operation is log-softmax
      *
      * @param      tmp    Auxiliary tensor info. Must be type F32 and same shape as the input.
      */
-    void
-    configure(const ITensorInfo *src, const ITensorInfo *max, ITensorInfo *dst, const float beta, ITensorInfo *tmp);
+    void configure(const ITensorInfo *src, ITensorInfo *dst, float beta, bool is_log, ITensorInfo *tmp);
     /** Static function to check if given info will lead to a valid configuration
      *
-     * Similar to CpuLogits1DSoftmaxKernel::configure()
+     * Similar to CpuSoftmaxKernel::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src,
-                           const ITensorInfo *max,
-                           const ITensorInfo *dst,
-                           const float        beta,
-                           const ITensorInfo *tmp);
+    static Status
+    validate(const ITensorInfo *src, const ITensorInfo *dst, float beta, bool is_log, const ITensorInfo *tmp);
 
     // Inherited methods overridden:
     void        run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
     const char *name() const override;
 
-    struct SoftmaxLogits1DKernel
+    struct SoftmaxKernel
     {
-        const char                  *name;
-        const DataTypeISASelectorPtr is_selected;
-        SoftmaxLogits1DKernelPtr     ukernel;
+        const char                                   *name;
+        const SoftmaxKernelDataTypeISASelectorDataPtr is_selected;
+        SoftmaxKernelPtr                              ukernel;
     };
 
-    static const std::vector<SoftmaxLogits1DKernel> &get_available_kernels();
+    static const std::vector<SoftmaxKernel> &get_available_kernels();
 
 private:
-    float                    _beta{1.0f};
-    SoftmaxLogits1DKernelPtr _run_method{nullptr};
-    std::string              _name{};
+    float            _beta{1.0f};
+    SoftmaxKernelPtr _run_method{nullptr};
+    std::string      _name{};
 };
 } // namespace kernels
 } // namespace cpu
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_CPU_SOFTMAX_KERNEL_H */
+#endif // ACL_SRC_CPU_KERNELS_CPUSOFTMAXKERNEL_H
diff --git a/src/cpu/kernels/activation/generic/neon/lut.cpp b/src/cpu/kernels/activation/generic/neon/lut.cpp
index f289c80d4b..ddd186f9cb 100644
--- a/src/cpu/kernels/activation/generic/neon/lut.cpp
+++ b/src/cpu/kernels/activation/generic/neon/lut.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023 Arm Limited.
+ * Copyright (c) 2022-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -34,8 +34,9 @@ namespace cpu
 #ifdef __aarch64__
 void neon_q8_activation_lut(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
 {
-    ARM_COMPUTE_ERROR_ON(src->info()->data_type() != DataType::QASYMM8 &&
-                         src->info()->data_type() != DataType::QASYMM8_SIGNED);
+    ARM_COMPUTE_ERROR_ON( // LUT does not provide any performance benefit for ReLU as it's a single max() operation
+        (src->info()->data_type() != DataType::QASYMM8 && src->info()->data_type() != DataType::QASYMM8_SIGNED) ||
+        act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU);
     const auto window_end_x  = window.x().end();
     Window     win_collapsed = window.collapse_if_possible(window, Window::DimZ);
     win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
diff --git a/src/cpu/kernels/activation/generic/sve/fp16.cpp b/src/cpu/kernels/activation/generic/sve/fp16.cpp
index 97399e01e0..19d9126556 100644
--- a/src/cpu/kernels/activation/generic/sve/fp16.cpp
+++ b/src/cpu/kernels/activation/generic/sve/fp16.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023 Arm Limited.
+ * Copyright (c) 2020-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -30,6 +30,7 @@
 #include "arm_compute/function_info/ActivationLayerInfo.h"
 
 #include "src/core/NEON/SVEMath.h"
+#include "src/cpu/kernels/lut/list.h"
 
 #include <arm_sve.h>
 #include <cmath>
@@ -141,6 +142,32 @@ void sve_fp16_activation(const ITensor *src, ITensor *dst, const ActivationLayer
         },
         input, output);
 }
+
+void sve_fp16_activation_lut(const ITensor             *src,
+                             ITensor                   *dst,
+                             const ActivationLayerInfo &act_info,
+                             const Window              &window)
+{
+    ARM_COMPUTE_ERROR_ON(src->info()->data_type() != DataType::F16);
+    const auto window_start_x = window.x().start();
+    const auto window_end_x   = window.x().end();
+    const auto size           = window_end_x - window_start_x;
+    Window     win_collapsed  = window.collapse_if_possible(window, Window::DimZ);
+    win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    Iterator input(src, win_collapsed);
+    Iterator output(dst, win_collapsed);
+    execute_window_loop(
+        win_collapsed,
+        [&](const Coordinates &)
+        {
+            const auto input_ptr  = reinterpret_cast<const uint16_t *>(input.ptr());
+            auto       output_ptr = reinterpret_cast<uint16_t *>(output.ptr());
+            lut_u16_sve(reinterpret_cast<const uint16_t *>(act_info.lut_fp16().data()), 1U /* num_strings (UNUSED) */,
+                        size, input_ptr + window_start_x, output_ptr + window_start_x);
+        },
+        input, output);
+}
 } // namespace cpu
 } // namespace arm_compute
 #endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */
diff --git a/src/cpu/kernels/activation/generic/sve2/lut.cpp b/src/cpu/kernels/activation/generic/sve2/lut.cpp
index 2ed667debf..5db8595a75 100644
--- a/src/cpu/kernels/activation/generic/sve2/lut.cpp
+++ b/src/cpu/kernels/activation/generic/sve2/lut.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023 Arm Limited.
+ * Copyright (c) 2022-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -34,8 +34,9 @@ namespace cpu
 #ifdef __aarch64__
 void sve2_q8_activation_lut(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
 {
-    ARM_COMPUTE_ERROR_ON(src->info()->data_type() != DataType::QASYMM8 &&
-                         src->info()->data_type() != DataType::QASYMM8_SIGNED);
+    ARM_COMPUTE_ERROR_ON( // LUT does not provide any performance benefit for ReLU as it's a single max() operation
+        (src->info()->data_type() != DataType::QASYMM8 && src->info()->data_type() != DataType::QASYMM8_SIGNED) ||
+        act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU);
     const auto window_end_x  = window.x().end();
     Window     win_collapsed = window.collapse_if_possible(window, Window::DimZ);
     win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
diff --git a/src/cpu/kernels/activation/list.h b/src/cpu/kernels/activation/list.h
index 6550ddfeca..8c24adc3fe 100644
--- a/src/cpu/kernels/activation/list.h
+++ b/src/cpu/kernels/activation/list.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023 Arm Limited.
+ * Copyright (c) 2020-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef SRC_CORE_NEON_KERNELS_ACTIVATION_LIST_H
-#define SRC_CORE_NEON_KERNELS_ACTIVATION_LIST_H
+#ifndef ACL_SRC_CPU_KERNELS_ACTIVATION_LIST_H
+#define ACL_SRC_CPU_KERNELS_ACTIVATION_LIST_H
 
 namespace arm_compute
 {
@@ -42,6 +42,7 @@ DECLARE_ACTIVATION_KERNEL(sve2_qasymm8_signed_activation);
 DECLARE_ACTIVATION_KERNEL(neon_qsymm16_activation);
 DECLARE_ACTIVATION_KERNEL(sve2_qsymm16_activation);
 DECLARE_ACTIVATION_KERNEL(sve_fp16_activation);
+DECLARE_ACTIVATION_KERNEL(sve_fp16_activation_lut);
 DECLARE_ACTIVATION_KERNEL(sve_fp32_activation);
 DECLARE_ACTIVATION_KERNEL(neon_fp16_activation);
 DECLARE_ACTIVATION_KERNEL(neon_fp32_activation);
@@ -50,4 +51,4 @@ DECLARE_ACTIVATION_KERNEL(neon_fp32_activation);
 } // namespace cpu
 } // namespace arm_compute
 
-#endif /* SRC_CORE_NEON_KERNELS_ACTIVATION_LIST_H */
+#endif // ACL_SRC_CPU_KERNELS_ACTIVATION_LIST_H
diff --git a/src/cpu/kernels/softmax/generic/sve/fp32.cpp b/src/cpu/kernels/depth_to_space/list.h
similarity index 63%
rename from src/cpu/kernels/softmax/generic/sve/fp32.cpp
rename to src/cpu/kernels/depth_to_space/list.h
index d692cc2477..9d0cd1e740 100644
--- a/src/cpu/kernels/softmax/generic/sve/fp32.cpp
+++ b/src/cpu/kernels/depth_to_space/list.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -22,28 +22,26 @@
  * SOFTWARE.
  */
 
-#include "arm_compute/core/Helpers.h"
+#ifndef ACL_SRC_CPU_KERNELS_DEPTH_TO_SPACE_LIST_H
+#define ACL_SRC_CPU_KERNELS_DEPTH_TO_SPACE_LIST_H
 
-#include "src/cpu/kernels/softmax/generic/sve/impl.h"
+#include <cstdint>
 
 namespace arm_compute
 {
 namespace cpu
 {
-void sve_fp32_softmax(const ITensor *in,
-                      const ITensor *max,
-                      void *const    tmp,
-                      ITensor       *out,
-                      const float    beta,
-                      bool           is_log,
-                      const Window  &window)
-{
-    return sve_softmax_logits_1d_float<float>(in, max, tmp, out, beta, is_log, window);
-}
 
-void sve_fp32_logits(const ITensor *in, ITensor *out, const Window &window)
-{
-    return sve_logits_1d_max<float>(in, out, window);
-}
+#define DECLARE_DEPTHTOSPACE_KERNEL(func_name)                                                                     \
+    void func_name(const uint8_t *src, uint8_t *dst, const uintptr_t src_shape[4], const uintptr_t src_strides[4], \
+                   const uintptr_t dst_strides[4], uintptr_t element_size, uintptr_t block_size)
+
+DECLARE_DEPTHTOSPACE_KERNEL(depth_to_space_nhwc_any);
+DECLARE_DEPTHTOSPACE_KERNEL(depth_to_space_nchw_any);
+
+#undef DECLARE_DEPTHTOSPACE_KERNEL
+
 } // namespace cpu
 } // namespace arm_compute
+
+#endif // ACL_SRC_CPU_KERNELS_DEPTH_TO_SPACE_LIST_H
diff --git a/src/cpu/kernels/depth_to_space/nchw/any/impl.cpp b/src/cpu/kernels/depth_to_space/nchw/any/impl.cpp
new file mode 100644
index 0000000000..0277690112
--- /dev/null
+++ b/src/cpu/kernels/depth_to_space/nchw/any/impl.cpp
@@ -0,0 +1,123 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/core/Error.h"
+
+#include <cstdint>
+#include <cstring>
+
+namespace arm_compute
+{
+namespace cpu
+{
+
+void depth_to_space_nchw_any( //
+    const uint8_t  *src,
+    uint8_t        *dst,
+    const uintptr_t src_shape[4],
+    const uintptr_t src_strides[4],
+    const uintptr_t dst_strides[4],
+    uintptr_t       element_size,
+    uintptr_t       block_size)
+{
+    ARM_COMPUTE_ERROR_ON(src_strides[0] != element_size);
+    ARM_COMPUTE_ERROR_ON(dst_strides[0] != element_size);
+
+    const auto dst_channels         = src_shape[2] / (block_size * block_size);
+    const auto src_block_col_stride = dst_channels * src_strides[2];
+    const auto src_block_row_stride = block_size * dst_channels * src_strides[2];
+
+    auto *src_batch_ptr = src;
+    auto *dst_batch_ptr = dst;
+
+    for (uintptr_t batch = 0; batch < src_shape[3]; ++batch)
+    {
+        auto *src_channel_ptr = src_batch_ptr;
+        auto *dst_channel_ptr = dst_batch_ptr;
+
+        for (uintptr_t channel = 0; channel < dst_channels; ++channel)
+        {
+            auto *src_height_block_ptr = src_channel_ptr;
+            auto *dst_row_ptr          = dst_channel_ptr;
+
+            for (uintptr_t height_block = 0; height_block < src_shape[1]; ++height_block)
+            {
+                auto *src_block_row_ptr = src_height_block_ptr;
+
+                for (uintptr_t block_row = 0; block_row < block_size; ++block_row)
+                {
+                    auto *src_width_block_ptr = src_block_row_ptr;
+                    auto *dst_col_ptr         = dst_row_ptr;
+
+                    for (uintptr_t width_block = 0; width_block < src_shape[0]; ++width_block)
+                    {
+                        auto *src_block_col_ptr = src_width_block_ptr;
+
+                        for (uintptr_t block_col = 0; block_col < block_size; ++block_col)
+                        {
+                            // The source pointer is accumulated as:
+                            //
+                            // src_block_col_ptr =
+                            //   src +
+                            //   batch * dst_strides[3] +
+                            //   (channel + (block_row * block_size + block_col) * dst_channels) * src_strides[2] +
+                            //   height_block * src_strides[1] +
+                            //   width_block * element_size;
+                            //
+                            // The destination pointer is accumuated as:
+                            //
+                            // dst_col_ptr =
+                            //   dst +
+                            //   batch * dst_strides[3] +
+                            //   channel * dst_strides[2] +
+                            //   (height_block * block_size + block_row) * dst_strides[1] +
+                            //   (width_block * block_size + block_col) * element_size
+
+                            std::memcpy(dst_col_ptr, src_block_col_ptr, element_size);
+
+                            src_block_col_ptr += src_block_col_stride;
+                            dst_col_ptr += element_size;
+                        }
+
+                        src_width_block_ptr += element_size;
+                    }
+
+                    src_block_row_ptr += src_block_row_stride;
+                    dst_row_ptr += dst_strides[1];
+                }
+
+                src_height_block_ptr += src_strides[1];
+            }
+
+            src_channel_ptr += src_strides[2];
+            dst_channel_ptr += dst_strides[2];
+        }
+
+        src_batch_ptr += src_strides[3];
+        dst_batch_ptr += dst_strides[3];
+    }
+}
+
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/depth_to_space/nhwc/any/impl.cpp b/src/cpu/kernels/depth_to_space/nhwc/any/impl.cpp
new file mode 100644
index 0000000000..b1c84599dc
--- /dev/null
+++ b/src/cpu/kernels/depth_to_space/nhwc/any/impl.cpp
@@ -0,0 +1,105 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/core/Error.h"
+
+#include <cstdint>
+#include <cstring>
+
+namespace arm_compute
+{
+namespace cpu
+{
+
+void depth_to_space_nhwc_any( //
+    const uint8_t  *src,
+    uint8_t        *dst,
+    const uintptr_t src_shape[4],
+    const uintptr_t src_strides[4],
+    const uintptr_t dst_strides[4],
+    uintptr_t       element_size,
+    uintptr_t       block_size)
+{
+    ARM_COMPUTE_ERROR_ON(src_strides[0] != element_size);
+    ARM_COMPUTE_ERROR_ON(dst_strides[0] != element_size);
+
+    const auto src_block_row_stride   = (src_shape[0] / block_size) * element_size;
+    const auto dst_width_block_stride = block_size * dst_strides[1];
+
+    auto *src_batch_ptr = src;
+    auto *dst_batch_ptr = dst;
+
+    for (uintptr_t batch = 0; batch < src_shape[3]; ++batch)
+    {
+        auto *src_height_block_ptr = src_batch_ptr;
+        auto *dst_row_ptr          = dst_batch_ptr;
+
+        for (uintptr_t height_block = 0; height_block < src_shape[2]; ++height_block)
+        {
+            auto *src_block_row_ptr = src_height_block_ptr;
+
+            for (uintptr_t block_row = 0; block_row < block_size; ++block_row)
+            {
+                auto *src_width_block_ptr = src_block_row_ptr;
+                auto *dst_width_block_ptr = dst_row_ptr;
+
+                for (uintptr_t width_block = 0; width_block < src_shape[1]; ++width_block)
+                {
+                    // The source pointer is accumulated as:
+                    //
+                    // src_width_block_ptr =
+                    //   src +
+                    //   batch * src_strides[3] +
+                    //   height_block * src_strides[2] +
+                    //   width_block * src_strides[1] +
+                    //   block_row * (src_shape[0] / block_size) * element_size;
+                    //
+                    // The destination pointer is accumulated as:
+                    //
+                    // dst_width_block_ptr =
+                    //     dst +
+                    //     batch * dst_strides[3] +
+                    //     (height_block * block_size + block_row) * dst_strides[2] +
+                    //     width_block * block_size * dst_strides[1];
+
+                    std::memcpy(dst_width_block_ptr, src_width_block_ptr, src_block_row_stride);
+
+                    src_width_block_ptr += src_strides[1];
+                    dst_width_block_ptr += dst_width_block_stride;
+                }
+
+                src_block_row_ptr += src_block_row_stride;
+                dst_row_ptr += dst_strides[2];
+            }
+
+            src_height_block_ptr += src_strides[2];
+        }
+
+        src_batch_ptr += src_strides[3];
+        dst_batch_ptr += dst_strides[3];
+    }
+}
+
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/fuse_batch_normalization/generic/impl.h b/src/cpu/kernels/fuse_batch_normalization/generic/impl.h
index d807148e37..0c90abccb1 100644
--- a/src/cpu/kernels/fuse_batch_normalization/generic/impl.h
+++ b/src/cpu/kernels/fuse_batch_normalization/generic/impl.h
@@ -32,6 +32,124 @@ namespace arm_compute
 {
 namespace cpu
 {
+template <typename T, bool fused_activation, typename F>
+void batch_normalization_nchw(const Window       &window,
+                              ITensor            *in,
+                              ITensor            *out,
+                              const ITensor      *in_mean,
+                              const ITensor      *in_var,
+                              const ITensor      *in_beta,
+                              const ITensor      *in_gamma,
+                              float               epsilon,
+                              ActivationLayerInfo act_info)
+{
+    /** SIMD vector tag type. */
+    using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>;
+
+    const int  window_step_x  = 16 / sizeof(T);
+    const auto window_start_x = static_cast<int>(window.x().start());
+    const auto window_end_x   = static_cast<int>(window.x().end());
+
+    Window win_to_use = window;
+    win_to_use.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    Iterator input(in, win_to_use);
+    Iterator output(out, win_to_use);
+
+    F activation_functor(act_info);
+
+    // Hold information about the current feature map we are iterating.
+    // Only compute denominator and constants once per feature map.
+    int slice = -1;
+
+    const auto input_mean = reinterpret_cast<const T *>(in_mean->ptr_to_element(Coordinates(0, 0)));
+    const auto input_var  = reinterpret_cast<const T *>(in_var->ptr_to_element(Coordinates(0, 0)));
+    const auto input_gamma =
+        (in_gamma != nullptr) ? reinterpret_cast<const T *>(in_gamma->ptr_to_element(Coordinates(0, 0))) : nullptr;
+    const auto input_beta =
+        (in_beta != nullptr) ? reinterpret_cast<const T *>(in_beta->ptr_to_element(Coordinates(0, 0))) : nullptr;
+
+    T mean        = static_cast<T>(0);
+    T var         = static_cast<T>(0);
+    T gamma       = static_cast<T>(1);
+    T beta        = static_cast<T>(0);
+    T denominator = static_cast<T>(0);
+
+    auto       mean_vec        = wrapper::vdup_n(mean, ExactTagType{});
+    auto       var_vec         = wrapper::vdup_n(var, ExactTagType{});
+    auto       gamma_vec       = wrapper::vdup_n(gamma, ExactTagType{});
+    auto       beta_vec        = wrapper::vdup_n(beta, ExactTagType{});
+    auto       denominator_vec = wrapper::vdup_n(denominator, ExactTagType{});
+    const auto epsilon_vec     = wrapper::vdup_n(static_cast<T>(epsilon), ExactTagType{});
+    execute_window_loop(
+        win_to_use,
+        [&](const Coordinates &id)
+        {
+            const auto input_ptr  = reinterpret_cast<const T *>(input.ptr());
+            const auto output_ptr = reinterpret_cast<T *>(output.ptr());
+
+            if (slice != id.z())
+            {
+                mean     = input_mean[id.z()];
+                var      = input_var[id.z()];
+                mean_vec = wrapper::vdup_n(mean, ExactTagType{});
+                var_vec  = wrapper::vdup_n(var, ExactTagType{});
+                if (input_gamma != nullptr)
+                {
+                    gamma     = input_gamma[id.z()];
+                    gamma_vec = wrapper::vdup_n(gamma, ExactTagType{});
+                }
+                if (input_beta != nullptr)
+                {
+                    beta     = input_beta[id.z()];
+                    beta_vec = wrapper::vdup_n(beta, ExactTagType{});
+                }
+
+                // Calculate denominator
+                denominator_vec = wrapper::vinvsqrt(wrapper::vadd(var_vec, epsilon_vec));
+                denominator     = wrapper::vgetlane(denominator_vec, 0);
+                slice           = id.z();
+            }
+
+            // Perform core calculations using vector operations
+            int x = window_start_x;
+            for (; x <= (window_end_x - window_step_x); x += window_step_x)
+            {
+                // Calculate x bar
+                const auto numerator = wrapper::vsub(wrapper::vloadq(input_ptr + x), mean_vec);
+                const auto x_bar     = wrapper::vmul(numerator, denominator_vec);
+                auto       res       = wrapper::vmla(beta_vec, x_bar, gamma_vec);
+
+                // Perform fused activation
+                if (fused_activation)
+                {
+                    activation_functor(res);
+                }
+
+                // Store results
+                wrapper::vstore(output_ptr + x, res);
+            }
+
+            // Compute left-over elements
+            for (; x < window_end_x; ++x)
+            {
+                const T numerator = input_ptr[x] - mean;
+                const T x_bar     = numerator * denominator;
+                T       res       = beta + x_bar * gamma;
+
+                // Perform fused activation
+                if (fused_activation)
+                {
+                    activation_functor(res);
+                }
+
+                // Store results
+                *(output_ptr + x) = res;
+            }
+        },
+        input, output);
+}
+
 template <typename T>
 void fused_batch_normalization_conv(const ITensor *conv_weights,
                                     const ITensor *conv_bias,
diff --git a/src/cpu/kernels/fuse_batch_normalization/nchw/neon/fp16.cpp b/src/cpu/kernels/fuse_batch_normalization/nchw/neon/fp16.cpp
new file mode 100644
index 0000000000..ae4c7e5736
--- /dev/null
+++ b/src/cpu/kernels/fuse_batch_normalization/nchw/neon/fp16.cpp
@@ -0,0 +1,96 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
+
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/TensorInfo.h"
+
+#include "src/core/CPP/Validate.h"
+#include "src/core/NEON/kernels/detail/NEActivationFunctionDetail.h"
+#include "src/core/NEON/wrapper/wrapper.h"
+#include "src/cpu/CpuTypes.h"
+#include "src/cpu/kernels/fuse_batch_normalization/generic/impl.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+void fp16_batch_normalization_nchw_non_fused(const Window       &window,
+                                             ITensor            *input,
+                                             ITensor            *output,
+                                             const ITensor      *mean,
+                                             const ITensor      *var,
+                                             const ITensor      *beta,
+                                             const ITensor      *gamma,
+                                             float               epsilon,
+                                             ActivationLayerInfo act_info)
+{
+    batch_normalization_nchw<float16_t, false, detail::dummy<float16_t, 8>>(window, input, output, mean, var, beta,
+                                                                            gamma, epsilon, act_info);
+}
+
+void fp16_batch_normalization_nchw_non_fused_relu(const Window       &window,
+                                                  ITensor            *input,
+                                                  ITensor            *output,
+                                                  const ITensor      *mean,
+                                                  const ITensor      *var,
+                                                  const ITensor      *beta,
+                                                  const ITensor      *gamma,
+                                                  float               epsilon,
+                                                  ActivationLayerInfo act_info)
+{
+    batch_normalization_nchw<float16_t, true, detail::relu<float16_t, 8>>(window, input, output, mean, var, beta, gamma,
+                                                                          epsilon, act_info);
+}
+
+void fp16_batch_normalization_nchw_non_fused_brelu(const Window       &window,
+                                                   ITensor            *input,
+                                                   ITensor            *output,
+                                                   const ITensor      *mean,
+                                                   const ITensor      *var,
+                                                   const ITensor      *beta,
+                                                   const ITensor      *gamma,
+                                                   float               epsilon,
+                                                   ActivationLayerInfo act_info)
+{
+    batch_normalization_nchw<float16_t, true, detail::brelu<float16_t, 8>>(window, input, output, mean, var, beta,
+                                                                           gamma, epsilon, act_info);
+}
+
+void fp16_batch_normalization_nchw_non_fused_lubrelu(const Window       &window,
+                                                     ITensor            *input,
+                                                     ITensor            *output,
+                                                     const ITensor      *mean,
+                                                     const ITensor      *var,
+                                                     const ITensor      *beta,
+                                                     const ITensor      *gamma,
+                                                     float               epsilon,
+                                                     ActivationLayerInfo act_info)
+{
+    batch_normalization_nchw<float16_t, true, detail::lubrelu<float16_t, 8>>(window, input, output, mean, var, beta,
+                                                                             gamma, epsilon, act_info);
+}
+} // namespace cpu
+} // namespace arm_compute
+#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */
diff --git a/src/cpu/kernels/fuse_batch_normalization/nchw/neon/fp32.cpp b/src/cpu/kernels/fuse_batch_normalization/nchw/neon/fp32.cpp
new file mode 100644
index 0000000000..ae2db1ac66
--- /dev/null
+++ b/src/cpu/kernels/fuse_batch_normalization/nchw/neon/fp32.cpp
@@ -0,0 +1,93 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/TensorInfo.h"
+
+#include "src/core/CPP/Validate.h"
+#include "src/core/NEON/kernels/detail/NEActivationFunctionDetail.h"
+#include "src/core/NEON/wrapper/wrapper.h"
+#include "src/cpu/CpuTypes.h"
+#include "src/cpu/kernels/fuse_batch_normalization/generic/impl.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+void fp32_batch_normalization_nchw_non_fused(const Window       &window,
+                                             ITensor            *input,
+                                             ITensor            *output,
+                                             const ITensor      *mean,
+                                             const ITensor      *var,
+                                             const ITensor      *beta,
+                                             const ITensor      *gamma,
+                                             float               epsilon,
+                                             ActivationLayerInfo act_info)
+{
+    batch_normalization_nchw<float, false, detail::dummy<float, 4>>(window, input, output, mean, var, beta, gamma,
+                                                                    epsilon, act_info);
+}
+
+void fp32_batch_normalization_nchw_non_fused_relu(const Window       &window,
+                                                  ITensor            *input,
+                                                  ITensor            *output,
+                                                  const ITensor      *mean,
+                                                  const ITensor      *var,
+                                                  const ITensor      *beta,
+                                                  const ITensor      *gamma,
+                                                  float               epsilon,
+                                                  ActivationLayerInfo act_info)
+{
+    batch_normalization_nchw<float, true, detail::relu<float, 4>>(window, input, output, mean, var, beta, gamma,
+                                                                  epsilon, act_info);
+}
+
+void fp32_batch_normalization_nchw_non_fused_brelu(const Window       &window,
+                                                   ITensor            *input,
+                                                   ITensor            *output,
+                                                   const ITensor      *mean,
+                                                   const ITensor      *var,
+                                                   const ITensor      *beta,
+                                                   const ITensor      *gamma,
+                                                   float               epsilon,
+                                                   ActivationLayerInfo act_info)
+{
+    batch_normalization_nchw<float, true, detail::brelu<float, 4>>(window, input, output, mean, var, beta, gamma,
+                                                                   epsilon, act_info);
+}
+
+void fp32_batch_normalization_nchw_non_fused_lubrelu(const Window       &window,
+                                                     ITensor            *input,
+                                                     ITensor            *output,
+                                                     const ITensor      *mean,
+                                                     const ITensor      *var,
+                                                     const ITensor      *beta,
+                                                     const ITensor      *gamma,
+                                                     float               epsilon,
+                                                     ActivationLayerInfo act_info)
+{
+    batch_normalization_nchw<float, true, detail::lubrelu<float, 4>>(window, input, output, mean, var, beta, gamma,
+                                                                     epsilon, act_info);
+}
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/internal/CpuDepthwiseConv2dAssemblyWrapperKernel.cpp b/src/cpu/kernels/internal/CpuDepthwiseConv2dAssemblyWrapperKernel.cpp
index 32d9ca4eac..296fe88791 100644
--- a/src/cpu/kernels/internal/CpuDepthwiseConv2dAssemblyWrapperKernel.cpp
+++ b/src/cpu/kernels/internal/CpuDepthwiseConv2dAssemblyWrapperKernel.cpp
@@ -239,12 +239,12 @@ void CpuDepthwiseConv2dAssemblyWrapperKernel::configure(const ITensorInfo *src,
             create_arm_dwc_quant<int8_t, int8_t, int8_t>(src, weights, dst, info, cpu_info, _kernel_asm, _multipliers,
                                                          _right_shifts, _left_shifts, asm_kernel_name);
             break;
-#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+#if defined(ENABLE_FP16_KERNELS)
         case DataType::F16:
             create_arm_dwc<float16_t, float16_t, float16_t>(src, weights, dst, info, cpu_info, _kernel_asm,
                                                             asm_kernel_name);
             break;
-#endif // defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+#endif // defined(ENABLE_FP16_KERNELS)
         case DataType::F32:
             create_arm_dwc<float, float, float>(src, weights, dst, info, cpu_info, _kernel_asm, asm_kernel_name);
             break;
diff --git a/src/cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.cpp b/src/cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.cpp
index a161c800fd..9ba2451482 100644
--- a/src/cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.cpp
+++ b/src/cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -79,11 +79,11 @@ void CpuPool2dAssemblyWrapperKernel::configure(const ITensorInfo      *src,
                 create_arm_pooling<int8_t, int8_t>(src, dst, info, cpu_info);
             }
             break;
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+#if defined(ENABLE_FP16_KERNELS)
         case DataType::F16:
             create_arm_pooling<float16_t, float16_t>(src, dst, info, cpu_info);
             break;
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+#endif // defined(ENABLE_FP16_KERNELS)
         case DataType::F32:
             create_arm_pooling<float, float>(src, dst, info, cpu_info);
             break;
diff --git a/src/cpu/kernels/lut/generic/sve/u16.cpp b/src/cpu/kernels/lut/generic/sve/u16.cpp
new file mode 100644
index 0000000000..75b8dcaae2
--- /dev/null
+++ b/src/cpu/kernels/lut/generic/sve/u16.cpp
@@ -0,0 +1,103 @@
+/*
+ * Copyright (c) 2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/Error.h"
+
+#include "src/cpu/kernels/lut/list.h"
+
+#ifdef __aarch64__
+#ifdef ARM_COMPUTE_ENABLE_SVE
+
+#include <arm_sve.h>
+
+namespace arm_compute
+{
+namespace cpu
+{
+void lut_u16_sve(const uint16_t *table, size_t num_strings, size_t size, const uint16_t *input, uint16_t *output)
+{
+    int64_t cnth  = svcnth();
+    int64_t tail  = size & (4 * cnth - 1);
+    int64_t count = size - tail;
+    int64_t pos   = 0;
+    ARM_COMPUTE_UNUSED(num_strings);
+    __asm __volatile("cbz %[count], 2f\n"
+                     "mov z31.s, #0\n"
+                     "cnth x7, ALL, MUL #4\n"
+                     "cntb x8, ALL, MUL #4\n"
+                     "ptrue p0.b\n"
+                     "1:"
+                     "ld1h z0.h, p0/z, [%[input]]\n"
+                     "ld1h z1.h, p0/z, [%[input], #1, MUL VL]\n"
+                     "ld1h z2.h, p0/z, [%[input], #2, MUL VL]\n"
+                     "ld1h z3.h, p0/z, [%[input], #3, MUL VL]\n"
+                     "add %[input], %[input], x8\n"
+
+                     "zip1 z8.h, z0.h, z31.h\n"
+                     "ld1h z8.s, p0/z, [%[table], z8.s, UXTW #1]\n"
+                     "zip2 z0.h, z0.h, z31.h\n"
+                     "ld1h z0.s, p0/z, [%[table], z0.s, UXTW #1]\n"
+                     "uzp1 z0.h, z8.h, z0.h\n"
+                     "st1h z0.h, p0, [%[output]]\n"
+
+                     "zip1 z10.h, z1.h, z31.h\n"
+                     "ld1h z10.s, p0/z, [%[table], z10.s, UXTW #1]\n"
+                     "zip2 z1.h, z1.h, z31.h\n"
+                     "ld1h z1.s, p0/z, [%[table], z1.s, UXTW #1]\n"
+                     "uzp1 z1.h, z10.h, z1.h\n"
+                     "st1h z1.h, p0, [%[output], #1, MUL VL]\n"
+
+                     "zip1 z12.h, z2.h, z31.h\n"
+                     "ld1h z12.s, p0/z, [%[table], z12.s, UXTW #1]\n"
+                     "zip2 z2.h, z2.h, z31.h\n"
+                     "ld1h z2.s, p0/z, [%[table], z2.s, UXTW #1]\n"
+                     "uzp1 z2.h, z12.h, z2.h\n"
+                     "st1h z2.h, p0, [%[output], #2, MUL VL]\n"
+
+                     "zip1 z14.h, z3.h, z31.h\n"
+                     "ld1h z14.s, p0/z, [%[table], z14.s, UXTW #1]\n"
+                     "zip2 z3.h, z3.h, z31.h\n"
+                     "ld1h z3.s, p0/z, [%[table], z3.s, UXTW #1]\n"
+                     "uzp1 z3.h, z14.h, z3.h\n"
+                     "st1h z3.h, p0, [%[output], #3, MUL VL]\n"
+
+                     "add %[pos], %[pos], x7\n"
+                     "add %[output], %[output], x8\n"
+                     "cmp %[pos], %[count]\n"
+                     "blt 1b\n"
+                     "2:\n"
+                     : [count] "+r"(count), [input] "+r"(input), [output] "+r"(output), [pos] "+r"(pos)
+                     : [table] "r"(table)
+                     : "memory", "cc", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12",
+                       "z14", "z31", "p0", "p1", "z2", "z3", "z4", "x7", "x8");
+    for (int i = 0; i < tail; i++)
+    {
+        output[i] = table[input[i]];
+    }
+}
+
+} // namespace cpu
+} // namespace arm_compute
+
+#endif // ARM_COMPUTE_ENABLE_SVE
+#endif // __aarch64__
diff --git a/src/cpu/kernels/lut/list.h b/src/cpu/kernels/lut/list.h
index da90346267..9acfe97728 100644
--- a/src/cpu/kernels/lut/list.h
+++ b/src/cpu/kernels/lut/list.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023 Arm Limited.
+ * Copyright (c) 2023-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -22,8 +22,8 @@
  * SOFTWARE.
  */
 
-#ifndef SRC_CORE_NEON_KERNELS_LUT_LIST_H
-#define SRC_CORE_NEON_KERNELS_LUT_LIST_H
+#ifndef ACL_SRC_CPU_KERNELS_LUT_LIST_H
+#define ACL_SRC_CPU_KERNELS_LUT_LIST_H
 
 #include <cstddef>
 #include <cstdint>
@@ -34,17 +34,27 @@ namespace cpu
 {
 
 #ifdef __aarch64__
-#define DECLARE_LUT_KERNEL(func_name)                                                                           \
+#define DECLARE_LUT_U8_KERNEL(func_name)                                                                        \
     void func_name(const uint8_t *table, size_t num_strings, size_t string_length, const uint8_t *const *input, \
                    uint8_t *const *output)
 
-DECLARE_LUT_KERNEL(lut_u8_neon);
-DECLARE_LUT_KERNEL(lut_u8_sve2);
+DECLARE_LUT_U8_KERNEL(lut_u8_neon);
+DECLARE_LUT_U8_KERNEL(lut_u8_sve2);
+
+#undef DECLARE_LUT_U8_KERNEL
+
+#define DECLARE_LUT_U16_KERNEL(func_name)                                                                  \
+    void func_name(const uint16_t *table, size_t num_strings, size_t string_length, const uint16_t *input, \
+                   uint16_t *output)
+
+DECLARE_LUT_U16_KERNEL(lut_u16_neon);
+DECLARE_LUT_U16_KERNEL(lut_u16_sve);
+
+#undef DECLARE_LUT_U16_KERNEL
 
-#undef DECLARE_LUT_KERNEL
 #endif // __aarch64__
 
 } // namespace cpu
 } // namespace arm_compute
 
-#endif // SRC_CORE_NEON_KERNELS_LUT_LIST_H
+#endif // ACL_SRC_CPU_KERNELS_LUT_LIST_H
diff --git a/src/cpu/kernels/mul/generic/neon/fp16.cpp b/src/cpu/kernels/mul/generic/neon/fp16.cpp
new file mode 100644
index 0000000000..920f298527
--- /dev/null
+++ b/src/cpu/kernels/mul/generic/neon/fp16.cpp
@@ -0,0 +1,145 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
+
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/TensorInfo.h"
+
+#include "src/core/CPP/Validate.h"
+#include "src/core/NEON/wrapper/wrapper.h"
+#include "src/cpu/CpuTypes.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+void mul_F16_F16_F16(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, float scale)
+{
+    // Create input windows
+    Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
+    Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape());
+
+    // Clear X Dimension on execution window as we handle manually
+    Window win = window;
+    win.set(Window::DimX, Window::Dimension(0, 1, 1));
+    constexpr int window_step_x         = 16;
+    const auto    window_start_x        = static_cast<int>(window.x().start());
+    const auto    window_end_x          = static_cast<int>(window.x().end());
+    const bool    is_broadcast_across_x = src1->info()->tensor_shape().x() != src2->info()->tensor_shape().x();
+    if (is_broadcast_across_x)
+    {
+        const bool     is_broadcast_input_2 = input2_win.x().step() == 0;
+        Window         broadcast_win        = is_broadcast_input_2 ? input2_win : input1_win;
+        Window         non_broadcast_win    = !is_broadcast_input_2 ? input2_win : input1_win;
+        const ITensor *broadcast_tensor     = is_broadcast_input_2 ? src2 : src1;
+        const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src2 : src1;
+        // Clear X Dimension on execution window as we handle manually
+        non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
+        Iterator broadcast_input(broadcast_tensor, broadcast_win);
+        Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
+        Iterator dst(out, win);
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
+            {
+                const auto non_broadcast_input_ptr = reinterpret_cast<const float16_t *>(non_broadcast_input.ptr());
+                const auto output_ptr              = reinterpret_cast<float16_t *>(dst.ptr());
+                const auto broadcast_value         = *reinterpret_cast<const float16_t *>(broadcast_input.ptr());
+                const float16x8x2_t broadcast_value_vec = {{
+                    vdupq_n_f16(broadcast_value),
+                    vdupq_n_f16(broadcast_value),
+                }};
+                const auto          scale_vec           = vdupq_n_f16(scale);
+                // Compute window_step_x elements per iteration
+                int x = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
+                {
+                    const float16x8x2_t non_broadcast_v = {{
+                        vld1q_f16(non_broadcast_input_ptr + x),
+                        vld1q_f16(non_broadcast_input_ptr + x + 8),
+                    }};
+                    const float16x8x2_t result          = {{
+                                 vmulq_f16(vmulq_f16(broadcast_value_vec.val[0], non_broadcast_v.val[0]), scale_vec),
+                                 vmulq_f16(vmulq_f16(broadcast_value_vec.val[1], non_broadcast_v.val[1]), scale_vec),
+                    }};
+                    vst1q_f16(output_ptr + x, result.val[0]);
+                    vst1q_f16(output_ptr + x + 8, result.val[1]);
+                }
+                // Compute left-over elements
+                for (; x < window_end_x; ++x)
+                {
+                    const auto non_broadcast_v = *(non_broadcast_input_ptr + x);
+                    *(output_ptr + x)          = broadcast_value * non_broadcast_v * scale;
+                }
+            },
+            broadcast_input, non_broadcast_input, dst);
+    }
+    else
+    {
+        input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
+        input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
+        Iterator input1(src1, input1_win);
+        Iterator input2(src2, input2_win);
+        Iterator dst(out, win);
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
+            {
+                const auto input1_ptr = reinterpret_cast<const float16_t *>(input1.ptr());
+                const auto input2_ptr = reinterpret_cast<const float16_t *>(input2.ptr());
+                const auto output_ptr = reinterpret_cast<float16_t *>(dst.ptr());
+                // Compute window_step_x elements per iteration
+                int x = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
+                {
+                    const float16x8x2_t ta1       = {{
+                              vld1q_f16(input1_ptr + x),
+                              vld1q_f16(input1_ptr + x + 8),
+                    }};
+                    const float16x8x2_t ta2       = {{
+                              vld1q_f16(input2_ptr + x),
+                              vld1q_f16(input2_ptr + x + 8),
+                    }};
+                    const float16x8_t   scale_vec = vdupq_n_f16(scale);
+                    const float16x8x2_t result    = {{
+                           vmulq_f16(vmulq_f16(ta1.val[0], ta2.val[0]), scale_vec),
+                           vmulq_f16(vmulq_f16(ta1.val[1], ta2.val[1]), scale_vec),
+                    }};
+                    vst1q_f16(output_ptr + x, result.val[0]);
+                    vst1q_f16(output_ptr + x + 8, result.val[1]);
+                }
+                // Compute left-over elements
+                for (; x < window_end_x; ++x)
+                {
+                    const auto ta1    = *(input1_ptr + x);
+                    const auto ta2    = *(input2_ptr + x);
+                    *(output_ptr + x) = ta1 * ta2 * scale;
+                }
+            },
+            input1, input2, dst);
+    }
+}
+} // namespace cpu
+} // namespace arm_compute
+#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */
diff --git a/src/cpu/kernels/mul/generic/neon/fp32.cpp b/src/cpu/kernels/mul/generic/neon/fp32.cpp
new file mode 100644
index 0000000000..3001eb5110
--- /dev/null
+++ b/src/cpu/kernels/mul/generic/neon/fp32.cpp
@@ -0,0 +1,138 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/TensorInfo.h"
+
+#include "src/core/CPP/Validate.h"
+#include "src/core/NEON/wrapper/wrapper.h"
+#include "src/cpu/CpuTypes.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+void mul_F32_F32_F32(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, float scale)
+{
+    // Create input windows
+    Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
+    Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape());
+
+    // Clear X Dimension on execution window as we handle manually
+    Window win = window;
+    win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    constexpr int window_step_x         = 16 / sizeof(float);
+    const auto    window_start_x        = static_cast<int>(window.x().start());
+    const auto    window_end_x          = static_cast<int>(window.x().end());
+    const bool    is_broadcast_across_x = src1->info()->tensor_shape().x() != src2->info()->tensor_shape().x();
+
+    using ExactTagType = typename wrapper::traits::neon_vector<float, window_step_x>::tag_type;
+
+    if (is_broadcast_across_x)
+    {
+        const bool     is_broadcast_input_2 = input2_win.x().step() == 0;
+        Window         broadcast_win        = is_broadcast_input_2 ? input2_win : input1_win;
+        Window         non_broadcast_win    = !is_broadcast_input_2 ? input2_win : input1_win;
+        const ITensor *broadcast_tensor     = is_broadcast_input_2 ? src2 : src1;
+        const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src2 : src1;
+
+        // Clear X Dimension on execution window as we handle manually
+        non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+        Iterator broadcast_input(broadcast_tensor, broadcast_win);
+        Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
+        Iterator dst(out, win);
+
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
+            {
+                const auto non_broadcast_input_ptr = reinterpret_cast<const float *>(non_broadcast_input.ptr());
+                const auto output_ptr              = reinterpret_cast<float *>(dst.ptr());
+
+                const float broadcast_value     = *reinterpret_cast<const float *>(broadcast_input.ptr());
+                const auto  broadcast_value_vec = wrapper::vdup_n(broadcast_value, ExactTagType{});
+                const auto  scale_vec           = wrapper::vdup_n(scale, ExactTagType{});
+
+                // Compute window_step_x elements per iteration
+                int x = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
+                {
+                    const auto non_broadcast_v = wrapper::vloadq(non_broadcast_input_ptr + x);
+                    auto       res = wrapper::vmul(wrapper::vmul(broadcast_value_vec, non_broadcast_v), scale_vec);
+                    wrapper::vstore(output_ptr + x, res);
+                }
+
+                // Compute left-over elements
+                for (; x < window_end_x; ++x)
+                {
+                    const auto non_broadcast_v = *(non_broadcast_input_ptr + x);
+                    *(output_ptr + x)          = broadcast_value * non_broadcast_v * scale;
+                }
+            },
+            broadcast_input, non_broadcast_input, dst);
+    }
+    else
+    {
+        // Clear X Dimension on execution window as we handle manually
+        input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
+        input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+        Iterator input1(src1, input1_win);
+        Iterator input2(src2, input2_win);
+        Iterator dst(out, win);
+
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
+            {
+                const auto input1_ptr = reinterpret_cast<const float *>(input1.ptr());
+                const auto input2_ptr = reinterpret_cast<const float *>(input2.ptr());
+                const auto output_ptr = reinterpret_cast<float *>(dst.ptr());
+
+                // Compute window_step_x elements per iteration
+                int x = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
+                {
+                    const auto ta1       = wrapper::vloadq(input1_ptr + x);
+                    const auto ta2       = wrapper::vloadq(input2_ptr + x);
+                    const auto scale_vec = wrapper::vdup_n(scale, ExactTagType{});
+                    const auto res       = wrapper::vmul(wrapper::vmul(ta1, ta2), scale_vec);
+                    wrapper::vstore(output_ptr + x, res);
+                }
+
+                // Compute left-over elements
+                for (; x < window_end_x; ++x)
+                {
+                    const auto ta1    = *(input1_ptr + x);
+                    const auto ta2    = *(input2_ptr + x);
+                    *(output_ptr + x) = ta1 * ta2 * scale;
+                }
+            },
+            input1, input2, dst);
+    }
+}
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/softmax/generic/sve/qasymm8.cpp b/src/cpu/kernels/mul/generic/neon/list.h
similarity index 73%
rename from src/cpu/kernels/softmax/generic/sve/qasymm8.cpp
rename to src/cpu/kernels/mul/generic/neon/list.h
index 85e5ccfea1..710cb68b72 100644
--- a/src/cpu/kernels/softmax/generic/sve/qasymm8.cpp
+++ b/src/cpu/kernels/mul/generic/neon/list.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,18 +21,18 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-
-#include "arm_compute/core/Helpers.h"
-
-#include "src/cpu/kernels/softmax/generic/sve/impl.h"
-
+#ifndef ACL_SRC_CPU_KERNELS_MUL_GENERIC_NEON_LIST_H
+#define ACL_SRC_CPU_KERNELS_MUL_GENERIC_NEON_LIST_H
 namespace arm_compute
 {
 namespace cpu
 {
-void sve_qasymm8_logits(const ITensor *in, ITensor *out, const Window &window)
-{
-    return sve_logits_1d_max<qasymm8_t>(in, out, window);
-}
+#define DECLARE_MUL_KERNEL(func_name) \
+    void func_name(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, float scale)
+
+DECLARE_MUL_KERNEL(mul_F32_F32_F32);
+DECLARE_MUL_KERNEL(mul_F16_F16_F16);
+#undef DECLARE_MUL_KERNEL
 } // namespace cpu
 } // namespace arm_compute
+#endif // ACL_SRC_CPU_KERNELS_MUL_GENERIC_NEON_LIST_H
diff --git a/src/cpu/kernels/softmax/generic/sve/fp16.cpp b/src/cpu/kernels/norm_layer/generic/neon/fp16.cpp
similarity index 50%
rename from src/cpu/kernels/softmax/generic/sve/fp16.cpp
rename to src/cpu/kernels/norm_layer/generic/neon/fp16.cpp
index 5e94f72faf..f85fe7a31a 100644
--- a/src/cpu/kernels/softmax/generic/sve/fp16.cpp
+++ b/src/cpu/kernels/norm_layer/generic/neon/fp16.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023 Arm Limited.
+ * Copyright (c) 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -22,29 +22,46 @@
  * SOFTWARE.
  */
 #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
-#include "arm_compute/core/Helpers.h"
 
+#include "src/core/NEON/wrapper/wrapper.h"
 #include "src/cpu/CpuTypes.h"
-#include "src/cpu/kernels/softmax/generic/sve/impl.h"
+#include "src/cpu/kernels/norm_layer/generic/neon/impl.h"
+
 namespace arm_compute
 {
 namespace cpu
 {
-void sve_fp16_softmax(const ITensor *in,
-                      const ITensor *max,
-                      void *const    tmp,
-                      ITensor       *out,
-                      const float    beta,
-                      bool           is_log,
-                      const Window  &window)
+
+void neon_normalize_float16_8_0_2D(
+    const Window &window, const ITensor *in, const ITensor *in_squared, ITensor *out, NormalizationLayerInfo ninfo)
+{
+    arm_compute::normalize_float<float16_t, 8, 0, true>(window, in, in_squared, out, ninfo);
+}
+
+void neon_normalize_float16_8_0(
+    const Window &window, const ITensor *in, const ITensor *in_squared, ITensor *out, NormalizationLayerInfo ninfo)
+{
+    arm_compute::normalize_float<float16_t, 8, 0, false>(window, in, in_squared, out, ninfo);
+}
+
+void neon_normalize_float16_8_1_2D(
+    const Window &window, const ITensor *in, const ITensor *in_squared, ITensor *out, NormalizationLayerInfo ninfo)
+{
+    arm_compute::normalize_float<float16_t, 8, 1, true>(window, in, in_squared, out, ninfo);
+}
+
+void neon_normalize_float16_8_1(
+    const Window &window, const ITensor *in, const ITensor *in_squared, ITensor *out, NormalizationLayerInfo ninfo)
 {
-    return sve_softmax_logits_1d_float<float16_t>(in, max, tmp, out, beta, is_log, window);
+    arm_compute::normalize_float<float16_t, 8, 1, false>(window, in, in_squared, out, ninfo);
 }
 
-void sve_fp16_logits(const ITensor *in, ITensor *out, const Window &window)
+void neon_normalize_float16_8_2(
+    const Window &window, const ITensor *in, const ITensor *in_squared, ITensor *out, NormalizationLayerInfo ninfo)
 {
-    return sve_logits_1d_max<float16_t>(in, out, window);
+    arm_compute::normalize_float<float16_t, 8, 2, false>(window, in, in_squared, out, ninfo);
 }
+
 } // namespace cpu
 } // namespace arm_compute
 #endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */
diff --git a/src/cpu/kernels/norm_layer/generic/neon/fp32.cpp b/src/cpu/kernels/norm_layer/generic/neon/fp32.cpp
new file mode 100644
index 0000000000..0b64f46956
--- /dev/null
+++ b/src/cpu/kernels/norm_layer/generic/neon/fp32.cpp
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/wrapper/wrapper.h"
+#include "src/cpu/CpuTypes.h"
+#include "src/cpu/kernels/norm_layer/generic/neon/impl.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+void neon_normalize_float32_4_0_2D(
+    const Window &window, const ITensor *in, const ITensor *in_squared, ITensor *out, NormalizationLayerInfo ninfo)
+{
+    arm_compute::normalize_float<float, 4, 0, true>(window, in, in_squared, out, ninfo);
+}
+
+void neon_normalize_float32_4_0(
+    const Window &window, const ITensor *in, const ITensor *in_squared, ITensor *out, NormalizationLayerInfo ninfo)
+{
+    arm_compute::normalize_float<float, 4, 0, false>(window, in, in_squared, out, ninfo);
+}
+
+void neon_normalize_float32_4_1_2D(
+    const Window &window, const ITensor *in, const ITensor *in_squared, ITensor *out, NormalizationLayerInfo ninfo)
+{
+    arm_compute::normalize_float<float, 4, 1, true>(window, in, in_squared, out, ninfo);
+}
+
+void neon_normalize_float32_4_1(
+    const Window &window, const ITensor *in, const ITensor *in_squared, ITensor *out, NormalizationLayerInfo ninfo)
+{
+    arm_compute::normalize_float<float, 4, 1, false>(window, in, in_squared, out, ninfo);
+}
+
+void neon_normalize_float32_4_2(
+    const Window &window, const ITensor *in, const ITensor *in_squared, ITensor *out, NormalizationLayerInfo ninfo)
+{
+    arm_compute::normalize_float<float, 4, 2, false>(window, in, in_squared, out, ninfo);
+}
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/norm_layer/generic/neon/impl.h b/src/cpu/kernels/norm_layer/generic/neon/impl.h
new file mode 100644
index 0000000000..6103165679
--- /dev/null
+++ b/src/cpu/kernels/norm_layer/generic/neon/impl.h
@@ -0,0 +1,177 @@
+/*
+ * Copyright (c) 2017-2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_CPU_KERNELS_NORM_LAYER_GENERIC_NEON_IMPL_H
+#define ACL_SRC_CPU_KERNELS_NORM_LAYER_GENERIC_NEON_IMPL_H
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include "src/core/helpers/NormalizationHelpers.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/NEMath.h"
+#include "src/core/NEON/wrapper/wrapper.h"
+
+namespace arm_compute
+{
+/** Function to perform normalization depending on the given template
+ *  dimension. The second template parameter specifies whether the
+ *  normalization has to be 1D or 2D.
+ *
+ * @note Only supported normalizations are:
+ *  - 1D over X or Z
+ *  - 2D over X and Y
+ *
+ * @param[in] window     Region on which to execute the kernel.
+ * @param[in] in         Source tensor. 3 lower dims represent a single input with dimensions [width, height, IFM],
+ *                       and an optional 4th dimension for batch of inputs. Data types supported: FP16/F32. Data layouts supported: NCHW/NHWC.
+ * @param[in] in_squared Source with each element has been squared. 3 lower dims represent a single input with dimensions [width, height, IFM],
+ *                       Data type and layout supported: same as @p input.
+ * @param[in] out        Destination tensor. Output will have the same number of dimensions as input. Data type and layout supported: same as @p input.
+ * @param[in] ninfo      Normalization layer information like the normalization type, normalization size and other parameters.
+ */
+template <typename T, unsigned int S, unsigned int dim, bool do_2D_norm>
+void normalize_float(
+    const Window &window, const ITensor *in, const ITensor *in_squared, ITensor *out, NormalizationLayerInfo ninfo)
+{
+    /** SIMD vector tag type. */
+    using ExactTagType = typename wrapper::traits::neon_vector<T, S>::tag_type;
+
+    Window win(window);
+    win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    const auto window_start_x = static_cast<int>(window.x().start());
+    const auto window_end_x   = static_cast<int>(window.x().end());
+    const int  window_step_x  = S;
+
+    Iterator input(in, win);
+    Iterator input_squared(in_squared, win);
+    Iterator output(out, win);
+
+    const int dim_y                      = in->info()->data_layout() == DataLayout::NCHW ? 1 : 2;
+    const int radius                     = ninfo.norm_size() / 2;
+    const int input_squared_stride_x     = in_squared->info()->strides_in_bytes()[0];
+    const int input_squared_stride_slice = in_squared->info()->strides_in_bytes()[dim];
+    const int input_squared_stride_row   = in_squared->info()->strides_in_bytes()[dim_y];
+
+    const int max_right  = in->info()->dimension(dim) - 1;
+    const int max_bottom = in->info()->dimension(dim_y) - 1;
+
+    const auto coeff_vec = wrapper::vdup_n(static_cast<T>(ninfo.scale_coeff()), ExactTagType{});
+    const auto beta_vec  = wrapper::vdup_n(static_cast<T>(ninfo.beta()), ExactTagType{});
+    const auto kappa_vec = wrapper::vdup_n(static_cast<T>(ninfo.kappa()), ExactTagType{});
+
+    auto sequential_normalization = [&](const int x, const Coordinates &id, const int current_row, const int first_row,
+                                        const int last_row, const T *input_ptr, const uint8_t *input_squared_start_ptr,
+                                        T *output_ptr)
+    {
+        const int current_slice = dim == 0 ? x : id[dim];
+        const int first_slice   = std::max(current_slice - radius, 0);
+        const int last_slice    = std::min(current_slice + radius, max_right);
+
+        const uint8_t *const input_squared_x_ptr = input_squared_start_ptr + x * input_squared_stride_x;
+        // Accumulate 2D In-Map values
+        auto accu = static_cast<T>(0.f);
+        for (int j = first_row; j <= last_row; ++j)
+        {
+            // Compute row displacement
+            const uint8_t *const input_squared_ptr = input_squared_x_ptr + (j - current_row) * input_squared_stride_row;
+            for (int i = first_slice; i <= last_slice; ++i)
+            {
+                accu +=
+                    *reinterpret_cast<const T *>(input_squared_ptr + (i - current_slice) * input_squared_stride_slice);
+            }
+        }
+
+        // Normalize
+        const auto normalized =
+            std::pow(accu * static_cast<T>(ninfo.scale_coeff()) + static_cast<T>(ninfo.kappa()), ninfo.beta());
+        const auto normalized_pixel = (*(input_ptr + x)) / normalized;
+        *(output_ptr + x)           = normalized_pixel;
+    };
+
+    execute_window_loop(
+        win,
+        [&](const Coordinates &id)
+        {
+            const auto input_ptr  = reinterpret_cast<const T *>(input.ptr());
+            auto       output_ptr = reinterpret_cast<T *>(output.ptr());
+
+            // Get range to normalize
+            const int current_row = do_2D_norm ? id[dim_y] : 0;
+            const int first_row   = do_2D_norm ? std::max(current_row - radius, 0) : 0;
+            const int last_row    = do_2D_norm ? std::min(current_row + radius, max_bottom) : 0;
+
+            int x = window_start_x;
+            // Compute serially starting elements for the case x dimension is width
+            for (; x < radius && x < window_end_x && dim == 0; ++x)
+            {
+                sequential_normalization(x, id, current_row, first_row, last_row, input_ptr, input_squared.ptr(),
+                                         output_ptr);
+            }
+
+            // Compute vectorized
+            for (; x <= window_end_x - window_step_x - radius; x += window_step_x)
+            {
+                const int current_slice = dim == 0 ? x : id[dim];
+                const int first_slice   = std::max(current_slice - radius, 0);
+                const int last_slice    = std::min(current_slice + radius, max_right);
+
+                const uint8_t *const input_squared_x_ptr = input_squared.ptr() + x * input_squared_stride_x;
+                // Accumulate 2D In-Map values
+                auto accu = wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{});
+                for (int j = first_row; j <= last_row; ++j)
+                {
+                    // Compute row displacement
+                    const uint8_t *const input_squared_ptr =
+                        input_squared_x_ptr + (j - current_row) * input_squared_stride_row;
+                    for (int i = first_slice; i <= last_slice; ++i)
+                    {
+                        accu = wrapper::vadd(
+                            accu, wrapper::vloadq(reinterpret_cast<const T *>(
+                                      input_squared_ptr + (i - current_slice) * input_squared_stride_slice)));
+                    }
+                }
+
+                // Normalize
+                const auto normalized       = wrapper::vpow(wrapper::vmla(kappa_vec, coeff_vec, accu), beta_vec);
+                const auto normalized_pixel = wrapper::vmul(wrapper::vloadq(input_ptr + x), wrapper::vinv(normalized));
+                wrapper::vstore(reinterpret_cast<T *>(output_ptr + x), normalized_pixel);
+            }
+
+            // Compute left-over elements
+            for (; x < window_end_x; ++x)
+            {
+                sequential_normalization(x, id, current_row, first_row, last_row, input_ptr, input_squared.ptr(),
+                                         output_ptr);
+            }
+        },
+        input, input_squared, output);
+}
+
+} // namespace arm_compute
+#endif // ACL_SRC_CPU_KERNELS_NORM_LAYER_GENERIC_NEON_IMPL_H
diff --git a/src/cpu/kernels/softmax/generic/sve2/qasymm8.cpp b/src/cpu/kernels/norm_layer/generic/neon/list.h
similarity index 52%
rename from src/cpu/kernels/softmax/generic/sve2/qasymm8.cpp
rename to src/cpu/kernels/norm_layer/generic/neon/list.h
index 95623786b3..f2e83d7af1 100644
--- a/src/cpu/kernels/softmax/generic/sve2/qasymm8.cpp
+++ b/src/cpu/kernels/norm_layer/generic/neon/list.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,24 +21,29 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-
-#include "arm_compute/core/Helpers.h"
-
-#include "src/cpu/kernels/softmax/generic/sve2/impl.h"
-
+#ifndef ACL_SRC_CPU_KERNELS_NORM_LAYER_GENERIC_NEON_LIST_H
+#define ACL_SRC_CPU_KERNELS_NORM_LAYER_GENERIC_NEON_LIST_H
 namespace arm_compute
 {
 namespace cpu
 {
-void sve2_qasymm8_softmax(const ITensor *in,
-                          const ITensor *max,
-                          void *const    tmp,
-                          ITensor       *out,
-                          const float    beta,
-                          bool           is_log,
-                          const Window  &window)
-{
-    return sve2_softmax_logits_1d_quantized<qasymm8_t>(in, max, tmp, out, beta, is_log, window);
-}
+
+#define DECLARE_NORMALIZATION_KERNEL(func_name)                                                      \
+    void func_name(const Window &window, const ITensor *in, const ITensor *in_squared, ITensor *out, \
+                   NormalizationLayerInfo ninfo)
+
+DECLARE_NORMALIZATION_KERNEL(neon_normalize_float32_4_0_2D);
+DECLARE_NORMALIZATION_KERNEL(neon_normalize_float32_4_0);
+DECLARE_NORMALIZATION_KERNEL(neon_normalize_float32_4_1_2D);
+DECLARE_NORMALIZATION_KERNEL(neon_normalize_float32_4_1);
+DECLARE_NORMALIZATION_KERNEL(neon_normalize_float32_4_2);
+DECLARE_NORMALIZATION_KERNEL(neon_normalize_float16_8_0_2D);
+DECLARE_NORMALIZATION_KERNEL(neon_normalize_float16_8_0);
+DECLARE_NORMALIZATION_KERNEL(neon_normalize_float16_8_1_2D);
+DECLARE_NORMALIZATION_KERNEL(neon_normalize_float16_8_1);
+DECLARE_NORMALIZATION_KERNEL(neon_normalize_float16_8_2);
+
+#undef DECLARE_NORMALIZATION_KERNEL
 } // namespace cpu
 } // namespace arm_compute
+#endif // ACL_SRC_CPU_KERNELS_NORM_LAYER_GENERIC_NEON_LIST_H
diff --git a/src/cpu/kernels/softmax/generic/neon/fp16.cpp b/src/cpu/kernels/softmax/generic/neon/fp16.cpp
index 2e2adf33e0..db8f881712 100644
--- a/src/cpu/kernels/softmax/generic/neon/fp16.cpp
+++ b/src/cpu/kernels/softmax/generic/neon/fp16.cpp
@@ -31,21 +31,18 @@ namespace arm_compute
 {
 namespace cpu
 {
-void neon_fp16_softmax(const ITensor *in,
-                       const ITensor *max,
-                       void *const    tmp,
-                       ITensor       *out,
-                       const float    beta,
-                       bool           is_log,
-                       const Window  &window)
-{
-    return neon_softmax_logits_1d_float<float16_t>(in, max, tmp, out, beta, is_log, window);
-}
 
-void neon_fp16_logits(const ITensor *in, ITensor *out, const Window &window)
+template <bool IS_LOG>
+void neon_fp16_softmax(const ITensor *in, void *const tmp, ITensor *out, const float beta, const Window &window)
 {
-    return neon_logits_1d_max<float16_t>(in, out, window);
+    return neon_softmax_float<float16_t, IS_LOG>(in, tmp, out, beta, window);
 }
+
+template void
+neon_fp16_softmax<true>(const ITensor *in, void *const tmp, ITensor *out, const float beta, const Window &window);
+template void
+neon_fp16_softmax<false>(const ITensor *in, void *const tmp, ITensor *out, const float beta, const Window &window);
+
 } // namespace cpu
 } // namespace arm_compute
 #endif //defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/cpu/kernels/softmax/generic/neon/fp32.cpp b/src/cpu/kernels/softmax/generic/neon/fp32.cpp
index 61df40c1b5..c281d1bf31 100644
--- a/src/cpu/kernels/softmax/generic/neon/fp32.cpp
+++ b/src/cpu/kernels/softmax/generic/neon/fp32.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -29,20 +29,17 @@ namespace arm_compute
 {
 namespace cpu
 {
-void neon_fp32_softmax(const ITensor *in,
-                       const ITensor *max,
-                       void *const    tmp,
-                       ITensor       *out,
-                       const float    beta,
-                       bool           is_log,
-                       const Window  &window)
-{
-    return neon_softmax_logits_1d_float<float>(in, max, tmp, out, beta, is_log, window);
-}
 
-void neon_fp32_logits(const ITensor *in, ITensor *out, const Window &window)
+template <bool IS_LOG>
+void neon_fp32_softmax(const ITensor *in, void *const tmp, ITensor *out, const float beta, const Window &window)
 {
-    return neon_logits_1d_max<float>(in, out, window);
+    return neon_softmax_float<float, IS_LOG>(in, tmp, out, beta, window);
 }
+
+template void
+neon_fp32_softmax<true>(const ITensor *in, void *const tmp, ITensor *out, const float beta, const Window &window);
+template void
+neon_fp32_softmax<false>(const ITensor *in, void *const tmp, ITensor *out, const float beta, const Window &window);
+
 } // namespace cpu
 } // namespace arm_compute
diff --git a/src/cpu/kernels/softmax/generic/neon/impl.cpp b/src/cpu/kernels/softmax/generic/neon/impl.cpp
index 5d6e6a4f80..487f6ae051 100644
--- a/src/cpu/kernels/softmax/generic/neon/impl.cpp
+++ b/src/cpu/kernels/softmax/generic/neon/impl.cpp
@@ -29,43 +29,76 @@ namespace arm_compute
 {
 namespace cpu
 {
-template void neon_logits_1d_max<qasymm8_signed_t>(const ITensor *in, ITensor *out, const Window &window);
-template void neon_logits_1d_max<qasymm8_t>(const ITensor *in, ITensor *out, const Window &window);
-
-template <typename T>
-void neon_softmax_logits_1d_quantized(
-    const ITensor *in, const ITensor *max, void *const tmp, ITensor *out, float beta, bool is_log, const Window &window)
+template <typename T, bool IS_LOG>
+void neon_softmax_quantized(const ITensor *in, void *const tmp, ITensor *out, float beta, const Window &window)
 {
     static_assert(std::is_same<T, qasymm8_t>::value || std::is_same<T, qasymm8_signed_t>::value,
                   "quantized type should be either qasymm8_t or qasymm8_signed_t.");
 
-    const int start_x     = in->info()->valid_region().anchor.x();
     const int input_width = in->info()->valid_region().shape.x();
 
-    const float scale_beta     = -beta * in->info()->quantization_info().uniform().scale;
-    const auto  scale_beta_vec = vdupq_n_f32(scale_beta);
+    const float       scale_beta     = -beta * in->info()->quantization_info().uniform().scale;
+    const float32x4_t scale_beta_vec = vdupq_n_f32(scale_beta);
+
+    Iterator in_it(in, window);
+    Iterator out_it(out, window);
 
-    Iterator      in_it(in, window);
-    Iterator      max_it(max, window);
-    Iterator      out_it(out, window);
     constexpr int vec_size = 16;
 
+#ifndef __aarch64__
+    const int sum_stages = log2(vec_size >> 1);
+#endif // __aarch64__
+
+    using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>;
+
     execute_window_loop(
         window,
         [&](const Coordinates &)
         {
             /* Get pointers */
-            const auto in_ptr  = reinterpret_cast<const T *>(in_it.ptr()) + start_x;
-            const auto out_ptr = reinterpret_cast<T *>(out_it.ptr()) + start_x;
-            const auto tmp_ptr = reinterpret_cast<float *>(tmp);
+            const T *in_ptr  = reinterpret_cast<const T *>(in_it.ptr());
+            T       *out_ptr = reinterpret_cast<T *>(out_it.ptr());
+            float   *tmp_ptr = reinterpret_cast<float *>(tmp);
+
+            T max_val;
+
+            /* Compute Max */
+            {
+                // Init max value
+                auto vec_max = wrapper::vdup_n(support::cpp11::lowest<T>(), ExactTagType{});
+                int  x       = 0;
 
-            float sum{};
-            float sum_inversed{};
+                for (; x <= (input_width - vec_size); x += vec_size)
+                {
+                    const auto current_value = wrapper::vloadq(in_ptr + x);
+                    vec_max                  = wrapper::vmax(vec_max, current_value);
+                }
+
+#ifdef __aarch64__
+                max_val = wrapper::vmaxv(vec_max);
+#else  // __aarch64__
+                auto carry_max = wrapper::vpmax(wrapper::vgethigh(vec_max), wrapper::vgetlow(vec_max));
+
+                for (int i = 0; i < sum_stages; ++i)
+                {
+                    carry_max = wrapper::vpmax(carry_max, carry_max);
+                }
+
+                max_val      = wrapper::vgetlane(carry_max, 0);
+#endif // __aarch64__
+
+                // Compute left-over elements
+                for (; x < input_width; ++x)
+                {
+                    max_val = std::max(*(in_ptr + x), max_val);
+                }
+            } // Compute Max
+
+            float sum_transformed{};
 
             /* Compute exponentials and sum */
             {
                 /* Get max value */
-                const auto max_val = *reinterpret_cast<const T *>(max_it.ptr());
                 const auto vec_max = wrapper::vdup_n(max_val, wrapper::traits::vector_128_tag{});
 
                 /* Init sum to zero */
@@ -80,11 +113,11 @@ void neon_softmax_logits_1d_quantized(
                 int x = 0;
                 for (; x <= (input_width - vec_size); x += vec_size)
                 {
-                    auto vec_elements     = wrapper::vloadq(in_ptr + x);
-                    vec_elements          = wrapper::vqsub(vec_max, vec_elements);
-                    auto vec_elements_flt = convert_int_to_float<float32x4x4_t>(vec_elements);
+                    auto vec_elements              = wrapper::vloadq(in_ptr + x);
+                    vec_elements                   = wrapper::vqsub(vec_max, vec_elements);
+                    float32x4x4_t vec_elements_flt = convert_int_to_float<float32x4x4_t>(vec_elements);
 
-                    if (is_log)
+                    if (IS_LOG)
                     {
                         vec_elements_flt.val[0] = vmulq_f32(vec_elements_flt.val[0], scale_beta_vec);
                         vec_elements_flt.val[1] = vmulq_f32(vec_elements_flt.val[1], scale_beta_vec);
@@ -111,17 +144,24 @@ void neon_softmax_logits_1d_quantized(
                 }
 
                 /* Reduce sum */
-                const auto sum_16_byte =
+                const float32x4_t sum_16_byte =
                     vaddq_f32(vaddq_f32(vec_sum.val[0], vec_sum.val[1]), vaddq_f32(vec_sum.val[2], vec_sum.val[3]));
+
+                float sum;
+
+#ifdef __aarch64__
+                sum = wrapper::vaddv(sum_16_byte);
+#else  // __aarch64__
                 auto sum_res = vpadd_f32(vget_high_f32(sum_16_byte), vget_low_f32(sum_16_byte));
                 sum_res      = vpadd_f32(sum_res, sum_res);
                 sum          = wrapper::vgetlane(sum_res, 0);
+#endif // __aarch64__
 
                 /* Run remaining elements */
                 for (; x < input_width; ++x)
                 {
                     float element{};
-                    if (is_log)
+                    if (IS_LOG)
                     {
                         element = (max_val - in_ptr[x]) * scale_beta;
                         sum += std::exp(element);
@@ -135,19 +175,22 @@ void neon_softmax_logits_1d_quantized(
                     tmp_ptr[x] = element;
                 }
 
-                if (!is_log)
+                if (!IS_LOG)
                 {
-                    sum_inversed = 256.f / sum;
+                    sum_transformed = 256.f / sum;
                 }
                 else
                 {
-                    sum = std::log(sum);
+                    sum_transformed = std::log(sum);
                 }
-            }
+            } // Compute exponentials and sum
 
             /* Normalize exponentials */
             {
                 constexpr bool is_qasymm8_signed = std::is_same<T, qasymm8_signed_t>::value;
+
+                const float32x4_t sum_vec = vdupq_n_f32(sum_transformed);
+
                 /* Loop over row and compute softmax */
                 int x = 0;
                 for (; x <= (input_width - vec_size); x += vec_size)
@@ -155,23 +198,23 @@ void neon_softmax_logits_1d_quantized(
                     using int_vec_type   = wrapper::traits::neon_vector_t<T, 16>;
                     float32x4x4_t vec_in = vld4q_f32(tmp_ptr + x);
                     int_vec_type  normalized_value{};
-                    if (is_log)
+                    if (IS_LOG)
                     {
                         const float32x4x4_t sub = {
-                            vsubq_f32(vec_in.val[0], vdupq_n_f32(sum)),
-                            vsubq_f32(vec_in.val[1], vdupq_n_f32(sum)),
-                            vsubq_f32(vec_in.val[2], vdupq_n_f32(sum)),
-                            vsubq_f32(vec_in.val[3], vdupq_n_f32(sum)),
+                            vsubq_f32(vec_in.val[0], sum_vec),
+                            vsubq_f32(vec_in.val[1], sum_vec),
+                            vsubq_f32(vec_in.val[2], sum_vec),
+                            vsubq_f32(vec_in.val[3], sum_vec),
                         };
                         normalized_value = convert_float_to_int<float32x4x4_t, int_vec_type>(sub);
                     }
                     else
                     {
                         float32x4x4_t mul = {
-                            vmulq_f32(vec_in.val[0], vdupq_n_f32(sum_inversed)),
-                            vmulq_f32(vec_in.val[1], vdupq_n_f32(sum_inversed)),
-                            vmulq_f32(vec_in.val[2], vdupq_n_f32(sum_inversed)),
-                            vmulq_f32(vec_in.val[3], vdupq_n_f32(sum_inversed)),
+                            vmulq_f32(vec_in.val[0], sum_vec),
+                            vmulq_f32(vec_in.val[1], sum_vec),
+                            vmulq_f32(vec_in.val[2], sum_vec),
+                            vmulq_f32(vec_in.val[3], sum_vec),
                         };
 
                         if (is_qasymm8_signed)
@@ -190,34 +233,31 @@ void neon_softmax_logits_1d_quantized(
                 /* Run remaining elements */
                 for (; x < input_width; ++x)
                 {
-                    if (is_log)
+                    if (IS_LOG)
                     {
-                        out_ptr[x] = utils::cast::saturate_cast<T>(tmp_ptr[x] - sum);
+                        out_ptr[x] = utils::cast::saturate_cast<T>(tmp_ptr[x] - sum_transformed);
                     }
                     else
                     {
-                        out_ptr[x] = utils::cast::saturate_cast<T>((tmp_ptr[x] * sum_inversed) -
+                        out_ptr[x] = utils::cast::saturate_cast<T>((tmp_ptr[x] * sum_transformed) -
                                                                    (is_qasymm8_signed ? 128.f : 0));
                     }
                 }
-            }
+            } // Normalize exponentials
         },
-        in_it, max_it, out_it);
+        in_it, out_it);
 }
 
-template void neon_softmax_logits_1d_quantized<qasymm8_signed_t>(const ITensor *in,
-                                                                 const ITensor *max,
-                                                                 void *const    tmp,
-                                                                 ITensor       *out,
-                                                                 float          beta,
-                                                                 bool           is_log,
-                                                                 const Window  &window);
-template void neon_softmax_logits_1d_quantized<qasymm8_t>(const ITensor *in,
-                                                          const ITensor *max,
-                                                          void *const    tmp,
-                                                          ITensor       *out,
-                                                          float          beta,
-                                                          bool           is_log,
-                                                          const Window  &window);
+template void neon_softmax_quantized<qasymm8_signed_t, true>(
+    const ITensor *in, void *const tmp, ITensor *out, float beta, const Window &window);
+
+template void neon_softmax_quantized<qasymm8_signed_t, false>(
+    const ITensor *in, void *const tmp, ITensor *out, float beta, const Window &window);
+
+template void neon_softmax_quantized<qasymm8_t, true>(
+    const ITensor *in, void *const tmp, ITensor *out, float beta, const Window &window);
+
+template void neon_softmax_quantized<qasymm8_t, false>(
+    const ITensor *in, void *const tmp, ITensor *out, float beta, const Window &window);
 } // namespace cpu
 } // namespace arm_compute
diff --git a/src/cpu/kernels/softmax/generic/neon/impl.h b/src/cpu/kernels/softmax/generic/neon/impl.h
index 4d9b789297..60380cd233 100644
--- a/src/cpu/kernels/softmax/generic/neon/impl.h
+++ b/src/cpu/kernels/softmax/generic/neon/impl.h
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef SRC_CORE_NEON_KERNELS_SOFTMAX_IMPL_H
-#define SRC_CORE_NEON_KERNELS_SOFTMAX_IMPL_H
+#ifndef ACL_SRC_CPU_KERNELS_SOFTMAX_GENERIC_NEON_IMPL_H
+#define ACL_SRC_CPU_KERNELS_SOFTMAX_GENERIC_NEON_IMPL_H
 
 #include "arm_compute/core/Helpers.h"
 
@@ -33,105 +33,100 @@ namespace arm_compute
 {
 namespace cpu
 {
-template <typename T>
-void neon_logits_1d_max(const ITensor *in, ITensor *out, const Window &window)
-{
-    /** SIMD vector tag type. */
-    using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>;
-
-    constexpr int window_step_x  = 16 / sizeof(T);
-    const auto    window_start_x = static_cast<int>(window.x().start());
-    const auto    window_end_x   = static_cast<int>(window.x().end());
-
-    Window win{window};
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-    Iterator input(in, win);
-    Iterator output(out, win);
-
-    const int sum_stages = log2(window_step_x / 2);
-    execute_window_loop(
-        win,
-        [&](const Coordinates &)
-        {
-            // Get pointers
-            const auto in_ptr  = reinterpret_cast<const T *>(input.ptr());
-            const auto out_ptr = reinterpret_cast<T *>(output.ptr());
-
-            // Init max value
-            auto vec_max = wrapper::vdup_n(support::cpp11::lowest<T>(), ExactTagType{});
-            int  x       = window_start_x;
-
-            for (; x <= (window_end_x - window_step_x); x += window_step_x)
-            {
-                const auto current_value = wrapper::vloadq(in_ptr + x);
-                vec_max                  = wrapper::vmax(vec_max, current_value);
-            }
-            auto carry_max = wrapper::vpmax(wrapper::vgethigh(vec_max), wrapper::vgetlow(vec_max));
-
-            for (int i = 0; i < sum_stages; ++i)
-            {
-                carry_max = wrapper::vpmax(carry_max, carry_max);
-            }
-            T max_val = wrapper::vgetlane(carry_max, 0);
 
-            // Compute left-over elements
-            for (; x < window_end_x; ++x)
-            {
-                max_val = *(in_ptr + x) > max_val ? *(in_ptr + x) : max_val;
-            }
+#ifdef __aarch64__
+namespace
+{
+// These helper functions are added because vaddv does not exist for fp16,
+// and, therefore, is not part of the wrapper::vaddv interface.
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+inline float16_t wrapper_vaddv(const float16x8_t &a, int sum_stages)
+{
+    auto sum_res = wrapper::vpadd(wrapper::vgethigh(a), wrapper::vgetlow(a));
+    for (int i = 0; i < sum_stages; ++i)
+    {
+        sum_res = wrapper::vpadd(sum_res, sum_res);
+    }
+    return wrapper::vgetlane(sum_res, 0);
+}
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 
-            *out_ptr = max_val;
-        },
-        input, output);
+inline float wrapper_vaddv(const float32x4_t &a, int sum_stages)
+{
+    ARM_COMPUTE_UNUSED(sum_stages);
+    return wrapper::vaddv(a);
 }
+} // namespace
+#endif // __aarch64__
 
-template <typename T>
-void neon_softmax_logits_1d_quantized(const ITensor *in,
-                                      const ITensor *max,
-                                      void *const    tmp,
-                                      ITensor       *out,
-                                      float          beta,
-                                      bool           is_log,
-                                      const Window  &window);
-
-template <typename T>
-void neon_softmax_logits_1d_float(const ITensor *in,
-                                  const ITensor *max,
-                                  void *const    tmp,
-                                  ITensor       *out,
-                                  const float    beta,
-                                  bool           is_log,
-                                  const Window  &window)
+// The template implementation for float data types is stored in the header file because
+// we need all fp16 instantiated code to live in fp16.cpp files.
+template <typename T, bool IS_LOG>
+void neon_softmax_float(const ITensor *in, void *const tmp, ITensor *out, float beta, const Window &window)
 {
-    const int start_x     = in->info()->valid_region().anchor.x();
+    ARM_COMPUTE_UNUSED(tmp);
+
     const int input_width = in->info()->valid_region().shape.x();
 
     Iterator in_it(in, window);
-    Iterator max_it(max, window);
     Iterator out_it(out, window);
 
     /** SIMD vector tag type. */
     using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>;
 
-    constexpr int vec_size   = 16 / sizeof(T);
-    const int     sum_stages = log2(vec_size / 2);
+    constexpr int vec_size = 16 / sizeof(T);
+
+    const int sum_stages = log2(vec_size >> 1);
+
+    const auto beta_vec = wrapper::vdup_n(static_cast<T>(beta), ExactTagType{});
 
     execute_window_loop(
         window,
         [&](const Coordinates &)
         {
             /* Get pointers */
-            const auto in_ptr  = reinterpret_cast<const T *>(in_it.ptr()) + start_x;
-            const auto out_ptr = reinterpret_cast<T *>(out_it.ptr()) + start_x;
-            const auto tmp_ptr = reinterpret_cast<T *>(tmp);
+            const T *in_ptr  = reinterpret_cast<const T *>(in_it.ptr());
+            T       *out_ptr = reinterpret_cast<T *>(out_it.ptr());
+
+            T max_val;
+
+            /* Compute Max */
+            {
+                // Init max value
+                auto vec_max = wrapper::vdup_n(support::cpp11::lowest<T>(), ExactTagType{});
+                int  x       = 0;
+
+                for (; x <= (input_width - vec_size); x += vec_size)
+                {
+                    const auto current_value = wrapper::vloadq(in_ptr + x);
+                    vec_max                  = wrapper::vmax(vec_max, current_value);
+                }
+
+#ifdef __aarch64__
+                max_val = wrapper::vmaxv(vec_max);
+#else  // __aarch64__
+                auto carry_max = wrapper::vpmax(wrapper::vgethigh(vec_max), wrapper::vgetlow(vec_max));
+
+                for (int i = 0; i < sum_stages; ++i)
+                {
+                    carry_max = wrapper::vpmax(carry_max, carry_max);
+                }
+
+                max_val      = wrapper::vgetlane(carry_max, 0);
+#endif // __aarch64__
 
-            T sum{};
-            T sum_inversed{};
+                // Compute left-over elements
+                for (; x < input_width; ++x)
+                {
+                    max_val = std::max(*(in_ptr + x), max_val);
+                }
+            } // compute max
+
+            T sum_transformed{};
 
             /* Compute exponentials and sum */
             {
                 /* Get max value */
-                const auto max_val = *reinterpret_cast<const T *>(max_it.ptr());
                 const auto vec_max = wrapper::vdup_n(max_val, ExactTagType{});
 
                 /* Init sum to zero */
@@ -143,35 +138,38 @@ void neon_softmax_logits_1d_float(const ITensor *in,
                 {
                     auto vec_elements = wrapper::vloadq(in_ptr + x);
                     vec_elements      = wrapper::vsub(vec_elements, vec_max);
-                    if (is_log)
+                    if (IS_LOG)
                     {
-                        vec_elements =
-                            wrapper::vmul(vec_elements, wrapper::vdup_n(static_cast<T>(beta), ExactTagType{}));
-                        vec_sum = wrapper::vadd(vec_sum, wrapper::vexpq(vec_elements));
+                        vec_elements = wrapper::vmul(vec_elements, beta_vec);
+                        vec_sum      = wrapper::vadd(vec_sum, wrapper::vexpq(vec_elements));
                     }
                     else
                     {
-                        vec_elements = wrapper::vexpq(
-                            wrapper::vmul(vec_elements, wrapper::vdup_n(static_cast<T>(beta), ExactTagType{})));
-                        vec_sum = wrapper::vadd(vec_sum, vec_elements);
+                        vec_elements = wrapper::vexpq(wrapper::vmul(vec_elements, beta_vec));
+                        vec_sum      = wrapper::vadd(vec_sum, vec_elements);
                     }
-                    wrapper::vstore(tmp_ptr + x, vec_elements);
+                    wrapper::vstore(out_ptr + x, vec_elements);
                 }
 
                 /* Reduce sum */
+                T sum{};
+#ifdef __aarch64__
+                sum = wrapper_vaddv(vec_sum, sum_stages);
+#else  // __aarch64__
                 auto sum_res = wrapper::vpadd(wrapper::vgethigh(vec_sum), wrapper::vgetlow(vec_sum));
                 for (int i = 0; i < sum_stages; ++i)
                 {
                     sum_res = wrapper::vpadd(sum_res, sum_res);
                 }
                 sum = wrapper::vgetlane(sum_res, 0);
+#endif // __aarch64__
 
                 /* Run remaining elements */
                 for (; x < input_width; ++x)
                 {
                     T element{};
 
-                    if (is_log)
+                    if (IS_LOG)
                     {
                         element = (in_ptr[x] - max_val) * beta;
                         sum += std::exp(element);
@@ -181,55 +179,59 @@ void neon_softmax_logits_1d_float(const ITensor *in,
                         element = std::exp((in_ptr[x] - max_val) * beta);
                         sum += element;
                     }
-                    tmp_ptr[x] = element;
+
+                    out_ptr[x] = element;
                 }
 
-                if (!is_log)
+                if (!IS_LOG)
                 {
-                    sum_inversed = T(1) / sum;
+                    sum_transformed = T(1) / sum;
                 }
                 else
                 {
-                    sum = static_cast<T>(std::log(sum));
+                    sum_transformed = static_cast<T>(std::log(sum));
                 }
-            }
+            } // Compute exponentials and sum
 
             /* Normalize exponentials */
             {
+                const auto sum_vec = wrapper::vdup_n(static_cast<T>(sum_transformed), ExactTagType{});
+
                 /* Loop over row and compute softmax */
                 int x = 0;
                 for (; x <= (input_width - vec_size); x += vec_size)
                 {
-                    auto vec_in           = wrapper::vloadq(tmp_ptr + x);
-                    auto normalized_value = wrapper::vdup_n(static_cast<T>(0), ExactTagType{});
-                    if (is_log)
+                    const auto vec_in = wrapper::vloadq(out_ptr + x);
+                    if (IS_LOG)
                     {
-                        normalized_value = wrapper::vsub(vec_in, wrapper::vdup_n(static_cast<T>(sum), ExactTagType{}));
+                        wrapper::vstore(out_ptr + x, wrapper::vsub(vec_in, sum_vec));
                     }
                     else
                     {
-                        normalized_value =
-                            wrapper::vmul(vec_in, wrapper::vdup_n(static_cast<T>(sum_inversed), ExactTagType{}));
+                        wrapper::vstore(out_ptr + x, wrapper::vmul(vec_in, sum_vec));
                     }
-                    wrapper::vstore(out_ptr + x, normalized_value);
                 }
+
                 /* Run remaining elements */
                 for (; x < input_width; ++x)
                 {
-                    if (is_log)
+                    if (IS_LOG)
                     {
-                        out_ptr[x] = tmp_ptr[x] - sum;
+                        out_ptr[x] = out_ptr[x] - sum_transformed;
                     }
                     else
                     {
-                        out_ptr[x] = tmp_ptr[x] * sum_inversed;
+                        out_ptr[x] = out_ptr[x] * sum_transformed;
                     }
                 }
-            }
+            } // Normalize exponentials
         },
-        in_it, max_it, out_it);
+        in_it, out_it);
 }
+
+template <typename T, bool IS_LOG>
+void neon_softmax_quantized(const ITensor *in, void *const tmp, ITensor *out, float beta, const Window &window);
 } // namespace cpu
 } // namespace arm_compute
 
-#endif /* SRC_CORE_NEON_KERNELS_SOFTMAX_IMPL_H */
+#endif // ACL_SRC_CPU_KERNELS_SOFTMAX_GENERIC_NEON_IMPL_H
diff --git a/src/cpu/kernels/softmax/generic/neon/qasymm8.cpp b/src/cpu/kernels/softmax/generic/neon/qasymm8.cpp
index 40713dc496..9589ebcd7c 100644
--- a/src/cpu/kernels/softmax/generic/neon/qasymm8.cpp
+++ b/src/cpu/kernels/softmax/generic/neon/qasymm8.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -29,20 +29,16 @@ namespace arm_compute
 {
 namespace cpu
 {
-void neon_qasymm8_softmax(const ITensor *in,
-                          const ITensor *max,
-                          void *const    tmp,
-                          ITensor       *out,
-                          const float    beta,
-                          bool           is_log,
-                          const Window  &window)
+template <bool IS_LOG>
+void neon_qasymm8_softmax(const ITensor *in, void *const tmp, ITensor *out, const float beta, const Window &window)
 {
-    return neon_softmax_logits_1d_quantized<qasymm8_t>(in, max, tmp, out, beta, is_log, window);
+    return neon_softmax_quantized<qasymm8_t, IS_LOG>(in, tmp, out, beta, window);
 }
 
-void neon_qasymm8_logits(const ITensor *in, ITensor *out, const Window &window)
-{
-    return neon_logits_1d_max<qasymm8_t>(in, out, window);
-}
+template void
+neon_qasymm8_softmax<true>(const ITensor *in, void *const tmp, ITensor *out, const float beta, const Window &window);
+template void
+neon_qasymm8_softmax<false>(const ITensor *in, void *const tmp, ITensor *out, const float beta, const Window &window);
+
 } // namespace cpu
 } // namespace arm_compute
diff --git a/src/cpu/kernels/softmax/generic/neon/qasymm8_signed.cpp b/src/cpu/kernels/softmax/generic/neon/qasymm8_signed.cpp
index 2c5e284f54..0bf6b2859a 100644
--- a/src/cpu/kernels/softmax/generic/neon/qasymm8_signed.cpp
+++ b/src/cpu/kernels/softmax/generic/neon/qasymm8_signed.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -29,20 +29,17 @@ namespace arm_compute
 {
 namespace cpu
 {
-void neon_qasymm8_signed_softmax(const ITensor *in,
-                                 const ITensor *max,
-                                 void *const    tmp,
-                                 ITensor       *out,
-                                 const float    beta,
-                                 bool           is_log,
-                                 const Window  &window)
+template <bool IS_LOG>
+void neon_qasymm8_signed_softmax(
+    const ITensor *in, void *const tmp, ITensor *out, const float beta, const Window &window)
 {
-    return neon_softmax_logits_1d_quantized<qasymm8_signed_t>(in, max, tmp, out, beta, is_log, window);
+    return neon_softmax_quantized<qasymm8_signed_t, IS_LOG>(in, tmp, out, beta, window);
 }
 
-void neon_qasymm8_singed_logits(const ITensor *in, ITensor *out, const Window &window)
-{
-    return neon_logits_1d_max<qasymm8_signed_t>(in, out, window);
-}
+template void neon_qasymm8_signed_softmax<true>(
+    const ITensor *in, void *const tmp, ITensor *out, const float beta, const Window &window);
+template void neon_qasymm8_signed_softmax<false>(
+    const ITensor *in, void *const tmp, ITensor *out, const float beta, const Window &window);
+
 } // namespace cpu
 } // namespace arm_compute
diff --git a/src/cpu/kernels/softmax/generic/sve/impl.cpp b/src/cpu/kernels/softmax/generic/sve/impl.cpp
index 24f1bb8143..0d4b7f4509 100644
--- a/src/cpu/kernels/softmax/generic/sve/impl.cpp
+++ b/src/cpu/kernels/softmax/generic/sve/impl.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -30,6 +30,9 @@ namespace arm_compute
 {
 namespace cpu
 {
+/// TODO: (COMPMID-6505) Similar to Neon(TM), this implementation be converted to
+/// a single kernel that performs softmax operation. Leaving the SVE code here for
+/// future references. Implementation for Neon(TM) is introduced in COMPMID-6500
 template <typename ScalarType>
 void sve_logits_1d_max(const ITensor *in, ITensor *out, const Window &window)
 {
@@ -172,25 +175,5 @@ void sve_softmax_logits_1d_float(const ITensor *in,
         },
         in_it, max_it, out_it);
 }
-
-template void sve_logits_1d_max<float>(const ITensor *in, ITensor *out, const Window &window);
-template void sve_logits_1d_max<float16_t>(const ITensor *in, ITensor *out, const Window &window);
-template void sve_logits_1d_max<qasymm8_t>(const ITensor *in, ITensor *out, const Window &window);
-template void sve_logits_1d_max<qasymm8_signed_t>(const ITensor *in, ITensor *out, const Window &window);
-
-template void sve_softmax_logits_1d_float<float>(const ITensor *in,
-                                                 const ITensor *max,
-                                                 void *const    tmp,
-                                                 ITensor       *out,
-                                                 const float    beta,
-                                                 bool           is_log,
-                                                 const Window  &window);
-template void sve_softmax_logits_1d_float<float16_t>(const ITensor *in,
-                                                     const ITensor *max,
-                                                     void *const    tmp,
-                                                     ITensor       *out,
-                                                     const float    beta,
-                                                     bool           is_log,
-                                                     const Window  &window);
 } // namespace cpu
 } // namespace arm_compute
diff --git a/src/cpu/kernels/softmax/generic/sve/qasymm8_signed.cpp b/src/cpu/kernels/softmax/generic/sve/qasymm8_signed.cpp
deleted file mode 100644
index 4be2e2eed6..0000000000
--- a/src/cpu/kernels/softmax/generic/sve/qasymm8_signed.cpp
+++ /dev/null
@@ -1,38 +0,0 @@
-/*
- * Copyright (c) 2021-2022 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/core/Helpers.h"
-
-#include "src/cpu/kernels/softmax/generic/sve/impl.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-void sve_qasymm8_signed_logits(const ITensor *in, ITensor *out, const Window &window)
-{
-    return sve_logits_1d_max<qasymm8_signed_t>(in, out, window);
-}
-} // namespace cpu
-} // namespace arm_compute
diff --git a/src/cpu/kernels/softmax/generic/sve2/impl.cpp b/src/cpu/kernels/softmax/generic/sve2/impl.cpp
index 98b2f5117f..a8fb1d4adf 100644
--- a/src/cpu/kernels/softmax/generic/sve2/impl.cpp
+++ b/src/cpu/kernels/softmax/generic/sve2/impl.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -32,6 +32,9 @@ namespace arm_compute
 {
 namespace cpu
 {
+/// TODO: (COMPMID-6505) Similar to Neon(TM), this implementation be converted to
+/// a single kernel that performs softmax operation. Leaving the SVE2 code here for
+/// future references. Implementation for Neon(TM) is introduced in COMPMID-6500
 template <typename ScalarType>
 void sve2_softmax_logits_1d_quantized(
     const ITensor *in, const ITensor *max, void *const tmp, ITensor *out, float beta, bool is_log, const Window &window)
@@ -205,20 +208,5 @@ void sve2_softmax_logits_1d_quantized(
         },
         in_it, max_it, out_it);
 }
-
-template void sve2_softmax_logits_1d_quantized<qasymm8_signed_t>(const ITensor *in,
-                                                                 const ITensor *max,
-                                                                 void *const    tmp,
-                                                                 ITensor       *out,
-                                                                 float          beta,
-                                                                 bool           is_log,
-                                                                 const Window  &window);
-template void sve2_softmax_logits_1d_quantized<qasymm8_t>(const ITensor *in,
-                                                          const ITensor *max,
-                                                          void *const    tmp,
-                                                          ITensor       *out,
-                                                          float          beta,
-                                                          bool           is_log,
-                                                          const Window  &window);
 } // namespace cpu
 } // namespace arm_compute
diff --git a/src/cpu/kernels/softmax/generic/sve2/qasymm8_signed.cpp b/src/cpu/kernels/softmax/generic/sve2/qasymm8_signed.cpp
deleted file mode 100644
index c20462fcef..0000000000
--- a/src/cpu/kernels/softmax/generic/sve2/qasymm8_signed.cpp
+++ /dev/null
@@ -1,44 +0,0 @@
-/*
- * Copyright (c) 2021-2022 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/core/Helpers.h"
-
-#include "src/cpu/kernels/softmax/generic/sve2/impl.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-void sve2_qasymm8_signed_softmax(const ITensor *in,
-                                 const ITensor *max,
-                                 void *const    tmp,
-                                 ITensor       *out,
-                                 const float    beta,
-                                 bool           is_log,
-                                 const Window  &window)
-{
-    return sve2_softmax_logits_1d_quantized<qasymm8_signed_t>(in, max, tmp, out, beta, is_log, window);
-}
-} // namespace cpu
-} // namespace arm_compute
diff --git a/src/cpu/kernels/softmax/list.h b/src/cpu/kernels/softmax/list.h
index 627ce0c264..c143f6659d 100644
--- a/src/cpu/kernels/softmax/list.h
+++ b/src/cpu/kernels/softmax/list.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,41 +21,24 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef SRC_CORE_NEON_KERNELS_SOFTMAX_LIST_H
-#define SRC_CORE_NEON_KERNELS_SOFTMAX_LIST_H
+#ifndef ACL_SRC_CPU_KERNELS_SOFTMAX_LIST_H
+#define ACL_SRC_CPU_KERNELS_SOFTMAX_LIST_H
 
 namespace arm_compute
 {
 namespace cpu
 {
-#define DECLARE_SOFTMAX_KERNEL(func_name)                                                                  \
-    void func_name(const ITensor *in, const ITensor *max, void *const tmp, ITensor *out, const float beta, \
-                   bool is_log, const Window &window)
+#define DECLARE_SOFTMAX_KERNEL(func_name) \
+    template <bool IS_LOG>                \
+    void func_name(const ITensor *in, void *const tmp, ITensor *out, const float beta, const Window &window)
 
 DECLARE_SOFTMAX_KERNEL(neon_fp32_softmax);
 DECLARE_SOFTMAX_KERNEL(neon_fp16_softmax);
 DECLARE_SOFTMAX_KERNEL(neon_qasymm8_softmax);
 DECLARE_SOFTMAX_KERNEL(neon_qasymm8_signed_softmax);
-DECLARE_SOFTMAX_KERNEL(sve_fp32_softmax);
-DECLARE_SOFTMAX_KERNEL(sve_fp16_softmax);
-DECLARE_SOFTMAX_KERNEL(sve2_qasymm8_signed_softmax);
-DECLARE_SOFTMAX_KERNEL(sve2_qasymm8_softmax);
 
 #undef DECLARE_SOFTMAX_KERNEL
-
-#define DECLARE_LOGITS_KERNEL(func_name) void func_name(const ITensor *in, ITensor *out, const Window &window)
-
-DECLARE_LOGITS_KERNEL(neon_fp32_logits);
-DECLARE_LOGITS_KERNEL(neon_fp16_logits);
-DECLARE_LOGITS_KERNEL(neon_qasymm8_logits);
-DECLARE_LOGITS_KERNEL(neon_qasymm8_singed_logits);
-DECLARE_LOGITS_KERNEL(sve_fp32_logits);
-DECLARE_LOGITS_KERNEL(sve_fp16_logits);
-DECLARE_LOGITS_KERNEL(sve_qasymm8_logits);
-DECLARE_LOGITS_KERNEL(sve_qasymm8_signed_logits);
-
-#undef DECLARE_LOGITS_KERNEL
 } // namespace cpu
 } // namespace arm_compute
 
-#endif /* SRC_CORE_NEON_KERNELS_SOFTMAX_LIST_H */
+#endif // ACL_SRC_CPU_KERNELS_SOFTMAX_LIST_H
diff --git a/src/cpu/operators/CpuSoftmax.cpp b/src/cpu/operators/CpuSoftmax.cpp
index e55d7f903e..ae14381ad9 100644
--- a/src/cpu/operators/CpuSoftmax.cpp
+++ b/src/cpu/operators/CpuSoftmax.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -41,13 +41,10 @@ namespace arm_compute
 {
 namespace cpu
 {
-template <bool IS_LOG>
-CpuSoftmaxGeneric<IS_LOG>::CpuSoftmaxGeneric()
+CpuSoftmaxGeneric::CpuSoftmaxGeneric()
     : _permute_input(),
       _permute_output(),
-      _max_kernel(),
       _softmax_kernel(),
-      _max(),
       _tmp(),
       _input_permuted(),
       _output_permuted(),
@@ -56,8 +53,7 @@ CpuSoftmaxGeneric<IS_LOG>::CpuSoftmaxGeneric()
 {
 }
 
-template <bool IS_LOG>
-void CpuSoftmaxGeneric<IS_LOG>::configure(const ITensorInfo *src, ITensorInfo *dst, float beta, int32_t axis)
+void CpuSoftmaxGeneric::configure(const ITensorInfo *src, ITensorInfo *dst, float beta, int32_t axis, bool is_log)
 {
     // Perform validation step
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
@@ -79,29 +75,23 @@ void CpuSoftmaxGeneric<IS_LOG>::configure(const ITensorInfo *src, ITensorInfo *d
     // or it is the original input case (2D case)
     const ITensorInfo *tmp_input = (_needs_permute ? &_input_permuted : src);
 
-    // Create intermediate tensors shapes
-    TensorShape max_sum_shape = tmp_input->tensor_shape();
-    max_sum_shape.set(0, 1);
-    const TensorInfo input_info = tmp_input->clone()->reset_padding().set_is_resizable(true);
-    DataType         tmp_data_type =
-        is_data_type_quantized_asymmetric(tmp_input->data_type()) ? DataType::F32 : tmp_input->data_type();
-    TensorInfo tensor_info_tmp(input_info.clone()->set_data_type(tmp_data_type));
-    TensorInfo max_info(tmp_input->clone()->set_tensor_shape(max_sum_shape));
+    TensorInfo tensor_info_tmp;
+    if (is_data_type_quantized_asymmetric(src->data_type()))
+    {
+        // Create intermediate tensors shapes
+        const TensorInfo input_info = tmp_input->clone()->reset_padding().set_is_resizable(true);
+        tensor_info_tmp             = input_info.clone()->set_data_type(DataType::F32);
+    }
 
     // Init intermediate tensors
-    _max = TensorInfo(max_info);
     _tmp = TensorInfo(tensor_info_tmp);
 
     // Configure kernels
-    auto mk = std::make_unique<kernels::CpuLogits1DMaxKernel>();
-    mk->configure(tmp_input, &_max);
-    _max_kernel = std::move(mk);
-
-    auto sm = std::make_unique<kernels::CpuLogits1DSoftmaxKernel<IS_LOG>>();
+    auto sm = std::make_unique<kernels::CpuSoftmaxKernel>();
     if (_needs_permute)
     {
         // The normalization kernel stores the result in a permuted output tensor
-        sm->configure(tmp_input, &_max, &_output_permuted, beta, &_tmp);
+        sm->configure(tmp_input, &_output_permuted, beta, is_log, &_tmp);
 
         // Re-permute the permuted output into the requested (4D) output
         _permute_output.configure(&_output_permuted, dst,
@@ -110,14 +100,15 @@ void CpuSoftmaxGeneric<IS_LOG>::configure(const ITensorInfo *src, ITensorInfo *d
     else
     {
         // Softmax 2D case
-        sm->configure(tmp_input, &_max, dst, beta, &_tmp);
+        sm->configure(tmp_input, dst, beta, is_log, &_tmp);
     }
     _softmax_kernel = std::move(sm);
 
-    _aux_mem[InternalTensorIdx::MAX] =
-        MemoryInfo(offset_int_vec(InternalTensorIdx::MAX), MemoryLifetime::Temporary, _max.total_size());
-    _aux_mem[InternalTensorIdx::TMP] =
-        MemoryInfo(offset_int_vec(InternalTensorIdx::TMP), MemoryLifetime::Temporary, _tmp.total_size());
+    if (_tmp.total_size() > 0)
+    {
+        _aux_mem[InternalTensorIdx::TMP] =
+            MemoryInfo(offset_int_vec(InternalTensorIdx::TMP), MemoryLifetime::Temporary, _tmp.total_size());
+    }
 
     _aux_mem[InternalTensorIdx::PERMUTED_SRC] = MemoryInfo(offset_int_vec(InternalTensorIdx::PERMUTED_SRC),
                                                            MemoryLifetime::Temporary, _input_permuted.total_size());
@@ -125,8 +116,8 @@ void CpuSoftmaxGeneric<IS_LOG>::configure(const ITensorInfo *src, ITensorInfo *d
                                                            MemoryLifetime::Temporary, _output_permuted.total_size());
 }
 
-template <bool IS_LOG>
-Status CpuSoftmaxGeneric<IS_LOG>::validate(const ITensorInfo *src, const ITensorInfo *dst, float beta, int32_t axis)
+Status
+CpuSoftmaxGeneric::validate(const ITensorInfo *src, const ITensorInfo *dst, float beta, int32_t axis, bool is_log)
 {
     // Perform validation step
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
@@ -136,17 +127,12 @@ Status CpuSoftmaxGeneric<IS_LOG>::validate(const ITensorInfo *src, const ITensor
                                 static_cast<int32_t>(src->num_dimensions()) <= axis);
 
     // Create intermediate tensor info
-    DataType         tmp_data_type = src->data_type();
-    const TensorInfo tensor_info_tmp(src->clone()->set_data_type(tmp_data_type).set_is_resizable(true));
-
-    TensorShape max_sum_shape = src->tensor_shape();
-    max_sum_shape.set(0, 1);
-    const TensorInfo tensor_info_max_sum(src->clone()
-                                             ->set_tensor_shape(max_sum_shape)
-                                             .set_data_type(tmp_data_type)
-                                             .set_quantization_info(src->quantization_info())
-                                             .set_is_resizable(true));
-    const TensorInfo dont_care;
+    TensorInfo tensor_info_tmp;
+
+    if (is_data_type_quantized_asymmetric(src->data_type()))
+    {
+        tensor_info_tmp = src->clone()->set_data_type(DataType::F32).set_is_resizable(true);
+    }
 
     const unsigned int actual_axis =
         static_cast<unsigned int>(wrap_around(axis, static_cast<int32_t>(src->num_dimensions())));
@@ -165,15 +151,12 @@ Status CpuSoftmaxGeneric<IS_LOG>::validate(const ITensorInfo *src, const ITensor
         ARM_COMPUTE_RETURN_ON_ERROR(CpuPermute::validate(&output_permuted, dst, permutation_vector));
     }
 
-    ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuLogits1DMaxKernel::validate(src, &tensor_info_max_sum));
-    ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuLogits1DSoftmaxKernel<IS_LOG>::validate(
-        &tensor_info_tmp, &tensor_info_max_sum, dst, beta, &dont_care));
+    ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuSoftmaxKernel::validate(src, dst, beta, is_log, &tensor_info_tmp));
 
     return Status{};
 }
 
-template <bool IS_LOG>
-void CpuSoftmaxGeneric<IS_LOG>::run(ITensorPack &tensors)
+void CpuSoftmaxGeneric::run(ITensorPack &tensors)
 {
     ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No inputs provided");
 
@@ -181,13 +164,11 @@ void CpuSoftmaxGeneric<IS_LOG>::run(ITensorPack &tensors)
     auto dst = tensors.get_tensor(TensorType::ACL_DST);
 
     CpuAuxTensorHandler tmp(offset_int_vec(InternalTensorIdx::TMP), _tmp, tensors, true);
-    CpuAuxTensorHandler max(offset_int_vec(InternalTensorIdx::MAX), _max, tensors, true);
 
     CpuAuxTensorHandler input_permuted(offset_int_vec(InternalTensorIdx::PERMUTED_SRC), _input_permuted, tensors, true);
     CpuAuxTensorHandler output_permuted(offset_int_vec(InternalTensorIdx::PERMUTED_DST), _output_permuted, tensors,
                                         true);
 
-    ITensorPack max_pack;
     ITensorPack softmax_pack;
 
     if (_needs_permute)
@@ -195,24 +176,15 @@ void CpuSoftmaxGeneric<IS_LOG>::run(ITensorPack &tensors)
         ITensorPack permute_in_pack = {{TensorType::ACL_SRC, src}, {TensorType::ACL_DST, input_permuted.get()}};
         _permute_input.run(permute_in_pack);
 
-        max_pack = {{TensorType::ACL_SRC, input_permuted.get()}, {TensorType::ACL_DST, max.get()}};
-
         softmax_pack = {{TensorType::ACL_SRC_0, input_permuted.get()},
-                        {TensorType::ACL_SRC_1, max.get()},
                         {TensorType::ACL_DST_0, output_permuted.get()},
                         {TensorType::ACL_DST_1, tmp.get()}};
     }
     else
     {
-        max_pack = {{TensorType::ACL_SRC, src}, {TensorType::ACL_DST, max.get()}};
-
-        softmax_pack = {{TensorType::ACL_SRC_0, src},
-                        {TensorType::ACL_SRC_1, max.get()},
-                        {TensorType::ACL_DST_0, dst},
-                        {TensorType::ACL_DST_1, tmp.get()}};
+        softmax_pack = {{TensorType::ACL_SRC_0, src}, {TensorType::ACL_DST_0, dst}, {TensorType::ACL_DST_1, tmp.get()}};
     }
 
-    NEScheduler::get().schedule_op(_max_kernel.get(), Window::DimY, _max_kernel->window(), max_pack);
     NEScheduler::get().schedule_op(_softmax_kernel.get(), Window::DimY, _softmax_kernel->window(), softmax_pack);
 
     if (_needs_permute)
@@ -224,13 +196,10 @@ void CpuSoftmaxGeneric<IS_LOG>::run(ITensorPack &tensors)
     }
 }
 
-template <bool IS_LOG>
-experimental::MemoryRequirements CpuSoftmaxGeneric<IS_LOG>::workspace() const
+experimental::MemoryRequirements CpuSoftmaxGeneric::workspace() const
 {
     return _aux_mem;
 }
 
-template class CpuSoftmaxGeneric<false>;
-template class CpuSoftmaxGeneric<true>;
 } // namespace cpu
 } // namespace arm_compute
diff --git a/src/cpu/operators/CpuSoftmax.h b/src/cpu/operators/CpuSoftmax.h
index 8cab70e14f..47020e9b7c 100644
--- a/src/cpu/operators/CpuSoftmax.h
+++ b/src/cpu/operators/CpuSoftmax.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_CPU_SOFTMAX_H
-#define ARM_COMPUTE_CPU_SOFTMAX_H
+#ifndef ACL_SRC_CPU_OPERATORS_CPUSOFTMAX_H
+#define ACL_SRC_CPU_OPERATORS_CPUSOFTMAX_H
 
 #include "arm_compute/core/experimental/Types.h"
 #include "arm_compute/core/TensorInfo.h"
@@ -37,9 +37,7 @@ namespace arm_compute
 {
 namespace cpu
 {
-class CpuLogits1DMaxKernel;
-template <bool IS_LOG>
-class CpuLogits1DSoftmaxKernel;
+class CpuSoftmaxKernel;
 
 /** Basic function to compute a SoftmaxLayer and a Log SoftmaxLayer.
  *
@@ -52,31 +50,31 @@ class CpuLogits1DSoftmaxKernel;
  * This function runs the following function/kernels:
  * -# If axis is not 0:
  * -# @ref CpuPermute
- * -# @ref kernels::CpuLogits1DMaxKernel
- * -# @ref kernels::CpuLogits1DSoftmaxKernel
+ * -# @ref kernels::CpuSoftmaxKernel
  */
-template <bool IS_LOG = false>
 class CpuSoftmaxGeneric : public ICpuOperator
 {
 public:
     CpuSoftmaxGeneric();
     /** Set the input and output tensors.
      *
-     * @param[in,out] src  Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
-     *                     last value of each row to the nearest multiple.
-     * @param[out]    dst  Destination tensor ifo. Data types supported: same as @p input.
-     * @param[in]     beta (Optional) A scaling factor for the exponent.
-     * @param[in]     axis (Optional) The dimension in which to apply the function. E.g. for input of shape 4x5x6 and
+     * @param[in,out] src    Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
+     *                       last value of each row to the nearest multiple.
+     * @param[out]    dst    Destination tensor ifo. Data types supported: same as @p input.
+     * @param[in]     beta   (Optional) A scaling factor for the exponent.
+     * @param[in]     axis   (Optional) The dimension in which to apply the function. E.g. for input of shape 4x5x6 and
      *                       axis=1, softmax will be applied to 4x6=24 vectors of size 5. Defaults to 0
+     * @param[in]     is_log True if the operation is log-softmax
      */
-    void configure(const ITensorInfo *src, ITensorInfo *dst, float beta = 1.0f, int32_t axis = 0);
+    void configure(const ITensorInfo *src, ITensorInfo *dst, float beta = 1.0f, int32_t axis = 0, bool is_log = false);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref CpuSoftmaxGeneric::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *dst, float beta = 1.0f, int32_t axis = 0);
+    static Status
+    validate(const ITensorInfo *src, const ITensorInfo *dst, float beta = 1.0f, int32_t axis = 0, bool is_log = false);
 
     // Inherited methods overridden:
     void                             run(ITensorPack &tensors) override;
@@ -85,8 +83,7 @@ class CpuSoftmaxGeneric : public ICpuOperator
 private:
     enum InternalTensorIdx
     {
-        MAX = 0,
-        TMP,
+        TMP = 0,
         PERMUTED_SRC,
         PERMUTED_DST,
         COUNT
@@ -94,10 +91,8 @@ class CpuSoftmaxGeneric : public ICpuOperator
 
     CpuPermute                  _permute_input;
     CpuPermute                  _permute_output;
-    std::unique_ptr<ICPPKernel> _max_kernel;
     std::unique_ptr<ICPPKernel> _softmax_kernel;
 
-    TensorInfo _max;
     TensorInfo _tmp;
     TensorInfo _input_permuted;
     TensorInfo _output_permuted;
@@ -105,9 +100,7 @@ class CpuSoftmaxGeneric : public ICpuOperator
     bool                             _needs_permute;
     experimental::MemoryRequirements _aux_mem{};
 };
-using CpuSoftmax    = CpuSoftmaxGeneric<false>;
-using CpuLogSoftmax = CpuSoftmaxGeneric<true>;
 
 } // namespace cpu
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_CPU_SOFTMAX_H */
+#endif // ACL_SRC_CPU_OPERATORS_CPUSOFTMAX_H
diff --git a/src/cpu/operators/CpuWinogradConv2d.cpp b/src/cpu/operators/CpuWinogradConv2d.cpp
index e4bcdc0b64..7d81aee0e9 100644
--- a/src/cpu/operators/CpuWinogradConv2d.cpp
+++ b/src/cpu/operators/CpuWinogradConv2d.cpp
@@ -122,13 +122,13 @@ bool get_winograd_kernel_implementation(const ITensorInfo
         success = arm_conv::winograd::get_implementation<float>(*winograd_impl, &CPUInfo::get(), *conv_args, nthreads,
                                                                 enable_fast_math, &winograd_cfg, nullptr);
     }
-#if defined(__aarch64__) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+#if defined(__aarch64__) && defined(ENABLE_FP16_KERNELS)
     else if (data_type == DataType::F16)
     {
         success = arm_conv::winograd::get_implementation<__fp16>(*winograd_impl, &CPUInfo::get(), *conv_args, nthreads,
                                                                  enable_fast_math, &winograd_cfg, nullptr);
     }
-#endif // defined(__aarch64__) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+#endif // defined(__aarch64__) && defined(ENABLE_FP16_KERNELS)
     else
     {
         success = false;
diff --git a/src/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp b/src/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp
index 82bd465c99..611bc76463 100644
--- a/src/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp
+++ b/src/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp
@@ -541,6 +541,7 @@ void Fallback<TypeInput, TypeOutput, OutputStage>::prepare(ITensorPack &tensors)
     {
         auto b = tensors.get_const_tensor(TensorType::ACL_SRC_1);
         auto c = tensors.get_const_tensor(TensorType::ACL_SRC_2);
+        ARM_COMPUTE_ERROR_ON_NULLPTR(b);
 
         // Setup up matrix bias in the assembly kernel, it's just a pointer to matrix C.
         if (c && c->info()->data_type() == DataType::S32)
@@ -614,6 +615,7 @@ void Fallback<TypeInput, TypeOutput, OutputStage>::run(ITensorPack &tensors)
     auto b = tensors.get_const_tensor(TensorType::ACL_SRC_1);
     auto c = tensors.get_const_tensor(TensorType::ACL_SRC_2);
     auto d = tensors.get_tensor(TensorType::ACL_DST);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(a, d);
 
     int       lda = a->info()->strides_in_bytes().y() / a->info()->element_size();
     int       ldb = 0;
@@ -652,7 +654,7 @@ void Fallback<TypeInput, TypeOutput, OutputStage>::run(ITensorPack &tensors)
     }
 
     // Check if B is pre-tranposed and de-reference if not
-    if (!_gemm_kernel_asm->B_is_pretransposed())
+    if (b_to_use && !_gemm_kernel_asm->B_is_pretransposed())
     {
         ldb            = b_to_use->info()->strides_in_bytes().y() / b_to_use->info()->element_size();
         multi_stride_b = b_to_use->info()->strides_in_bytes().z() / b_to_use->info()->element_size();
@@ -670,7 +672,7 @@ void Fallback<TypeInput, TypeOutput, OutputStage>::run(ITensorPack &tensors)
         }
 
         // Pretranspose B if required
-        if (_B_pretranspose_required)
+        if (b_to_use && _B_pretranspose_required)
         {
             // Fixed format kernels need no pretranspose.
             ARM_COMPUTE_ERROR_ON(arm_compute::is_fixed_format(
diff --git a/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel.cpp b/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel.cpp
index 9a2a4890f3..4ca4b83f9c 100644
--- a/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel.cpp
+++ b/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel.cpp
@@ -41,7 +41,6 @@
 #include "src/gpu/cl/kernels/gemm/ClGemmHelpers.h"
 #include "support/Cast.h"
 #include "support/StringSupport.h"
-
 namespace arm_compute
 {
 namespace opencl
@@ -101,7 +100,7 @@ Status validate_arguments(const ITensorInfo       *src0,
     ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(0) != k);
 
     // Validate the reinterpreted-as-3D-case
-    if (gemm_info.depth_output_gemm3d != 0)
+    if (gemm_info.reinterpret_input_as_3d)
     {
         ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(1) * src0->dimension(2) != m);
     }
@@ -284,8 +283,12 @@ void ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel::configure(const CLCompileCon
     build_opts.add_option("-DA_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.a()));
     build_opts.add_option("-DB_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.b()));
 
+    build_opts.add_option_if(gemm_info.reinterpret_input_as_3d, "-DREINTERPRET_INPUT_AS_3D");
+    build_opts.add_option_if(gemm_info.depth_output_gemm3d != 0, "-DREINTERPRET_OUTPUT_AS_3D");
+    build_opts.add_option_if(src1->num_dimensions() > 2, "-DBATCHED_RHS");
+
     std::string kernel_name("gemm_mm_reshaped_only_rhs_nt_mmul");
-    kernel_name += rhs_info.export_to_cl_image ? "_texture" : "";
+    kernel_name += _export_to_cl_image ? "_texture" : "";
 
     // A macro guard to compile ONLY the kernel of interest
     build_opts.add_option("-D" + upper_string(kernel_name));
diff --git a/src/gpu/cl/kernels/gemm/native/ClGemmNativeKernelConfig.h b/src/gpu/cl/kernels/gemm/native/ClGemmNativeKernelConfig.h
index 955bb3c01a..22aa1e2034 100644
--- a/src/gpu/cl/kernels/gemm/native/ClGemmNativeKernelConfig.h
+++ b/src/gpu/cl/kernels/gemm/native/ClGemmNativeKernelConfig.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_CL_GEMM_NATIVE_KERNEL_CONFIGURATION_H
-#define ARM_COMPUTE_CL_GEMM_NATIVE_KERNEL_CONFIGURATION_H
+#ifndef ACL_SRC_GPU_CL_KERNELS_GEMM_NATIVE_CLGEMMNATIVEKERNELCONFIG_H
+#define ACL_SRC_GPU_CL_KERNELS_GEMM_NATIVE_CLGEMMNATIVEKERNELCONFIG_H
 
 #include "src/gpu/cl/kernels/gemm/IClGemmKernelConfig.h"
 #include "src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeBifrost.h"
@@ -58,6 +58,7 @@ class ClGemmNativeKernelConfigurationFactory final
             case GPUTarget::BIFROST:
                 return std::make_unique<ClGemmDefaultConfigNativeBifrost>(gpu);
             case GPUTarget::VALHALL:
+            case GPUTarget::FIFTHGEN:
                 return std::make_unique<ClGemmDefaultConfigNativeValhall>(gpu);
             default:
                 ARM_COMPUTE_ERROR("Not supported GPU target");
@@ -68,4 +69,4 @@ class ClGemmNativeKernelConfigurationFactory final
 } // namespace kernels
 } // namespace opencl
 } // namespace arm_compute
-#endif /*ARM_COMPUTE_CL_GEMM_NATIVE_KERNEL_CONFIGURATION_H */
+#endif // ACL_SRC_GPU_CL_KERNELS_GEMM_NATIVE_CLGEMMNATIVEKERNELCONFIG_H
diff --git a/src/gpu/cl/kernels/gemm/reshaped/ClGemmReshapedKernelConfig.h b/src/gpu/cl/kernels/gemm/reshaped/ClGemmReshapedKernelConfig.h
index 83928b3f4f..6327ee3027 100644
--- a/src/gpu/cl/kernels/gemm/reshaped/ClGemmReshapedKernelConfig.h
+++ b/src/gpu/cl/kernels/gemm/reshaped/ClGemmReshapedKernelConfig.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_CL_GEMM_RESHAPED_KERNEL_CONFIGURATION_H
-#define ARM_COMPUTE_CL_GEMM_RESHAPED_KERNEL_CONFIGURATION_H
+#ifndef ACL_SRC_GPU_CL_KERNELS_GEMM_RESHAPED_CLGEMMRESHAPEDKERNELCONFIG_H
+#define ACL_SRC_GPU_CL_KERNELS_GEMM_RESHAPED_CLGEMMRESHAPEDKERNELCONFIG_H
 
 #include "src/gpu/cl/kernels/gemm/IClGemmKernelConfig.h"
 #include "src/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedBifrost.h"
@@ -56,6 +56,7 @@ class ClGemmReshapedKernelConfigurationFactory final
             case GPUTarget::BIFROST:
                 return std::make_unique<ClGemmDefaultConfigReshapedBifrost>(gpu);
             case GPUTarget::VALHALL:
+            case GPUTarget::FIFTHGEN:
                 return std::make_unique<ClGemmDefaultConfigReshapedValhall>(gpu);
             default:
                 ARM_COMPUTE_ERROR("Not supported GPU target");
@@ -66,4 +67,4 @@ class ClGemmReshapedKernelConfigurationFactory final
 } // namespace kernels
 } // namespace opencl
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_GEMM_RESHAPED_KERNEL_CONFIGURATION_H */
+#endif // ACL_SRC_GPU_CL_KERNELS_GEMM_RESHAPED_CLGEMMRESHAPEDKERNELCONFIG_H
diff --git a/src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmReshapedOnlyRhsKernelConfig.h b/src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmReshapedOnlyRhsKernelConfig.h
index e07ad993ed..1f0c5c2d87 100644
--- a/src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmReshapedOnlyRhsKernelConfig.h
+++ b/src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmReshapedOnlyRhsKernelConfig.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_CL_GEMM_RESHAPED_ONLY_RHS_KERNEL_CONFIGURATION_H
-#define ARM_COMPUTE_CL_GEMM_RESHAPED_ONLY_RHS_KERNEL_CONFIGURATION_H
+#ifndef ACL_SRC_GPU_CL_KERNELS_GEMM_RESHAPED_ONLY_RHS_CLGEMMRESHAPEDONLYRHSKERNELCONFIG_H
+#define ACL_SRC_GPU_CL_KERNELS_GEMM_RESHAPED_ONLY_RHS_CLGEMMRESHAPEDONLYRHSKERNELCONFIG_H
 
 #include "src/gpu/cl/kernels/gemm/IClGemmKernelConfig.h"
 #include "src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyBifrost.h"
@@ -56,6 +56,7 @@ class ClGemmReshapedOnlyRhsKernelConfigurationFactory final
             case GPUTarget::BIFROST:
                 return std::make_unique<ClGemmDefaultConfigReshapedRhsOnlyBifrost>(gpu);
             case GPUTarget::VALHALL:
+            case GPUTarget::FIFTHGEN:
                 return std::make_unique<ClGemmDefaultConfigReshapedRhsOnlyValhall>(gpu);
             default:
                 ARM_COMPUTE_ERROR("Not supported GPU target");
@@ -66,4 +67,4 @@ class ClGemmReshapedOnlyRhsKernelConfigurationFactory final
 } // namespace kernels
 } // namespace opencl
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_GEMM_RESHAPED_ONLY_RHS_KERNEL_CONFIGURATION_H */
+#endif // ACL_SRC_GPU_CL_KERNELS_GEMM_RESHAPED_ONLY_RHS_CLGEMMRESHAPEDONLYRHSKERNELCONFIG_H
diff --git a/src/gpu/cl/operators/ClMatMul.cpp b/src/gpu/cl/operators/ClMatMul.cpp
index 9962ee550a..28a2aa2540 100644
--- a/src/gpu/cl/operators/ClMatMul.cpp
+++ b/src/gpu/cl/operators/ClMatMul.cpp
@@ -34,6 +34,7 @@
 #include "src/gpu/cl/kernels/ClMatMulNativeMMULKernel.h"
 #include "src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultConfigValhall.h"
 #include "src/runtime/heuristics/matmul_native/ClMatMulNativeKernelConfig.h"
+#include "src/runtime/heuristics/matmul_native/ClMatMulNativeKernelVariant.h"
 #include "src/runtime/heuristics/matmul_native/IClMatMulNativeKernelConfig.h"
 
 using namespace arm_compute::cl_matmul;
@@ -42,59 +43,6 @@ namespace arm_compute
 {
 namespace opencl
 {
-namespace
-{
-enum class MatMulKernelType
-{
-    /** Native matrix multiplication for FP types */
-    NATIVE_FP,
-
-    /** Native matrix multiplication for quantized types */
-    NATIVE_QUANTIZED,
-
-    /** Native matrix multiplication using MMUL extension for FP types */
-    NATIVE_MMUL_FP,
-
-    /** Native matrix multiplication using MMUL extension for Quantized types */
-    NATIVE_MMUL_QUANTIZED
-};
-
-MatMulKernelType get_matmul_kernel(const ITensorInfo         *lhs,
-                                   const ITensorInfo         *rhs,
-                                   const MatMulInfo          &matmul_info,
-                                   const ActivationLayerInfo &act_info)
-{
-    ARM_COMPUTE_UNUSED(lhs, rhs, matmul_info, act_info);
-
-    const bool is_quantized      = is_data_type_quantized_asymmetric(lhs->data_type());
-    const bool is_mmul_supported = arm_matrix_multiply_supported(CLKernelLibrary::get().get_device());
-
-    const int k = matmul_info.adj_lhs() ? lhs->tensor_shape().y() : lhs->tensor_shape().x();
-
-    if (is_quantized)
-    {
-        // MMUL kernel works only when K is a multiple of 16
-        if (is_mmul_supported && !act_info.enabled() && k % 16 == 0)
-        {
-            return MatMulKernelType::NATIVE_MMUL_QUANTIZED;
-        }
-
-        return MatMulKernelType::NATIVE_QUANTIZED;
-    }
-    else
-    {
-        // MMUL kernel works only when K is a multiple of 4
-        if (is_mmul_supported && !act_info.enabled() && k % 4 == 0)
-        {
-            return MatMulKernelType::NATIVE_MMUL_FP;
-        }
-
-        return MatMulKernelType::NATIVE_FP;
-    }
-
-    return is_quantized ? MatMulKernelType::NATIVE_QUANTIZED : MatMulKernelType::NATIVE_FP;
-}
-} // namespace
 using namespace arm_compute::opencl::kernels;
 
 ClMatMul::ClMatMul()
@@ -119,7 +67,10 @@ Status ClMatMul::validate(const ITensorInfo         *lhs,
 
     const MatMulKernelInfo kernel_info = t->configure(lhs, rhs, matmul_info);
 
-    switch (get_matmul_kernel(lhs, rhs, matmul_info, act_info))
+    const auto             kernel_selector = ClMatMulNativeKernelVariantFactory::create(gpu_target);
+    const MatMulKernelType kernel_type     = kernel_selector->select_kernel(lhs, rhs, matmul_info, act_info);
+
+    switch (kernel_type)
     {
         case MatMulKernelType::NATIVE_FP:
             return ClMatMulNativeKernel::validate(lhs, rhs, nullptr /* bias */, dst, kernel_info, act_info);
@@ -151,7 +102,10 @@ void ClMatMul::configure(const CLCompileContext    &compile_context,
     const auto             kernel_config = ClMatMulNativeKernelConfigurationFactory::create(gpu_target);
     const MatMulKernelInfo kernel_info   = kernel_config->configure(lhs, rhs, matmul_info);
 
-    switch (get_matmul_kernel(lhs, rhs, matmul_info, act_info))
+    const auto             kernel_selector = ClMatMulNativeKernelVariantFactory::create(gpu_target);
+    const MatMulKernelType kernel_type     = kernel_selector->select_kernel(lhs, rhs, matmul_info, act_info);
+
+    switch (kernel_type)
     {
         case MatMulKernelType::NATIVE_FP:
         {
diff --git a/src/runtime/CL/CLMemoryRegion.cpp b/src/runtime/CL/CLMemoryRegion.cpp
index 835958b816..c9ddf9b85c 100644
--- a/src/runtime/CL/CLMemoryRegion.cpp
+++ b/src/runtime/CL/CLMemoryRegion.cpp
@@ -26,6 +26,8 @@
 #include "arm_compute/core/Error.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 
+#include "src/common/utils/Log.h"
+
 namespace arm_compute
 {
 ICLMemoryRegion::ICLMemoryRegion(size_t size)
@@ -72,7 +74,14 @@ CLBufferMemoryRegion::~CLBufferMemoryRegion()
     // Flush the command queue to ensure all commands that may use this memory buffer are scheduled to be finished before
     // this buffer is freed
     // Do not call finish as it is a blocking call which affects the performance
-    CLScheduler::get().queue().flush();
+    try
+    {
+        CLScheduler::get().queue().flush();
+    }
+    catch (const std::exception &e)
+    {
+        ARM_COMPUTE_LOG_ERROR_ACL(e.what());
+    }
 }
 
 void *CLBufferMemoryRegion::ptr()
diff --git a/src/runtime/CL/functions/CLReduceMean.cpp b/src/runtime/CL/functions/CLReduceMean.cpp
index 6c6daff5ba..bef8d887fd 100644
--- a/src/runtime/CL/functions/CLReduceMean.cpp
+++ b/src/runtime/CL/functions/CLReduceMean.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2021, 2023 Arm Limited.
+ * Copyright (c) 2018-2021, 2023-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -66,7 +66,14 @@ validate_config(const ITensorInfo *input, const Coordinates &reduction_axis, boo
         TensorShape out_shape = input->tensor_shape();
         // Validate output_shape only if not using auto_init
         convert_negative_axis(axis_local, input_dims);
+
+// Suppress warning produced by a compiler bug in GCC
+// https://gcc.gnu.org/bugzilla/show_bug.cgi?id=104165
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Warray-bounds"
         std::sort(axis_local.begin(), axis_local.begin() + reduction_ops);
+#pragma GCC diagnostic pop
+
         for (unsigned int i = 0; i < reduction_ops; ++i)
         {
             ARM_COMPUTE_RETURN_ERROR_ON(axis_local[i] > 3);
@@ -202,7 +209,13 @@ void CLReduceMean::configure(const CLCompileContext &compile_context,
 
         // We have to sort the reduction axis vectors in order for remove_dimension
         // to work properly
+
+// Suppress warning produced by a compiler bug in GCC
+// https://gcc.gnu.org/bugzilla/show_bug.cgi?id=104165
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Warray-bounds"
         std::sort(axis_local.begin(), axis_local.begin() + _reduction_ops);
+#pragma GCC diagnostic pop
         for (int i = 0; i < _reduction_ops; ++i)
         {
             out_shape.remove_dimension(axis_local[i] - i, false);
diff --git a/src/runtime/CL/gemm/CLGEMMKernelSelection.h b/src/runtime/CL/gemm/CLGEMMKernelSelection.h
index c528dbcac4..98dd44b1bf 100644
--- a/src/runtime/CL/gemm/CLGEMMKernelSelection.h
+++ b/src/runtime/CL/gemm/CLGEMMKernelSelection.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020 Arm Limited.
+ * Copyright (c) 2020, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef SRC_CLGEMMKERNELSELECTION_H
-#define SRC_CLGEMMKERNELSELECTION_H
+#ifndef ACL_SRC_RUNTIME_CL_GEMM_CLGEMMKERNELSELECTION_H
+#define ACL_SRC_RUNTIME_CL_GEMM_CLGEMMKERNELSELECTION_H
 
 #include "arm_compute/runtime/CL/ICLGEMMKernelSelection.h"
 
@@ -53,6 +53,7 @@ class CLGEMMKernelSelectionFactory final
             case GPUTarget::BIFROST:
                 return std::make_unique<CLGEMMDefaultTypeBifrost>(gpu);
             case GPUTarget::VALHALL:
+            case GPUTarget::FIFTHGEN:
                 return std::make_unique<CLGEMMDefaultTypeValhall>(gpu);
             default:
                 ARM_COMPUTE_ERROR("Not supported GPU target");
@@ -61,4 +62,4 @@ class CLGEMMKernelSelectionFactory final
 };
 } // namespace cl_gemm
 } // namespace arm_compute
-#endif /* SRC_CLGEMMKERNELSELECTION_H */
+#endif // ACL_SRC_RUNTIME_CL_GEMM_CLGEMMKERNELSELECTION_H
diff --git a/src/runtime/NEON/functions/NEDepthToSpaceLayer.cpp b/src/runtime/NEON/functions/NEDepthToSpaceLayer.cpp
index 47564059ec..5eea4dca65 100644
--- a/src/runtime/NEON/functions/NEDepthToSpaceLayer.cpp
+++ b/src/runtime/NEON/functions/NEDepthToSpaceLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,15 +25,20 @@
 #include "arm_compute/runtime/NEON/functions/NEDepthToSpaceLayer.h"
 
 #include "arm_compute/core/Error.h"
-#include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
 
 #include "src/common/utils/Log.h"
 #include "src/core/NEON/kernels/NEDepthToSpaceLayerKernel.h"
 
 namespace arm_compute
 {
+NEDepthToSpaceLayer::NEDepthToSpaceLayer() : _kernel{}
+{
+}
+
+NEDepthToSpaceLayer::~NEDepthToSpaceLayer() = default;
+
 void NEDepthToSpaceLayer::configure(const ITensor *input, ITensor *output, int32_t block_shape)
 {
     ARM_COMPUTE_LOG_PARAMS(input, output, block_shape);
@@ -47,4 +52,10 @@ Status NEDepthToSpaceLayer::validate(const ITensorInfo *input, const ITensorInfo
 {
     return NEDepthToSpaceLayerKernel::validate(input, output, block_shape);
 }
+
+void NEDepthToSpaceLayer::run()
+{
+    NEScheduler::get().schedule(_kernel.get(), _kernel->get_split_dimension());
+}
+
 } // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEReduceMean.cpp b/src/runtime/NEON/functions/NEReduceMean.cpp
index d37cf4a8d0..a23db87059 100644
--- a/src/runtime/NEON/functions/NEReduceMean.cpp
+++ b/src/runtime/NEON/functions/NEReduceMean.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2023 Arm Limited.
+ * Copyright (c) 2018-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -63,7 +63,14 @@ validate_config(const ITensorInfo *input, const Coordinates &reduction_axis, boo
         TensorShape out_shape = input->tensor_shape();
         // Validate output_shape only if not using auto_init
         convert_negative_axis(axis_local, input_dims);
+
+// Suppress warning produced by a compiler bug in GCC
+// https://gcc.gnu.org/bugzilla/show_bug.cgi?id=104165
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Warray-bounds"
         std::sort(axis_local.begin(), axis_local.begin() + reduction_ops);
+#pragma GCC diagnostic pop
+
         for (unsigned int i = 0; i < reduction_ops; ++i)
         {
             ARM_COMPUTE_RETURN_ERROR_ON(axis_local[i] > 3);
@@ -168,7 +175,14 @@ void NEReduceMean::configure(ITensor *input, const Coordinates &reduction_axis,
         TensorShape out_shape = tmp_input->info()->tensor_shape();
         // We have to sort the reduction axis vectors in order for remove_dimension
         // to work properly
+
+// Suppress warning produced by a compiler bug in GCC
+// https://gcc.gnu.org/bugzilla/show_bug.cgi?id=104165
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Warray-bounds"
         std::sort(axis_local.begin(), axis_local.begin() + _reduction_ops);
+#pragma GCC diagnostic pop
+
         for (int i = 0; i < _reduction_ops; ++i)
         {
             out_shape.remove_dimension(axis_local[i] - i, false);
diff --git a/src/runtime/NEON/functions/NESoftmaxLayer.cpp b/src/runtime/NEON/functions/NESoftmaxLayer.cpp
index e3c2012d05..be588c5b52 100644
--- a/src/runtime/NEON/functions/NESoftmaxLayer.cpp
+++ b/src/runtime/NEON/functions/NESoftmaxLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -29,7 +29,6 @@
 
 #include "src/core/helpers/MemoryHelpers.h"
 #include "src/core/helpers/SoftmaxHelpers.h"
-#include "src/cpu/kernels/CpuSoftmaxKernel.h"
 #include "src/cpu/operators/CpuSoftmax.h"
 
 namespace arm_compute
@@ -37,13 +36,12 @@ namespace arm_compute
 template <bool IS_LOG>
 struct NESoftmaxLayerGeneric<IS_LOG>::Impl
 {
-    const ITensor                                  *src{nullptr};
-    ITensor                                        *dst{nullptr};
-    Tensor                                          max{nullptr};
-    std::unique_ptr<cpu::CpuSoftmaxGeneric<IS_LOG>> op{nullptr};
-    MemoryGroup                                     memory_group{};
-    ITensorPack                                     run_pack{};
-    WorkspaceData<Tensor>                           workspace_tensors{};
+    const ITensor                          *src{nullptr};
+    ITensor                                *dst{nullptr};
+    std::unique_ptr<cpu::CpuSoftmaxGeneric> op{nullptr};
+    MemoryGroup                             memory_group{};
+    ITensorPack                             run_pack{};
+    WorkspaceData<Tensor>                   workspace_tensors{};
 };
 
 template <bool IS_LOG>
@@ -67,8 +65,8 @@ void NESoftmaxLayerGeneric<IS_LOG>::configure(ITensor *input, ITensor *output, f
 
     _impl->src = input;
     _impl->dst = output;
-    _impl->op  = std::make_unique<cpu::CpuSoftmaxGeneric<IS_LOG>>();
-    _impl->op->configure(input->info(), output->info(), beta, axis);
+    _impl->op  = std::make_unique<cpu::CpuSoftmaxGeneric>();
+    _impl->op->configure(input->info(), output->info(), beta, axis, IS_LOG);
 
     _impl->run_pack          = {{TensorType::ACL_SRC, _impl->src}, {TensorType::ACL_DST, _impl->dst}};
     _impl->workspace_tensors = manage_workspace<Tensor>(_impl->op->workspace(), _impl->memory_group, _impl->run_pack);
@@ -79,7 +77,7 @@ Status
 NESoftmaxLayerGeneric<IS_LOG>::validate(const ITensorInfo *input, const ITensorInfo *output, float beta, int32_t axis)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_RETURN_ON_ERROR(cpu::CpuSoftmaxGeneric<IS_LOG>::validate(input, output, beta, axis));
+    ARM_COMPUTE_RETURN_ON_ERROR(cpu::CpuSoftmaxGeneric::validate(input, output, beta, axis, IS_LOG));
     return Status{};
 }
 
diff --git a/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp b/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp
index 8d77abcfc7..7334be8456 100644
--- a/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2022 Arm Limited.
+ * Copyright (c) 2017-2022, 2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -56,7 +56,7 @@ struct NEWinogradConvolutionLayer::Impl
 NEWinogradConvolutionLayer::NEWinogradConvolutionLayer(const std::shared_ptr<IMemoryManager> &memory_manager)
     : _impl(std::make_unique<Impl>())
 {
-    _impl->memory_group = MemoryGroup(std::move(memory_manager));
+    _impl->memory_group = MemoryGroup(memory_manager);
 }
 
 NEWinogradConvolutionLayer::~NEWinogradConvolutionLayer() = default;
diff --git a/src/runtime/Scheduler.cpp b/src/runtime/Scheduler.cpp
index e52fb59940..3f1e96968a 100644
--- a/src/runtime/Scheduler.cpp
+++ b/src/runtime/Scheduler.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2020, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
diff --git a/src/runtime/heuristics/direct_conv/ClDirectConvKernelConfig.h b/src/runtime/heuristics/direct_conv/ClDirectConvKernelConfig.h
index 2c2509f70b..215b17ef79 100644
--- a/src/runtime/heuristics/direct_conv/ClDirectConvKernelConfig.h
+++ b/src/runtime/heuristics/direct_conv/ClDirectConvKernelConfig.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef SRC_RUNTIME_HEURISTICS_DIRECT_CONV_CLDIRECTCONVKERNELCONFIG
-#define SRC_RUNTIME_HEURISTICS_DIRECT_CONV_CLDIRECTCONVKERNELCONFIG
+#ifndef ACL_SRC_RUNTIME_HEURISTICS_DIRECT_CONV_CLDIRECTCONVKERNELCONFIG_H
+#define ACL_SRC_RUNTIME_HEURISTICS_DIRECT_CONV_CLDIRECTCONVKERNELCONFIG_H
 
 #include "src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigBifrost.h"
 #include "src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigValhall.h"
@@ -53,6 +53,7 @@ class ClDirectConvKernelConfigurationFactory final
             case GPUTarget::BIFROST:
                 return std::make_unique<ClDirectConvDefaultConfigBifrost>(gpu);
             case GPUTarget::VALHALL:
+            case GPUTarget::FIFTHGEN:
                 return std::make_unique<ClDirectConvDefaultConfigValhall>(gpu);
             default:
                 ARM_COMPUTE_ERROR("Not supported GPU target");
@@ -61,4 +62,4 @@ class ClDirectConvKernelConfigurationFactory final
 };
 } // namespace cl_direct_conv
 } // namespace arm_compute
-#endif /* SRC_RUNTIME_HEURISTICS_DIRECT_CONV_CLDIRECTCONVKERNELCONFIG */
+#endif // ACL_SRC_RUNTIME_HEURISTICS_DIRECT_CONV_CLDIRECTCONVKERNELCONFIG_H
diff --git a/src/runtime/heuristics/dwc_native/ClDWCNativeKernelConfig.h b/src/runtime/heuristics/dwc_native/ClDWCNativeKernelConfig.h
index 49ce6ff479..031cf1859a 100644
--- a/src/runtime/heuristics/dwc_native/ClDWCNativeKernelConfig.h
+++ b/src/runtime/heuristics/dwc_native/ClDWCNativeKernelConfig.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef SRC_RUNTIME_HEURISTICS_DWC_NATIVE_CLDWCNATIVEKERNELCONFIG
-#define SRC_RUNTIME_HEURISTICS_DWC_NATIVE_CLDWCNATIVEKERNELCONFIG
+#ifndef ACL_SRC_RUNTIME_HEURISTICS_DWC_NATIVE_CLDWCNATIVEKERNELCONFIG_H
+#define ACL_SRC_RUNTIME_HEURISTICS_DWC_NATIVE_CLDWCNATIVEKERNELCONFIG_H
 
 #include "src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigBifrost.h"
 #include "src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigValhall.h"
@@ -54,6 +54,7 @@ class ClDWCNativeKernelConfigurationFactory final
             case GPUTarget::BIFROST:
                 return std::make_unique<ClDWCNativeDefaultConfigBifrost>(gpu);
             case GPUTarget::VALHALL:
+            case GPUTarget::FIFTHGEN:
                 return std::make_unique<ClDWCNativeDefaultConfigValhall>(gpu);
             default:
                 ARM_COMPUTE_ERROR("Not supported GPU target");
@@ -62,4 +63,4 @@ class ClDWCNativeKernelConfigurationFactory final
 };
 } // namespace cl_dwc
 } // namespace arm_compute
-#endif /* SRC_RUNTIME_HEURISTICS_DWC_NATIVE_CLDWCNATIVEKERNELCONFIG */
+#endif // ACL_SRC_RUNTIME_HEURISTICS_DWC_NATIVE_CLDWCNATIVEKERNELCONFIG_H
diff --git a/src/runtime/heuristics/indirect_conv/ClIndirectConvKernelConfig.h b/src/runtime/heuristics/indirect_conv/ClIndirectConvKernelConfig.h
index dd614e1f68..5e7ba6f8e9 100644
--- a/src/runtime/heuristics/indirect_conv/ClIndirectConvKernelConfig.h
+++ b/src/runtime/heuristics/indirect_conv/ClIndirectConvKernelConfig.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef SRC_RUNTIME_HEURISTICS_INDIRECT_CONV_CLINDIRECTCONVKERNELCONFIG
-#define SRC_RUNTIME_HEURISTICS_INDIRECT_CONV_CLINDIRECTCONVKERNELCONFIG
+#ifndef ACL_SRC_RUNTIME_HEURISTICS_INDIRECT_CONV_CLINDIRECTCONVKERNELCONFIG_H
+#define ACL_SRC_RUNTIME_HEURISTICS_INDIRECT_CONV_CLINDIRECTCONVKERNELCONFIG_H
 
 #include "src/runtime/heuristics/indirect_conv/ClIndirectConvDefaultConfigValhall.h"
 #include "src/runtime/heuristics/indirect_conv/IClIndirectConvKernelConfig.h"
@@ -50,6 +50,7 @@ class ClIndirectConvKernelConfigurationFactory final
             case GPUTarget::MIDGARD:
             case GPUTarget::BIFROST:
             case GPUTarget::VALHALL:
+            case GPUTarget::FIFTHGEN:
                 return std::make_unique<ClIndirectConvDefaultConfigValhall>(gpu);
             default:
                 ARM_COMPUTE_ERROR("Not supported GPU target");
@@ -58,4 +59,4 @@ class ClIndirectConvKernelConfigurationFactory final
 };
 } // namespace cl_indirect_conv
 } // namespace arm_compute
-#endif /* SRC_RUNTIME_HEURISTICS_INDIRECT_CONV_CLINDIRECTCONVKERNELCONFIG */
+#endif // ACL_SRC_RUNTIME_HEURISTICS_INDIRECT_CONV_CLINDIRECTCONVKERNELCONFIG_H
diff --git a/src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultConfigValhall.cpp b/src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultConfigValhall.cpp
index 4b923547c4..3a02a60650 100644
--- a/src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultConfigValhall.cpp
+++ b/src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultConfigValhall.cpp
@@ -62,6 +62,7 @@ ClMatMulNativeDefaultConfigValhall::configure(const ITensorInfo *lhs, const ITen
     switch (_target)
     {
         case GPUTarget::G715:
+        case GPUTarget::G615:
             func = configs_G715.get_function(lhs->data_type());
             break;
         case GPUTarget::G710:
diff --git a/src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultVariantValhall.cpp b/src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultVariantValhall.cpp
new file mode 100644
index 0000000000..3878f698fd
--- /dev/null
+++ b/src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultVariantValhall.cpp
@@ -0,0 +1,113 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultVariantValhall.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/GPUTarget.h"
+#include "arm_compute/core/TensorInfo.h"
+
+namespace arm_compute
+{
+namespace cl_matmul
+{
+ClMatMulNativeDefaultVariantValhall::ClMatMulNativeDefaultVariantValhall(GPUTarget gpu)
+    : IClMatMulNativeKernelVariant(gpu)
+{
+}
+
+MatMulKernelType ClMatMulNativeDefaultVariantValhall::select_kernel(const ITensorInfo         *lhs,
+                                                                    const ITensorInfo         *rhs,
+                                                                    const MatMulInfo          &info,
+                                                                    const ActivationLayerInfo &act_info)
+{
+    ARM_COMPUTE_UNUSED(rhs);
+
+    using VariantFunctionExecutorPtr =
+        MatMulKernelType (ClMatMulNativeDefaultVariantValhall::*)(int k, bool act_enabled);
+
+    ClMatMulNativeVariantArray<VariantFunctionExecutorPtr> configs_G715(
+        &ClMatMulNativeDefaultVariantValhall::configure_G715_float,
+        &ClMatMulNativeDefaultVariantValhall::configure_G715_quantized);
+
+    ClMatMulNativeVariantArray<VariantFunctionExecutorPtr> configs_default(
+        &ClMatMulNativeDefaultVariantValhall::configure_default_float,
+        &ClMatMulNativeDefaultVariantValhall::configure_default_quantized);
+
+    VariantFunctionExecutorPtr func = nullptr;
+    switch (_target)
+    {
+        case GPUTarget::G715:
+        case GPUTarget::G615:
+            func = configs_G715.get_function(lhs->data_type());
+            break;
+        default:
+            func = configs_default.get_function(lhs->data_type());
+            break;
+    }
+
+    const int  k           = info.adj_lhs() ? lhs->tensor_shape().y() : lhs->tensor_shape().x();
+    const bool act_enabled = act_info.enabled();
+
+    ARM_COMPUTE_ERROR_ON_MSG(func == nullptr, "Data type not supported for matmul native");
+    return (this->*func)(k, act_enabled);
+}
+
+MatMulKernelType ClMatMulNativeDefaultVariantValhall::configure_G715_float(int k, bool act_enabled)
+{
+    // MMUL kernel works only when K is a multiple of 4
+    if (!act_enabled && k % 4 == 0)
+    {
+        return MatMulKernelType::NATIVE_MMUL_FP;
+    }
+
+    return MatMulKernelType::NATIVE_FP;
+}
+
+MatMulKernelType ClMatMulNativeDefaultVariantValhall::configure_G715_quantized(int k, bool act_enabled)
+{
+    // MMUL kernel works only when K is a multiple of 16
+    if (!act_enabled && k % 16 == 0)
+    {
+        return MatMulKernelType::NATIVE_MMUL_QUANTIZED;
+    }
+
+    return MatMulKernelType::NATIVE_QUANTIZED;
+}
+
+MatMulKernelType ClMatMulNativeDefaultVariantValhall::configure_default_float(int k, bool act_enabled)
+{
+    ARM_COMPUTE_UNUSED(k, act_enabled);
+
+    return MatMulKernelType::NATIVE_FP;
+}
+
+MatMulKernelType ClMatMulNativeDefaultVariantValhall::configure_default_quantized(int k, bool act_enabled)
+{
+    ARM_COMPUTE_UNUSED(k, act_enabled);
+
+    return MatMulKernelType::NATIVE_QUANTIZED;
+}
+
+} // namespace cl_matmul
+} // namespace arm_compute
diff --git a/src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultVariantValhall.h b/src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultVariantValhall.h
new file mode 100644
index 0000000000..a202676e98
--- /dev/null
+++ b/src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultVariantValhall.h
@@ -0,0 +1,57 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_CLMATMULNATIVEDEFAULTVARIANTVALHALL_H
+#define ACL_SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_CLMATMULNATIVEDEFAULTVARIANTVALHALL_H
+
+#include "src/runtime/heuristics/matmul_native/IClMatMulNativeKernelVariant.h"
+
+namespace arm_compute
+{
+namespace cl_matmul
+{
+/** Valhall based OpenCL matmul configuration */
+class ClMatMulNativeDefaultVariantValhall final : public IClMatMulNativeKernelVariant
+{
+public:
+    /** Constructor
+     *
+     * @param[in] gpu GPU target
+     */
+    ClMatMulNativeDefaultVariantValhall(GPUTarget gpu);
+
+    // Inherited overridden method
+    MatMulKernelType select_kernel(const ITensorInfo         *lhs,
+                                   const ITensorInfo         *rhs,
+                                   const MatMulInfo          &info,
+                                   const ActivationLayerInfo &act_info) override;
+
+private:
+    MatMulKernelType configure_G715_float(int k, bool act_enabled);
+    MatMulKernelType configure_G715_quantized(int k, bool act_enabled);
+    MatMulKernelType configure_default_float(int k, bool act_enabled);
+    MatMulKernelType configure_default_quantized(int k, bool act_enabled);
+};
+} // namespace cl_matmul
+} // namespace arm_compute
+#endif // ACL_SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_CLMATMULNATIVEDEFAULTVARIANTVALHALL_H
diff --git a/src/runtime/heuristics/matmul_native/ClMatMulNativeHelpers.h b/src/runtime/heuristics/matmul_native/ClMatMulNativeHelpers.h
index a114fffa68..699f5fe8c1 100644
--- a/src/runtime/heuristics/matmul_native/ClMatMulNativeHelpers.h
+++ b/src/runtime/heuristics/matmul_native/ClMatMulNativeHelpers.h
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_CLMATMULNATIVEHELPERS
-#define SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_CLMATMULNATIVEHELPERS
+#ifndef ACL_SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_CLMATMULNATIVEHELPERS_H
+#define ACL_SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_CLMATMULNATIVEHELPERS_H
 
 #include "arm_compute/core/Types.h"
 
@@ -80,4 +80,4 @@ MatMulKernelInfo find_info(const MatMulNativeConfigsMatrix &configs,
                            unsigned int                     b);
 } // namespace cl_matmul
 } // namespace arm_compute
-#endif /* SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_CLMATMULNATIVEHELPERS */
+#endif // ACL_SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_CLMATMULNATIVEHELPERS_H
diff --git a/src/runtime/heuristics/matmul_native/ClMatMulNativeKernelConfig.h b/src/runtime/heuristics/matmul_native/ClMatMulNativeKernelConfig.h
index b10018a6d2..e7485bca81 100644
--- a/src/runtime/heuristics/matmul_native/ClMatMulNativeKernelConfig.h
+++ b/src/runtime/heuristics/matmul_native/ClMatMulNativeKernelConfig.h
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_CLMATMULNATIVEKERNELCONFIG
-#define SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_CLMATMULNATIVEKERNELCONFIG
+#ifndef ACL_SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_CLMATMULNATIVEKERNELCONFIG_H
+#define ACL_SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_CLMATMULNATIVEKERNELCONFIG_H
 
 #include "src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultConfigValhall.h"
 #include "src/runtime/heuristics/matmul_native/IClMatMulNativeKernelConfig.h"
@@ -50,6 +50,7 @@ class ClMatMulNativeKernelConfigurationFactory final
             case GPUTarget::MIDGARD:
             case GPUTarget::BIFROST:
             case GPUTarget::VALHALL:
+            case GPUTarget::FIFTHGEN:
                 return std::make_unique<ClMatMulNativeDefaultConfigValhall>(gpu);
             default:
                 ARM_COMPUTE_ERROR("Not supported GPU target");
@@ -58,4 +59,4 @@ class ClMatMulNativeKernelConfigurationFactory final
 };
 } // namespace cl_matmul
 } // namespace arm_compute
-#endif /* SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_CLMATMULNATIVEKERNELCONFIG */
+#endif // ACL_SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_CLMATMULNATIVEKERNELCONFIG_H
diff --git a/src/runtime/heuristics/matmul_native/ClMatMulNativeKernelVariant.h b/src/runtime/heuristics/matmul_native/ClMatMulNativeKernelVariant.h
new file mode 100644
index 0000000000..c2895b8919
--- /dev/null
+++ b/src/runtime/heuristics/matmul_native/ClMatMulNativeKernelVariant.h
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_CLMATMULNATIVEKERNELVARIANT_H
+#define ACL_SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_CLMATMULNATIVEKERNELVARIANT_H
+
+#include "src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultVariantValhall.h"
+#include "src/runtime/heuristics/matmul_native/IClMatMulNativeKernelVariant.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+namespace cl_matmul
+{
+
+/** ClMatMul variant factory class */
+class ClMatMulNativeKernelVariantFactory final
+{
+public:
+    /** Static method to call the ClMatMul configuration class accordingly with the GPU target
+     *
+     * @param[in] gpu GPU target
+     *
+     * @return IClMatMulNativeKernelVariant
+     */
+    static std::unique_ptr<IClMatMulNativeKernelVariant> create(GPUTarget gpu)
+    {
+        switch (get_arch_from_target(gpu))
+        {
+            case GPUTarget::MIDGARD:
+            case GPUTarget::BIFROST:
+            case GPUTarget::VALHALL:
+            case GPUTarget::FIFTHGEN:
+                return std::make_unique<ClMatMulNativeDefaultVariantValhall>(gpu);
+            default:
+                ARM_COMPUTE_ERROR("Not supported GPU target");
+        }
+    }
+};
+} // namespace cl_matmul
+} // namespace arm_compute
+#endif // ACL_SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_CLMATMULNATIVEKERNELVARIANT_H
diff --git a/src/runtime/heuristics/matmul_native/IClMatMulNativeKernelConfig.h b/src/runtime/heuristics/matmul_native/IClMatMulNativeKernelConfig.h
index b9b091100c..00ba3641d5 100644
--- a/src/runtime/heuristics/matmul_native/IClMatMulNativeKernelConfig.h
+++ b/src/runtime/heuristics/matmul_native/IClMatMulNativeKernelConfig.h
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_ICLMATMULNATIVEKERNELCONFIG
-#define SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_ICLMATMULNATIVEKERNELCONFIG
+#ifndef ACL_SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_ICLMATMULNATIVEKERNELCONFIG_H
+#define ACL_SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_ICLMATMULNATIVEKERNELCONFIG_H
 
 #include "arm_compute/core/GPUTarget.h"
 #include "arm_compute/core/KernelDescriptors.h"
@@ -85,7 +85,9 @@ class ClMatMulNativeConfigArray
     std::array<T, 3> _configs;
 };
 
-/** Basic interface for the matmul native kernel configuration */
+/** Basic interface for the matmul native kernel configuration
+ *  This is the base class that chooses architecture specific kernel configurations.
+*/
 class IClMatMulNativeKernelConfig
 {
 public:
@@ -112,4 +114,4 @@ class IClMatMulNativeKernelConfig
 };
 } // namespace cl_matmul
 } // namespace arm_compute
-#endif /* SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_ICLMATMULNATIVEKERNELCONFIG */
+#endif // ACL_SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_ICLMATMULNATIVEKERNELCONFIG_H
diff --git a/src/runtime/heuristics/matmul_native/IClMatMulNativeKernelVariant.h b/src/runtime/heuristics/matmul_native/IClMatMulNativeKernelVariant.h
new file mode 100644
index 0000000000..eac41dd6a3
--- /dev/null
+++ b/src/runtime/heuristics/matmul_native/IClMatMulNativeKernelVariant.h
@@ -0,0 +1,135 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_ICLMATMULNATIVEKERNELVARIANT_H
+#define ACL_SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_ICLMATMULNATIVEKERNELVARIANT_H
+
+#include "arm_compute/core/CoreTypes.h" // DataType
+#include "arm_compute/core/GPUTarget.h"
+#include "arm_compute/core/ITensorInfo.h"
+#include "arm_compute/function_info/ActivationLayerInfo.h"
+#include "arm_compute/function_info/MatMulInfo.h"
+
+#include "src/core/common/Macros.h"
+
+#include <array>
+
+namespace arm_compute
+{
+namespace cl_matmul
+{
+enum class MatMulKernelType
+{
+    /** Native matrix multiplication for FP types */
+    NATIVE_FP,
+
+    /** Native matrix multiplication for quantized types */
+    NATIVE_QUANTIZED,
+
+    /** Native matrix multiplication using MMUL extension for FP types */
+    NATIVE_MMUL_FP,
+
+    /** Native matrix multiplication using MMUL extension for Quantized types */
+    NATIVE_MMUL_QUANTIZED
+};
+
+/** Basic container for the OpenCL MatMul Native variant functions */
+template <class T>
+class ClMatMulNativeVariantArray
+{
+public:
+    /** Alias for Float index */
+    static constexpr size_t DT_FLOAT = 0;
+    /** Alias for Quantized type index */
+    static constexpr size_t DT_QUANTIZED = 1;
+
+    /** Constructor
+     *
+     * @param[in] func_float     Function to call for matmul native float (F32, F16)
+     * @param[in] func_quantized Function to call for matmul native quantized (QASYMM8, QASYMM8_SIGNED, QSYMM8_PER_CHANNEL)
+     *
+     */
+    ClMatMulNativeVariantArray(T func_float, T func_quantized) : _configs{func_float, func_quantized}
+    {
+    }
+
+    /** Method to return the matmul native variant function based on data type
+     *
+     * @param[in] data_type Input data type
+     *
+     * @return the valid function otherwise it returns nullptr if the data type is not valid
+     */
+    T get_function(DataType data_type)
+    {
+        switch (data_type)
+        {
+            case DataType::F32:
+            case DataType::F16:
+                return _configs.at(DT_FLOAT);
+            case DataType::QASYMM8:
+            case DataType::QASYMM8_SIGNED:
+            case DataType::QSYMM8_PER_CHANNEL:
+                return _configs.at(DT_QUANTIZED);
+            default:
+                return nullptr;
+        }
+    }
+
+private:
+    std::array<T, 2> _configs;
+};
+
+/** Basic interface for the matmul native kernel variant
+ *  This is the base class that chooses architecture specific kernel variants.
+*/
+class IClMatMulNativeKernelVariant
+{
+public:
+    /** Constructor
+     *
+     * @param[in] arch GPU target
+     */
+    IClMatMulNativeKernelVariant(GPUTarget arch) : _target(arch)
+    {
+    }
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(IClMatMulNativeKernelVariant);
+    /** Virtual destructor */
+    virtual ~IClMatMulNativeKernelVariant() = default;
+    /** This method returns the @ref MatMulKernelType for the given inputs
+     *
+     * @param[in] lhs      LHS tensor
+     * @param[in] rhs      RHS tensor
+     * @param[in] info     MatMul info
+     * @param[in] act_info Activation layer info
+     */
+    virtual MatMulKernelType select_kernel(const ITensorInfo         *lhs,
+                                           const ITensorInfo         *rhs,
+                                           const MatMulInfo          &info,
+                                           const ActivationLayerInfo &act_info) = 0;
+
+protected:
+    GPUTarget _target;
+};
+} // namespace cl_matmul
+} // namespace arm_compute
+#endif // ACL_SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_ICLMATMULNATIVEKERNELVARIANT_H
diff --git a/support/ToolchainSupport.h b/support/ToolchainSupport.h
index 4d394889c3..accbb643c2 100644
--- a/support/ToolchainSupport.h
+++ b/support/ToolchainSupport.h
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_SUPPORT_TOOLCHAINSUPPORT
-#define ARM_COMPUTE_SUPPORT_TOOLCHAINSUPPORT
+#ifndef ACL_SUPPORT_TOOLCHAINSUPPORT_H
+#define ACL_SUPPORT_TOOLCHAINSUPPORT_H
 
 #include "support/Bfloat16.h"
 #include "support/Half.h"
@@ -184,8 +184,7 @@ inline T nearbyint(T value)
 template <typename T, typename = typename std::enable_if<std::is_floating_point<T>::value>::type>
 inline T round(T value)
 {
-    //Workaround Valgrind's mismatches: when running from Valgrind the call to std::round(-4.500000) == -4.000000 instead of 5.00000
-    return (value < 0.f) ? static_cast<int>(value - 0.5f) : static_cast<int>(value + 0.5f);
+    return std::round(value);
 }
 
 /** Round floating-point value with half value rounding away from zero and cast to long
@@ -331,4 +330,4 @@ inline bool signbit(bfloat16 value)
 } // namespace cpp11
 } // namespace support
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_SUPPORT_TOOLCHAINSUPPORT */
+#endif // ACL_SUPPORT_TOOLCHAINSUPPORT_H
diff --git a/tests/SConscript b/tests/SConscript
index 21904899c0..305f1693d1 100644
--- a/tests/SConscript
+++ b/tests/SConscript
@@ -83,17 +83,12 @@ if 'macos' in test_env['os']:
 
 if env['os'] in ['android', 'macos', 'bare_metal'] or env['standalone']:
     Import("arm_compute_a")
-    Import("arm_compute_core_a")
     Import("arm_compute_graph_a")
-    if env['os']=='windows':
-        test_env.Append(LIBS = [arm_compute_graph_a, arm_compute_a])
-    else:
-        test_env.Append(LIBS = [arm_compute_graph_a, arm_compute_a, arm_compute_core_a])
+    test_env.Append(LIBS = [arm_compute_graph_a, arm_compute_a])
     arm_compute_lib = arm_compute_graph_a
 else:
     Import("arm_compute_graph_so")
-    Import("arm_compute_core_a")
-    test_env.Append(LIBS = ["arm_compute_graph", "arm_compute", "arm_compute_core"])
+    test_env.Append(LIBS = ["arm_compute_graph", "arm_compute"])
     arm_compute_lib = arm_compute_graph_so
 
 if env['os'] in ['bare_metal']:
@@ -189,9 +184,8 @@ if env['fixed_format_kernels'] and test_env['validation_tests']:
     test_env.Append(CPPDEFINES = ['ARM_COMPUTE_ENABLE_FIXED_FORMAT_KERNELS'])
 
 if test_env['validation_tests']:
-    arm_compute_validation_framework = env.StaticLibrary('arm_compute_validation_framework', Glob('validation/reference/*.cpp') + Glob('validation/*.cpp'), LINKFLAGS=test_env['LINKFLAGS'], CXXFLAGS=test_env['CXXFLAGS'], LIBS= [ arm_compute_test_framework, arm_compute_core_a])
+    arm_compute_validation_framework = env.StaticLibrary('arm_compute_validation_framework', Glob('validation/reference/*.cpp') + Glob('validation/*.cpp'), LINKFLAGS=test_env['LINKFLAGS'], CXXFLAGS=test_env['CXXFLAGS'], LIBS= [ arm_compute_test_framework ])
     Depends(arm_compute_validation_framework , arm_compute_test_framework)
-    Depends(arm_compute_validation_framework , arm_compute_core_a)
 
     program_objects = files_validation + common_objects
     if test_env['os'] == 'bare_metal':
diff --git a/tests/datasets/GEMMLowpFusedOffsetOutputDataset.h b/tests/datasets/GEMMLowpFusedOffsetOutputDataset.h
index 8c90efcbdd..b0ad4879ba 100644
--- a/tests/datasets/GEMMLowpFusedOffsetOutputDataset.h
+++ b/tests/datasets/GEMMLowpFusedOffsetOutputDataset.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022 Arm Limited.
+ * Copyright (c) 2019-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_TEST_GEMMLOWPOUTPUT_DATASET
-#define ARM_COMPUTE_TEST_GEMMLOWPOUTPUT_DATASET
+#ifndef ACL_TESTS_DATASETS_GEMMLOWPFUSEDOFFSETOUTPUTDATASET_H
+#define ACL_TESTS_DATASETS_GEMMLOWPFUSEDOFFSETOUTPUTDATASET_H
 
 #include "utils/TypePrinter.h"
 
@@ -40,21 +40,17 @@ namespace datasets
 class GEMMLowpFusedOffsetOutputDataset
 {
 public:
-    using type = std::tuple<TensorShape, TensorShape, TensorShape, int32_t, int32_t, GEMMLowpOutputStageInfo>;
+    using type = std::tuple<TensorShape, TensorShape, TensorShape, GEMMLowpOutputStageType>;
 
     struct iterator
     {
         iterator(std::vector<TensorShape>::const_iterator             a_it,
                  std::vector<TensorShape>::const_iterator             b_it,
                  std::vector<TensorShape>::const_iterator             c_it,
-                 std::vector<int32_t>::const_iterator                 a_offset_it,
-                 std::vector<int32_t>::const_iterator                 b_offset_it,
-                 std::vector<GEMMLowpOutputStageInfo>::const_iterator output_stage_it)
+                 std::vector<GEMMLowpOutputStageType>::const_iterator output_stage_it)
             : _a_it{ std::move(a_it) },
               _b_it{ std::move(b_it) },
               _c_it{ std::move(c_it) },
-              _a_offset_it{ std::move(a_offset_it) },
-              _b_offset_it{ std::move(b_offset_it) },
               _output_stage_it{ std::move(output_stage_it) }
         {
         }
@@ -65,33 +61,14 @@ class GEMMLowpFusedOffsetOutputDataset
             description << "A=" << *_a_it << ":";
             description << "B=" << *_b_it << ":";
             description << "C=" << *_c_it << ":";
-            description << "a_offset=" << *_a_offset_it << ":";
-            description << "b_offset=" << *_b_offset_it << ":";
-            description << "output_type=" << string_from_gemmlowp_output_stage((*_output_stage_it).type) << ":";
-            description << "output_offset=" << (*_output_stage_it).gemmlowp_offset << ":";
-            description << "output_multiplier={";
-            for(auto it = (*_output_stage_it).gemmlowp_multipliers.begin(); it != (*_output_stage_it).gemmlowp_multipliers.end(); ++it)
-            {
-                description << (*it) << ", ";
-            }
-            description << "}:";
-            description << "output_shift={";
-
-            for(auto it = (*_output_stage_it).gemmlowp_shifts.begin(); it != (*_output_stage_it).gemmlowp_shifts.end(); ++it)
-            {
-                description << (*it) << ", ";
-            }
-            description << "}:";
-            description << "output_min=" << (*_output_stage_it).gemmlowp_min_bound << ":";
-            description << "output_max=" << (*_output_stage_it).gemmlowp_max_bound << ":";
-            description << "is_quantized_per_channel=" << (*_output_stage_it).is_quantized_per_channel << ":";
+            description << "output_type=" << string_from_gemmlowp_output_stage(*_output_stage_it) << ":";
 
             return description.str();
         }
 
         GEMMLowpFusedOffsetOutputDataset::type operator*() const
         {
-            return std::make_tuple(*_a_it, *_b_it, *_c_it, *_a_offset_it, *_b_offset_it, *_output_stage_it);
+            return std::make_tuple(*_a_it, *_b_it, *_c_it, *_output_stage_it);
         }
 
         iterator &operator++()
@@ -99,8 +76,6 @@ class GEMMLowpFusedOffsetOutputDataset
             ++_a_it;
             ++_b_it;
             ++_c_it;
-            ++_a_offset_it;
-            ++_b_offset_it;
             ++_output_stage_it;
 
             return *this;
@@ -110,45 +85,27 @@ class GEMMLowpFusedOffsetOutputDataset
         std::vector<TensorShape>::const_iterator             _a_it;
         std::vector<TensorShape>::const_iterator             _b_it;
         std::vector<TensorShape>::const_iterator             _c_it;
-        std::vector<int32_t>::const_iterator                 _a_offset_it;
-        std::vector<int32_t>::const_iterator                 _b_offset_it;
-        std::vector<GEMMLowpOutputStageInfo>::const_iterator _output_stage_it;
+        std::vector<GEMMLowpOutputStageType>::const_iterator _output_stage_it;
     };
 
     iterator begin() const
     {
-        return iterator(_a_shapes.begin(), _b_shapes.begin(), _c_shapes.begin(), _a_offset.begin(), _b_offset.begin(), _output_stage.begin());
+        return iterator(_a_shapes.begin(), _b_shapes.begin(), _c_shapes.begin(), _output_stage.begin());
     }
 
     int size() const
     {
-        return std::min(_a_shapes.size(), std::min(_b_shapes.size(), std::min(_c_shapes.size(), std::min(_a_offset.size(), std::min(_b_offset.size(), _output_stage.size())))));
+        return std::min(_a_shapes.size(), std::min(_b_shapes.size(), std::min(_c_shapes.size(), _output_stage.size())));
     }
 
-    void add_config(TensorShape a, TensorShape b, TensorShape c, int32_t a_offset, int32_t b_offset, GEMMLowpOutputStageInfo output_stage)
+    void add_config(TensorShape a, TensorShape b, TensorShape c, GEMMLowpOutputStageType output_stage)
     {
         _a_shapes.emplace_back(std::move(a));
         _b_shapes.emplace_back(std::move(b));
         _c_shapes.emplace_back(std::move(c));
-        _a_offset.emplace_back(std::move(a_offset));
-        _b_offset.emplace_back(std::move(b_offset));
         _output_stage.emplace_back(std::move(output_stage));
     }
 
-    GEMMLowpOutputStageInfo OutputStageInfo(GEMMLowpOutputStageType type, int32_t offset, int32_t multiplier, int32_t shift, int32_t min, int32_t max)
-    {
-        GEMMLowpOutputStageInfo output_stage = GEMMLowpOutputStageInfo();
-        output_stage.type                    = type;
-        output_stage.gemmlowp_offset         = offset;
-        output_stage.gemmlowp_multiplier     = multiplier;
-        output_stage.gemmlowp_shift          = shift;
-        output_stage.gemmlowp_min_bound      = min;
-        output_stage.gemmlowp_max_bound      = max;
-        output_stage.gemmlowp_multipliers.push_back(multiplier);
-        output_stage.gemmlowp_shifts.push_back(shift);
-        return output_stage;
-    }
-
 protected:
     GEMMLowpFusedOffsetOutputDataset()                                    = default;
     GEMMLowpFusedOffsetOutputDataset(GEMMLowpFusedOffsetOutputDataset &&) = default;
@@ -157,9 +114,7 @@ class GEMMLowpFusedOffsetOutputDataset
     std::vector<TensorShape>             _a_shapes{};
     std::vector<TensorShape>             _b_shapes{};
     std::vector<TensorShape>             _c_shapes{};
-    std::vector<int32_t>                 _a_offset{};
-    std::vector<int32_t>                 _b_offset{};
-    std::vector<GEMMLowpOutputStageInfo> _output_stage{};
+    std::vector<GEMMLowpOutputStageType> _output_stage{};
 };
 
 class SmallGEMMLowpFusedOffsetOutputUint8Dataset final : public GEMMLowpFusedOffsetOutputDataset
@@ -167,47 +122,28 @@ class SmallGEMMLowpFusedOffsetOutputUint8Dataset final : public GEMMLowpFusedOff
 public:
     SmallGEMMLowpFusedOffsetOutputUint8Dataset()
     {
-        add_config(TensorShape(21U, 13U), TensorShape(1U, 21U), TensorShape(1U, 13U), 0, 0, OutputStageInfo(GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT, -100, 2, 13, 10, 210));
-        add_config(TensorShape(52U, 13U), TensorShape(33U, 52U), TensorShape(33U, 13U), 0, 4, OutputStageInfo(GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT, 100, 2, 13, 10, 210));
-        add_config(TensorShape(31U, 27U), TensorShape(23U, 31U), TensorShape(23U, 27U), 18, 23, OutputStageInfo(GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT, 200, 2, 13, 10, 210));
-        add_config(TensorShape(32U, 72U), TensorShape(16U, 32U), TensorShape(16U, 72U), -9, 1, OutputStageInfo(GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT, -100, 2, 13, 10, 210));
-
-        add_config(TensorShape(21U, 1U), TensorShape(43U, 21U), TensorShape(43U, 1U), 0, 0, OutputStageInfo(GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT, -2, 254601600, 10, 10, 210));
-        add_config(TensorShape(31U, 3U), TensorShape(72U, 31U), TensorShape(72U, 3U), -2, 13, OutputStageInfo(GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT, 0, 254601600, 10, 10, 210));
-        add_config(TensorShape(31U, 27U), TensorShape(23U, 31U), TensorShape(23U, 27U), 5, 13, OutputStageInfo(GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT, 2, 254601602, 10, 10, 210));
-        add_config(TensorShape(32U, 72U), TensorShape(17U, 32U), TensorShape(17U, 72U), -9, 1, OutputStageInfo(GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT, -1, 254601602, 10, 10, 210));
+        add_config(TensorShape(21U, 13U), TensorShape(1U, 21U), TensorShape(1U, 13U),GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT);
+        add_config(TensorShape(52U, 13U), TensorShape(33U, 52U), TensorShape(33U, 13U),GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT);
+        add_config(TensorShape(31U, 27U), TensorShape(23U, 31U), TensorShape(23U, 27U),GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT);
+        add_config(TensorShape(32U, 72U), TensorShape(16U, 32U), TensorShape(16U, 72U),GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT);
+        add_config(TensorShape(21U, 1U), TensorShape(43U, 21U), TensorShape(43U, 1U),GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT);
+        add_config(TensorShape(31U, 3U), TensorShape(72U, 31U), TensorShape(72U, 3U),GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT);
+        add_config(TensorShape(32U, 72U), TensorShape(17U, 32U), TensorShape(17U, 72U),GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT);
     }
 };
 
-class SmallGEMMLowpFusedBatchedMatMulDatasetUnsigned final : public GEMMLowpFusedOffsetOutputDataset
+class SmallGEMMLowpFusedBatchedMatMulDataset final : public GEMMLowpFusedOffsetOutputDataset
 {
 public:
-    SmallGEMMLowpFusedBatchedMatMulDatasetUnsigned()
+    SmallGEMMLowpFusedBatchedMatMulDataset()
     {
-        add_config(TensorShape(4U, 3U), TensorShape(2U, 4U), TensorShape(2U, 3U), 0, 0, OutputStageInfo(GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT, 5, 1 << 25, 5, 0, 254));
-        add_config(TensorShape(4U, 3U), TensorShape(2U, 4U), TensorShape(2U, 3U), 0, 0, OutputStageInfo(GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT, 100, 1 << 25, 3, 0, 254));
-        add_config(TensorShape(12U, 15U), TensorShape(7U, 12U), TensorShape(7U, 15U), -3, 15, OutputStageInfo(GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT, 0, 1 << 19, 0, 20, 210));
-        add_config(TensorShape(59U, 17U), TensorShape(36U, 59U), TensorShape(36U, 17U), -2, 13, OutputStageInfo(GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT, -30, 2, 1 << 25, 14, 210));
-        add_config(TensorShape(2U, 4U, 3U), TensorShape(5U, 2U, 3U), TensorShape(5U, 4U, 3U), -5, 12, OutputStageInfo(GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT, -20, 1 << 25, 4, 0, 127));
-        add_config(TensorShape(15U, 7U, 3U), TensorShape(29U, 15U, 3U), TensorShape(29U, 7U, 3U), 5, 2, OutputStageInfo(GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT, -10, 1 << 25, 6, 10, 210));
-        add_config(TensorShape(56U, 17U, 32U), TensorShape(5U, 56U, 32U), TensorShape(5U, 17U, 32U), -3, 2, OutputStageInfo(GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT, -15, 1 << 25, 3, 10, 210));
-        add_config(TensorShape(13U, 256U, 32U), TensorShape(19U, 13U, 32U), TensorShape(19U, 256U, 32U), 5, 2, OutputStageInfo(GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT, -15, 1 << 25, 6, 50, 225));
-    }
-};
-
-class SmallGEMMLowpFusedBatchedMatMulDatasetSigned final : public GEMMLowpFusedOffsetOutputDataset
-{
-public:
-    SmallGEMMLowpFusedBatchedMatMulDatasetSigned()
-    {
-        add_config(TensorShape(4U, 3U), TensorShape(2U, 4U), TensorShape(2U, 3U), 0, 0, OutputStageInfo(GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT, 5, 1 << 25, 5, -128, 127));
-        add_config(TensorShape(4U, 3U), TensorShape(2U, 4U), TensorShape(2U, 3U), 0, 0, OutputStageInfo(GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT, 100, 1 << 25, 3, -128, 127));
-        add_config(TensorShape(12U, 15U), TensorShape(7U, 12U), TensorShape(7U, 15U), -3, 15, OutputStageInfo(GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT, 0, 1 << 19, 0, -108, 127));
-        add_config(TensorShape(59U, 17U), TensorShape(36U, 59U), TensorShape(36U, 17U), -2, 13, OutputStageInfo(GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT, -30, 2, 1 << 25, -98, 107));
-        add_config(TensorShape(2U, 4U, 3U), TensorShape(5U, 2U, 3U), TensorShape(5U, 4U, 3U), -5, 12, OutputStageInfo(GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT, -20, 1 << 25, 4, -127, 64));
-        add_config(TensorShape(15U, 7U, 3U), TensorShape(29U, 15U, 3U), TensorShape(29U, 7U, 3U), 5, 2, OutputStageInfo(GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT, -10, 1 << 25, 6, -64, 127));
-        add_config(TensorShape(56U, 17U, 32U), TensorShape(5U, 56U, 32U), TensorShape(5U, 17U, 32U), 3, 2, OutputStageInfo(GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT, -15, 1 << 25, 6, -127, 110));
-        add_config(TensorShape(13U, 256U, 32U), TensorShape(19U, 13U, 32U), TensorShape(19U, 256U, 32U), 5, 2, OutputStageInfo(GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT, -15, 1 << 25, 6, -77, 115));
+        add_config(TensorShape(4U, 3U), TensorShape(2U, 4U), TensorShape(2U, 3U), GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT);
+        add_config(TensorShape(12U, 15U), TensorShape(7U, 12U), TensorShape(7U, 15U), GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT);
+        add_config(TensorShape(59U, 17U), TensorShape(36U, 59U), TensorShape(36U, 17U), GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT);
+        add_config(TensorShape(2U, 4U, 3U), TensorShape(5U, 2U, 3U), TensorShape(5U, 4U, 3U), GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT);
+        add_config(TensorShape(15U, 7U, 3U), TensorShape(29U, 15U, 3U), TensorShape(29U, 7U, 3U), GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT);
+        add_config(TensorShape(56U, 17U, 32U), TensorShape(5U, 56U, 32U), TensorShape(5U, 17U, 32U), GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT);
+        add_config(TensorShape(13U, 256U, 32U), TensorShape(19U, 13U, 32U), TensorShape(19U, 256U, 32U), GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT);
     }
 };
 
@@ -216,14 +152,12 @@ class SmallGEMMLowpFusedOffsetOutputOutput3DUint8Dataset final : public GEMMLowp
 public:
     SmallGEMMLowpFusedOffsetOutputOutput3DUint8Dataset()
     {
-        add_config(TensorShape(21U, 1421U, 33U), TensorShape(34U, 21U), TensorShape(34U, 7U, 203U, 33U), 0, 0, OutputStageInfo(GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT, -100, 2, 13, 10, 210));
-        add_config(TensorShape(31U, 102U, 55U), TensorShape(23U, 31U), TensorShape(23U, 1U, 102U, 55U), 0, 4, OutputStageInfo(GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT, 100, 2, 13, 10, 210));
-        add_config(TensorShape(38U, 1200U, 77U), TensorShape(21U, 38U), TensorShape(21U, 4U, 300U, 77U), 18, 23, OutputStageInfo(GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT, 200, 2, 13, 10, 210));
-        add_config(TensorShape(32U, 103U, 99U), TensorShape(17U, 32U), TensorShape(17U, 1U, 103U, 99U), -9, 1, OutputStageInfo(GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT, -100, 2, 13, 10, 210));
-        add_config(TensorShape(16U, 1600U, 111U), TensorShape(8U, 16U), TensorShape(8U, 8U, 200U, 111U), 0, 0, OutputStageInfo(GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT, -2, 254601600, 10, 10,
-                                                                                                                               210));
-        add_config(TensorShape(16U, 1600U, 113U), TensorShape(8U, 16U), TensorShape(8U, 8U, 200U, 113U), -2, 13, OutputStageInfo(GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT, 0, 254601600, 10, 10,
-                   210));
+        add_config(TensorShape(21U, 1421U, 33U), TensorShape(34U, 21U), TensorShape(34U, 7U, 203U, 33U), GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT);
+        add_config(TensorShape(31U, 102U, 55U), TensorShape(23U, 31U), TensorShape(23U, 1U, 102U, 55U), GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT);
+        add_config(TensorShape(38U, 1200U, 77U), TensorShape(21U, 38U), TensorShape(21U, 4U, 300U, 77U), GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT);
+        add_config(TensorShape(32U, 103U, 99U), TensorShape(17U, 32U), TensorShape(17U, 1U, 103U, 99U), GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT);
+        add_config(TensorShape(16U, 1600U, 111U), TensorShape(8U, 16U), TensorShape(8U, 8U, 200U, 111U), GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT);
+        add_config(TensorShape(16U, 1600U, 113U), TensorShape(8U, 16U), TensorShape(8U, 8U, 200U, 113U), GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT);
     }
 };
 
@@ -232,14 +166,12 @@ class SmallGEMMLowpFusedOffsetOutputInputOutput3DUint8Dataset final : public GEM
 public:
     SmallGEMMLowpFusedOffsetOutputInputOutput3DUint8Dataset()
     {
-        add_config(TensorShape(21U, 7U, 203U, 33U), TensorShape(34U, 21U), TensorShape(34U, 7U, 203U, 33U), 0, 0, OutputStageInfo(GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT, -100, 2, 13, 10, 210));
-        add_config(TensorShape(31U, 1U, 102U, 55U), TensorShape(23U, 31U), TensorShape(23U, 1U, 102U, 55U), 0, 4, OutputStageInfo(GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT, 100, 2, 13, 10, 210));
-        add_config(TensorShape(38U, 4U, 300U, 77U), TensorShape(21U, 38U), TensorShape(21U, 4U, 300U, 77U), 18, 23, OutputStageInfo(GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT, 200, 2, 13, 10, 210));
-        add_config(TensorShape(32U, 1U, 103U, 99U), TensorShape(17U, 32U), TensorShape(17U, 1U, 103U, 99U), -9, 1, OutputStageInfo(GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT, -100, 2, 13, 10, 210));
-        add_config(TensorShape(16U, 8U, 200U, 111U), TensorShape(8U, 16U), TensorShape(8U, 8U, 200U, 111U), 0, 0, OutputStageInfo(GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT, -2, 254601600, 10, 10,
-                   210));
-        add_config(TensorShape(16U, 8U, 200U, 113U), TensorShape(8U, 16U), TensorShape(8U, 8U, 200U, 113U), -2, 13, OutputStageInfo(GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT, 0, 254601600, 10, 10,
-                   210));
+        add_config(TensorShape(21U, 7U, 203U, 33U), TensorShape(34U, 21U), TensorShape(34U, 7U, 203U, 33U), GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT);
+        add_config(TensorShape(31U, 1U, 102U, 55U), TensorShape(23U, 31U), TensorShape(23U, 1U, 102U, 55U), GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT);
+        add_config(TensorShape(38U, 4U, 300U, 77U), TensorShape(21U, 38U), TensorShape(21U, 4U, 300U, 77U), GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT);
+        add_config(TensorShape(32U, 1U, 103U, 99U), TensorShape(17U, 32U), TensorShape(17U, 1U, 103U, 99U), GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT);
+        add_config(TensorShape(16U, 8U, 200U, 111U), TensorShape(8U, 16U), TensorShape(8U, 8U, 200U, 111U), GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT);
+        add_config(TensorShape(16U, 8U, 200U, 113U), TensorShape(8U, 16U), TensorShape(8U, 8U, 200U, 113U), GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT);
     }
 };
 
@@ -248,28 +180,14 @@ class SmallGEMMLowpFusedOffsetOutputInt8Dataset final : public GEMMLowpFusedOffs
 public:
     SmallGEMMLowpFusedOffsetOutputInt8Dataset()
     {
-        add_config(TensorShape(21U, 1U), TensorShape(1U, 21U), TensorShape(1U, 1U), 0, 0, OutputStageInfo(GEMMLowpOutputStageType::QUANTIZE_DOWN, -50, 2, 13, -10, 110));
-        add_config(TensorShape(31U, 3U), TensorShape(72U, 31U), TensorShape(72U, 3U), -2, 13, OutputStageInfo(GEMMLowpOutputStageType::QUANTIZE_DOWN, 0, 2, 13, -10, 110));
-        add_config(TensorShape(52U, 26U), TensorShape(33U, 52U), TensorShape(33U, 26U), -2, 0, OutputStageInfo(GEMMLowpOutputStageType::QUANTIZE_DOWN, 0, 2, 13, -10, 110));
-        add_config(TensorShape(38U, 43U), TensorShape(21U, 38U), TensorShape(21U, 43U), -3, -2, OutputStageInfo(GEMMLowpOutputStageType::QUANTIZE_DOWN, -40, 2, 13, -10, 110));
-
-        add_config(TensorShape(21U, 13U), TensorShape(33U, 21U), TensorShape(33U, 13U), 0, 0, OutputStageInfo(GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT, -1, 254601600, 10, -10, 110));
-        add_config(TensorShape(52U, 26U), TensorShape(33U, 52U), TensorShape(33U, 26U), -2, 0, OutputStageInfo(GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT, 1, 254601600, 10, -10, 110));
-        add_config(TensorShape(38U, 43U), TensorShape(21U, 38U), TensorShape(21U, 43U), -3, -2, OutputStageInfo(GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT, -2, 254601602, 10, -10, 110));
-        add_config(TensorShape(32U, 72U), TensorShape(17U, 32U), TensorShape(17U, 72U), -9, 1, OutputStageInfo(GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT, -1, 254601602, 10, -10, 110));
-    }
-};
-
-class SmallGEMMLowpFusedOffsetOutputPerChannelDataset final : public GEMMLowpFusedOffsetOutputDataset
-{
-public:
-    SmallGEMMLowpFusedOffsetOutputPerChannelDataset()
-    {
-        add_config(TensorShape(21U, 1U, 6U), TensorShape(43U, 21U, 6U), TensorShape(43U, 1U, 6U), 0, 0, OutputStageInfo(GEMMLowpOutputStageType::QUANTIZE_DOWN, -200, 2, 13, 10, 210));
-        add_config(TensorShape(21U, 13U, 3U), TensorShape(33U, 21U, 3U), TensorShape(33U, 13U, 3U), 0, 0, OutputStageInfo(GEMMLowpOutputStageType::QUANTIZE_DOWN, -100, 2, 13, 10, 210));
-        add_config(TensorShape(31U, 3U, 2U), TensorShape(72U, 31U, 2U), TensorShape(72U, 3U, 2U), -2, 13, OutputStageInfo(GEMMLowpOutputStageType::QUANTIZE_DOWN, 0, 2, 13, 10, 210));
-        add_config(TensorShape(52U, 13U, 7U), TensorShape(33U, 52U, 7U), TensorShape(33U, 13U, 7U), 0, 4, OutputStageInfo(GEMMLowpOutputStageType::QUANTIZE_DOWN, 100, 2, 13, 10, 210));
-        add_config(TensorShape(52U, 26U, 8U), TensorShape(33U, 52U, 8U), TensorShape(33U, 26U, 8U), -2, 0, OutputStageInfo(GEMMLowpOutputStageType::QUANTIZE_DOWN, 0, 2, 13, 10, 210));
+        add_config(TensorShape(21U, 1U), TensorShape(1U, 21U), TensorShape(1U, 1U), GEMMLowpOutputStageType::QUANTIZE_DOWN);
+        add_config(TensorShape(31U, 3U), TensorShape(72U, 31U), TensorShape(72U, 3U), GEMMLowpOutputStageType::QUANTIZE_DOWN);
+        add_config(TensorShape(52U, 26U), TensorShape(33U, 52U), TensorShape(33U, 26U), GEMMLowpOutputStageType::QUANTIZE_DOWN);
+        add_config(TensorShape(38U, 43U), TensorShape(21U, 38U), TensorShape(21U, 43U), GEMMLowpOutputStageType::QUANTIZE_DOWN);
+        add_config(TensorShape(21U, 13U), TensorShape(33U, 21U), TensorShape(33U, 13U), GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT);
+        add_config(TensorShape(52U, 26U), TensorShape(33U, 52U), TensorShape(33U, 26U), GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT);
+        add_config(TensorShape(38U, 43U), TensorShape(21U, 38U), TensorShape(21U, 43U), GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT);
+        add_config(TensorShape(32U, 72U), TensorShape(17U, 32U), TensorShape(17U, 72U), GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT);
     }
 };
 
@@ -278,15 +196,12 @@ class LargeGEMMLowpFusedOffsetOutputUint8Dataset final : public GEMMLowpFusedOff
 public:
     LargeGEMMLowpFusedOffsetOutputUint8Dataset()
     {
-        add_config(TensorShape(923U, 429U), TensorShape(871U, 923U), TensorShape(871U, 429U), 0, 0, OutputStageInfo(GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT, -100, 2, 18, 10, 210));
-        add_config(TensorShape(873U, 513U), TensorShape(784U, 873U), TensorShape(784U, 513U), 0, 4, OutputStageInfo(GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT, 100, 2, 18, 10, 210));
-        add_config(TensorShape(1021U, 973U), TensorShape(783U, 1021U), TensorShape(783U, 973U), 5, 13, OutputStageInfo(GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT, 200, 2, 18, 10, 210));
-        add_config(TensorShape(941U, 1011U), TensorShape(623U, 941U), TensorShape(623U, 1011U), -9, 1, OutputStageInfo(GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT, -100, 2, 18, 10, 210));
+        add_config(TensorShape(923U, 429U), TensorShape(871U, 923U), TensorShape(871U, 429U),GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT);
+        add_config(TensorShape(873U, 513U), TensorShape(784U, 873U), TensorShape(784U, 513U),GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT);
+        add_config(TensorShape(1021U, 973U), TensorShape(783U, 1021U), TensorShape(783U, 973U),GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT);
+        add_config(TensorShape(941U, 1011U), TensorShape(623U, 941U), TensorShape(623U, 1011U),GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT);
+        add_config(TensorShape(681U, 1023U), TensorShape(213U, 681U), TensorShape(213U, 1023U),GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT);
 
-        add_config(TensorShape(923U, 429U), TensorShape(871U, 923U), TensorShape(871U, 429U), 0, 0, OutputStageInfo(GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT, -1, 254601600, 15, 10, 210));
-        add_config(TensorShape(873U, 513U), TensorShape(784U, 873U), TensorShape(784U, 513U), 0, 4, OutputStageInfo(GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT, 1, 254601600, 15, 10, 210));
-        add_config(TensorShape(1021U, 973U), TensorShape(783U, 1021U), TensorShape(783U, 973U), 5, 13, OutputStageInfo(GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT, -2, 254601602, 15, 10, 210));
-        add_config(TensorShape(681U, 1023U), TensorShape(213U, 681U), TensorShape(213U, 1023U), -3, -2, OutputStageInfo(GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT, -1, 254601602, 15, 10, 210));
     }
 };
 
@@ -295,18 +210,17 @@ class LargeGEMMLowpFusedOffsetOutputInt8Dataset final : public GEMMLowpFusedOffs
 public:
     LargeGEMMLowpFusedOffsetOutputInt8Dataset()
     {
-        add_config(TensorShape(923U, 1U, 15U), TensorShape(871U, 923U, 15U), TensorShape(871U, 1U, 15U), 0, 0, OutputStageInfo(GEMMLowpOutputStageType::QUANTIZE_DOWN, -50, 2, 18, -10, 110));
-        add_config(TensorShape(873U, 7U), TensorShape(784U, 873U), TensorShape(784U, 7U), -1, 3, OutputStageInfo(GEMMLowpOutputStageType::QUANTIZE_DOWN, 0, 2, 18, -10, 110));
-        add_config(TensorShape(697U, 872U), TensorShape(563U, 697U), TensorShape(563U, 872U), -2, 0, OutputStageInfo(GEMMLowpOutputStageType::QUANTIZE_DOWN, 0, 2, 18, -10, 110));
-        add_config(TensorShape(681U, 1023U), TensorShape(213U, 681U), TensorShape(213U, 1023U), -3, -2, OutputStageInfo(GEMMLowpOutputStageType::QUANTIZE_DOWN, -50, 2, 18, -10, 110));
-
-        add_config(TensorShape(923U, 1U), TensorShape(871U, 923U), TensorShape(871U, 1U), 0, 0, OutputStageInfo(GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT, -2, 254601600, 15, -10, 110));
-        add_config(TensorShape(873U, 7U), TensorShape(784U, 873U), TensorShape(784U, 7U), -1, 3, OutputStageInfo(GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT, 0, 254601600, 15, -10, 110));
-        add_config(TensorShape(697U, 872U), TensorShape(563U, 697U), TensorShape(563U, 872U), -2, 0, OutputStageInfo(GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT, 2, 254601602, 15, -10, 110));
-        add_config(TensorShape(1021U, 973U), TensorShape(783U, 1021U), TensorShape(783U, 973U), 5, 13, OutputStageInfo(GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT, -2, 254601602, 15, -10, 110));
+        add_config(TensorShape(923U, 1U, 15U), TensorShape(871U, 923U, 15U), TensorShape(871U, 1U, 15U), GEMMLowpOutputStageType::QUANTIZE_DOWN);
+        add_config(TensorShape(873U, 7U), TensorShape(784U, 873U), TensorShape(784U, 7U), GEMMLowpOutputStageType::QUANTIZE_DOWN);
+        add_config(TensorShape(697U, 872U), TensorShape(563U, 697U), TensorShape(563U, 872U), GEMMLowpOutputStageType::QUANTIZE_DOWN);
+        add_config(TensorShape(681U, 1023U), TensorShape(213U, 681U), TensorShape(213U, 1023U), GEMMLowpOutputStageType::QUANTIZE_DOWN);
+        add_config(TensorShape(923U, 1U), TensorShape(871U, 923U), TensorShape(871U, 1U), GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT);
+        add_config(TensorShape(873U, 7U), TensorShape(784U, 873U), TensorShape(784U, 7U), GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT);
+        add_config(TensorShape(697U, 872U), TensorShape(563U, 697U), TensorShape(563U, 872U), GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT);
+        add_config(TensorShape(1021U, 973U), TensorShape(783U, 1021U), TensorShape(783U, 973U), GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT);
     }
 };
 } // namespace datasets
 } // namespace test
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_TEST_GEMMLOWPOUTPUT_DATASET */
+#endif // ACL_TESTS_DATASETS_GEMMLOWPFUSEDOFFSETOUTPUTDATASET_H
diff --git a/tests/validate_examples/RunExample.cpp b/tests/validate_examples/RunExample.cpp
index 5066e9663d..36bf587551 100644
--- a/tests/validate_examples/RunExample.cpp
+++ b/tests/validate_examples/RunExample.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2021 Arm Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -29,6 +29,7 @@
 #include "ValidateExample.h"
 #include "arm_compute/runtime/CL/CLHelpers.h"
 #include "arm_compute/runtime/Scheduler.h"
+#include "arm_compute/core/Version.h"
 #include "tests/AssetsLibrary.h"
 #include "tests/Globals.h"
 #include "tests/framework/Framework.h"
diff --git a/tests/validation/CL/GEMMLowp.cpp b/tests/validation/CL/GEMMLowp.cpp
index 0b057b9dce..1ae9e96626 100644
--- a/tests/validation/CL/GEMMLowp.cpp
+++ b/tests/validation/CL/GEMMLowp.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2022 Arm Limited.
+ * Copyright (c) 2017-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -44,6 +44,9 @@ namespace test
 {
 namespace validation
 {
+
+using framework::dataset::make;
+
 namespace
 {
 constexpr AbsoluteTolerance<float> tolerance_quant(1); /**< Tolerance value for comparing reference's output against implementation's output for quantized data types */
@@ -72,9 +75,9 @@ using CLGEMMLowpMatrixMultiplyCoreFusedOffsetOutputFixtureBatchedUnsigned =
 TEST_SUITE(BatchedMatMul)
 TEST_SUITE(QASYMM8)
 FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMLowpMatrixMultiplyCoreFusedOffsetOutputFixtureBatchedUnsigned, framework::DatasetMode::ALL,
-                       combine(combine(datasets::SmallGEMMLowpFusedBatchedMatMulDatasetUnsigned(),
-                                       framework::dataset::make("DataType", { DataType::QASYMM8 })),
-                               framework::dataset::make("bool", { false })))
+    combine(datasets::SmallGEMMLowpFusedBatchedMatMulDataset(),
+        make("DataType", { DataType::QASYMM8 }),
+        make("reshape_b_only_on_first_run", { false })))
 {
     validate(CLAccessor(_target), _reference, tolerance_quant);
 }
@@ -84,9 +87,9 @@ using CLGEMMLowpMatrixMultiplyCoreFusedOffsetOutputFixtureBatchedSigned =
     GEMMLowpMatrixMultiplyCoreFusedOffsetOutputGenericValidationFixture<CLTensor, CLAccessor, CLGEMMLowpMatrixMultiplyCore, false, false, int8_t, int8_t, true>;
 TEST_SUITE(QASYMM8_SIGNED)
 FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMLowpMatrixMultiplyCoreFusedOffsetOutputFixtureBatchedSigned, framework::DatasetMode::ALL,
-                       combine(combine(datasets::SmallGEMMLowpFusedBatchedMatMulDatasetSigned(),
-                                       framework::dataset::make("DataType", { DataType::QASYMM8_SIGNED })),
-                               framework::dataset::make("bool", { false })))
+    combine(datasets::SmallGEMMLowpFusedBatchedMatMulDataset(),
+        make("DataType", { DataType::QASYMM8_SIGNED }),
+        make("reshape_b_only_on_first_run", { false })))
 {
     validate(CLAccessor(_target), _reference, tolerance_quant);
 }
@@ -96,9 +99,10 @@ TEST_SUITE_END() // BatchedMatMul
 TEST_SUITE(FusedOffsetOutput)
 TEST_SUITE(QASYMM8)
 using CLGEMMLowpMatrixMultiplyCoreFusedOffsetOutputUint8Fixture = GEMMLowpMatrixMultiplyCoreFusedOffsetOutputGenericValidationFixture<CLTensor, CLAccessor, CLGEMMLowpMatrixMultiplyCore>;
-FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMLowpMatrixMultiplyCoreFusedOffsetOutputUint8Fixture, framework::DatasetMode::ALL, combine(combine(datasets::SmallGEMMLowpFusedOffsetOutputUint8Dataset(),
-                       framework::dataset::make("DataType", { DataType::QASYMM8 })),
-                       framework::dataset::make("reshape_b_only_on_first_run", { true, false })))
+FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMLowpMatrixMultiplyCoreFusedOffsetOutputUint8Fixture, framework::DatasetMode::ALL,
+    combine(datasets::SmallGEMMLowpFusedOffsetOutputUint8Dataset(),
+        make("DataType", { DataType::QASYMM8 }),
+        make("reshape_b_only_on_first_run", { true, false })))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_quant);
@@ -108,9 +112,9 @@ TEST_SUITE(Output3D)
 using CLGEMMLowpMatrixMultiplyCoreFusedOffsetOutputOutput3DUint8Fixture =
     GEMMLowpMatrixMultiplyCoreFusedOffsetOutputGenericValidationFixture<CLTensor, CLAccessor, CLGEMMLowpMatrixMultiplyCore, false, true>;
 FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMLowpMatrixMultiplyCoreFusedOffsetOutputOutput3DUint8Fixture, framework::DatasetMode::ALL,
-                       combine(combine(datasets::SmallGEMMLowpFusedOffsetOutputOutput3DUint8Dataset(),
-                                       framework::dataset::make("DataType", { DataType::QASYMM8 })),
-                               framework::dataset::make("reshape_b_only_on_first_run", { true, false })))
+    combine(datasets::SmallGEMMLowpFusedOffsetOutputOutput3DUint8Dataset(),
+        make("DataType", { DataType::QASYMM8 }),
+        make("reshape_b_only_on_first_run", { true, false })))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_quant);
@@ -121,18 +125,19 @@ TEST_SUITE(InputOutput3D)
 using CLGEMMLowpMatrixMultiplyCoreFusedOffsetOutputInputOutput3DUint8Fixture =
     GEMMLowpMatrixMultiplyCoreFusedOffsetOutputGenericValidationFixture<CLTensor, CLAccessor, CLGEMMLowpMatrixMultiplyCore, true, true>;
 FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMLowpMatrixMultiplyCoreFusedOffsetOutputInputOutput3DUint8Fixture, framework::DatasetMode::ALL,
-                       combine(combine(datasets::SmallGEMMLowpFusedOffsetOutputInputOutput3DUint8Dataset(),
-                                       framework::dataset::make("DataType", { DataType::QASYMM8 })),
-                               framework::dataset::make("reshape_b_only_on_first_run", { true, false })))
+    combine(datasets::SmallGEMMLowpFusedOffsetOutputInputOutput3DUint8Dataset(),
+        make("DataType", { DataType::QASYMM8 }),
+        make("reshape_b_only_on_first_run", { true, false })))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_quant);
 }
 TEST_SUITE_END() // InputOutput3D
 
-FIXTURE_DATA_TEST_CASE(RunLarge, CLGEMMLowpMatrixMultiplyCoreFusedOffsetOutputUint8Fixture, framework::DatasetMode::NIGHTLY, combine(combine(datasets::LargeGEMMLowpFusedOffsetOutputUint8Dataset(),
-                       framework::dataset::make("DataType", { DataType::QASYMM8 })),
-                       framework::dataset::make("reshape_b_only_on_first_run", { true, false })))
+FIXTURE_DATA_TEST_CASE(RunLarge, CLGEMMLowpMatrixMultiplyCoreFusedOffsetOutputUint8Fixture, framework::DatasetMode::NIGHTLY,
+    combine(datasets::LargeGEMMLowpFusedOffsetOutputUint8Dataset(),
+        make("DataType", { DataType::QASYMM8 }),
+        make("reshape_b_only_on_first_run", { true, false })))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_quant);
@@ -141,8 +146,9 @@ TEST_SUITE_END() // QASYMM8
 TEST_SUITE(QASYMM8_SIGNED)
 using CLGEMMLowpMatrixMultiplyCoreFusedOffsetOutputInt8Fixture =
     GEMMLowpMatrixMultiplyCoreFusedOffsetOutputValidationFixture<CLTensor, CLAccessor, CLGEMMLowpMatrixMultiplyCore, false, false, int8_t, int8_t>;
-FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMLowpMatrixMultiplyCoreFusedOffsetOutputInt8Fixture, framework::DatasetMode::ALL, combine(datasets::SmallGEMMLowpFusedOffsetOutputInt8Dataset(),
-                       framework::dataset::make("DataType", { DataType::QASYMM8_SIGNED })))
+FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMLowpMatrixMultiplyCoreFusedOffsetOutputInt8Fixture, framework::DatasetMode::ALL,
+    combine(datasets::SmallGEMMLowpFusedOffsetOutputInt8Dataset(),
+        make("DataType", { DataType::QASYMM8_SIGNED })))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_quant);
@@ -185,24 +191,24 @@ TEST_SUITE(QuantizeDownInt32Scale)
 
 TEST_SUITE(QASYMM8)
 
-const auto quantize_down_int32_to_uint8_scale_cases = framework::dataset::make("result_offset", -2, 1) * framework::dataset::make("result_mult_int", 1, 2) * framework::dataset::make("result_shift", 2,
-                                                      3)
-                                                      * framework::dataset::make("min", 0) * framework::dataset::make("max", 255) * framework::dataset::make("addBias", { false, true });
+const auto quantize_down_int32_to_uint8_scale_cases = make("result_offset", -2, 1) * make("result_mult_int", 1, 2) * make("result_shift", 2, 3)
+                                                      * make("min", 0) * make("max", 255) * make("addBias", { false, true });
 
-const auto quantize_down_int32_to_uint8_scale_relu_cases = framework::dataset::make("result_offset", -2, 1) * framework::dataset::make("result_mult_int", 1,
-                                                           2)
-                                                           * framework::dataset::make("result_shift", 2, 3) * framework::dataset::make("min", 0, 2) * framework::dataset::make("max", 171, 173) * framework::dataset::make("addBias", { false, true });
+const auto quantize_down_int32_to_uint8_scale_relu_cases = make("result_offset", -2, 1) * make("result_mult_int", 1, 2)
+                                                           * make("result_shift", 2, 3) * make("min", 0, 2) * make("max", 171, 173) * make("addBias", { false, true });
 
 using CLGEMMLowpQuantizeDownInt32ScaleFixture = GEMMLowpQuantizeDownInt32ToUint8ScaleValidationFixture<CLTensor, CLAccessor, CLGEMMLowpOutputStage>;
 
-FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMLowpQuantizeDownInt32ScaleFixture, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), quantize_down_int32_to_uint8_scale_cases))
+FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMLowpQuantizeDownInt32ScaleFixture, framework::DatasetMode::ALL,
+    combine(datasets::SmallShapes(), quantize_down_int32_to_uint8_scale_cases))
 {
     // Validate output
     validate(CLAccessor(_target), _reference);
 }
 
 TEST_SUITE(BoundedReLu)
-FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMLowpQuantizeDownInt32ScaleFixture, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), quantize_down_int32_to_uint8_scale_relu_cases))
+FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMLowpQuantizeDownInt32ScaleFixture, framework::DatasetMode::ALL,
+    combine(datasets::SmallShapes(), quantize_down_int32_to_uint8_scale_relu_cases))
 {
     // Validate output
     validate(CLAccessor(_target), _reference);
@@ -213,24 +219,24 @@ TEST_SUITE_END() // QASYMM8
 
 TEST_SUITE(QASYMM8_SIGNED)
 
-const auto quantize_down_int32_to_int8_scale_cases = framework::dataset::make("result_offset", -2, 1) * framework::dataset::make("result_mult_int", 1, 2) * framework::dataset::make("result_shift", 2,
-                                                     3)
-                                                     * framework::dataset::make("min", -128) * framework::dataset::make("max", 127) * framework::dataset::make("addBias", { false, true });
+const auto quantize_down_int32_to_int8_scale_cases = make("result_offset", -2, 1) * make("result_mult_int", 1, 2) * make("result_shift", 2, 3)
+                                                     * make("min", -128) * make("max", 127) * make("addBias", { false, true });
 
-const auto quantize_down_int32_to_int8_scale_relu_cases = framework::dataset::make("result_offset", -2, 1) * framework::dataset::make("result_mult_int", 1,
-                                                          2)
-                                                          * framework::dataset::make("result_shift", 2, 3) * framework::dataset::make("min", -100, -98) * framework::dataset::make("max", 71, 73) * framework::dataset::make("addBias", { false, true });
+const auto quantize_down_int32_to_int8_scale_relu_cases = make("result_offset", -2, 1) * make("result_mult_int", 1, 2)
+                                                          * make("result_shift", 2, 3) * make("min", -100, -98) * make("max", 71, 73) * make("addBias", { false, true });
 
 using CLGEMMLowpQuantizeDownInt32ScaleFixture = GEMMLowpQuantizeDownInt32ToInt8ScaleValidationFixture<CLTensor, CLAccessor, CLGEMMLowpOutputStage>;
 
-FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMLowpQuantizeDownInt32ScaleFixture, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), quantize_down_int32_to_int8_scale_cases))
+FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMLowpQuantizeDownInt32ScaleFixture, framework::DatasetMode::ALL,
+    combine(datasets::SmallShapes(), quantize_down_int32_to_int8_scale_cases))
 {
     // Validate output
     validate(CLAccessor(_target), _reference);
 }
 
 TEST_SUITE(BoundedReLu)
-FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMLowpQuantizeDownInt32ScaleFixture, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), quantize_down_int32_to_int8_scale_relu_cases))
+FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMLowpQuantizeDownInt32ScaleFixture, framework::DatasetMode::ALL,
+    combine(datasets::SmallShapes(), quantize_down_int32_to_int8_scale_relu_cases))
 {
     // Validate output
     validate(CLAccessor(_target), _reference);
@@ -247,13 +253,14 @@ using CLGEMMLowpQuantizeDownInt32ScaleByFloatFixture =
     GEMMLowpQuantizeDownInt32ScaleByFloatValidationFixture<CLTensor, CLAccessor, CLGEMMLowpOutputStage, uint8_t>;
 
 FIXTURE_DATA_TEST_CASE(RunTiny, CLGEMMLowpQuantizeDownInt32ScaleByFloatFixture, framework::DatasetMode::ALL,
-                       combine(combine(combine(combine(combine(combine(framework::dataset::make("DataType", DataType::QASYMM8),
-                                                                       datasets::TinyShapes()),
-                                                               framework::dataset::make("result_real_multiplier", 0.33f)),
-                                                       framework::dataset::make("result_offset", 2, 3)),
-                                               framework::dataset::make("min", 0)),
-                                       framework::dataset::make("max", 255)),
-                               framework::dataset::make("addBias", { false, true })))
+    combine(
+        make("DataType", DataType::QASYMM8),
+        datasets::TinyShapes(),
+        make("result_real_multiplier", 0.33f),
+        make("result_offset", 2, 3),
+        make("min", 0),
+        make("max", 255),
+        make("addBias", { false, true })))
 {
     // Validate output
     validate(CLAccessor(_target), _reference);
@@ -264,13 +271,14 @@ TEST_SUITE(QASYMM8_SIGNED)
 using CLGEMMLowpQuantizeDownInt32ScaleByFloatFixture_Signed =
     GEMMLowpQuantizeDownInt32ScaleByFloatValidationFixture<CLTensor, CLAccessor, CLGEMMLowpOutputStage, int8_t>;
 FIXTURE_DATA_TEST_CASE(RunTiny, CLGEMMLowpQuantizeDownInt32ScaleByFloatFixture_Signed, framework::DatasetMode::ALL,
-                       combine(combine(combine(combine(combine(combine(framework::dataset::make("DataType", DataType::QASYMM8_SIGNED),
-                                                                       datasets::TinyShapes()),
-                                                               framework::dataset::make("result_real_multiplier", 0.33f)),
-                                                       framework::dataset::make("result_offset", 2, 3)),
-                                               framework::dataset::make("min", -128)),
-                                       framework::dataset::make("max", 127)),
-                               framework::dataset::make("addBias", { false, true })))
+    combine(
+        make("DataType", DataType::QASYMM8_SIGNED),
+        datasets::TinyShapes(),
+        make("result_real_multiplier", 0.33f),
+        make("result_offset", 2, 3),
+        make("min", -128),
+        make("max", 127),
+        make("addBias", { false, true })))
 {
     // Validate output
     validate(CLAccessor(_target), _reference);
diff --git a/tests/validation/NEON/GEMMLowp.cpp b/tests/validation/NEON/GEMMLowp.cpp
index 46058bd148..9c4d1741eb 100644
--- a/tests/validation/NEON/GEMMLowp.cpp
+++ b/tests/validation/NEON/GEMMLowp.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2023 Arm Limited.
+ * Copyright (c) 2017-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -50,9 +50,12 @@ namespace validation
 TEST_SUITE(NEON)
 TEST_SUITE(GEMMLowp)
 TEST_SUITE(MatrixMultiplyCore)
+
 using NEGEMMLowpMatrixMultiplyCoreFixture = GEMMLowpMatrixMultiplyCoreValidationFixture<Tensor, Accessor, NEGEMMLowpMatrixMultiplyCore>;
 using NEGEMMLowpBatchedMatMulFixture      = GEMMLowpMatrixMultiplyCoreValidationFixture<Tensor, Accessor, NEGEMMLowpMatrixMultiplyCore, false, false, true>;
 
+using framework::dataset::make;
+
 DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, framework::dataset::concat(datasets::SmallGEMMLowpDataset(), datasets::LargeGEMMLowpDataset()),
                shape_a, shape_b, shape_c, a_offset, b_offset)
 {
@@ -80,26 +83,26 @@ DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, framework::dataset::c
 
 // *INDENT-OFF*
 // clang-format off
-DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(
-    framework::dataset::make("InputAInfo", { TensorInfo(TensorShape(21U, 13U), 1, DataType::QASYMM8, QuantizationInfo(1.f/255, 10)), // Input not a multiple of 4
+DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(
+    make("InputAInfo", { TensorInfo(TensorShape(21U, 13U), 1, DataType::QASYMM8, QuantizationInfo(1.f/255, 10)), // Input not a multiple of 4
                                              TensorInfo(TensorShape(21U, 13U), 1, DataType::S32),                                 // Mismatching data type
                                              TensorInfo(TensorShape(20U, 13U), 1, DataType::QASYMM8, QuantizationInfo(1.f/255, 10)), // Invalid dimensions
                                              TensorInfo(TensorShape(21U, 13U), 1, DataType::QASYMM8, QuantizationInfo(1.f/255, 10)), // Invalid dimensions
                                              TensorInfo(TensorShape(16U, 32U), 1, DataType::QASYMM8, QuantizationInfo(1.f/255, 10)),
                                           }),
-    framework::dataset::make("InputBInfo",{ TensorInfo(TensorShape(33U, 21U), 1, DataType::QASYMM8, QuantizationInfo(1.f/256, 10)),
+    make("InputBInfo",{ TensorInfo(TensorShape(33U, 21U), 1, DataType::QASYMM8, QuantizationInfo(1.f/256, 10)),
                                             TensorInfo(TensorShape(33U, 21U), 1, DataType::QASYMM8, QuantizationInfo(1.f/256, 10)),
                                             TensorInfo(TensorShape(33U, 21U), 1, DataType::QASYMM8, QuantizationInfo(1.f/256, 10)),
                                             TensorInfo(TensorShape(33U, 21U), 1, DataType::QASYMM8, QuantizationInfo(1.f/256, 10)),
                                             TensorInfo(TensorShape(64U, 16U), 1, DataType::QASYMM8, QuantizationInfo(1.f/256, 10)),
-                                          })),
-    framework::dataset::make("OutputInfo",{ TensorInfo(TensorShape(33U, 13U), 1, DataType::S32),
+                                          }),
+    make("OutputInfo",{ TensorInfo(TensorShape(33U, 13U), 1, DataType::S32),
                                             TensorInfo(TensorShape(33U, 13U), 1, DataType::S32),
                                             TensorInfo(TensorShape(33U, 13U), 1, DataType::S32),
                                             TensorInfo(TensorShape(8U, 11U), 1, DataType::S32),
                                             TensorInfo(TensorShape(64U, 32U), 1, DataType::S32),
-                                           })),
-    framework::dataset::make("Expected", { true, false, false, false, true })),
+                                           }),
+    make("Expected", { true, false, false, false, true })),
     a_info, b_info, output_info, expected)
 {
     // Lock tensors
@@ -231,9 +234,9 @@ using NEGEMMLowpMatrixMultiplyCoreFusedOffsetOutputFixtureBatchedUnsigned =
 TEST_SUITE(BatchedMatMul)
 TEST_SUITE(QASYMM8)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMLowpMatrixMultiplyCoreFusedOffsetOutputFixtureBatchedUnsigned, framework::DatasetMode::ALL,
-                       combine(combine(datasets::SmallGEMMLowpFusedBatchedMatMulDatasetUnsigned(),
-                                       framework::dataset::make("DataType", { DataType::QASYMM8 })),
-                               framework::dataset::make("bool", { false })))
+    combine(datasets::SmallGEMMLowpFusedBatchedMatMulDataset(),
+        make("DataType", { DataType::QASYMM8 }),
+        make("reshape_b_only_on_first_run", { false })))
 {
     validate(Accessor(_target), _reference, tolerance_batched);
 }
@@ -243,9 +246,9 @@ using NEGEMMLowpMatrixMultiplyCoreFusedOffsetOutputFixtureBatchedSigned =
     GEMMLowpMatrixMultiplyCoreFusedOffsetOutputGenericValidationFixture<Tensor, Accessor, NEGEMMLowpMatrixMultiplyCore, false, false, int8_t, int8_t, true>;
 TEST_SUITE(QASYMM8_SIGNED)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMLowpMatrixMultiplyCoreFusedOffsetOutputFixtureBatchedSigned, framework::DatasetMode::ALL,
-                       combine(combine(datasets::SmallGEMMLowpFusedBatchedMatMulDatasetSigned(),
-                                       framework::dataset::make("DataType", { DataType::QASYMM8_SIGNED })),
-                               framework::dataset::make("bool", { false })))
+    combine(datasets::SmallGEMMLowpFusedBatchedMatMulDataset(),
+        make("DataType", { DataType::QASYMM8_SIGNED }),
+        make("reshape_b_only_on_first_run", { false })))
 {
     validate(Accessor(_target), _reference, tolerance_batched);
 }
@@ -256,15 +259,17 @@ using NEGEMMLowpMatrixMultiplyCoreFusedOffsetOutputFixture = GEMMLowpMatrixMulti
 constexpr AbsoluteTolerance<float> tolerance_quant(1);
 
 TEST_SUITE(FusedOffsetOutput)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMLowpMatrixMultiplyCoreFusedOffsetOutputFixture, framework::DatasetMode::ALL, combine(datasets::SmallGEMMLowpFusedOffsetOutputUint8Dataset(),
-                       framework::dataset::make("DataType", { DataType::QASYMM8 })))
+FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMLowpMatrixMultiplyCoreFusedOffsetOutputFixture, framework::DatasetMode::ALL,
+    combine(datasets::SmallGEMMLowpFusedOffsetOutputUint8Dataset(),
+        make("DataType", { DataType::QASYMM8 })))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_quant);
 }
 
-FIXTURE_DATA_TEST_CASE(RunLarge, NEGEMMLowpMatrixMultiplyCoreFusedOffsetOutputFixture, framework::DatasetMode::NIGHTLY, combine(datasets::LargeGEMMLowpFusedOffsetOutputUint8Dataset(),
-                       framework::dataset::make("DataType", { DataType::QASYMM8 })))
+FIXTURE_DATA_TEST_CASE(RunLarge, NEGEMMLowpMatrixMultiplyCoreFusedOffsetOutputFixture, framework::DatasetMode::NIGHTLY,
+    combine(datasets::LargeGEMMLowpFusedOffsetOutputUint8Dataset(),
+        make("DataType", { DataType::QASYMM8 })))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_quant);
diff --git a/tests/validation/NEON/ReduceMean.cpp b/tests/validation/NEON/ReduceMean.cpp
index 49a38cd49c..8ca0bb53a7 100644
--- a/tests/validation/NEON/ReduceMean.cpp
+++ b/tests/validation/NEON/ReduceMean.cpp
@@ -46,8 +46,13 @@ constexpr AbsoluteTolerance<float> tolerance_f32(0.001f); /**< Tolerance value f
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 constexpr AbsoluteTolerance<float> tolerance_f16(0.03f); /**< Tolerance value for comparing reference's output against implementation's output for 16-bit floating-point type */
 #endif                                                   // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+#ifdef __aarch64__
 constexpr AbsoluteTolerance<uint8_t> tolerance_u8(1);    /**< Tolerance value for comparing reference's output against implementation's output for unsigned 8-bit asymmetric quantized type */
+constexpr AbsoluteTolerance<int8_t>  tolerance_s8(1);    /**< Tolerance value for comparing reference's output against implementation's output for signed 8-bit asymmetric quantized type */
+#else // __aarch64__
+constexpr AbsoluteTolerance<uint8_t> tolerance_u8(2);    /**< Tolerance value for comparing reference's output against implementation's output for unsigned 8-bit asymmetric quantized type */
 constexpr AbsoluteTolerance<int8_t>  tolerance_s8(2);    /**< Tolerance value for comparing reference's output against implementation's output for signed 8-bit asymmetric quantized type */
+#endif // __aarch64__
 
 const auto axis_keep = combine(framework::dataset::make("Axis", { Coordinates(0), Coordinates(1, 0), Coordinates(1, 2), Coordinates(0, 2), Coordinates(1, 3), Coordinates(2, 3), Coordinates(0, 1, 2, 3) }),
                                framework::dataset::make("KeepDims", { true }));
diff --git a/tests/validation/NEON/SoftmaxLayer.cpp b/tests/validation/NEON/SoftmaxLayer.cpp
index b372bdf3fa..2397d81547 100644
--- a/tests/validation/NEON/SoftmaxLayer.cpp
+++ b/tests/validation/NEON/SoftmaxLayer.cpp
@@ -22,14 +22,12 @@
  * SOFTWARE.
  */
 #include "arm_compute/core/Types.h"
-#include "arm_compute/core/utils/StringUtils.h"
 #include "arm_compute/runtime/NEON/functions/NESoftmaxLayer.h"
 #include "arm_compute/runtime/Tensor.h"
 #include "arm_compute/runtime/TensorAllocator.h"
 #include "src/common/cpuinfo/CpuIsaInfo.h"
 #include "src/cpu/kernels/CpuSoftmaxKernel.h"
 #include "tests/NEON/Accessor.h"
-#include "tests/PaddingCalculator.h"
 #include "tests/datasets/ShapeDatasets.h"
 #include "tests/framework/Asserts.h"
 #include "tests/framework/Macros.h"
@@ -42,6 +40,7 @@ namespace test
 {
 namespace validation
 {
+using framework::dataset::make;
 namespace
 {
 /** Tolerance for float operations */
@@ -53,7 +52,7 @@ constexpr AbsoluteTolerance<uint8_t> tolerance_qasymm8(1);
 constexpr AbsoluteTolerance<int8_t>  tolerance_qasymm8_signed(1);
 
 /** CNN data types */
-const auto CNNDataTypes = framework::dataset::make("DataType",
+const auto CNNDataTypes = make("DataType",
 {
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
     DataType::F16,
@@ -66,53 +65,53 @@ TEST_SUITE(NEON)
 TEST_SUITE(SoftmaxLayer)
 // *INDENT-OFF*
 // clang-format off
-DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(
-               framework::dataset::make("InputInfo", { TensorInfo(TensorShape(27U, 13U), 1, DataType::F32),    // Mismatching data types
-                                                       TensorInfo(TensorShape(27U, 13U), 1, DataType::F32),    // Mismatching shapes
-                                                       TensorInfo(TensorShape(27U, 13U), 1, DataType::QASYMM8, // Invalid output quantization info
-                                                                  QuantizationInfo(1.f/256, 12)),
-                                                       TensorInfo(TensorShape(32U, 13U), 1, DataType::F32),
-                                                       TensorInfo(TensorShape(32U, 13U), 1, DataType::QASYMM8,
-                                                                  QuantizationInfo(1.f/256, 12)),
-                                                       TensorInfo(TensorShape(32U, 13U), 1, DataType::F32),
-                                                       TensorInfo(TensorShape(32U, 13U), 1, DataType::QASYMM8,  //Invalid axis high
-                                                                  QuantizationInfo(1.f/256, 12)),
-                                                       TensorInfo(TensorShape(32U, 13U), 1, DataType::QASYMM8,  //Invalid axis low
-                                                                  QuantizationInfo(1.f/256, 12)),
-                                                      }),
-               framework::dataset::make("OutputInfo",{ TensorInfo(TensorShape(27U, 13U), 1, DataType::F16),
-                                                       TensorInfo(TensorShape(27U, 11U), 1, DataType::F32),
-                                                       TensorInfo(TensorShape(27U, 13U), 1, DataType::QASYMM8,
-                                                                  QuantizationInfo(1.f/256, 12)),
-                                                       TensorInfo(TensorShape(32U, 13U), 1, DataType::F32),
-                                                       TensorInfo(TensorShape(32U, 13U), 1, DataType::QASYMM8,
-                                                                  QuantizationInfo(1.f/256, 0)),
-                                                       TensorInfo(TensorShape(32U, 13U), 1, DataType::F32),
-                                                       TensorInfo(TensorShape(32U, 13U), 1, DataType::QASYMM8,
-                                                                  QuantizationInfo(1.f/256, 0)),
-                                                       TensorInfo(TensorShape(32U, 13U), 1, DataType::QASYMM8,
-                                                                  QuantizationInfo(1.f/256, 0)),
-                                                     })),
-               framework::dataset::make("beta", { 1.0,
-                                                  2.0,
-                                                  1.0,
-                                                  2.0,
-                                                  1.0,
-                                                  1.0,
-                                                  2.0,
-                                                  1.0,
-                                                })),
-               framework::dataset::make("axis", { 0,
-                                                  0,
-                                                  0,
-                                                  1,
-                                                  0,
-                                                  -1,
-                                                  2,
-                                                  -3,
-                                                })),
-               framework::dataset::make("Expected", { false, false, false, true, true, true, false, false })),
-               input_info, output_info, beta, axis, expected)
+DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(
+    make("InputInfo", { TensorInfo(TensorShape(27U, 13U), 1, DataType::F32),    // Mismatching data types
+                        TensorInfo(TensorShape(27U, 13U), 1, DataType::F32),    // Mismatching shapes
+                        TensorInfo(TensorShape(27U, 13U), 1, DataType::QASYMM8, // Invalid output quantization info
+                                    QuantizationInfo(1.f/256, 12)),
+                        TensorInfo(TensorShape(32U, 13U), 1, DataType::F32),
+                        TensorInfo(TensorShape(32U, 13U), 1, DataType::QASYMM8,
+                                    QuantizationInfo(1.f/256, 12)),
+                        TensorInfo(TensorShape(32U, 13U), 1, DataType::F32),
+                        TensorInfo(TensorShape(32U, 13U), 1, DataType::QASYMM8,  //Invalid axis high
+                                    QuantizationInfo(1.f/256, 12)),
+                        TensorInfo(TensorShape(32U, 13U), 1, DataType::QASYMM8,  //Invalid axis low
+                                    QuantizationInfo(1.f/256, 12)),
+                        }),
+    make("OutputInfo",{ TensorInfo(TensorShape(27U, 13U), 1, DataType::F16),
+                        TensorInfo(TensorShape(27U, 11U), 1, DataType::F32),
+                        TensorInfo(TensorShape(27U, 13U), 1, DataType::QASYMM8,
+                                    QuantizationInfo(1.f/256, 12)),
+                        TensorInfo(TensorShape(32U, 13U), 1, DataType::F32),
+                        TensorInfo(TensorShape(32U, 13U), 1, DataType::QASYMM8,
+                                    QuantizationInfo(1.f/256, 0)),
+                        TensorInfo(TensorShape(32U, 13U), 1, DataType::F32),
+                        TensorInfo(TensorShape(32U, 13U), 1, DataType::QASYMM8,
+                                    QuantizationInfo(1.f/256, 0)),
+                        TensorInfo(TensorShape(32U, 13U), 1, DataType::QASYMM8,
+                                    QuantizationInfo(1.f/256, 0)),
+                        }),
+    make("beta", { 1.0,
+                   2.0,
+                   1.0,
+                   2.0,
+                   1.0,
+                   1.0,
+                   2.0,
+                   1.0,
+                }),
+    make("axis", { 0,
+                   0,
+                   0,
+                   1,
+                   0,
+                   -1,
+                   2,
+                   -3,
+                }),
+    make("Expected", { false, false, false, true, true, true, false, false })),
+    input_info, output_info, beta, axis, expected)
 {
     ARM_COMPUTE_EXPECT(bool(NESoftmaxLayer::validate(&input_info.clone()->set_is_resizable(false), &output_info.clone()->set_is_resizable(false), beta, axis)) == expected, framework::LogLevel::ERRORS);
 }
@@ -122,54 +121,26 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(
 template <typename T>
 using NESoftmaxLayerFixture = SoftmaxValidationFixture<Tensor, Accessor, NESoftmaxLayer, T>;
 
-DATA_TEST_CASE(KernelSelection_max_logits, framework::DatasetMode::ALL, concat(
-                   combine(framework::dataset::make("CpuExt", std::string("NEON")),
-                           framework::dataset::make("DataType", { DataType::F32,
-                                                                  DataType::F16,
-                                                                  DataType::QASYMM8,
-                                                                  DataType::QASYMM8_SIGNED
-                                                                })),
-                   combine(framework::dataset::make("CpuExt", std::string("SVE")),
-                           framework::dataset::make("DataType", { DataType::F32,
-                                                                  DataType::F16,
-                                                                  DataType::QASYMM8,
-                                                                  DataType::QASYMM8_SIGNED
-                                                                }))),
-               cpu_ext, data_type)
-{
-    using namespace cpu::kernels;
-
-    cpuinfo::CpuIsaInfo cpu_isa{};
-    cpu_isa.neon = (cpu_ext == "NEON");
-    cpu_isa.sve  = (cpu_ext == "SVE");
-    cpu_isa.fp16 = (data_type == DataType::F16);
-
-    const auto *selected_impl = CpuLogits1DMaxKernel::get_implementation(DataTypeISASelectorData{ data_type, cpu_isa }, cpu::KernelSelectionType::Preferred);
-
-    ARM_COMPUTE_ERROR_ON_NULLPTR(selected_impl);
-
-    std::string expected = lower_string(cpu_ext) + "_" + cpu_impl_dt(data_type) + "_logits_1d_max";
-    std::string actual   = selected_impl->name;
-
-    ARM_COMPUTE_EXPECT_EQUAL(expected, actual, framework::LogLevel::ERRORS);
-}
-
-DATA_TEST_CASE(KernelSelection_logits, framework::DatasetMode::ALL, concat(concat(
-                                                                               combine(framework::dataset::make("CpuExt", std::string("NEON")),
-                                                                                       framework::dataset::make("DataType", { DataType::F32,
-                                                                                                                DataType::F16,
-                                                                                                                DataType::QASYMM8,
-                                                                                                                DataType::QASYMM8_SIGNED
-                                                                                                                            })),
-                                                                               combine(framework::dataset::make("CpuExt", std::string("SVE")),
-                                                                                       framework::dataset::make("DataType", { DataType::F32,
-                                                                                                                DataType::F16
-                                                                                                                            }))),
-                                                                           combine(framework::dataset::make("CpuExt", std::string("SVE2")),
-                                                                                   framework::dataset::make("DataType", { DataType::QASYMM8,
-                                                                                                            DataType::QASYMM8_SIGNED
-                                                                                                                        }))),
-               cpu_ext, data_type)
+DATA_TEST_CASE(KernelSelection, framework::DatasetMode::ALL,
+    concat(concat(
+        combine(
+            make("CpuExt", std::string("NEON")),
+            make("DataType", { DataType::F32,
+                            DataType::F16,
+                            DataType::QASYMM8,
+                            DataType::QASYMM8_SIGNED})
+        ),
+        combine(
+            make("CpuExt", std::string("SVE")),
+            make("DataType", { DataType::F32,
+                            DataType::F16}))
+        ),
+        combine(
+            make("CpuExt", std::string("SVE2")),
+            make("DataType", { DataType::QASYMM8,
+                            DataType::QASYMM8_SIGNED}))
+        ),
+        cpu_ext, data_type)
 {
     using namespace cpu::kernels;
 
@@ -179,11 +150,12 @@ DATA_TEST_CASE(KernelSelection_logits, framework::DatasetMode::ALL, concat(conca
     cpu_isa.sve2 = (cpu_ext == "SVE2");
     cpu_isa.fp16 = (data_type == DataType::F16);
 
-    const auto *selected_impl = CpuLogits1DSoftmaxKernel<false>::get_implementation(DataTypeISASelectorData{ data_type, cpu_isa }, cpu::KernelSelectionType::Preferred);
+    const auto *selected_impl = CpuSoftmaxKernel::get_implementation(
+        SoftmaxKernelDataTypeISASelectorData{ data_type, cpu_isa, false /* is_log */ }, cpu::KernelSelectionType::Preferred);
 
     ARM_COMPUTE_ERROR_ON_NULLPTR(selected_impl);
 
-    std::string expected = lower_string(cpu_ext) + "_" + cpu_impl_dt(data_type) + "_softmax_logits_1d";
+    std::string expected = "neon_" + cpu_impl_dt(data_type) + "_softmax";
     std::string actual   = selected_impl->name;
 
     ARM_COMPUTE_EXPECT_EQUAL(expected, actual, framework::LogLevel::ERRORS);
@@ -192,26 +164,32 @@ DATA_TEST_CASE(KernelSelection_logits, framework::DatasetMode::ALL, concat(conca
 TEST_SUITE(Float)
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 TEST_SUITE(FP16)
-FIXTURE_DATA_TEST_CASE(RunSmall, NESoftmaxLayerFixture<half>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::Small4DShapes(),
-                                                                                                                 framework::dataset::make("DataType", DataType::F16)),
-                                                                                                                 framework::dataset::make("Beta", { 1.0f, 2.0f })),
-                                                                                                         framework::dataset::make("Axis", { 0, 1 })))
+FIXTURE_DATA_TEST_CASE(RunSmall, NESoftmaxLayerFixture<half>, framework::DatasetMode::PRECOMMIT,
+    combine(
+        datasets::Small4DShapes(),
+        make("DataType", DataType::F16),
+        make("Beta", { 1.0f, 2.0f }),
+        make("Axis", { 0, 1 })))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_f16);
 }
-FIXTURE_DATA_TEST_CASE(RunSmall4D, NESoftmaxLayerFixture<half>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::Small4DShapes(),
-                                                                                                                   framework::dataset::make("DataType", DataType::F16)),
-                                                                                                                   framework::dataset::make("Beta", { 1.0f, 2.0f })),
-                                                                                                           framework::dataset::make("Axis", { 0, 2, -1 })))
+FIXTURE_DATA_TEST_CASE(RunSmall4D, NESoftmaxLayerFixture<half>, framework::DatasetMode::PRECOMMIT,
+    combine(
+        datasets::Small4DShapes(),
+        make("DataType", DataType::F16),
+        make("Beta", { 1.0f, 2.0f }),
+        make("Axis", { 0, 2, -1 })))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_f16);
 }
-FIXTURE_DATA_TEST_CASE(RunLarge, NESoftmaxLayerFixture<half>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::SoftmaxLayerLargeShapes(),
-                                                                                                                       framework::dataset::make("DataType", DataType::F16)),
-                                                                                                               framework::dataset::make("Beta", { 1.0f, 2.0f })),
-                                                                                                       framework::dataset::make("Axis", { 0 })))
+FIXTURE_DATA_TEST_CASE(RunLarge, NESoftmaxLayerFixture<half>, framework::DatasetMode::NIGHTLY,
+    combine(
+        datasets::SoftmaxLayerLargeShapes(),
+        make("DataType", DataType::F16),
+        make("Beta", { 1.0f, 2.0f }),
+        make("Axis", { 0 })))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_f16);
@@ -220,26 +198,30 @@ TEST_SUITE_END() //FP16
 #endif           /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
 
 TEST_SUITE(FP32)
-FIXTURE_DATA_TEST_CASE(RunSmall2D, NESoftmaxLayerFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SoftmaxLayerSmallShapes(),
-                                                                                                                    framework::dataset::make("DataType", DataType::F32)),
-                                                                                                                    framework::dataset::make("Beta", { 1.0f, 2.0f })),
-                                                                                                            framework::dataset::make("Axis", { 0, -1 })))
+FIXTURE_DATA_TEST_CASE(RunSmall2D, NESoftmaxLayerFixture<float>, framework::DatasetMode::PRECOMMIT,
+    combine(
+        datasets::SoftmaxLayerSmallShapes(),
+        make("DataType", DataType::F32),
+        make("Beta", { 1.0f, 2.0f }),
+        make("Axis", { 0, -1 })))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_f32);
 }
-FIXTURE_DATA_TEST_CASE(RunSmall4D, NESoftmaxLayerFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::Small4DShapes(),
-                                                                                                                    framework::dataset::make("DataType", DataType::F32)),
-                                                                                                                    framework::dataset::make("Beta", { 1.0f, 2.0f })),
-                                                                                                            framework::dataset::make("Axis", { 0, -2, 3 })))
+FIXTURE_DATA_TEST_CASE(RunSmall4D, NESoftmaxLayerFixture<float>, framework::DatasetMode::PRECOMMIT,
+    combine(datasets::Small4DShapes(),
+        make("DataType", DataType::F32),
+        make("Beta", { 1.0f, 2.0f }),
+        make("Axis", { 0, -2, 3 })))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_f32);
 }
-FIXTURE_DATA_TEST_CASE(RunLarge, NESoftmaxLayerFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::SoftmaxLayerLargeShapes(),
-                                                                                                                        framework::dataset::make("DataType", DataType::F32)),
-                                                                                                                framework::dataset::make("Beta", { 1.0f, 2.0f })),
-                                                                                                        framework::dataset::make("Axis", { 0 })))
+FIXTURE_DATA_TEST_CASE(RunLarge, NESoftmaxLayerFixture<float>, framework::DatasetMode::NIGHTLY,
+    combine(datasets::SoftmaxLayerLargeShapes(),
+        make("DataType", DataType::F32),
+        make("Beta", { 1.0f, 2.0f }),
+        make("Axis", { 0 })))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_f32);
@@ -252,29 +234,40 @@ using NESoftmaxLayerQuantizedFixture = SoftmaxValidationQuantizedFixture<Tensor,
 
 TEST_SUITE(Quantized)
 TEST_SUITE(QASYMM8)
-FIXTURE_DATA_TEST_CASE(RunSmall2D, NESoftmaxLayerQuantizedFixture<uint8_t>, framework::DatasetMode::ALL, combine(combine(combine(datasets::SoftmaxLayerSmallShapes(),
-                                                                                                                 framework::dataset::make("DataType", DataType::QASYMM8)),
-                                                                                                                 combine(framework::dataset::make("QuantizationInfo", { QuantizationInfo(0.5f, -10) }),
-                                                                                                                         framework::dataset::make("Beta", { 1.0f, 2.f }))),
-                                                                                                                 framework::dataset::make("Axis", { 0, -1 })))
+FIXTURE_DATA_TEST_CASE(RunSmall2D, NESoftmaxLayerQuantizedFixture<uint8_t>, framework::DatasetMode::ALL,
+    combine(
+        datasets::SoftmaxLayerSmallShapes(),
+        make("DataType", DataType::QASYMM8),
+        combine(
+            make("QuantizationInfo", { QuantizationInfo(0.5f, -10) }),
+            make("Beta", { 1.0f, 2.f })
+        ),
+        make("Axis", { 0, -1 })))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_qasymm8);
 }
-FIXTURE_DATA_TEST_CASE(RunSmall4D, NESoftmaxLayerQuantizedFixture<uint8_t>, framework::DatasetMode::ALL, combine(combine(combine(datasets::Small4DShapes(),
-                                                                                                                 framework::dataset::make("DataType", DataType::QASYMM8)),
-                                                                                                                 combine(framework::dataset::make("QuantizationInfo", { QuantizationInfo(0.5f, -10) }),
-                                                                                                                         framework::dataset::make("Beta", { 1.0f, 2.f }))),
-                                                                                                                 framework::dataset::make("Axis", { 0, 1, -2 })))
+FIXTURE_DATA_TEST_CASE(RunSmall4D, NESoftmaxLayerQuantizedFixture<uint8_t>, framework::DatasetMode::ALL,
+    combine(
+        datasets::Small4DShapes(),
+        make("DataType", DataType::QASYMM8),
+        combine(
+            make("QuantizationInfo", { QuantizationInfo(0.5f, -10) }),
+            make("Beta", { 1.0f, 2.f })),
+        make("Axis", { 0, 1, -2 })))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_qasymm8);
 }
-FIXTURE_DATA_TEST_CASE(RunLarge, NESoftmaxLayerQuantizedFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::SoftmaxLayerLargeShapes(),
-                                                                                                                   framework::dataset::make("DataType", DataType::QASYMM8)),
-                                                                                                                   combine(framework::dataset::make("QuantizationInfo", { QuantizationInfo(0.5f, -10) }),
-                                                                                                                           framework::dataset::make("Beta", { 1.0f, 2.0f }))),
-                                                                                                                   framework::dataset::make("Axis", { 0 })))
+FIXTURE_DATA_TEST_CASE(RunLarge, NESoftmaxLayerQuantizedFixture<uint8_t>, framework::DatasetMode::NIGHTLY,
+    combine(
+        datasets::SoftmaxLayerLargeShapes(),
+        make("DataType", DataType::QASYMM8),
+        combine(
+            make("QuantizationInfo", { QuantizationInfo(0.5f, -10) }),
+            make("Beta", { 1.0f, 2.0f })
+        ),
+        make("Axis", { 0 })))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_qasymm8);
@@ -282,20 +275,28 @@ FIXTURE_DATA_TEST_CASE(RunLarge, NESoftmaxLayerQuantizedFixture<uint8_t>, framew
 TEST_SUITE_END() //QASYMM8
 
 TEST_SUITE(QASYMM8_SIGNED)
-FIXTURE_DATA_TEST_CASE(RunSmall2D, NESoftmaxLayerQuantizedFixture<int8_t>, framework::DatasetMode::ALL, combine(combine(combine(datasets::SoftmaxLayerSmallShapes(),
-                                                                                                                        framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)),
-                                                                                                                        combine(framework::dataset::make("QuantizationInfo", { QuantizationInfo(0.5f, -10) }),
-                                                                                                                                framework::dataset::make("Beta", { 1.0f, 2.f }))),
-                                                                                                                framework::dataset::make("Axis", { 0, -1 })))
+FIXTURE_DATA_TEST_CASE(RunSmall2D, NESoftmaxLayerQuantizedFixture<int8_t>, framework::DatasetMode::ALL,
+    combine(
+        datasets::SoftmaxLayerSmallShapes(),
+        make("DataType", DataType::QASYMM8_SIGNED),
+        combine(
+            make("QuantizationInfo", { QuantizationInfo(0.5f, -10) }),
+            make("Beta", { 1.0f, 2.f })
+        ),
+        make("Axis", { 0, -1 })))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_qasymm8_signed);
 }
-FIXTURE_DATA_TEST_CASE(RunSmall4D, NESoftmaxLayerQuantizedFixture<int8_t>, framework::DatasetMode::ALL, combine(combine(combine(datasets::Small4DShapes(),
-                                                                                                                        framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)),
-                                                                                                                        combine(framework::dataset::make("QuantizationInfo", { QuantizationInfo(0.5f, -10) }),
-                                                                                                                                framework::dataset::make("Beta", { 1.0f, 2.f }))),
-                                                                                                                framework::dataset::make("Axis", { 0, 1, -1 })))
+FIXTURE_DATA_TEST_CASE(RunSmall4D, NESoftmaxLayerQuantizedFixture<int8_t>, framework::DatasetMode::ALL,
+    combine(
+        datasets::Small4DShapes(),
+        make("DataType", DataType::QASYMM8_SIGNED),
+        combine(
+            make("QuantizationInfo", { QuantizationInfo(0.5f, -10) }),
+            make("Beta", { 1.0f, 2.f })
+        ),
+        make("Axis", { 0, 1, -1 })))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_qasymm8_signed);
diff --git a/tests/validation/UNIT/GPUTarget.cpp b/tests/validation/UNIT/GPUTarget.cpp
index 5ec2592f00..2e64635b7a 100644
--- a/tests/validation/UNIT/GPUTarget.cpp
+++ b/tests/validation/UNIT/GPUTarget.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022 Arm Limited.
+ * Copyright (c) 2018-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -62,6 +62,8 @@ TEST_CASE(GetGPUTargetFromName, framework::DatasetMode::ALL)
     ARM_COMPUTE_EXPECT(get_target_from_name("Mali-G310") == GPUTarget::G310, framework::LogLevel::ERRORS);
     ARM_COMPUTE_EXPECT(get_target_from_name("Mali-G715") == GPUTarget::G715, framework::LogLevel::ERRORS);
     ARM_COMPUTE_EXPECT(get_target_from_name("Mali-G615") == GPUTarget::G615, framework::LogLevel::ERRORS);
+    ARM_COMPUTE_EXPECT(get_target_from_name("Mali-G720") == GPUTarget::G720, framework::LogLevel::ERRORS);
+    ARM_COMPUTE_EXPECT(get_target_from_name("Mali-G620") == GPUTarget::G620, framework::LogLevel::ERRORS);
 }
 
 TEST_CASE(GPUTargetIsIn, framework::DatasetMode::ALL)
diff --git a/tests/validation/fixtures/GEMMLowpFixture.h b/tests/validation/fixtures/GEMMLowpFixture.h
index 1492ac6945..a65a1e6bd8 100644
--- a/tests/validation/fixtures/GEMMLowpFixture.h
+++ b/tests/validation/fixtures/GEMMLowpFixture.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2023 Arm Limited.
+ * Copyright (c) 2017-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,14 +21,19 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_TEST_GEMMLOWP_FIXTURE
-#define ARM_COMPUTE_TEST_GEMMLOWP_FIXTURE
+#ifndef ACL_TESTS_VALIDATION_FIXTURES_GEMMLOWPFIXTURE_H
+#define ACL_TESTS_VALIDATION_FIXTURES_GEMMLOWPFIXTURE_H
 
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "src/core/utils/quantization/AsymmHelpers.h"
+#include "tests/validation/Helpers.h"
 #include "tests/framework/Fixture.h"
 #include "tests/validation/Validation.h"
 #include "tests/validation/reference/GEMMLowp.h"
 
+#include <cstdint>
+#include <vector>
+
 namespace arm_compute
 {
 namespace test
@@ -37,82 +42,46 @@ namespace validation
 {
 namespace
 {
+
 template <typename U>
 void fill(U &&tensor, int i)
 {
-    switch(tensor.data_type())
-    {
-        case DataType::QSYMM8_PER_CHANNEL:
-        {
-            int min_bound = 128;
-            int max_bound = -127;
-            for(size_t j = 0; j < tensor.quantization_info().scale().size(); j++)
-            {
-                std::pair<int, int> bounds = get_symm_quantized_per_channel_bounds(tensor.quantization_info(), -1.0f, 1.0f, i);
-                if(bounds.first < min_bound)
-                {
-                    min_bound = bounds.first;
-                }
-                if(bounds.second > max_bound)
-                {
-                    max_bound = bounds.second;
-                }
-            }
-            std::uniform_int_distribution<int32_t> distribution(min_bound, max_bound);
-            library->fill(tensor, distribution, i);
-            break;
-        }
-        case DataType::QASYMM8:
-        {
-            std::uniform_int_distribution<uint32_t> distribution(1, 254);
-            library->fill(tensor, distribution, i);
-            break;
-        }
-        case DataType::S32:
-        {
-            std::uniform_int_distribution<int32_t> distribution(-20000, 20000);
-            library->fill(tensor, distribution, i);
-            break;
-        }
-        case DataType::F16:
-        {
-            arm_compute::utils::uniform_real_distribution_16bit<half> distribution{ -1.0f, 1.0f };
-            library->fill(tensor, distribution, i);
-            break;
-        }
-        case DataType::F32:
-        {
-            std::uniform_real_distribution<float> distribution(-1.0f, 1.0f);
-            library->fill(tensor, distribution, i);
-            break;
-        }
-        default:
-            library->fill_tensor_uniform(tensor, i);
-    }
+    ARM_COMPUTE_ASSERT(is_data_type_quantized(tensor.data_type()));
+    library->fill_tensor_uniform(tensor, i);
 }
 
+template <typename U>
+void fill_bias_s32(U &&tensor, int i, int32_t min, int32_t max)
+{
+    ARM_COMPUTE_ASSERT(tensor.data_type() == DataType::S32);
+    std::uniform_int_distribution<int32_t> distribution(min, max);
+    library->fill(tensor, distribution, i);
+}
+
+/** Information about how to fill tensors */
+struct TensorFillInfo
+{
+    // Bias fill range. Default values are arbitrary
+    int32_t min_bias {-20000};
+    int32_t max_bias {20000};
+    // Optional extra hash to randomize tensor filling
+    int32_t hash     {0};
+};
+
 template <typename TensorType, typename AccessorType, typename FunctionType, bool reinterpret_input_as_3d, bool reinterpret_output_as_3d, typename OutputType, bool is_fused = false, bool run_twice = false>
-TensorType compute_gemmlowp_target(const TensorShape &shape_a, const TensorShape &shape_b, const TensorShape &shape_output, int32_t a_offset, int32_t b_offset,
-                                   GEMMLowpOutputStageInfo output_stage = GEMMLowpOutputStageInfo(), DataType data_type_a = DataType::QASYMM8, DataType data_type_b = DataType::QASYMM8,
-                                   QuantizationInfo b_qinfo = QuantizationInfo(), bool reshape_b_only_on_first_run = false)
+TensorType compute_gemmlowp_target(const TensorShape &shape_a, const TensorShape &shape_b, const TensorShape &shape_output, const QuantizationInfo& a_qinfo, const QuantizationInfo& b_qinfo,
+                                   const QuantizationInfo& output_qinfo, DataType data_type_a = DataType::QASYMM8, DataType data_type_b = DataType::QASYMM8,
+                                   GEMMLowpOutputStageInfo output_stage = GEMMLowpOutputStageInfo(), bool reshape_b_only_on_first_run = false, const TensorFillInfo& finfo = TensorFillInfo() )
 {
+    ARM_COMPUTE_ASSERT(is_data_type_quantized_asymmetric(data_type_a));
+    ARM_COMPUTE_ASSERT(data_type_a == data_type_b);
     // Create tensors
-    DataType data_type_output = output_stage.type == GEMMLowpOutputStageType::NONE ? DataType::S32 : data_type_a;
+    const DataType data_type_output = output_stage.type == GEMMLowpOutputStageType::NONE ? DataType::S32 : data_type_a;
 
-    TensorType a      = create_tensor<TensorType>(shape_a, data_type_a, 1);
-    TensorType b      = create_tensor<TensorType>(shape_b, data_type_b, 1); // gemm output before output stage mismatch if i pass data_layout_output here. to be investigated
-    TensorType output = create_tensor<TensorType>(shape_output, data_type_output, 1);
+    TensorType a      = create_tensor<TensorType>(shape_a, data_type_a, 1, a_qinfo);
+    TensorType b      = create_tensor<TensorType>(shape_b, data_type_b, 1, b_qinfo); // gemm output before output stage mismatch if i pass data_layout_output here. to be investigated
+    TensorType output = create_tensor<TensorType>(shape_output, data_type_output, 1, output_qinfo /* output_qinfo will be ignored when output stage type is None */);
 
-    a.info()->set_quantization_info(QuantizationInfo(1.0f / 255, a_offset));
-
-    if(data_type_b == DataType::QSYMM8_PER_CHANNEL)
-    {
-        b.info()->set_quantization_info(b_qinfo);
-    }
-    else
-    {
-        b.info()->set_quantization_info(QuantizationInfo(1.0f / 255, b_offset));
-    }
     TensorType bias;
     if(is_fused)
     {
@@ -142,26 +111,26 @@ TensorType compute_gemmlowp_target(const TensorShape &shape_a, const TensorShape
     ARM_COMPUTE_ASSERT(!output.info()->is_resizable());
 
     // Fill tensors
-    fill(AccessorType(a), 0);
-    fill(AccessorType(b), 1);
+    fill(AccessorType(a), 0 + finfo.hash);
+    fill(AccessorType(b), 1 + finfo.hash);
 
     if(is_fused)
     {
         ARM_COMPUTE_ASSERT(bias.info()->is_resizable());
         bias.allocator()->allocate();
         ARM_COMPUTE_ASSERT(!bias.info()->is_resizable());
-        fill(AccessorType(bias), 2);
+        fill_bias_s32(AccessorType(bias), 2 + finfo.hash, finfo.min_bias, finfo.max_bias);
     }
 
     // Run with variable inputs.
     if(run_twice)
     {
         gemmlowp.run();
-        fill(AccessorType(a), 3); // Fill tensors with new seed after run
-        fill(AccessorType(b), 4);
+        fill(AccessorType(a), 3 + finfo.hash); // Fill tensors with new seed after run
+        fill(AccessorType(b), 4 + finfo.hash);
         if(is_fused)
         {
-            fill(AccessorType(bias), 5);
+            fill_bias_s32(AccessorType(bias), 5 + finfo.hash, finfo.min_bias, finfo.max_bias);
         }
     }
 
@@ -171,9 +140,11 @@ TensorType compute_gemmlowp_target(const TensorShape &shape_a, const TensorShape
 }
 
 template <bool reinterpret_input_as_3d, typename TI = uint8_t, typename TW = uint8_t, bool pretranspose_A = false, bool pretranspose_B = false, bool run_twice = false>
-SimpleTensor<int32_t> compute_gemmlowp_reference(const TensorShape &shape_a, const TensorShape &shape_b, const TensorShape &shape_output, int32_t a_offset, int32_t b_offset,
-                                                 DataType data_type_a = DataType::QASYMM8, DataType data_type_b = DataType::QASYMM8, QuantizationInfo b_qinfo = QuantizationInfo())
+SimpleTensor<int32_t> compute_gemmlowp_reference(const TensorShape &shape_a, const TensorShape &shape_b, const TensorShape &shape_output, const QuantizationInfo& a_qinfo, const QuantizationInfo& b_qinfo,
+                                                 DataType data_type_a = DataType::QASYMM8, DataType data_type_b = DataType::QASYMM8, const TensorFillInfo& finfo = TensorFillInfo())
 {
+    ARM_COMPUTE_ASSERT(is_data_type_quantized_asymmetric(data_type_a));
+    ARM_COMPUTE_ASSERT(data_type_a == data_type_b);
     TensorShape shape_a_to_use = shape_a;
     if(reinterpret_input_as_3d)
     {
@@ -182,8 +153,8 @@ SimpleTensor<int32_t> compute_gemmlowp_reference(const TensorShape &shape_a, con
     }
 
     // Create reference
-    SimpleTensor<TI> a{ shape_a_to_use, data_type_a, 1 };
-    SimpleTensor<TW> b{ shape_b, data_type_b, 1, data_type_b == DataType::QSYMM8_PER_CHANNEL ? b_qinfo : QuantizationInfo(1.0f / 255, b_offset) };
+    SimpleTensor<TI> a{ shape_a_to_use, data_type_a, 1, a_qinfo };
+    SimpleTensor<TW> b{ shape_b, data_type_b, 1, b_qinfo };
 
     TensorShape shape_a_to_use_transposed{ shape_a_to_use };
     TensorShape shape_b_transposed{ shape_b };
@@ -193,12 +164,12 @@ SimpleTensor<int32_t> compute_gemmlowp_reference(const TensorShape &shape_a, con
     shape_b_transposed.set(0, shape_b[1]);
     shape_b_transposed.set(1, shape_b[0]);
 
-    SimpleTensor<TI> a_transposed{ shape_a_to_use_transposed, data_type_a, 1 };
-    SimpleTensor<TW> b_transposed{ shape_b_transposed, data_type_b, 1, data_type_b == DataType::QSYMM8_PER_CHANNEL ? b_qinfo : QuantizationInfo(1.0f / 255, b_offset) };
+    SimpleTensor<TI> a_transposed{ shape_a_to_use_transposed, data_type_a, 1, a_qinfo };
+    SimpleTensor<TW> b_transposed{ shape_b_transposed, data_type_b, 1, b_qinfo };
 
     // Fill reference
-    fill(a, 0);
-    fill(b, 1);
+    fill(a, 0 + finfo.hash);
+    fill(b, 1 + finfo.hash);
 
     // Transpose reference if required
     /* Note: Assuming the usual batch matmul dimensions A = (B x M x K), B = (B x K x N), if pretranspose_A is set to true, then A is assumed to be (B x K x M),
@@ -216,16 +187,18 @@ SimpleTensor<int32_t> compute_gemmlowp_reference(const TensorShape &shape_a, con
     }
 
     // Run with variable inputs.
+    const int32_t a_offset = a_qinfo.uniform().offset;
+    const int32_t b_offset = b_qinfo.uniform().offset;
     if(run_twice)
     {
         reference::gemmlowp_matrix_multiply_core<int32_t, TI, TW>((pretranspose_A ? a_transposed : a), (pretranspose_B ? b_transposed : b), shape_output, a_offset, b_offset);
-        fill((pretranspose_A) ? a_transposed : a, 3);
-        fill((pretranspose_B) ? b_transposed : b, 4);
+        fill((pretranspose_A) ? a_transposed : a, 3 + finfo.hash);
+        fill((pretranspose_B) ? b_transposed : b, 4 + finfo.hash);
     }
 
     return reference::gemmlowp_matrix_multiply_core<int32_t, TI, TW>((pretranspose_A ? a_transposed : a), (pretranspose_B ? b_transposed : b), shape_output, a_offset, b_offset);
 }
-}
+} // namespace
 
 template <typename TensorType, typename AccessorType, typename FunctionType, bool reinterpret_input_as_3d = false, bool reinterpret_output_as_3d = false, bool run_twice = false>
 class GEMMLowpMatrixMultiplyCoreValidationFixture : public framework::Fixture
@@ -233,20 +206,22 @@ class GEMMLowpMatrixMultiplyCoreValidationFixture : public framework::Fixture
 public:
     void setup(TensorShape shape_a, TensorShape shape_b, TensorShape shape_output, int32_t a_offset, int32_t b_offset)
     {
-        _target    = compute_target(shape_a, shape_b, shape_output, a_offset, b_offset);
-        _reference = compute_reference(shape_a, shape_b, shape_output, a_offset, b_offset);
+        const auto a_qinfo = QuantizationInfo(1.0f / 255, a_offset);
+        const auto b_qinfo = QuantizationInfo(1.0f / 255, b_offset);
+        _target    = compute_target(shape_a, shape_b, shape_output, a_qinfo, b_qinfo);
+        _reference = compute_reference(shape_a, shape_b, shape_output, a_qinfo, b_qinfo);
     }
 
 protected:
-    TensorType compute_target(const TensorShape &shape_a, const TensorShape &shape_b, const TensorShape &shape_output, int32_t a_offset, int32_t b_offset)
+    TensorType compute_target(const TensorShape &shape_a, const TensorShape &shape_b, const TensorShape &shape_output, const QuantizationInfo& a_qinfo, const QuantizationInfo& b_qinfo)
     {
-        return compute_gemmlowp_target<TensorType, AccessorType, FunctionType, reinterpret_input_as_3d, reinterpret_output_as_3d, int32_t, false, run_twice>(shape_a, shape_b, shape_output, a_offset,
-                b_offset);
+        const auto output_qinfo = QuantizationInfo(); // No output stage
+        return compute_gemmlowp_target<TensorType, AccessorType, FunctionType, reinterpret_input_as_3d, reinterpret_output_as_3d, int32_t, false, run_twice>(shape_a, shape_b, shape_output, a_qinfo, b_qinfo, output_qinfo);
     }
 
-    SimpleTensor<int32_t> compute_reference(const TensorShape &shape_a, const TensorShape &shape_b, const TensorShape &shape_output, int32_t a_offset, int32_t b_offset)
+    SimpleTensor<int32_t> compute_reference(const TensorShape &shape_a, const TensorShape &shape_b, const TensorShape &shape_output, const QuantizationInfo& a_qinfo, const QuantizationInfo& b_qinfo)
     {
-        return compute_gemmlowp_reference<reinterpret_input_as_3d, uint8_t, uint8_t, false, false, run_twice>(shape_a, shape_b, shape_output, a_offset, b_offset);
+        return compute_gemmlowp_reference<reinterpret_input_as_3d, uint8_t, uint8_t, false, false, run_twice>(shape_a, shape_b, shape_output, a_qinfo, b_qinfo);
     }
 
     TensorType            _target{};
@@ -257,54 +232,138 @@ template <typename TensorType, typename AccessorType, typename FunctionType, boo
 class GEMMLowpMatrixMultiplyCoreFusedOffsetOutputGenericValidationFixture : public framework::Fixture
 {
 public:
-    void setup(TensorShape shape_a, TensorShape shape_b, TensorShape shape_output, int32_t a_offset, int32_t b_offset, GEMMLowpOutputStageInfo output_stage, DataType data_type_b,
+    /** Dynamically initialize the quantization info with saturation awareness
+     */
+    template <typename T>
+    static void setup_quantization(DataType data_type, const TensorShape& shape_a, const TensorShape& shape_b, QuantizationInfo& a_qinfo, QuantizationInfo& b_qinfo, QuantizationInfo& output_qinfo, TensorFillInfo& finfo)
+    {
+        // This hash is used by random generators. There may be hash collisions but
+        // this is intentional as it's a very easy way to make the the current
+        // random generation process almost different for many test configurations,
+        // which were using the same set of values before.
+        finfo.hash = shape_a[0] + shape_a[1] + shape_b[0] + shape_b[1];
+
+        const int32_t t_max = static_cast<int32_t>(std::numeric_limits<T>::max());
+        const int32_t t_min = static_cast<int32_t>(std::numeric_limits<T>::min());
+
+        std::mt19937                           generator(library->seed() + finfo.hash);
+        std::uniform_real_distribution<float>  distribution_float(-5.0f, 3.0f);
+        std::uniform_int_distribution<int32_t> distribution_t(t_min, t_max);
+
+        const float scale_lhs = pow(2, distribution_float(generator)); // [2^-5, 2^3]
+        const float scale_rhs = pow(2, distribution_float(generator)); // [2^-5, 2^3]
+
+        const int32_t offset_lhs = distribution_t(generator);
+        const int32_t offset_rhs = distribution_t(generator);
+
+        a_qinfo = QuantizationInfo(scale_lhs, offset_lhs);
+        b_qinfo = QuantizationInfo(scale_rhs, offset_rhs);
+
+        // reinterpret_input_as_3d or reinterpret_output_as_3d can be ignored, as the underlying gemm / matmul computation
+        // is equivalent to a standard 2D one with m-n-k dimensions
+        const int m = shape_a.y();
+        const int n = shape_b.x();
+        const int k = shape_a.x();
+
+        const float bias_fraction = 0.5f; // We enabled is_fused in compute_gemmlowp_target below, thus bias is included
+
+        QuantizationHint q_hint = suggest_matmul_dst_q_info_and_bias(a_qinfo, b_qinfo, m, n, k, data_type, bias_fraction);
+        output_qinfo            = q_hint.q_info;
+        finfo.min_bias          = q_hint.bias_min;
+        finfo.max_bias          = q_hint.bias_max;
+
+        // Both target and reference implementations use negated offsets, i.e.
+        //      float_val = (int_val + offset) * scale
+        // instead of
+        //      float_val = (int_val - offset) * scale
+        // as usual. Therefore, after calculating the output quantization above, we
+        // negate the offsets of inputs' offsets.
+        a_qinfo = QuantizationInfo(scale_lhs, -offset_lhs);
+        b_qinfo = QuantizationInfo(scale_rhs, -offset_rhs);
+    }
+
+    /** Initialize output stage info from quantization info */
+    static Status init_gemmlowp_output_stage_info(
+                                        DataType                data_type,
+                                        const QuantizationInfo& a_qinfo,
+                                        const QuantizationInfo& b_qinfo,
+                                        const QuantizationInfo& output_qinfo,
+                                        GEMMLowpOutputStageType type,
+                                        GEMMLowpOutputStageInfo &gemmlowp_output_stage_info)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON(!is_data_type_quantized_asymmetric(data_type));
+
+        const UniformQuantizationInfo aq_unif   = a_qinfo.uniform();
+        const UniformQuantizationInfo bq_unif   = b_qinfo.uniform();
+        const UniformQuantizationInfo oq_unif   = output_qinfo.uniform();
+
+        float   multiplier = (aq_unif.scale * bq_unif.scale) / oq_unif.scale;
+        int32_t int_multiplier;
+        int32_t shift;
+
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            quantization::calculate_quantized_multiplier(multiplier, &int_multiplier, &shift));
+
+        int32_t type_min             = 0;
+        int32_t type_max             = 0;
+        std::tie(type_min, type_max) = quantization::get_quantized_asymmetric_output_min_max(output_qinfo, ActivationLayerInfo(), data_type);
+
+        gemmlowp_output_stage_info.gemmlowp_real_multiplier = multiplier;
+        gemmlowp_output_stage_info.gemmlowp_multiplier = int_multiplier;
+        gemmlowp_output_stage_info.gemmlowp_multipliers = { int_multiplier };
+        gemmlowp_output_stage_info.gemmlowp_shift      = shift;
+        gemmlowp_output_stage_info.gemmlowp_shifts     = { shift };
+        gemmlowp_output_stage_info.gemmlowp_offset     = oq_unif.offset;
+        gemmlowp_output_stage_info.type                = type;
+        gemmlowp_output_stage_info.gemmlowp_min_bound  = type_min;
+        gemmlowp_output_stage_info.gemmlowp_max_bound  = type_max;
+
+        return Status{};
+    }
+
+    /** Currently this fixture only tests the following data type configurations:
+     *
+     * 1. a and b are of the same data type
+     * 2. The data type is quantized asymmetric
+     *
+     */
+    void setup(TensorShape shape_a, TensorShape shape_b, TensorShape shape_output, GEMMLowpOutputStageType output_stage_type, DataType data_type,
                bool reshape_b_only_on_first_run)
     {
-        ARM_COMPUTE_ASSERT(output_stage.type != GEMMLowpOutputStageType::NONE);
-        DataType data_type_a = data_type_b == DataType::QASYMM8_SIGNED ? DataType::QASYMM8_SIGNED : DataType::QASYMM8;
+        ARM_COMPUTE_ASSERT(output_stage_type != GEMMLowpOutputStageType::NONE);
+        ARM_COMPUTE_ASSERT(is_data_type_quantized_asymmetric(data_type));
 
-        if(data_type_b == DataType::QSYMM8_PER_CHANNEL)
-        {
-            output_stage.is_quantized_per_channel              = true;
-            const size_t                          num_channels = shape_b[0];
-            std::vector<float>                    scales(num_channels);
-            std::uniform_real_distribution<float> distribution(0.f, 1.f);
-            library->fill(scales, distribution, 0);
-            output_stage.gemmlowp_multipliers.resize(num_channels);
-            output_stage.gemmlowp_shifts.resize(num_channels);
-            for(size_t i = 0; i < num_channels; ++i)
-            {
-                quantization::calculate_quantized_multiplier(scales[i], &output_stage.gemmlowp_multipliers[i], &output_stage.gemmlowp_shifts[i]);
-            }
+        // Randomized dynamic quantization: randomize quantization info in a way that ensures no result saturation
+        // most of the time
+        QuantizationInfo a_qinfo;
+        QuantizationInfo b_qinfo;
+        QuantizationInfo output_qinfo;
+        TensorFillInfo finfo;
+        setup_quantization<TI>(data_type, shape_a, shape_b, a_qinfo, b_qinfo, output_qinfo, finfo);
 
-            _reference = compute_reference(shape_a, shape_b, shape_output, a_offset, 0, output_stage, data_type_a, data_type_b, QuantizationInfo(scales));
-            _target    = compute_target(shape_a, shape_b, shape_output, a_offset, 0, output_stage, data_type_a, data_type_b, QuantizationInfo(scales), reshape_b_only_on_first_run);
-        }
-        else
-        {
-            _reference = compute_reference(shape_a, shape_b, shape_output, a_offset, b_offset, output_stage, data_type_a, data_type_b, QuantizationInfo());
-            _target    = compute_target(shape_a, shape_b, shape_output, a_offset, b_offset, output_stage, data_type_a, data_type_b, QuantizationInfo(), reshape_b_only_on_first_run);
-        }
+        GEMMLowpOutputStageInfo output_stage;
+        init_gemmlowp_output_stage_info(data_type, a_qinfo, b_qinfo, output_qinfo, output_stage_type, output_stage);
+
+        _reference = compute_reference(shape_a, shape_b, shape_output, a_qinfo, b_qinfo, data_type, data_type, output_stage, finfo);
+        _target    = compute_target(shape_a, shape_b, shape_output, a_qinfo, b_qinfo, output_qinfo, data_type, data_type, output_stage, reshape_b_only_on_first_run, finfo);
     }
 
 protected:
-    TensorType compute_target(const TensorShape &shape_a, const TensorShape &shape_b, const TensorShape &shape_output, int32_t a_offset, int32_t b_offset, GEMMLowpOutputStageInfo output_stage,
-                              DataType data_type_a, DataType data_type_b, QuantizationInfo b_qinfo, bool reshape_b_only_on_first_run = false)
+    TensorType compute_target(const TensorShape &shape_a, const TensorShape &shape_b, const TensorShape &shape_output, const QuantizationInfo& a_qinfo, const QuantizationInfo& b_qinfo, const QuantizationInfo& output_qinfo,
+                              DataType data_type_a, DataType data_type_b, const GEMMLowpOutputStageInfo& output_stage, bool reshape_b_only_on_first_run = false, const TensorFillInfo& finfo = TensorFillInfo())
     {
-        return compute_gemmlowp_target<TensorType, AccessorType, FunctionType, reinterpret_input_as_3d, reinterpret_output_as_3d, qasymm8_t, true, run_twice>(shape_a, shape_b, shape_output, a_offset,
-                b_offset,
-                output_stage, data_type_a, data_type_b, b_qinfo, reshape_b_only_on_first_run);
+        return compute_gemmlowp_target<TensorType, AccessorType, FunctionType, reinterpret_input_as_3d, reinterpret_output_as_3d, qasymm8_t, true, run_twice>(shape_a, shape_b, shape_output, a_qinfo,
+                b_qinfo, output_qinfo, data_type_a, data_type_b, output_stage, reshape_b_only_on_first_run, finfo);
     }
 
-    SimpleTensor<TI> compute_reference(const TensorShape &shape_a, const TensorShape &shape_b, const TensorShape &shape_output, int32_t a_offset, int32_t b_offset,
-                                       GEMMLowpOutputStageInfo output_stage, DataType data_type_a, DataType data_type_b, QuantizationInfo b_qinfo)
+    SimpleTensor<TI> compute_reference(const TensorShape &shape_a, const TensorShape &shape_b, const TensorShape &shape_output, const QuantizationInfo& a_qinfo, const QuantizationInfo& b_qinfo,
+                                       DataType data_type_a, DataType data_type_b, const GEMMLowpOutputStageInfo& output_stage, const TensorFillInfo& finfo = TensorFillInfo())
     {
-        SimpleTensor<int32_t> output = compute_gemmlowp_reference<reinterpret_input_as_3d, TI, TW, false, false, run_twice>(shape_a, shape_b, shape_output, a_offset, b_offset, data_type_a, data_type_b,
-                                                                                                                            b_qinfo);
+        SimpleTensor<int32_t> output = compute_gemmlowp_reference<reinterpret_input_as_3d, TI, TW, false, false, run_twice>(shape_a, shape_b, shape_output, a_qinfo, b_qinfo, data_type_a, data_type_b, finfo);
 
         TensorShape           bias_shape(shape_b[0]);
         SimpleTensor<int32_t> bias{ bias_shape, DataType::S32, 1 };
-        (run_twice) ? fill(bias, 5) : fill(bias, 2); // Fill bias with same seed as last run of gemmlowp_target
+        (run_twice) ? fill_bias_s32(bias, 5 + finfo.hash, finfo.min_bias, finfo.max_bias) : fill_bias_s32(bias, 2 + finfo.hash, finfo.min_bias, finfo.max_bias); // Fill bias with same seed as last run of gemmlowp_target
 
         switch(output_stage.type)
         {
@@ -330,10 +389,10 @@ class GEMMLowpMatrixMultiplyCoreFusedOffsetOutputValidationFixture : public
     GEMMLowpMatrixMultiplyCoreFusedOffsetOutputGenericValidationFixture<TensorType, AccessorType, FunctionType, reinterpret_input_as_3d, reinterpret_output_as_3d, TI, TW>
 {
 public:
-    void setup(TensorShape shape_a, TensorShape shape_b, TensorShape shape_output, int32_t a_offset, int32_t b_offset, GEMMLowpOutputStageInfo output_stage, DataType data_type_b)
+    void setup(TensorShape shape_a, TensorShape shape_b, TensorShape shape_output, GEMMLowpOutputStageType output_stage_type, DataType data_type)
     {
         GEMMLowpMatrixMultiplyCoreFusedOffsetOutputGenericValidationFixture<TensorType, AccessorType, FunctionType, reinterpret_input_as_3d, reinterpret_output_as_3d, TI, TW>::setup(shape_a, shape_b,
-                shape_output, a_offset, b_offset, output_stage, data_type_b, false);
+                shape_output, output_stage_type, data_type, false /* reshape_b_only_on_first_run */);
     }
 };
 
@@ -2076,4 +2135,4 @@ class GEMMLowpMatrixMultiplyNative3DValidationFixture : public framework::Fixtur
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_TEST_GEMMLOWP_FIXTURE */
+#endif // ACL_TESTS_VALIDATION_FIXTURES_GEMMLOWPFIXTURE_H
diff --git a/tests/validation/fixtures/WinogradConvolutionLayerFixture.h b/tests/validation/fixtures/WinogradConvolutionLayerFixture.h
index 1e8820492a..20b678b36c 100644
--- a/tests/validation/fixtures/WinogradConvolutionLayerFixture.h
+++ b/tests/validation/fixtures/WinogradConvolutionLayerFixture.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2021, 2023 Arm Limited.
+ * Copyright (c) 2018-2021, 2023-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_TEST_WINOGRAD_LAYER_FIXTURE
-#define ARM_COMPUTE_TEST_WINOGRAD_LAYER_FIXTURE
+#ifndef ACL_TESTS_VALIDATION_FIXTURES_WINOGRADCONVOLUTIONLAYERFIXTURE_H
+#define ACL_TESTS_VALIDATION_FIXTURES_WINOGRADCONVOLUTIONLAYERFIXTURE_H
 
 #include "arm_compute/core/TensorShape.h"
 #include "arm_compute/core/Types.h"
@@ -229,7 +229,7 @@ class WinogradConvolutionLayerFastMathValidationFixture : public framework::Fixt
         SimpleTensor<T1> filter_transform_out = reference::winograd_filter_transform<T1>(weights_t1, filter_transform_shape, winograd_info);
         SimpleTensor<T1> batched_gemm         = reference::gemm<T1>(input_transform_out, filter_transform_out, dummy_c, 1.0f, 0.0f);
         SimpleTensor<T1> conv_out             = reference::winograd_output_transform<T1>(batched_gemm, bias_t1, output_transform_shape, winograd_info);
-        SimpleTensor<T>  conv_out_t(std::move(copy_tensor<T, T1>(conv_out)));
+        SimpleTensor<T>  conv_out_t(copy_tensor<T, T1>(conv_out));
         return (act_info.enabled()) ? reference::activation_layer<T>(conv_out_t, act_info) : conv_out_t;
     }
 
@@ -584,4 +584,4 @@ class WinogradOutputTransformValidationFixture : public framework::Fixture
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_TEST_WINOGRAD_LAYER_FIXTURE */
\ No newline at end of file
+#endif // ACL_TESTS_VALIDATION_FIXTURES_WINOGRADCONVOLUTIONLAYERFIXTURE_H
diff --git a/utils/TypePrinter.h b/utils/TypePrinter.h
index 4f14d985af..e8831a354c 100644
--- a/utils/TypePrinter.h
+++ b/utils/TypePrinter.h
@@ -2230,6 +2230,9 @@ inline ::std::ostream &operator<<(::std::ostream &os, const GPUTarget &gpu_targe
         case GPUTarget::VALHALL:
             os << "VALHALL";
             break;
+        case GPUTarget::FIFTHGEN:
+            os << "FIFTHGEN";
+            break;
         case GPUTarget::T600:
             os << "T600";
             break;
@@ -2299,6 +2302,12 @@ inline ::std::ostream &operator<<(::std::ostream &os, const GPUTarget &gpu_targe
         case GPUTarget::G615:
             os << "G615";
             break;
+        case GPUTarget::G720:
+            os << "G720";
+            break;
+        case GPUTarget::G620:
+            os << "G620";
+            break;
         default:
             ARM_COMPUTE_ERROR("NOT_SUPPORTED!");
     }