Skip to content

Commit

Permalink
+add AMX-BF16 (AVX-512VBMI) optimizations of function DeinterleaveUv.
Browse files Browse the repository at this point in the history
  • Loading branch information
ermig1979 committed Sep 24, 2024
1 parent a2926f6 commit 1b27b85
Show file tree
Hide file tree
Showing 32 changed files with 124 additions and 48 deletions.
1 change: 1 addition & 0 deletions docs/2024.html
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ <h5>New features</h5>
<ul>
<li>Base implementation of class SynetDeconvolution16bGemm.</li>
<li>Base implementation, SSE4.1, AVX2, AVX-512BW, AMX-BF16 optimizations of class SynetDeconvolution16bNhwcGemm.</li>
<li>AMX-BF16 (AVX-512VBMI) optimizations of function DeinterleaveUv.</li>
</ul>

<h4>Test framework</h4>
Expand Down
1 change: 1 addition & 0 deletions prj/vs2019/AmxBf16.vcxproj
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@
<ClInclude Include="..\..\src\Simd\SimdUpdate.h" />
</ItemGroup>
<ItemGroup>
<ClCompile Include="..\..\src\Simd\SimdAmxBf16Deinterleave.cpp" />
<ClCompile Include="..\..\src\Simd\SimdAmxBf16SynetConvolution16b.cpp" />
<ClCompile Include="..\..\src\Simd\SimdAmxBf16SynetConvolution16bNchwGemm.cpp" />
<ClCompile Include="..\..\src\Simd\SimdAmxBf16SynetConvolution16bNhwcDirect.cpp" />
Expand Down
3 changes: 3 additions & 0 deletions prj/vs2019/AmxBf16.vcxproj.filters
Original file line number Diff line number Diff line change
Expand Up @@ -257,5 +257,8 @@
<ClCompile Include="..\..\src\Simd\SimdAmxBf16SynetDeconvolution16b.cpp">
<Filter>AmxBf16</Filter>
</ClCompile>
<ClCompile Include="..\..\src\Simd\SimdAmxBf16Deinterleave.cpp">
<Filter>AmxBf16</Filter>
</ClCompile>
</ItemGroup>
</Project>
1 change: 0 additions & 1 deletion prj/vs2019/Avx2.vcxproj
Original file line number Diff line number Diff line change
Expand Up @@ -149,7 +149,6 @@
<ClInclude Include="..\..\src\Simd\SimdAllocator.hpp" />
<ClInclude Include="..\..\src\Simd\SimdAlphaBlending.h" />
<ClInclude Include="..\..\src\Simd\SimdArray.h" />
<ClInclude Include="..\..\src\Simd\SimdAvx1.h" />
<ClInclude Include="..\..\src\Simd\SimdAvx2.h" />
<ClInclude Include="..\..\src\Simd\SimdBase.h" />
<ClInclude Include="..\..\src\Simd\SimdBase64.h" />
Expand Down
3 changes: 0 additions & 3 deletions prj/vs2019/Avx2.vcxproj.filters
Original file line number Diff line number Diff line change
Expand Up @@ -419,9 +419,6 @@
<ClInclude Include="..\..\src\Simd\SimdArray.h">
<Filter>Inc</Filter>
</ClInclude>
<ClInclude Include="..\..\src\Simd\SimdAvx1.h">
<Filter>Inc</Filter>
</ClInclude>
<ClInclude Include="..\..\src\Simd\SimdBase.h">
<Filter>Inc</Filter>
</ClInclude>
Expand Down
1 change: 0 additions & 1 deletion prj/vs2019/Avx512bw.vcxproj
Original file line number Diff line number Diff line change
Expand Up @@ -152,7 +152,6 @@
<ClInclude Include="..\..\src\Simd\SimdAllocator.hpp" />
<ClInclude Include="..\..\src\Simd\SimdAlphaBlending.h" />
<ClInclude Include="..\..\src\Simd\SimdArray.h" />
<ClInclude Include="..\..\src\Simd\SimdAvx1.h" />
<ClInclude Include="..\..\src\Simd\SimdAvx2.h" />
<ClInclude Include="..\..\src\Simd\SimdAvx512bw.h" />
<ClInclude Include="..\..\src\Simd\SimdBase.h" />
Expand Down
3 changes: 0 additions & 3 deletions prj/vs2019/Avx512bw.vcxproj.filters
Original file line number Diff line number Diff line change
Expand Up @@ -620,9 +620,6 @@
<ClInclude Include="..\..\src\Simd\SimdSynetInnerProduct32f.h">
<Filter>Inc</Filter>
</ClInclude>
<ClInclude Include="..\..\src\Simd\SimdAvx1.h">
<Filter>Inc</Filter>
</ClInclude>
<ClInclude Include="..\..\src\Simd\SimdTranspose.h">
<Filter>Inc</Filter>
</ClInclude>
Expand Down
1 change: 0 additions & 1 deletion prj/vs2019/Simd.vcxproj
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,6 @@
<ClInclude Include="..\..\src\Simd\SimdMemory.h" />
<ClInclude Include="..\..\src\Simd\SimdMemoryStream.h" />
<ClInclude Include="..\..\src\Simd\SimdMotion.hpp" />
<ClInclude Include="..\..\src\Simd\SimdMsa.h" />
<ClInclude Include="..\..\src\Simd\SimdNeon.h" />
<ClInclude Include="..\..\src\Simd\SimdNeural.hpp" />
<ClInclude Include="..\..\src\Simd\SimdParallel.hpp" />
Expand Down
3 changes: 0 additions & 3 deletions prj/vs2019/Simd.vcxproj.filters
Original file line number Diff line number Diff line change
Expand Up @@ -100,9 +100,6 @@
<ClInclude Include="..\..\src\Simd\SimdMemory.h">
<Filter>Inc</Filter>
</ClInclude>
<ClInclude Include="..\..\src\Simd\SimdMsa.h">
<Filter>Inc</Filter>
</ClInclude>
<ClInclude Include="..\..\src\Simd\SimdNeon.h">
<Filter>Inc</Filter>
</ClInclude>
Expand Down
1 change: 1 addition & 0 deletions prj/vs2022/AmxBf16.vcxproj
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@
<ClInclude Include="..\..\src\Simd\SimdUpdate.h" />
</ItemGroup>
<ItemGroup>
<ClCompile Include="..\..\src\Simd\SimdAmxBf16Deinterleave.cpp" />
<ClCompile Include="..\..\src\Simd\SimdAmxBf16SynetConvolution16b.cpp" />
<ClCompile Include="..\..\src\Simd\SimdAmxBf16SynetConvolution16bNchwGemm.cpp" />
<ClCompile Include="..\..\src\Simd\SimdAmxBf16SynetConvolution16bNhwcDirect.cpp" />
Expand Down
3 changes: 3 additions & 0 deletions prj/vs2022/AmxBf16.vcxproj.filters
Original file line number Diff line number Diff line change
Expand Up @@ -257,5 +257,8 @@
<ClCompile Include="..\..\src\Simd\SimdAmxBf16SynetDeconvolution16b.cpp">
<Filter>AmxBf16</Filter>
</ClCompile>
<ClCompile Include="..\..\src\Simd\SimdAmxBf16Deinterleave.cpp">
<Filter>AmxBf16</Filter>
</ClCompile>
</ItemGroup>
</Project>
1 change: 0 additions & 1 deletion prj/vs2022/Avx2.vcxproj
Original file line number Diff line number Diff line change
Expand Up @@ -149,7 +149,6 @@
<ClInclude Include="..\..\src\Simd\SimdAllocator.hpp" />
<ClInclude Include="..\..\src\Simd\SimdAlphaBlending.h" />
<ClInclude Include="..\..\src\Simd\SimdArray.h" />
<ClInclude Include="..\..\src\Simd\SimdAvx1.h" />
<ClInclude Include="..\..\src\Simd\SimdAvx2.h" />
<ClInclude Include="..\..\src\Simd\SimdBase.h" />
<ClInclude Include="..\..\src\Simd\SimdBase64.h" />
Expand Down
3 changes: 0 additions & 3 deletions prj/vs2022/Avx2.vcxproj.filters
Original file line number Diff line number Diff line change
Expand Up @@ -419,9 +419,6 @@
<ClInclude Include="..\..\src\Simd\SimdArray.h">
<Filter>Inc</Filter>
</ClInclude>
<ClInclude Include="..\..\src\Simd\SimdAvx1.h">
<Filter>Inc</Filter>
</ClInclude>
<ClInclude Include="..\..\src\Simd\SimdBase.h">
<Filter>Inc</Filter>
</ClInclude>
Expand Down
1 change: 0 additions & 1 deletion prj/vs2022/Avx512bw.vcxproj
Original file line number Diff line number Diff line change
Expand Up @@ -152,7 +152,6 @@
<ClInclude Include="..\..\src\Simd\SimdAllocator.hpp" />
<ClInclude Include="..\..\src\Simd\SimdAlphaBlending.h" />
<ClInclude Include="..\..\src\Simd\SimdArray.h" />
<ClInclude Include="..\..\src\Simd\SimdAvx1.h" />
<ClInclude Include="..\..\src\Simd\SimdAvx2.h" />
<ClInclude Include="..\..\src\Simd\SimdAvx512bw.h" />
<ClInclude Include="..\..\src\Simd\SimdBase.h" />
Expand Down
3 changes: 0 additions & 3 deletions prj/vs2022/Avx512bw.vcxproj.filters
Original file line number Diff line number Diff line change
Expand Up @@ -620,9 +620,6 @@
<ClInclude Include="..\..\src\Simd\SimdSynetInnerProduct32f.h">
<Filter>Inc</Filter>
</ClInclude>
<ClInclude Include="..\..\src\Simd\SimdAvx1.h">
<Filter>Inc</Filter>
</ClInclude>
<ClInclude Include="..\..\src\Simd\SimdTranspose.h">
<Filter>Inc</Filter>
</ClInclude>
Expand Down
1 change: 0 additions & 1 deletion prj/vs2022/Simd.vcxproj
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,6 @@
<ClInclude Include="..\..\src\Simd\SimdMemory.h" />
<ClInclude Include="..\..\src\Simd\SimdMemoryStream.h" />
<ClInclude Include="..\..\src\Simd\SimdMotion.hpp" />
<ClInclude Include="..\..\src\Simd\SimdMsa.h" />
<ClInclude Include="..\..\src\Simd\SimdNeon.h" />
<ClInclude Include="..\..\src\Simd\SimdNeural.hpp" />
<ClInclude Include="..\..\src\Simd\SimdParallel.hpp" />
Expand Down
3 changes: 0 additions & 3 deletions prj/vs2022/Simd.vcxproj.filters
Original file line number Diff line number Diff line change
Expand Up @@ -100,9 +100,6 @@
<ClInclude Include="..\..\src\Simd\SimdMemory.h">
<Filter>Inc</Filter>
</ClInclude>
<ClInclude Include="..\..\src\Simd\SimdMsa.h">
<Filter>Inc</Filter>
</ClInclude>
<ClInclude Include="..\..\src\Simd\SimdNeon.h">
<Filter>Inc</Filter>
</ClInclude>
Expand Down
2 changes: 2 additions & 0 deletions src/Simd/SimdAmxBf16.h
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,8 @@ namespace Simd
#ifdef SIMD_AMXBF16_ENABLE
namespace AmxBf16
{
void DeinterleaveUv(const uint8_t* uv, size_t uvStride, size_t width, size_t height, uint8_t* u, size_t uStride, uint8_t* v, size_t vStride);

void Float32ToBFloat16(const float* src, size_t size, uint16_t* dst);

void ChangeColors(const uint8_t* src, size_t srcStride, size_t width, size_t height, const uint8_t* colors, uint8_t* dst, size_t dstStride);
Expand Down
79 changes: 79 additions & 0 deletions src/Simd/SimdAmxBf16Deinterleave.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
/*
* Simd Library (http://ermig1979.github.io/Simd).
*
* Copyright (c) 2011-2021 Yermalayeu Ihar.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
#include "Simd/SimdMemory.h"
#include "Simd/SimdStore.h"
#include "Simd/SimdConversion.h"

namespace Simd
{
#ifdef SIMD_AMXBF16_ENABLE
namespace AmxBf16
{
const __m512i K8_PERM_UV_TO_U = SIMD_MM512_SETR_EPI8(
0x00, 0x02, 0x04, 0x06, 0x08, 0x0A, 0x0C, 0x0E, 0x10, 0x12, 0x14, 0x16, 0x18, 0x1A, 0x1C, 0x1E,
0x20, 0x22, 0x24, 0x26, 0x28, 0x2A, 0x2C, 0x2E, 0x30, 0x32, 0x34, 0x36, 0x38, 0x3A, 0x3C, 0x3E,
0x40, 0x42, 0x44, 0x46, 0x48, 0x4A, 0x4C, 0x4E, 0x50, 0x52, 0x54, 0x56, 0x58, 0x5A, 0x5C, 0x5E,
0x60, 0x62, 0x64, 0x66, 0x68, 0x6A, 0x6C, 0x6E, 0x70, 0x72, 0x74, 0x76, 0x78, 0x7A, 0x7C, 0x7E);

const __m512i K8_PERM_UV_TO_V = SIMD_MM512_SETR_EPI8(
0x01, 0x03, 0x05, 0x07, 0x09, 0x0B, 0x0D, 0x0F, 0x11, 0x13, 0x15, 0x17, 0x19, 0x1B, 0x1D, 0x1F,
0x21, 0x23, 0x25, 0x27, 0x29, 0x2B, 0x2D, 0x2F, 0x31, 0x33, 0x35, 0x37, 0x39, 0x3B, 0x3D, 0x3F,
0x41, 0x43, 0x45, 0x47, 0x49, 0x4B, 0x4D, 0x4F, 0x51, 0x53, 0x55, 0x57, 0x59, 0x5B, 0x5D, 0x5F,
0x61, 0x63, 0x65, 0x67, 0x69, 0x6B, 0x6D, 0x6F, 0x71, 0x73, 0x75, 0x77, 0x79, 0x7B, 0x7D, 0x7F);

SIMD_INLINE void DeinterleaveUv(const uint8_t * uv, uint8_t * u, uint8_t * v)
{
const __m512i uv0 = _mm512_loadu_si512(uv + 0);
const __m512i uv1 = _mm512_loadu_si512(uv + A);
_mm512_storeu_si512(u, _mm512_permutex2var_epi8(uv0, K8_PERM_UV_TO_U, uv1));
_mm512_storeu_si512(v, _mm512_permutex2var_epi8(uv0, K8_PERM_UV_TO_V, uv1));
}

SIMD_INLINE void DeinterleaveUv(const uint8_t* uv, uint8_t* u, uint8_t* v, __mmask64 tail)
{
const __m512i uv0 = _mm512_maskz_loadu_epi16(__mmask32(tail >> 00), (uint16_t*)(uv + 0));
const __m512i uv1 = _mm512_maskz_loadu_epi16(__mmask32(tail >> 32), (uint16_t*)(uv + A));
_mm512_mask_storeu_epi8(u, tail, _mm512_permutex2var_epi8(uv0, K8_PERM_UV_TO_U, uv1));
_mm512_mask_storeu_epi8(v, tail, _mm512_permutex2var_epi8(uv0, K8_PERM_UV_TO_V, uv1));
}

void DeinterleaveUv(const uint8_t * uv, size_t uvStride, size_t width, size_t height, uint8_t * u, size_t uStride, uint8_t * v, size_t vStride)
{
size_t widthA = AlignLo(width, A);
__mmask64 tail = TailMask64(width - widthA);
for (size_t row = 0; row < height; ++row)
{
size_t col = 0;
for (; col < widthA; col += A)
DeinterleaveUv(uv + col * 2, u + col, v + col);
if (col < width)
DeinterleaveUv(uv + col * 2, u + col, v + col, tail);
uv += uvStride;
u += uStride;
v += vStride;
}
}
}
#endif
}
2 changes: 1 addition & 1 deletion src/Simd/SimdArray.h
Original file line number Diff line number Diff line change
Expand Up @@ -164,4 +164,4 @@ namespace Simd
#endif
}

#endif//__SimdArray_h__
#endif
6 changes: 3 additions & 3 deletions src/Simd/SimdAvx512bwDeinterleave.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@ namespace Simd
DeinterleaveUv<false>(uv, uvStride, width, height, u, uStride, v, vStride);
}

//---------------------------------------------------------------------
//-------------------------------------------------------------------------------------------------

const __m512i K8_SHUFFLE_DEINTERLEAVE_BGR = SIMD_MM512_SETR_EPI8(
0x0, 0x3, 0x6, 0x9, 0x1, 0x4, 0x7, 0xA, 0x2, 0x5, 0x8, 0xB, -1, -1, -1, -1,
Expand Down Expand Up @@ -162,7 +162,7 @@ namespace Simd
DeinterleaveBgr<false>(bgr, bgrStride, width, height, b, bStride, g, gStride, r, rStride);
}

//---------------------------------------------------------------------
//-------------------------------------------------------------------------------------------------

const __m512i K8_SHUFFLE_BGRA = SIMD_MM512_SETR_EPI8(
0x0, 0x4, 0x8, 0xC, 0x1, 0x5, 0x9, 0xD, 0x2, 0x6, 0xA, 0xE, 0x3, 0x7, 0xB, 0xF,
Expand Down Expand Up @@ -249,5 +249,5 @@ namespace Simd
DeinterleaveBgra<false>(bgra, bgraStride, width, height, b, bStride, g, gStride, r, rStride, a, aStride);
}
}
#endif// SIMD_AVX512BW_ENABLE
#endif
}
2 changes: 1 addition & 1 deletion src/Simd/SimdBaseSynetDeconvolution16b.cpp
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
/*
* Simd Library (http://ermig1979.github.io/Simd).
*
* Copyright (c) 2011-2023 Yermalayeu Ihar.
* Copyright (c) 2011-2024 Yermalayeu Ihar.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
Expand Down
2 changes: 1 addition & 1 deletion src/Simd/SimdBaseSynetDeconvolution16bNhwcGemm.cpp
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
/*
* Simd Library (http://ermig1979.github.io/Simd).
*
* Copyright (c) 2011-2023 Yermalayeu Ihar.
* Copyright (c) 2011-2024 Yermalayeu Ihar.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
Expand Down
2 changes: 1 addition & 1 deletion src/Simd/SimdBaseSynetDeconvolution32f.cpp
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
/*
* Simd Library (http://ermig1979.github.io/Simd).
*
* Copyright (c) 2011-2023 Yermalayeu Ihar.
* Copyright (c) 2011-2024 Yermalayeu Ihar.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
Expand Down
5 changes: 5 additions & 0 deletions src/Simd/SimdLib.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1714,6 +1714,11 @@ SIMD_API void SimdDeinterleaveUv(const uint8_t * uv, size_t uvStride, size_t wid
uint8_t * u, size_t uStride, uint8_t * v, size_t vStride)
{
SIMD_EMPTY();
#ifdef SIMD_AMXBF16_ENABLE
if (AmxBf16::Enable)
AmxBf16::DeinterleaveUv(uv, uvStride, width, height, u, uStride, v, vStride);
else
#endif
#ifdef SIMD_AVX512BW_ENABLE
if (Avx512bw::Enable)
Avx512bw::DeinterleaveUv(uv, uvStride, width, height, u, uStride, v, vStride);
Expand Down
6 changes: 3 additions & 3 deletions src/Simd/SimdLib.h
Original file line number Diff line number Diff line change
Expand Up @@ -741,7 +741,7 @@ typedef struct SimdConvolutionParameters
#ifdef __cplusplus
extern "C"
{
#endif//__cplusplus
#endif

/*! @ingroup info
Expand Down Expand Up @@ -8861,6 +8861,6 @@ extern "C"
size_t width, size_t height, uint8_t* uyvy, size_t uyvyStride);
#ifdef __cplusplus
}
#endif // __cplusplus
#endif

#endif//__SimdLib_h__
#endif
4 changes: 2 additions & 2 deletions src/Simd/SimdLoad.h
Original file line number Diff line number Diff line change
Expand Up @@ -971,6 +971,6 @@ namespace Simd
return vextq_f32(Load<false>(p - 2), vdupq_n_f32(0.0f), 2);
}
}
#endif//SIMD_NEON_ENABLE
#endif
}
#endif//__SimdLoad_h__
#endif
12 changes: 6 additions & 6 deletions src/Simd/SimdLog.h
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ namespace Simd
Simd::Log<T>(buffer, n, name);
}
}
#endif //SIMD_SSE41_ENABLE
#endif

#ifdef SIMD_AVX2_ENABLE
namespace Avx2
Expand All @@ -96,7 +96,7 @@ namespace Simd
Simd::Log<T>(buffer, n, name);
}
}
#endif //SIMD_AVX2_ENABLE
#endif

#ifdef SIMD_AVX512BW_ENABLE
namespace Avx512bw
Expand Down Expand Up @@ -218,7 +218,7 @@ namespace Simd
std::cout << "} " << std::endl;
}
}
#endif//SIMD_NEON_ENABLE
#endif
}

#define SIMD_LOG(value) Log(value, #value)
Expand All @@ -235,7 +235,7 @@ namespace Simd
std::cerr << ss.str() << std::flush; \
}

#else//SIMD_LOG_ENABLE
#else

#define SIMD_LOG(value)
#define SIMD_LOG1(value)
Expand All @@ -244,6 +244,6 @@ namespace Simd

#define SIMD_LOG_ERROR(message)

#endif//SIMD_LOG_ENABLE
#endif

#endif//__SimdLog_h__
#endif
2 changes: 1 addition & 1 deletion src/Simd/SimdNeonSynetDeconvolution32f.cpp
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
/*
* Simd Library (http://ermig1979.github.io/Simd).
*
* Copyright (c) 2011-2023 Yermalayeu Ihar.
* Copyright (c) 2011-2024 Yermalayeu Ihar.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
Expand Down
Loading

0 comments on commit 1b27b85

Please sign in to comment.