From e1681043b3a3477d950f0ed4989767abecf99a83 Mon Sep 17 00:00:00 2001 From: Yermalayeu Ihar Date: Tue, 8 Aug 2023 09:15:37 +0300 Subject: [PATCH] +add AVX-512BW optimizations of functions Yuv422pToBgrV2, Yuv444pToBgrV2. --- docs/2023.html | 4 +- src/Simd/SimdAvx512bw.h | 6 ++ src/Simd/SimdAvx512bwYuvToBgrV2.cpp | 136 ++++++++++++++++++++++++++++ src/Simd/SimdLib.cpp | 20 ++-- src/Test/TestYuvToAny.cpp | 16 ++-- 5 files changed, 162 insertions(+), 20 deletions(-) diff --git a/docs/2023.html b/docs/2023.html index e1792ccbad..b2481a67a4 100644 --- a/docs/2023.html +++ b/docs/2023.html @@ -38,8 +38,8 @@

Algorithms

New features

Test framework

diff --git a/src/Simd/SimdAvx512bw.h b/src/Simd/SimdAvx512bw.h index 466117874c..56ed5b0388 100644 --- a/src/Simd/SimdAvx512bw.h +++ b/src/Simd/SimdAvx512bw.h @@ -678,9 +678,15 @@ namespace Simd void Yuv422pToBgr(const uint8_t * y, size_t yStride, const uint8_t * u, size_t uStride, const uint8_t * v, size_t vStride, size_t width, size_t height, uint8_t * bgr, size_t bgrStride); + void Yuv422pToBgrV2(const uint8_t* y, size_t yStride, const uint8_t* u, size_t uStride, const uint8_t* v, size_t vStride, + size_t width, size_t height, uint8_t* bgr, size_t bgrStride, SimdYuvType yuvType); + void Yuv444pToBgr(const uint8_t * y, size_t yStride, const uint8_t * u, size_t uStride, const uint8_t * v, size_t vStride, size_t width, size_t height, uint8_t * bgr, size_t bgrStride); + void Yuv444pToBgrV2(const uint8_t* y, size_t yStride, const uint8_t* u, size_t uStride, const uint8_t* v, size_t vStride, + size_t width, size_t height, uint8_t* bgr, size_t bgrStride, SimdYuvType yuvType); + void Yuv420pToBgra(const uint8_t * y, size_t yStride, const uint8_t * u, size_t uStride, const uint8_t * v, size_t vStride, size_t width, size_t height, uint8_t * bgra, size_t bgraStride, uint8_t alpha); diff --git a/src/Simd/SimdAvx512bwYuvToBgrV2.cpp b/src/Simd/SimdAvx512bwYuvToBgrV2.cpp index ff216d76b0..2550df0337 100644 --- a/src/Simd/SimdAvx512bwYuvToBgrV2.cpp +++ b/src/Simd/SimdAvx512bwYuvToBgrV2.cpp @@ -112,6 +112,142 @@ namespace Simd else Yuv420pToBgrV2(y, yStride, u, uStride, v, vStride, width, height, bgr, bgrStride, yuvType); } + + //------------------------------------------------------------------------------------------------- + + template SIMD_YUV_TO_BGR_INLINE void Yuv422pToBgrV2(const uint8_t* y, + const uint8_t* u, const uint8_t* v, uint8_t* bgr, const __mmask64* tails) + { + __m512i _u = _mm512_permutexvar_epi64(K64_PERMUTE_FOR_UNPACK, (Load(u, tails[0]))); + __m512i u0 = UnpackU8<0>(_u, _u); + __m512i u1 = UnpackU8<1>(_u, _u); + __m512i _v = _mm512_permutexvar_epi64(K64_PERMUTE_FOR_UNPACK, (Load(v, tails[0]))); + __m512i v0 = UnpackU8<0>(_v, _v); + __m512i v1 = UnpackU8<1>(_v, _v); + YuvToBgr(Load(y + 0, tails[1]), u0, v0, bgr + 0 * A, tails + 3); + YuvToBgr(Load(y + A, tails[2]), u1, v1, bgr + 3 * A, tails + 6); + } + + template void Yuv422pToBgrV2(const uint8_t* y, size_t yStride, const uint8_t* u, size_t uStride, const uint8_t* v, size_t vStride, + size_t width, size_t height, uint8_t* bgr, size_t bgrStride) + { + assert(width % 2 == 0); + if (align) + { + assert(Aligned(y) && Aligned(yStride) && Aligned(u) && Aligned(uStride)); + assert(Aligned(v) && Aligned(vStride) && Aligned(bgr) && Aligned(bgrStride)); + } + + width /= 2; + size_t alignedWidth = AlignLo(width, A); + size_t tail = width - alignedWidth; + __mmask64 tailMasks[9]; + tailMasks[0] = TailMask64(tail); + for (size_t i = 0; i < 2; ++i) + tailMasks[1 + i] = TailMask64(tail * 2 - A * i); + for (size_t i = 0; i < 6; ++i) + tailMasks[3 + i] = TailMask64(tail * 6 - A * i); + for (size_t row = 0; row < height; row += 1) + { + size_t col = 0; + for (; col < alignedWidth; col += A) + Yuv422pToBgrV2(y + col * 2, u + col, v + col, bgr + col * 6, tailMasks); + if (col < width) + Yuv422pToBgrV2(y + col * 2, u + col, v + col, bgr + col * 6, tailMasks); + y += yStride; + u += uStride; + v += vStride; + bgr += bgrStride; + } + } + + template void Yuv422pToBgrV2(const uint8_t* y, size_t yStride, const uint8_t* u, size_t uStride, const uint8_t* v, size_t vStride, + size_t width, size_t height, uint8_t* bgr, size_t bgrStride, SimdYuvType yuvType) + { + switch (yuvType) + { + case SimdYuvBt601: Yuv422pToBgrV2(y, yStride, u, uStride, v, vStride, width, height, bgr, bgrStride); break; + case SimdYuvBt709: Yuv422pToBgrV2(y, yStride, u, uStride, v, vStride, width, height, bgr, bgrStride); break; + case SimdYuvBt2020: Yuv422pToBgrV2(y, yStride, u, uStride, v, vStride, width, height, bgr, bgrStride); break; + case SimdYuvTrect871: Yuv422pToBgrV2(y, yStride, u, uStride, v, vStride, width, height, bgr, bgrStride); break; + default: + assert(0); + } + } + + void Yuv422pToBgrV2(const uint8_t* y, size_t yStride, const uint8_t* u, size_t uStride, const uint8_t* v, size_t vStride, + size_t width, size_t height, uint8_t* bgr, size_t bgrStride, SimdYuvType yuvType) + { + if (Aligned(y) && Aligned(yStride) && Aligned(u) && Aligned(uStride) + && Aligned(v) && Aligned(vStride) && Aligned(bgr) && Aligned(bgrStride)) + Yuv422pToBgrV2(y, yStride, u, uStride, v, vStride, width, height, bgr, bgrStride, yuvType); + else + Yuv422pToBgrV2(y, yStride, u, uStride, v, vStride, width, height, bgr, bgrStride, yuvType); + } + + //------------------------------------------------------------------------------------------------- + + template SIMD_YUV_TO_BGR_INLINE void Yuv444pToBgrV2(const uint8_t* y, + const uint8_t* u, const uint8_t* v, uint8_t* bgr, const __mmask64* tails) + { + __m512i _y = Load(y, tails[0]); + __m512i _u = Load(u, tails[0]); + __m512i _v = Load(v, tails[0]); + YuvToBgr(_y, _u, _v, bgr, tails + 1); + } + + template void Yuv444pToBgrV2(const uint8_t* y, size_t yStride, const uint8_t* u, size_t uStride, const uint8_t* v, size_t vStride, + size_t width, size_t height, uint8_t* bgr, size_t bgrStride) + { + if (align) + { + assert(Aligned(y) && Aligned(yStride) && Aligned(u) && Aligned(uStride)); + assert(Aligned(v) && Aligned(vStride) && Aligned(bgr) && Aligned(bgrStride)); + } + + size_t alignedWidth = AlignLo(width, A); + size_t tail = width - alignedWidth; + __mmask64 tailMasks[4]; + tailMasks[0] = TailMask64(tail); + for (size_t i = 0; i < 3; ++i) + tailMasks[1 + i] = TailMask64(tail * 3 - A * i); + for (size_t row = 0; row < height; row += 1) + { + size_t col = 0; + for (; col < alignedWidth; col += A) + Yuv444pToBgrV2(y + col, u + col, v + col, bgr + col * 3, tailMasks); + if (col < width) + Yuv444pToBgrV2(y + col, u + col, v + col, bgr + col * 3, tailMasks); + y += yStride; + u += uStride; + v += vStride; + bgr += bgrStride; + } + } + + template void Yuv444pToBgrV2(const uint8_t* y, size_t yStride, const uint8_t* u, size_t uStride, const uint8_t* v, size_t vStride, + size_t width, size_t height, uint8_t* bgr, size_t bgrStride, SimdYuvType yuvType) + { + switch (yuvType) + { + case SimdYuvBt601: Yuv444pToBgrV2(y, yStride, u, uStride, v, vStride, width, height, bgr, bgrStride); break; + case SimdYuvBt709: Yuv444pToBgrV2(y, yStride, u, uStride, v, vStride, width, height, bgr, bgrStride); break; + case SimdYuvBt2020: Yuv444pToBgrV2(y, yStride, u, uStride, v, vStride, width, height, bgr, bgrStride); break; + case SimdYuvTrect871: Yuv444pToBgrV2(y, yStride, u, uStride, v, vStride, width, height, bgr, bgrStride); break; + default: + assert(0); + } + } + + void Yuv444pToBgrV2(const uint8_t* y, size_t yStride, const uint8_t* u, size_t uStride, const uint8_t* v, size_t vStride, + size_t width, size_t height, uint8_t* bgr, size_t bgrStride, SimdYuvType yuvType) + { + if (Aligned(y) && Aligned(yStride) && Aligned(u) && Aligned(uStride) + && Aligned(v) && Aligned(vStride) && Aligned(bgr) && Aligned(bgrStride)) + Yuv444pToBgrV2(y, yStride, u, uStride, v, vStride, width, height, bgr, bgrStride, yuvType); + else + Yuv444pToBgrV2(y, yStride, u, uStride, v, vStride, width, height, bgr, bgrStride, yuvType); + } } #endif } diff --git a/src/Simd/SimdLib.cpp b/src/Simd/SimdLib.cpp index 487d4b59ab..68faa551b8 100644 --- a/src/Simd/SimdLib.cpp +++ b/src/Simd/SimdLib.cpp @@ -7580,11 +7580,11 @@ SIMD_API void SimdYuv422pToBgrV2(const uint8_t* y, size_t yStride, const uint8_t size_t width, size_t height, uint8_t* bgr, size_t bgrStride, SimdYuvType yuvType) { SIMD_EMPTY(); -//#ifdef SIMD_AVX512BW_ENABLE -// if (Avx512bw::Enable) -// Avx512bw::Yuv422pToBgrV2(y, yStride, u, uStride, v, vStride, width, height, bgr, bgrStride, yuvType); -// else -//#endif +#ifdef SIMD_AVX512BW_ENABLE + if (Avx512bw::Enable) + Avx512bw::Yuv422pToBgrV2(y, yStride, u, uStride, v, vStride, width, height, bgr, bgrStride, yuvType); + else +#endif #ifdef SIMD_AVX2_ENABLE if (Avx2::Enable && width >= Avx2::DA) Avx2::Yuv422pToBgrV2(y, yStride, u, uStride, v, vStride, width, height, bgr, bgrStride, yuvType); @@ -7639,11 +7639,11 @@ SIMD_API void SimdYuv444pToBgrV2(const uint8_t* y, size_t yStride, const uint8_t size_t width, size_t height, uint8_t* bgr, size_t bgrStride, SimdYuvType yuvType) { SIMD_EMPTY(); -//#ifdef SIMD_AVX512BW_ENABLE -// if (Avx512bw::Enable) -// Avx512bw::Yuv444pToBgrV2(y, yStride, u, uStride, v, vStride, width, height, bgr, bgrStride, yuvType); -// else -//#endif +#ifdef SIMD_AVX512BW_ENABLE + if (Avx512bw::Enable) + Avx512bw::Yuv444pToBgrV2(y, yStride, u, uStride, v, vStride, width, height, bgr, bgrStride, yuvType); + else +#endif #ifdef SIMD_AVX2_ENABLE if (Avx2::Enable && width >= Avx2::A) Avx2::Yuv444pToBgrV2(y, yStride, u, uStride, v, vStride, width, height, bgr, bgrStride, yuvType); diff --git a/src/Test/TestYuvToAny.cpp b/src/Test/TestYuvToAny.cpp index c8ddcc3ecc..f567c626e2 100644 --- a/src/Test/TestYuvToAny.cpp +++ b/src/Test/TestYuvToAny.cpp @@ -508,10 +508,10 @@ namespace Test result = result && YuvToBgr2AutoTest(FUNC_YUV2(Simd::Avx2::Yuv422pToBgrV2), FUNC_YUV2(SimdYuv422pToBgrV2), 2, 1); #endif -//#ifdef SIMD_AVX512BW_ENABLE -// if (Simd::Avx512bw::Enable) -// result = result && YuvToBgr2AutoTest(FUNC_YUV2(Simd::Avx512bw::Yuv422pToBgrV2), FUNC_YUV2(SimdYuv422pToBgrV2), 2, 1); -//#endif +#ifdef SIMD_AVX512BW_ENABLE + if (Simd::Avx512bw::Enable) + result = result && YuvToBgr2AutoTest(FUNC_YUV2(Simd::Avx512bw::Yuv422pToBgrV2), FUNC_YUV2(SimdYuv422pToBgrV2), 2, 1); +#endif //#ifdef SIMD_NEON_ENABLE // if (Simd::Neon::Enable) @@ -537,10 +537,10 @@ namespace Test result = result && YuvToBgr2AutoTest(FUNC_YUV2(Simd::Avx2::Yuv444pToBgrV2), FUNC_YUV2(SimdYuv444pToBgrV2), 1, 1); #endif -//#ifdef SIMD_AVX512BW_ENABLE -// if (Simd::Avx512bw::Enable) -// result = result && YuvToBgr2AutoTest(FUNC_YUV2(Simd::Avx512bw::Yuv444pToBgrV2), FUNC_YUV2(SimdYuv444pToBgrV2), 1, 1); -//#endif +#ifdef SIMD_AVX512BW_ENABLE + if (Simd::Avx512bw::Enable) + result = result && YuvToBgr2AutoTest(FUNC_YUV2(Simd::Avx512bw::Yuv444pToBgrV2), FUNC_YUV2(SimdYuv444pToBgrV2), 1, 1); +#endif //#ifdef SIMD_NEON_ENABLE // if (Simd::Neon::Enable)