Skip to content

Commit

Permalink
+add AVX-512BW optimizations of functions Yuv422pToBgrV2, Yuv444pToBg…
Browse files Browse the repository at this point in the history
…rV2.
  • Loading branch information
ermig1979 committed Aug 8, 2023
1 parent 3db19f6 commit e168104
Show file tree
Hide file tree
Showing 5 changed files with 162 additions and 20 deletions.
4 changes: 2 additions & 2 deletions docs/2023.html
Original file line number Diff line number Diff line change
Expand Up @@ -38,8 +38,8 @@ <h4>Algorithms</h4>
<h5>New features</h5>
<ul>
<li>Base implementation, SSE4.1, AVX2, AVX-512BW optimizations of function Yuv420pToBgrV2.</li>
<li>Base implementation, SSE4.1, AVX2 optimizations of function Yuv422pToBgrV2.</li>
<li>Base implementation, SSE4.1, AVX2 optimizations of function Yuv444pToBgrV2.</li>
<li>Base implementation, SSE4.1, AVX2, AVX-512BW optimizations of function Yuv422pToBgrV2.</li>
<li>Base implementation, SSE4.1, AVX2, AVX-512BW optimizations of function Yuv444pToBgrV2.</li>
</ul>

<h4>Test framework</h4>
Expand Down
6 changes: 6 additions & 0 deletions src/Simd/SimdAvx512bw.h
Original file line number Diff line number Diff line change
Expand Up @@ -678,9 +678,15 @@ namespace Simd
void Yuv422pToBgr(const uint8_t * y, size_t yStride, const uint8_t * u, size_t uStride, const uint8_t * v, size_t vStride,
size_t width, size_t height, uint8_t * bgr, size_t bgrStride);

void Yuv422pToBgrV2(const uint8_t* y, size_t yStride, const uint8_t* u, size_t uStride, const uint8_t* v, size_t vStride,
size_t width, size_t height, uint8_t* bgr, size_t bgrStride, SimdYuvType yuvType);

void Yuv444pToBgr(const uint8_t * y, size_t yStride, const uint8_t * u, size_t uStride, const uint8_t * v, size_t vStride,
size_t width, size_t height, uint8_t * bgr, size_t bgrStride);

void Yuv444pToBgrV2(const uint8_t* y, size_t yStride, const uint8_t* u, size_t uStride, const uint8_t* v, size_t vStride,
size_t width, size_t height, uint8_t* bgr, size_t bgrStride, SimdYuvType yuvType);

void Yuv420pToBgra(const uint8_t * y, size_t yStride, const uint8_t * u, size_t uStride, const uint8_t * v, size_t vStride,
size_t width, size_t height, uint8_t * bgra, size_t bgraStride, uint8_t alpha);

Expand Down
136 changes: 136 additions & 0 deletions src/Simd/SimdAvx512bwYuvToBgrV2.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,142 @@ namespace Simd
else
Yuv420pToBgrV2<false>(y, yStride, u, uStride, v, vStride, width, height, bgr, bgrStride, yuvType);
}

//-------------------------------------------------------------------------------------------------

template <bool align, bool mask, class T> SIMD_YUV_TO_BGR_INLINE void Yuv422pToBgrV2(const uint8_t* y,
const uint8_t* u, const uint8_t* v, uint8_t* bgr, const __mmask64* tails)
{
__m512i _u = _mm512_permutexvar_epi64(K64_PERMUTE_FOR_UNPACK, (Load<align, mask>(u, tails[0])));
__m512i u0 = UnpackU8<0>(_u, _u);
__m512i u1 = UnpackU8<1>(_u, _u);
__m512i _v = _mm512_permutexvar_epi64(K64_PERMUTE_FOR_UNPACK, (Load<align, mask>(v, tails[0])));
__m512i v0 = UnpackU8<0>(_v, _v);
__m512i v1 = UnpackU8<1>(_v, _v);
YuvToBgr<align, mask, T>(Load<align, mask>(y + 0, tails[1]), u0, v0, bgr + 0 * A, tails + 3);
YuvToBgr<align, mask, T>(Load<align, mask>(y + A, tails[2]), u1, v1, bgr + 3 * A, tails + 6);
}

template <bool align, class T> void Yuv422pToBgrV2(const uint8_t* y, size_t yStride, const uint8_t* u, size_t uStride, const uint8_t* v, size_t vStride,
size_t width, size_t height, uint8_t* bgr, size_t bgrStride)
{
assert(width % 2 == 0);
if (align)
{
assert(Aligned(y) && Aligned(yStride) && Aligned(u) && Aligned(uStride));
assert(Aligned(v) && Aligned(vStride) && Aligned(bgr) && Aligned(bgrStride));
}

width /= 2;
size_t alignedWidth = AlignLo(width, A);
size_t tail = width - alignedWidth;
__mmask64 tailMasks[9];
tailMasks[0] = TailMask64(tail);
for (size_t i = 0; i < 2; ++i)
tailMasks[1 + i] = TailMask64(tail * 2 - A * i);
for (size_t i = 0; i < 6; ++i)
tailMasks[3 + i] = TailMask64(tail * 6 - A * i);
for (size_t row = 0; row < height; row += 1)
{
size_t col = 0;
for (; col < alignedWidth; col += A)
Yuv422pToBgrV2<align, false, T>(y + col * 2, u + col, v + col, bgr + col * 6, tailMasks);
if (col < width)
Yuv422pToBgrV2<align, true, T>(y + col * 2, u + col, v + col, bgr + col * 6, tailMasks);
y += yStride;
u += uStride;
v += vStride;
bgr += bgrStride;
}
}

template <bool align> void Yuv422pToBgrV2(const uint8_t* y, size_t yStride, const uint8_t* u, size_t uStride, const uint8_t* v, size_t vStride,
size_t width, size_t height, uint8_t* bgr, size_t bgrStride, SimdYuvType yuvType)
{
switch (yuvType)
{
case SimdYuvBt601: Yuv422pToBgrV2<align, Base::Bt601>(y, yStride, u, uStride, v, vStride, width, height, bgr, bgrStride); break;
case SimdYuvBt709: Yuv422pToBgrV2<align, Base::Bt709>(y, yStride, u, uStride, v, vStride, width, height, bgr, bgrStride); break;
case SimdYuvBt2020: Yuv422pToBgrV2<align, Base::Bt2020>(y, yStride, u, uStride, v, vStride, width, height, bgr, bgrStride); break;
case SimdYuvTrect871: Yuv422pToBgrV2<align, Base::Trect871>(y, yStride, u, uStride, v, vStride, width, height, bgr, bgrStride); break;
default:
assert(0);
}
}

void Yuv422pToBgrV2(const uint8_t* y, size_t yStride, const uint8_t* u, size_t uStride, const uint8_t* v, size_t vStride,
size_t width, size_t height, uint8_t* bgr, size_t bgrStride, SimdYuvType yuvType)
{
if (Aligned(y) && Aligned(yStride) && Aligned(u) && Aligned(uStride)
&& Aligned(v) && Aligned(vStride) && Aligned(bgr) && Aligned(bgrStride))
Yuv422pToBgrV2<true>(y, yStride, u, uStride, v, vStride, width, height, bgr, bgrStride, yuvType);
else
Yuv422pToBgrV2<false>(y, yStride, u, uStride, v, vStride, width, height, bgr, bgrStride, yuvType);
}

//-------------------------------------------------------------------------------------------------

template <bool align, bool mask, class T> SIMD_YUV_TO_BGR_INLINE void Yuv444pToBgrV2(const uint8_t* y,
const uint8_t* u, const uint8_t* v, uint8_t* bgr, const __mmask64* tails)
{
__m512i _y = Load<align, mask>(y, tails[0]);
__m512i _u = Load<align, mask>(u, tails[0]);
__m512i _v = Load<align, mask>(v, tails[0]);
YuvToBgr<align, mask, T>(_y, _u, _v, bgr, tails + 1);
}

template <bool align, class T> void Yuv444pToBgrV2(const uint8_t* y, size_t yStride, const uint8_t* u, size_t uStride, const uint8_t* v, size_t vStride,
size_t width, size_t height, uint8_t* bgr, size_t bgrStride)
{
if (align)
{
assert(Aligned(y) && Aligned(yStride) && Aligned(u) && Aligned(uStride));
assert(Aligned(v) && Aligned(vStride) && Aligned(bgr) && Aligned(bgrStride));
}

size_t alignedWidth = AlignLo(width, A);
size_t tail = width - alignedWidth;
__mmask64 tailMasks[4];
tailMasks[0] = TailMask64(tail);
for (size_t i = 0; i < 3; ++i)
tailMasks[1 + i] = TailMask64(tail * 3 - A * i);
for (size_t row = 0; row < height; row += 1)
{
size_t col = 0;
for (; col < alignedWidth; col += A)
Yuv444pToBgrV2<align, false, T>(y + col, u + col, v + col, bgr + col * 3, tailMasks);
if (col < width)
Yuv444pToBgrV2<align, true, T>(y + col, u + col, v + col, bgr + col * 3, tailMasks);
y += yStride;
u += uStride;
v += vStride;
bgr += bgrStride;
}
}

template <bool align> void Yuv444pToBgrV2(const uint8_t* y, size_t yStride, const uint8_t* u, size_t uStride, const uint8_t* v, size_t vStride,
size_t width, size_t height, uint8_t* bgr, size_t bgrStride, SimdYuvType yuvType)
{
switch (yuvType)
{
case SimdYuvBt601: Yuv444pToBgrV2<align, Base::Bt601>(y, yStride, u, uStride, v, vStride, width, height, bgr, bgrStride); break;
case SimdYuvBt709: Yuv444pToBgrV2<align, Base::Bt709>(y, yStride, u, uStride, v, vStride, width, height, bgr, bgrStride); break;
case SimdYuvBt2020: Yuv444pToBgrV2<align, Base::Bt2020>(y, yStride, u, uStride, v, vStride, width, height, bgr, bgrStride); break;
case SimdYuvTrect871: Yuv444pToBgrV2<align, Base::Trect871>(y, yStride, u, uStride, v, vStride, width, height, bgr, bgrStride); break;
default:
assert(0);
}
}

void Yuv444pToBgrV2(const uint8_t* y, size_t yStride, const uint8_t* u, size_t uStride, const uint8_t* v, size_t vStride,
size_t width, size_t height, uint8_t* bgr, size_t bgrStride, SimdYuvType yuvType)
{
if (Aligned(y) && Aligned(yStride) && Aligned(u) && Aligned(uStride)
&& Aligned(v) && Aligned(vStride) && Aligned(bgr) && Aligned(bgrStride))
Yuv444pToBgrV2<true>(y, yStride, u, uStride, v, vStride, width, height, bgr, bgrStride, yuvType);
else
Yuv444pToBgrV2<false>(y, yStride, u, uStride, v, vStride, width, height, bgr, bgrStride, yuvType);
}
}
#endif
}
20 changes: 10 additions & 10 deletions src/Simd/SimdLib.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7580,11 +7580,11 @@ SIMD_API void SimdYuv422pToBgrV2(const uint8_t* y, size_t yStride, const uint8_t
size_t width, size_t height, uint8_t* bgr, size_t bgrStride, SimdYuvType yuvType)
{
SIMD_EMPTY();
//#ifdef SIMD_AVX512BW_ENABLE
// if (Avx512bw::Enable)
// Avx512bw::Yuv422pToBgrV2(y, yStride, u, uStride, v, vStride, width, height, bgr, bgrStride, yuvType);
// else
//#endif
#ifdef SIMD_AVX512BW_ENABLE
if (Avx512bw::Enable)
Avx512bw::Yuv422pToBgrV2(y, yStride, u, uStride, v, vStride, width, height, bgr, bgrStride, yuvType);
else
#endif
#ifdef SIMD_AVX2_ENABLE
if (Avx2::Enable && width >= Avx2::DA)
Avx2::Yuv422pToBgrV2(y, yStride, u, uStride, v, vStride, width, height, bgr, bgrStride, yuvType);
Expand Down Expand Up @@ -7639,11 +7639,11 @@ SIMD_API void SimdYuv444pToBgrV2(const uint8_t* y, size_t yStride, const uint8_t
size_t width, size_t height, uint8_t* bgr, size_t bgrStride, SimdYuvType yuvType)
{
SIMD_EMPTY();
//#ifdef SIMD_AVX512BW_ENABLE
// if (Avx512bw::Enable)
// Avx512bw::Yuv444pToBgrV2(y, yStride, u, uStride, v, vStride, width, height, bgr, bgrStride, yuvType);
// else
//#endif
#ifdef SIMD_AVX512BW_ENABLE
if (Avx512bw::Enable)
Avx512bw::Yuv444pToBgrV2(y, yStride, u, uStride, v, vStride, width, height, bgr, bgrStride, yuvType);
else
#endif
#ifdef SIMD_AVX2_ENABLE
if (Avx2::Enable && width >= Avx2::A)
Avx2::Yuv444pToBgrV2(y, yStride, u, uStride, v, vStride, width, height, bgr, bgrStride, yuvType);
Expand Down
16 changes: 8 additions & 8 deletions src/Test/TestYuvToAny.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -508,10 +508,10 @@ namespace Test
result = result && YuvToBgr2AutoTest(FUNC_YUV2(Simd::Avx2::Yuv422pToBgrV2), FUNC_YUV2(SimdYuv422pToBgrV2), 2, 1);
#endif

//#ifdef SIMD_AVX512BW_ENABLE
// if (Simd::Avx512bw::Enable)
// result = result && YuvToBgr2AutoTest(FUNC_YUV2(Simd::Avx512bw::Yuv422pToBgrV2), FUNC_YUV2(SimdYuv422pToBgrV2), 2, 1);
//#endif
#ifdef SIMD_AVX512BW_ENABLE
if (Simd::Avx512bw::Enable)
result = result && YuvToBgr2AutoTest(FUNC_YUV2(Simd::Avx512bw::Yuv422pToBgrV2), FUNC_YUV2(SimdYuv422pToBgrV2), 2, 1);
#endif

//#ifdef SIMD_NEON_ENABLE
// if (Simd::Neon::Enable)
Expand All @@ -537,10 +537,10 @@ namespace Test
result = result && YuvToBgr2AutoTest(FUNC_YUV2(Simd::Avx2::Yuv444pToBgrV2), FUNC_YUV2(SimdYuv444pToBgrV2), 1, 1);
#endif

//#ifdef SIMD_AVX512BW_ENABLE
// if (Simd::Avx512bw::Enable)
// result = result && YuvToBgr2AutoTest(FUNC_YUV2(Simd::Avx512bw::Yuv444pToBgrV2), FUNC_YUV2(SimdYuv444pToBgrV2), 1, 1);
//#endif
#ifdef SIMD_AVX512BW_ENABLE
if (Simd::Avx512bw::Enable)
result = result && YuvToBgr2AutoTest(FUNC_YUV2(Simd::Avx512bw::Yuv444pToBgrV2), FUNC_YUV2(SimdYuv444pToBgrV2), 1, 1);
#endif

//#ifdef SIMD_NEON_ENABLE
// if (Simd::Neon::Enable)
Expand Down

0 comments on commit e168104

Please sign in to comment.