Skip to content

Commit

Permalink
+add AVX-512BW, NEON optimizations of function Yuv422pToBgraV2.
Browse files Browse the repository at this point in the history
  • Loading branch information
ermig1979 committed Aug 8, 2023
1 parent f2ac83f commit 3dd6aab
Show file tree
Hide file tree
Showing 7 changed files with 164 additions and 21 deletions.
2 changes: 1 addition & 1 deletion docs/2023.html
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ <h5>New features</h5>
<li>The mark of function SimdYuv420pToBgr as deprecated.</li>
<li>The mark of function SimdYuv422pToBgr as deprecated.</li>
<li>The mark of function SimdYuv444pToBgr as deprecated.</li>
<li>Base implementation, SSE4.1, AVX2 optimizations of function Yuv422pToBgraV2.</li>
<li>Base implementation, SSE4.1, AVX2, AVX-512BW, NEON optimizations of function Yuv422pToBgraV2.</li>
</ul>

<h4>Test framework</h4>
Expand Down
3 changes: 3 additions & 0 deletions src/Simd/SimdAvx512bw.h
Original file line number Diff line number Diff line change
Expand Up @@ -696,6 +696,9 @@ namespace Simd
void Yuv422pToBgra(const uint8_t * y, size_t yStride, const uint8_t * u, size_t uStride, const uint8_t * v, size_t vStride,
size_t width, size_t height, uint8_t * bgra, size_t bgraStride, uint8_t alpha);

void Yuv422pToBgraV2(const uint8_t* y, size_t yStride, const uint8_t* u, size_t uStride, const uint8_t* v, size_t vStride,
size_t width, size_t height, uint8_t* bgra, size_t bgraStride, uint8_t alpha, SimdYuvType yuvType);

void Yuv444pToBgra(const uint8_t * y, size_t yStride, const uint8_t * u, size_t uStride, const uint8_t * v, size_t vStride,
size_t width, size_t height, uint8_t * bgra, size_t bgraStride, uint8_t alpha);

Expand Down
72 changes: 72 additions & 0 deletions src/Simd/SimdAvx512bwYuvToBgraV2.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,78 @@ namespace Simd
else
Yuv444pToBgraV2<false>(y, yStride, u, uStride, v, vStride, width, height, bgra, bgraStride, alpha, yuvType);
}
//-------------------------------------------------------------------------------------------------

template <bool align, bool mask, class T> SIMD_YUV_TO_BGR_INLINE void Yuv422pToBgraV2(const uint8_t* y,
const uint8_t* u, const uint8_t* v, const __m512i& a, uint8_t* bgra, const __mmask64* tails)
{
__m512i _u = _mm512_permutexvar_epi64(K64_PERMUTE_FOR_UNPACK, (Load<align, mask>(u, tails[0])));
__m512i u0 = UnpackU8<0>(_u, _u);
__m512i u1 = UnpackU8<1>(_u, _u);
__m512i _v = _mm512_permutexvar_epi64(K64_PERMUTE_FOR_UNPACK, (Load<align, mask>(v, tails[0])));
__m512i v0 = UnpackU8<0>(_v, _v);
__m512i v1 = UnpackU8<1>(_v, _v);
YuvToBgra<align, mask, T>(Load<align, mask>(y + 0, tails[1]), u0, v0, a, bgra + 00, tails + 3);
YuvToBgra<align, mask, T>(Load<align, mask>(y + A, tails[2]), u1, v1, a, bgra + QA, tails + 7);
}

template <bool align, class T> void Yuv422pToBgraV2(const uint8_t* y, size_t yStride, const uint8_t* u, size_t uStride, const uint8_t* v, size_t vStride,
size_t width, size_t height, uint8_t* bgra, size_t bgraStride, uint8_t alpha)
{
assert(width % 2 == 0);
if (align)
{
assert(Aligned(y) && Aligned(yStride) && Aligned(u) && Aligned(uStride));
assert(Aligned(v) && Aligned(vStride) && Aligned(bgra) && Aligned(bgraStride));
}

__m512i a = _mm512_set1_epi8(alpha);
width /= 2;
size_t alignedWidth = AlignLo(width, A);
size_t tail = width - alignedWidth;
__mmask64 tailMasks[11];
tailMasks[0] = TailMask64(tail);
for (size_t i = 0; i < 2; ++i)
tailMasks[1 + i] = TailMask64(tail * 2 - A * i);
for (size_t i = 0; i < 8; ++i)
tailMasks[3 + i] = TailMask64(tail * 8 - A * i);
for (size_t row = 0; row < height; row += 1)
{
size_t col = 0;
for (; col < alignedWidth; col += A)
Yuv422pToBgraV2<align, false, T>(y + col * 2, u + col, v + col, a, bgra + col * 8, tailMasks);
if (col < width)
Yuv422pToBgraV2<align, true, T>(y + col * 2, u + col, v + col, a, bgra + col * 8, tailMasks);
y += yStride;
u += uStride;
v += vStride;
bgra += bgraStride;
}
}

template <bool align> void Yuv422pToBgraV2(const uint8_t* y, size_t yStride, const uint8_t* u, size_t uStride, const uint8_t* v, size_t vStride,
size_t width, size_t height, uint8_t* bgra, size_t bgraStride, uint8_t alpha, SimdYuvType yuvType)
{
switch (yuvType)
{
case SimdYuvBt601: Yuv422pToBgraV2<align, Base::Bt601>(y, yStride, u, uStride, v, vStride, width, height, bgra, bgraStride, alpha); break;
case SimdYuvBt709: Yuv422pToBgraV2<align, Base::Bt709>(y, yStride, u, uStride, v, vStride, width, height, bgra, bgraStride, alpha); break;
case SimdYuvBt2020: Yuv422pToBgraV2<align, Base::Bt2020>(y, yStride, u, uStride, v, vStride, width, height, bgra, bgraStride, alpha); break;
case SimdYuvTrect871: Yuv422pToBgraV2<align, Base::Trect871>(y, yStride, u, uStride, v, vStride, width, height, bgra, bgraStride, alpha); break;
default:
assert(0);
}
}

void Yuv422pToBgraV2(const uint8_t* y, size_t yStride, const uint8_t* u, size_t uStride, const uint8_t* v, size_t vStride,
size_t width, size_t height, uint8_t* bgra, size_t bgraStride, uint8_t alpha, SimdYuvType yuvType)
{
if (Aligned(y) && Aligned(yStride) && Aligned(u) && Aligned(uStride)
&& Aligned(v) && Aligned(vStride) && Aligned(bgra) && Aligned(bgraStride))
Yuv422pToBgraV2<true>(y, yStride, u, uStride, v, vStride, width, height, bgra, bgraStride, alpha, yuvType);
else
Yuv422pToBgraV2<false>(y, yStride, u, uStride, v, vStride, width, height, bgra, bgraStride, alpha, yuvType);
}

//-------------------------------------------------------------------------------------------------

Expand Down
20 changes: 10 additions & 10 deletions src/Simd/SimdLib.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7757,11 +7757,11 @@ SIMD_API void SimdYuv422pToBgraV2(const uint8_t* y, size_t yStride, const uint8_
size_t width, size_t height, uint8_t* bgra, size_t bgraStride, uint8_t alpha, SimdYuvType yuvType)
{
SIMD_EMPTY();
//#ifdef SIMD_AVX512BW_ENABLE
// if (Avx512bw::Enable)
// Avx512bw::Yuv422pToBgraV2(y, yStride, u, uStride, v, vStride, width, height, bgra, bgraStride, alpha, yuvType);
// else
//#endif
#ifdef SIMD_AVX512BW_ENABLE
if (Avx512bw::Enable)
Avx512bw::Yuv422pToBgraV2(y, yStride, u, uStride, v, vStride, width, height, bgra, bgraStride, alpha, yuvType);
else
#endif
#ifdef SIMD_AVX2_ENABLE
if (Avx2::Enable && width >= Avx2::DA)
Avx2::Yuv422pToBgraV2(y, yStride, u, uStride, v, vStride, width, height, bgra, bgraStride, alpha, yuvType);
Expand All @@ -7772,11 +7772,11 @@ SIMD_API void SimdYuv422pToBgraV2(const uint8_t* y, size_t yStride, const uint8_
Sse41::Yuv422pToBgraV2(y, yStride, u, uStride, v, vStride, width, height, bgra, bgraStride, alpha, yuvType);
else
#endif
//#ifdef SIMD_NEON_ENABLE
// if (Neon::Enable && width >= Neon::DA)
// Neon::Yuv422pToBgraV2(y, yStride, u, uStride, v, vStride, width, height, bgra, bgraStride, alpha, yuvType);
// else
//#endif
#ifdef SIMD_NEON_ENABLE
if (Neon::Enable && width >= Neon::DA)
Neon::Yuv422pToBgraV2(y, yStride, u, uStride, v, vStride, width, height, bgra, bgraStride, alpha, yuvType);
else
#endif
Base::Yuv422pToBgraV2(y, yStride, u, uStride, v, vStride, width, height, bgra, bgraStride, alpha, yuvType);
}

Expand Down
3 changes: 3 additions & 0 deletions src/Simd/SimdNeon.h
Original file line number Diff line number Diff line change
Expand Up @@ -689,6 +689,9 @@ namespace Simd
void Yuv422pToBgra(const uint8_t * y, size_t yStride, const uint8_t * u, size_t uStride, const uint8_t * v, size_t vStride,
size_t width, size_t height, uint8_t * bgra, size_t bgraStride, uint8_t alpha);

void Yuv422pToBgraV2(const uint8_t* y, size_t yStride, const uint8_t* u, size_t uStride, const uint8_t* v, size_t vStride,
size_t width, size_t height, uint8_t* bgra, size_t bgraStride, uint8_t alpha, SimdYuvType yuvType);

void Yuv444pToBgra(const uint8_t * y, size_t yStride, const uint8_t * u, size_t uStride, const uint8_t * v, size_t vStride,
size_t width, size_t height, uint8_t * bgra, size_t bgraStride, uint8_t alpha);

Expand Down
67 changes: 66 additions & 1 deletion src/Simd/SimdNeonYuvToBgraV2.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,6 @@ namespace Simd
else
Yuva444pToBgraV2<false>(y, yStride, u, uStride, v, vStride, a, aStride, width, height, bgra, bgraStride, yuvType);
}

//-------------------------------------------------------------------------------------------------

template <class T, bool align> SIMD_INLINE void Yuva422pToBgra(const uint8_t* y, const uint8x16x2_t& u, const uint8x16x2_t& v, const uint8x16_t& a, uint8_t* bgra)
Expand All @@ -109,6 +108,72 @@ namespace Simd
YuvToBgra<T, align>(Load<align>(y + A), u.val[1], v.val[1], a, bgra + QA);
}

template <bool align, class T> void Yuv422pToBgraV2(const uint8_t* y, size_t yStride, const uint8_t* u, size_t uStride, const uint8_t* v, size_t vStride,
size_t width, size_t height, uint8_t* bgra, size_t bgraStride, uint8_t alpha)
{
assert((width % 2 == 0) && (width >= DA));
if (align)
{
assert(Aligned(y) && Aligned(yStride) && Aligned(u) && Aligned(uStride) && Aligned(v) && Aligned(vStride));
assert(Aligned(bgra) && Aligned(bgraStride));
}

size_t bodyWidth = AlignLo(width, DA);
size_t tail = width - bodyWidth;
uint8x16_t _a = vdupq_n_u8(alpha);
uint8x16x2_t _u, _v;
for (size_t row = 0; row < height; row += 1)
{
for (size_t colUV = 0, colY = 0, colBgra = 0; colY < bodyWidth; colY += DA, colUV += A, colBgra += OA)
{
_u.val[1] = _u.val[0] = Load<align>(u + colUV);
_u = vzipq_u8(_u.val[0], _u.val[1]);
_v.val[1] = _v.val[0] = Load<align>(v + colUV);
_v = vzipq_u8(_v.val[0], _v.val[1]);
Yuva422pToBgra<T, align>(y + colY, _u, _v, _a, bgra + colBgra);
}
if (tail)
{
size_t offset = width - DA;
_u.val[1] = _u.val[0] = Load<false>(u + offset / 2);
_u = vzipq_u8(_u.val[0], _u.val[1]);
_v.val[1] = _v.val[0] = Load<false>(v + offset / 2);
_v = vzipq_u8(_v.val[0], _v.val[1]);
Yuva422pToBgra<T, false>(y + offset, _u, _v, _a, bgra + 4 * offset);
}
y += yStride;
u += uStride;
v += vStride;
bgra += bgraStride;
}
}

template <bool align> void Yuv422pToBgraV2(const uint8_t* y, size_t yStride, const uint8_t* u, size_t uStride, const uint8_t* v, size_t vStride,
size_t width, size_t height, uint8_t* bgra, size_t bgraStride, uint8_t alpha, SimdYuvType yuvType)
{
switch (yuvType)
{
case SimdYuvBt601: Yuv422pToBgraV2<align, Base::Bt601>(y, yStride, u, uStride, v, vStride, width, height, bgra, bgraStride, alpha); break;
case SimdYuvBt709: Yuv422pToBgraV2<align, Base::Bt709>(y, yStride, u, uStride, v, vStride, width, height, bgra, bgraStride, alpha); break;
case SimdYuvBt2020: Yuv422pToBgraV2<align, Base::Bt2020>(y, yStride, u, uStride, v, vStride, width, height, bgra, bgraStride, alpha); break;
case SimdYuvTrect871: Yuv422pToBgraV2<align, Base::Trect871>(y, yStride, u, uStride, v, vStride, width, height, bgra, bgraStride, alpha); break;
default:
assert(0);
}
}

void Yuv422pToBgraV2(const uint8_t* y, size_t yStride, const uint8_t* u, size_t uStride, const uint8_t* v, size_t vStride,
size_t width, size_t height, uint8_t* bgra, size_t bgraStride, uint8_t alpha, SimdYuvType yuvType)
{
if (Aligned(y) && Aligned(yStride) && Aligned(u) && Aligned(uStride)
&& Aligned(v) && Aligned(vStride) && Aligned(bgra) && Aligned(bgraStride))
Yuv422pToBgraV2<true>(y, yStride, u, uStride, v, vStride, width, height, bgra, bgraStride, alpha, yuvType);
else
Yuv422pToBgraV2<false>(y, yStride, u, uStride, v, vStride, width, height, bgra, bgraStride, alpha, yuvType);
}

//-------------------------------------------------------------------------------------------------

template <bool align, class T> void Yuv420pToBgraV2(const uint8_t* y, size_t yStride, const uint8_t* u, size_t uStride, const uint8_t* v, size_t vStride,
size_t width, size_t height, uint8_t* bgra, size_t bgraStride, uint8_t alpha)
{
Expand Down
18 changes: 9 additions & 9 deletions src/Test/TestYuvToBgra.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -486,15 +486,15 @@ namespace Test
result = result && YuvToBgra2AutoTest(FUNC_YUV2(Simd::Avx2::Yuv422pToBgraV2), FUNC_YUV2(SimdYuv422pToBgraV2), 2, 1);
#endif

//#ifdef SIMD_AVX512BW_ENABLE
// if (Simd::Avx512bw::Enable)
// result = result && YuvToBgra2AutoTest(FUNC_YUV2(Simd::Avx512bw::Yuv422pToBgraV2), FUNC_YUV2(SimdYuv422pToBgraV2), 2, 1);
//#endif
//
//#ifdef SIMD_NEON_ENABLE
// if (Simd::Neon::Enable)
// result = result && YuvToBgra2AutoTest(FUNC_YUV2(Simd::Neon::Yuv422pToBgraV2), FUNC_YUV2(SimdYuv422pToBgraV2), 2, 1);
//#endif
#ifdef SIMD_AVX512BW_ENABLE
if (Simd::Avx512bw::Enable)
result = result && YuvToBgra2AutoTest(FUNC_YUV2(Simd::Avx512bw::Yuv422pToBgraV2), FUNC_YUV2(SimdYuv422pToBgraV2), 2, 1);
#endif

#ifdef SIMD_NEON_ENABLE
if (Simd::Neon::Enable)
result = result && YuvToBgra2AutoTest(FUNC_YUV2(Simd::Neon::Yuv422pToBgraV2), FUNC_YUV2(SimdYuv422pToBgraV2), 2, 1);
#endif

return result;
}
Expand Down

0 comments on commit 3dd6aab

Please sign in to comment.