diff --git a/src/Simd/SimdAvx2SynetConvolution16bNchwGemm.cpp b/src/Simd/SimdAvx2SynetConvolution16bNchwGemm.cpp index 6d18efe70c..afa6454f31 100644 --- a/src/Simd/SimdAvx2SynetConvolution16bNchwGemm.cpp +++ b/src/Simd/SimdAvx2SynetConvolution16bNchwGemm.cpp @@ -97,7 +97,7 @@ namespace Simd } for (; k < K; k += 2) { - const float* src0 = src + k * dS, * src1 = src0 + dS; + const float* src0 = src + k * dS; for (f = 0; f < tail; ++f) { *dst++ = Base::Float32ToBFloat16(src0[f]); @@ -120,6 +120,89 @@ namespace Simd } } + SIMD_INLINE void ReorderF(const uint16_t* src, size_t stride, uint16_t*& dst) + { + __m128i src0 = _mm_loadu_si128((__m128i*)src); + __m128i src1 = _mm_loadu_si128((__m128i*)(src + stride)); + _mm_storeu_si128((__m128i*)dst + 0, _mm_unpacklo_epi16(src0, src1)); + _mm_storeu_si128((__m128i*)dst + 1, _mm_unpackhi_epi16(src0, src1)); + dst += DF; + } + + static void Reorder16bNchwGemm1x1(const uint8_t* src8, const ConvParam& p, const AlgParam& a, size_t yBeg, size_t yEnd, size_t cBeg, size_t cEnd, uint16_t* dst) + { + const uint16_t* src = ((uint16_t*)src8) + (cBeg * p.srcH + yBeg) * p.srcW; + size_t N = (yEnd - yBeg) * p.srcW, NF = AlignLo(N, a.F), j = 0, dS = p.srcH * p.srcW; + size_t K = Min(cEnd, a.K) - cBeg, K2 = AlignLo(K, 2), KH = AlignHi(K, a.microK), k; + for (; j < NF; j += a.F) + { + for (k = 0; k < K2; k += 2) + { + const uint16_t* src0 = src + k * dS; + for (size_t f = 0; f < a.F; f += F) + ReorderF(src0 + f, dS, dst); + } + for (; k < K; k += 2) + { + const uint16_t* src0 = src + k * dS; + for (size_t f = 0; f < a.F; ++f) + { + *dst++ = src0[f]; + *dst++ = 0; + } + } + for (; k < KH; k += 2) + { + for (size_t f = 0; f < a.F; ++f) + { + *dst++ = 0; + *dst++ = 0; + } + } + src += a.F; + } + if (j < N) + { + size_t tail = N - j, f; + for (k = 0; k < K2; k += 2) + { + const uint16_t* src0 = src + k * dS, * src1 = src0 + dS; + for (f = 0; f < tail; ++f) + { + *dst++ = src0[f]; + *dst++ = src1[f]; + } + for (; f < a.F; ++f) + { + *dst++ = 0; + *dst++ = 0; + } + } + for (; k < K; k += 2) + { + const uint16_t* src0 = src + k * dS; + for (f = 0; f < tail; ++f) + { + *dst++ = src0[f]; + *dst++ = 0; + } + for (; f < a.F; ++f) + { + *dst++ = 0; + *dst++ = 0; + } + } + for (; k < KH; k += 2) + { + for (size_t f = 0; f < a.F; ++f) + { + *dst++ = 0; + *dst++ = 0; + } + } + } + } + //----------------------------------------------------------------------------------------- template void Convolution16bNchwGemm_2xM(const uint16_t* weight0, const ConvParam& p, const AlgParam& a, @@ -362,8 +445,8 @@ namespace Simd SetAlgParam(F, F * 2, 5, 2, Base::AlgCacheL1(), Base::AlgCacheL2(), Base::AlgCacheL3()); if (_src16b) { - //if (_is1x1) - // _convert = Reorder16bNchwGemm1x1; + if (_is1x1) + _convert = Reorder16bNchwGemm1x1; //else // _convert = Reorder16bNhwcGemm; } diff --git a/src/Simd/SimdBaseSynetConvolution16bNchwGemm.cpp b/src/Simd/SimdBaseSynetConvolution16bNchwGemm.cpp index eed6efac2e..9e346f5997 100644 --- a/src/Simd/SimdBaseSynetConvolution16bNchwGemm.cpp +++ b/src/Simd/SimdBaseSynetConvolution16bNchwGemm.cpp @@ -93,7 +93,7 @@ namespace Simd } for (; k < K; k += 2) { - const float* src0 = src + k * dS, * src1 = src0 + dS; + const float* src0 = src + k * dS; for (f = 0; f < tail; ++f) { *dst++ = Float32ToBFloat16(src0[f]); @@ -134,7 +134,7 @@ namespace Simd } for (; k < K; k += 2) { - const uint16_t* src0 = src + k * dS, * src1 = src0 + dS; + const uint16_t* src0 = src + k * dS; for (size_t f = 0; f < a.F; ++f) { *dst++ = src0[f]; @@ -170,7 +170,7 @@ namespace Simd } for (; k < K; k += 2) { - const uint16_t* src0 = src + k * dS, * src1 = src0 + dS; + const uint16_t* src0 = src + k * dS; for (f = 0; f < tail; ++f) { *dst++ = src0[f]; diff --git a/src/Simd/SimdSse41SynetConvolution16bNchwGemm.cpp b/src/Simd/SimdSse41SynetConvolution16bNchwGemm.cpp index aa57545ffe..0003a57b64 100644 --- a/src/Simd/SimdSse41SynetConvolution16bNchwGemm.cpp +++ b/src/Simd/SimdSse41SynetConvolution16bNchwGemm.cpp @@ -61,7 +61,7 @@ namespace Simd } for (; k < K; k += 2) { - const float* src0 = src + k * dS, * src1 = src0 + dS; + const float* src0 = src + k * dS; for (size_t f = 0; f < a.F; ++f) { *dst++ = Base::Float32ToBFloat16(src0[f]); @@ -97,7 +97,7 @@ namespace Simd } for (; k < K; k += 2) { - const float* src0 = src + k * dS, * src1 = src0 + dS; + const float* src0 = src + k * dS; for (f = 0; f < tail; ++f) { *dst++ = Base::Float32ToBFloat16(src0[f]); @@ -120,6 +120,15 @@ namespace Simd } } + SIMD_INLINE void ReorderDF(const uint16_t* src, size_t stride, uint16_t*& dst0, uint16_t*& dst1) + { + __m128i src0 = _mm_loadu_si128((__m128i*)src); + __m128i src1 = _mm_loadu_si128((__m128i*)(src + stride)); + _mm_storeu_si128((__m128i*)dst0, _mm_unpacklo_epi16(src0, src1)); + _mm_storeu_si128((__m128i*)dst1, _mm_unpackhi_epi16(src0, src1)); + dst0 += DF, dst1 += DF; + } + SIMD_INLINE void ReorderF(const uint16_t* src, size_t stride, uint16_t*& dst) { __m128i src0 = _mm_loadl_epi64((__m128i*)src); @@ -131,9 +140,46 @@ namespace Simd static void Reorder16bNchwGemm1x1(const uint8_t* src8, const ConvParam& p, const AlgParam& a, size_t yBeg, size_t yEnd, size_t cBeg, size_t cEnd, uint16_t* dst) { const uint16_t* src = ((uint16_t*)src8) + (cBeg * p.srcH + yBeg) * p.srcW; - size_t N = (yEnd - yBeg) * p.srcW, NF = AlignLo(N, a.F), j, dS = p.srcH * p.srcW; + size_t N = (yEnd - yBeg) * p.srcW, NF = AlignLo(N, a.F), N2F = AlignLo(N, a.F * 2), j = 0, dS = p.srcH * p.srcW; size_t K = Min(cEnd, a.K) - cBeg, K2 = AlignLo(K, 2), KH = AlignHi(K, a.microK), k; - for (j = 0; j < NF; j += a.F) + if (a.F == F) + { + for (j = 0; j < N2F; j += a.F * 2) + { + uint16_t* dst0 = dst + 0 * KH * a.F; + uint16_t* dst1 = dst + 1 * KH * a.F; + for (k = 0; k < K2; k += 2) + { + const uint16_t* src0 = src + k * dS; + for (size_t f = 0; f < a.F; f += DF) + ReorderDF(src0 + f, dS, dst0, dst1); + } + for (; k < K; k += 2) + { + const uint16_t* src0 = src + k * dS; + for (size_t f = 0; f < a.F; ++f) + { + *dst0++ = src0[f]; + *dst0++ = 0; + *dst1++ = src0[f + a.F]; + *dst1++ = 0; + } + } + for (; k < KH; k += 2) + { + for (size_t f = 0; f < a.F; ++f) + { + *dst0++ = 0; + *dst0++ = 0; + *dst1++ = 0; + *dst1++ = 0; + } + } + src += a.F * 2; + dst += KH * a.F * 2; + } + } + for (; j < NF; j += a.F) { for (k = 0; k < K2; k += 2) { @@ -143,7 +189,7 @@ namespace Simd } for (; k < K; k += 2) { - const uint16_t* src0 = src + k * dS, * src1 = src0 + dS; + const uint16_t* src0 = src + k * dS; for (size_t f = 0; f < a.F; ++f) { *dst++ = src0[f]; @@ -179,7 +225,7 @@ namespace Simd } for (; k < K; k += 2) { - const uint16_t* src0 = src + k * dS, * src1 = src0 + dS; + const uint16_t* src0 = src + k * dS; for (f = 0; f < tail; ++f) { *dst++ = src0[f]; diff --git a/src/Test/TestSynetConvolution16b.cpp b/src/Test/TestSynetConvolution16b.cpp index 99a8925112..ffcd3eb1d8 100644 --- a/src/Test/TestSynetConvolution16b.cpp +++ b/src/Test/TestSynetConvolution16b.cpp @@ -270,7 +270,7 @@ namespace Test #endif #else - result = result && SynetConvolution16bForwardAutoTest(eps, Param(1, 2156, 4, 4, 4, _1, _1, _1, _0, _0, 1, aId, tF, f32, b16), c, f1, f2); + result = result && SynetConvolution16bForwardAutoTest(eps, Param(1, 2156, 4, 4, 4, _1, _1, _1, _0, _0, 1, aId, tF, b16, b16), c, f1, f2); //result = result && SynetConvolution16bForwardAutoTest(eps, Param(1, 64, 88, 88, 128, _3, _1, _2, _1, _1, 1, aRe, tT, f32, b16), c, f1, f2); //result = result && SynetConvolution16bForwardAutoTest(eps, Param(1, 64, 88, 88, 128, _3, _1, _2, _1, _1, 1, aRe, tT, f32, b16), c, f1, f2); //result = result && SynetConvolution16bForwardAutoTest(eps, Param(1, 64, 88, 88, 64, _3, _1, _1, _1, _1, 1, aSw, tT, b16, f32), c, f1, f2);