Skip to content

Commit

Permalink
*improve function Reorder16bNchwGemm1x1.
Browse files Browse the repository at this point in the history
  • Loading branch information
ermig1979 committed Jul 24, 2024
1 parent 58dc21d commit 044b8e1
Show file tree
Hide file tree
Showing 4 changed files with 142 additions and 13 deletions.
89 changes: 86 additions & 3 deletions src/Simd/SimdAvx2SynetConvolution16bNchwGemm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@ namespace Simd
}
for (; k < K; k += 2)
{
const float* src0 = src + k * dS, * src1 = src0 + dS;
const float* src0 = src + k * dS;
for (f = 0; f < tail; ++f)
{
*dst++ = Base::Float32ToBFloat16(src0[f]);
Expand All @@ -120,6 +120,89 @@ namespace Simd
}
}

SIMD_INLINE void ReorderF(const uint16_t* src, size_t stride, uint16_t*& dst)
{
__m128i src0 = _mm_loadu_si128((__m128i*)src);
__m128i src1 = _mm_loadu_si128((__m128i*)(src + stride));
_mm_storeu_si128((__m128i*)dst + 0, _mm_unpacklo_epi16(src0, src1));
_mm_storeu_si128((__m128i*)dst + 1, _mm_unpackhi_epi16(src0, src1));
dst += DF;
}

static void Reorder16bNchwGemm1x1(const uint8_t* src8, const ConvParam& p, const AlgParam& a, size_t yBeg, size_t yEnd, size_t cBeg, size_t cEnd, uint16_t* dst)
{
const uint16_t* src = ((uint16_t*)src8) + (cBeg * p.srcH + yBeg) * p.srcW;
size_t N = (yEnd - yBeg) * p.srcW, NF = AlignLo(N, a.F), j = 0, dS = p.srcH * p.srcW;
size_t K = Min(cEnd, a.K) - cBeg, K2 = AlignLo(K, 2), KH = AlignHi(K, a.microK), k;
for (; j < NF; j += a.F)
{
for (k = 0; k < K2; k += 2)
{
const uint16_t* src0 = src + k * dS;
for (size_t f = 0; f < a.F; f += F)
ReorderF(src0 + f, dS, dst);
}
for (; k < K; k += 2)
{
const uint16_t* src0 = src + k * dS;
for (size_t f = 0; f < a.F; ++f)
{
*dst++ = src0[f];
*dst++ = 0;
}
}
for (; k < KH; k += 2)
{
for (size_t f = 0; f < a.F; ++f)
{
*dst++ = 0;
*dst++ = 0;
}
}
src += a.F;
}
if (j < N)
{
size_t tail = N - j, f;
for (k = 0; k < K2; k += 2)
{
const uint16_t* src0 = src + k * dS, * src1 = src0 + dS;
for (f = 0; f < tail; ++f)
{
*dst++ = src0[f];
*dst++ = src1[f];
}
for (; f < a.F; ++f)
{
*dst++ = 0;
*dst++ = 0;
}
}
for (; k < K; k += 2)
{
const uint16_t* src0 = src + k * dS;
for (f = 0; f < tail; ++f)
{
*dst++ = src0[f];
*dst++ = 0;
}
for (; f < a.F; ++f)
{
*dst++ = 0;
*dst++ = 0;
}
}
for (; k < KH; k += 2)
{
for (size_t f = 0; f < a.F; ++f)
{
*dst++ = 0;
*dst++ = 0;
}
}
}
}

//-----------------------------------------------------------------------------------------

template<Term16bType term, SimdConvolutionActivationType type, int M> void Convolution16bNchwGemm_2xM(const uint16_t* weight0, const ConvParam& p, const AlgParam& a,
Expand Down Expand Up @@ -362,8 +445,8 @@ namespace Simd
SetAlgParam(F, F * 2, 5, 2, Base::AlgCacheL1(), Base::AlgCacheL2(), Base::AlgCacheL3());
if (_src16b)
{
//if (_is1x1)
// _convert = Reorder16bNchwGemm1x1;
if (_is1x1)
_convert = Reorder16bNchwGemm1x1;
//else
// _convert = Reorder16bNhwcGemm;
}
Expand Down
6 changes: 3 additions & 3 deletions src/Simd/SimdBaseSynetConvolution16bNchwGemm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@ namespace Simd
}
for (; k < K; k += 2)
{
const float* src0 = src + k * dS, * src1 = src0 + dS;
const float* src0 = src + k * dS;
for (f = 0; f < tail; ++f)
{
*dst++ = Float32ToBFloat16(src0[f]);
Expand Down Expand Up @@ -134,7 +134,7 @@ namespace Simd
}
for (; k < K; k += 2)
{
const uint16_t* src0 = src + k * dS, * src1 = src0 + dS;
const uint16_t* src0 = src + k * dS;
for (size_t f = 0; f < a.F; ++f)
{
*dst++ = src0[f];
Expand Down Expand Up @@ -170,7 +170,7 @@ namespace Simd
}
for (; k < K; k += 2)
{
const uint16_t* src0 = src + k * dS, * src1 = src0 + dS;
const uint16_t* src0 = src + k * dS;
for (f = 0; f < tail; ++f)
{
*dst++ = src0[f];
Expand Down
58 changes: 52 additions & 6 deletions src/Simd/SimdSse41SynetConvolution16bNchwGemm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ namespace Simd
}
for (; k < K; k += 2)
{
const float* src0 = src + k * dS, * src1 = src0 + dS;
const float* src0 = src + k * dS;
for (size_t f = 0; f < a.F; ++f)
{
*dst++ = Base::Float32ToBFloat16(src0[f]);
Expand Down Expand Up @@ -97,7 +97,7 @@ namespace Simd
}
for (; k < K; k += 2)
{
const float* src0 = src + k * dS, * src1 = src0 + dS;
const float* src0 = src + k * dS;
for (f = 0; f < tail; ++f)
{
*dst++ = Base::Float32ToBFloat16(src0[f]);
Expand All @@ -120,6 +120,15 @@ namespace Simd
}
}

SIMD_INLINE void ReorderDF(const uint16_t* src, size_t stride, uint16_t*& dst0, uint16_t*& dst1)
{
__m128i src0 = _mm_loadu_si128((__m128i*)src);
__m128i src1 = _mm_loadu_si128((__m128i*)(src + stride));
_mm_storeu_si128((__m128i*)dst0, _mm_unpacklo_epi16(src0, src1));
_mm_storeu_si128((__m128i*)dst1, _mm_unpackhi_epi16(src0, src1));
dst0 += DF, dst1 += DF;
}

SIMD_INLINE void ReorderF(const uint16_t* src, size_t stride, uint16_t*& dst)
{
__m128i src0 = _mm_loadl_epi64((__m128i*)src);
Expand All @@ -131,9 +140,46 @@ namespace Simd
static void Reorder16bNchwGemm1x1(const uint8_t* src8, const ConvParam& p, const AlgParam& a, size_t yBeg, size_t yEnd, size_t cBeg, size_t cEnd, uint16_t* dst)
{
const uint16_t* src = ((uint16_t*)src8) + (cBeg * p.srcH + yBeg) * p.srcW;
size_t N = (yEnd - yBeg) * p.srcW, NF = AlignLo(N, a.F), j, dS = p.srcH * p.srcW;
size_t N = (yEnd - yBeg) * p.srcW, NF = AlignLo(N, a.F), N2F = AlignLo(N, a.F * 2), j = 0, dS = p.srcH * p.srcW;
size_t K = Min(cEnd, a.K) - cBeg, K2 = AlignLo(K, 2), KH = AlignHi(K, a.microK), k;
for (j = 0; j < NF; j += a.F)
if (a.F == F)
{
for (j = 0; j < N2F; j += a.F * 2)
{
uint16_t* dst0 = dst + 0 * KH * a.F;
uint16_t* dst1 = dst + 1 * KH * a.F;
for (k = 0; k < K2; k += 2)
{
const uint16_t* src0 = src + k * dS;
for (size_t f = 0; f < a.F; f += DF)
ReorderDF(src0 + f, dS, dst0, dst1);
}
for (; k < K; k += 2)
{
const uint16_t* src0 = src + k * dS;
for (size_t f = 0; f < a.F; ++f)
{
*dst0++ = src0[f];
*dst0++ = 0;
*dst1++ = src0[f + a.F];
*dst1++ = 0;
}
}
for (; k < KH; k += 2)
{
for (size_t f = 0; f < a.F; ++f)
{
*dst0++ = 0;
*dst0++ = 0;
*dst1++ = 0;
*dst1++ = 0;
}
}
src += a.F * 2;
dst += KH * a.F * 2;
}
}
for (; j < NF; j += a.F)
{
for (k = 0; k < K2; k += 2)
{
Expand All @@ -143,7 +189,7 @@ namespace Simd
}
for (; k < K; k += 2)
{
const uint16_t* src0 = src + k * dS, * src1 = src0 + dS;
const uint16_t* src0 = src + k * dS;
for (size_t f = 0; f < a.F; ++f)
{
*dst++ = src0[f];
Expand Down Expand Up @@ -179,7 +225,7 @@ namespace Simd
}
for (; k < K; k += 2)
{
const uint16_t* src0 = src + k * dS, * src1 = src0 + dS;
const uint16_t* src0 = src + k * dS;
for (f = 0; f < tail; ++f)
{
*dst++ = src0[f];
Expand Down
2 changes: 1 addition & 1 deletion src/Test/TestSynetConvolution16b.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -270,7 +270,7 @@ namespace Test
#endif

#else
result = result && SynetConvolution16bForwardAutoTest(eps, Param(1, 2156, 4, 4, 4, _1, _1, _1, _0, _0, 1, aId, tF, f32, b16), c, f1, f2);
result = result && SynetConvolution16bForwardAutoTest(eps, Param(1, 2156, 4, 4, 4, _1, _1, _1, _0, _0, 1, aId, tF, b16, b16), c, f1, f2);
//result = result && SynetConvolution16bForwardAutoTest(eps, Param(1, 64, 88, 88, 128, _3, _1, _2, _1, _1, 1, aRe, tT, f32, b16), c, f1, f2);
//result = result && SynetConvolution16bForwardAutoTest(eps, Param(1, 64, 88, 88, 128, _3, _1, _2, _1, _1, 1, aRe, tT, f32, b16), c, f1, f2);
//result = result && SynetConvolution16bForwardAutoTest(eps, Param(1, 64, 88, 88, 64, _3, _1, _1, _1, _1, 1, aSw, tT, b16, f32), c, f1, f2);
Expand Down

0 comments on commit 044b8e1

Please sign in to comment.