diff --git a/src/Simd/SimdBaseSynetConvolution16bNchwGemm.cpp b/src/Simd/SimdBaseSynetConvolution16bNchwGemm.cpp index 347c4fdeee..64f7875a5e 100644 --- a/src/Simd/SimdBaseSynetConvolution16bNchwGemm.cpp +++ b/src/Simd/SimdBaseSynetConvolution16bNchwGemm.cpp @@ -98,27 +98,23 @@ namespace Simd { const ConvParam& p = _param; const AlgParam& a = _alg; - //size_t D = DivHi(p.dstC, _alg.F); - //_weight.Resize(a.bufK * a.bufD, true); - //uint16_t* dst = _weight.data; - //for (size_t d = 0; d < D; d++) - //{ - // for (size_t k = 0; k < a.bufK; k += 2) - // { - // const float* src = weight + k * p.dstC + d * _alg.F; - // for (size_t f = 0; f < _alg.F; ++f) - // { - // for (size_t i = 0; i < 2; ++i) - // { - // if (d * _alg.F + f < p.dstC && k + i < a.K) - // *(dst++) = Float32ToBFloat16(src[i * p.dstC]); - // else - // *(dst++) = 0; - // } - // src++; - // } - // } - //} + _weight.Resize(a.bufK * a.bufD, true); + uint16_t* dst = _weight.data; + for (size_t mak = 0; mak < a.bufK; mak += a.macroK) + { + size_t macroK = Simd::Min(a.bufK, mak + a.macroK) - mak; + for (size_t d = 0; d < a.bufD; d += 1) + { + const float* src = weight + d * p.dstC + mak; + for (size_t k = 0; k < macroK; k += 1) + { + if (d < p.dstC && mak + k < a.K) + *(dst++) = Float32ToBFloat16(src[k]); + else + *(dst++) = 0; + } + } + } } void SynetConvolution16bNchwGemm::Forward(const uint8_t* src, uint8_t* buf8, uint8_t* dst) @@ -126,7 +122,7 @@ namespace Simd const ConvParam& p = _param; const AlgParam& a = _alg; buf8 = Buffer(buf8); - uint16_t* bufB = _convert ? Allocate(buf8, a.bufN * a.bufK) : NULL; + uint16_t* bufB = Allocate(buf8, a.bufN * a.bufK); float* bufS = a.sumBuf ? Allocate(buf8, a.macroD * a.bufN) : NULL; for (size_t b = 0; b < p.batch; b += 1) { @@ -143,48 +139,33 @@ namespace Simd const ConvParam& p = _param; const AlgParam& a = _alg; const float* bias = _bias.data, * params = _params.data; - - //for (size_t dc = 0; dc < p.dstC; dc += a.macroD) - //{ - // size_t macroD = Simd::Min(p.dstC, dc + a.macroD) - dc; - // const uint16_t* weight = _weight.data + dc * a.bufK; - // for (size_t mak = 0; mak < a.K; mak += a.macroK) - // { - // size_t macroK = Simd::Min(a.bufK, mak + a.macroK) - mak; - // for (size_t yBeg = 0; yBeg < dstH;) - // { - // size_t yEnd = Simd::Min(yBeg + a.macroH, dstH); - // size_t bufOffs = (a.macroK < a.bufK || _convert == NULL) ? - // yBeg * (_convert ? AlignHi(p.dstW, a.F) : p.dstW) * a.bufK + (a.reorderType ? mak * a.F : mak) : 0; - // size_t sumOffs = a.macroK < a.bufK ? yBeg * p.dstW * a.macroD : 0; - // size_t dstOffs = yBeg * p.dstW * p.dstC * _elemD; - // if (dc == 0 && mak == 0 && _convert) - // { - // if (a.batch > 1) - // { - // size_t dS = p.srcH * p.srcW * p.srcC * _elemS; - // size_t dB = p.dstH * p.dstW * a.bufK; - // for (size_t b = 0; b < a.batch; ++b) - // _convert(src + b * dS, p, a, 0, p.dstH, buf + b * dB); - // } - // else - // _convert(src, p, a, yBeg, yEnd, buf + bufOffs); - // } - // if (mak + macroK == a.bufK) - // _convolutions[1](buf + bufOffs, p, a, macroD, yEnd - yBeg, macroK, macroK == a.bufK ? 1 : 0, - // weight, bias, params, sum + sumOffs, dst + dstOffs); - // else - // _convolutions[0](buf + bufOffs, p, a, macroD, yEnd - yBeg, macroK, mak == 0 ? 1 : 0, - // weight, bias, params, sum + sumOffs, dst + dstOffs); - // yBeg = yEnd; - // } - // weight += macroK * a.F; - // } - // bias += macroD; - // if (p.activation == ::SimdConvolutionActivationPrelu) - // params += macroD; - // dst += macroD * _elemD; - //} + for (size_t yBeg = 0; yBeg < p.dstH;) + { + size_t yEnd = Simd::Min(yBeg + a.macroH, p.dstH); + if(!_is1x1) + _convert(src, p, a, yBeg, yEnd, 0, p.srcC, buf); + for (size_t mak = 0; mak < a.K; mak += a.macroK) + { + size_t macroK = Simd::Min(a.bufK, mak + a.macroK) - mak; + if (_is1x1) + _convert(src, p, a, yBeg, yEnd, mak, mak + macroK, buf); + size_t bufOffs = _is1x1 ? mak * a.F : 0; + for (size_t dc = 0; dc < p.dstC; dc += a.macroD) + { + size_t macroD = Simd::Min(p.dstC, dc + a.macroD) - dc; + size_t sumOffs = a.macroK < a.bufK ? yBeg * p.dstW * a.macroD : 0; + size_t dstOffs = (dc * p.dstH + yBeg) * p.dstW * _elemD; + const uint16_t* weight = _weight.data + a.bufD * mak + dc * macroK; + if (mak + macroK == a.bufK) + _convolutions[1](weight, p, a, macroD, yEnd - yBeg, macroK, macroK == a.bufK ? 1 : 0, + buf + bufOffs, bias, params, sum + sumOffs, dst + dstOffs); + else + _convolutions[0](weight, p, a, macroD, yEnd - yBeg, macroK, mak == 0 ? 1 : 0, + buf + bufOffs, bias, params, sum + sumOffs, dst + dstOffs); + } + } + yBeg = yEnd; + } } bool SynetConvolution16bNchwGemm::Preferable(const ConvParam& p) diff --git a/src/Simd/SimdSynetConvolution16b.h b/src/Simd/SimdSynetConvolution16b.h index 978c3fde4b..2bf17ccd2f 100644 --- a/src/Simd/SimdSynetConvolution16b.h +++ b/src/Simd/SimdSynetConvolution16b.h @@ -221,7 +221,7 @@ namespace Simd int reorderType, sumBuf; }; - typedef void(*ConvertPtr)(const uint8_t* src, const ConvParam& p, const AlgParam& a, size_t yBeg, size_t yEnd, uint16_t* dst); + typedef void(*ConvertPtr)(const uint8_t* src, const ConvParam& p, const AlgParam& a, size_t yBeg, size_t yEnd, size_t cBeg, size_t cEnd, uint16_t* dst); typedef void(*ConvolutionPtr)(const uint16_t* weight, const ConvParam& p, const AlgParam& a, size_t dstC, size_t dstH, size_t srcC, int zero, const uint16_t* src, const float* bias, const float* params, float* sum, uint8_t* dst);