From bda5b582f1a5dee9cc909fa0449570e20008b65c Mon Sep 17 00:00:00 2001 From: Yermalayeu Ihar Date: Wed, 2 Oct 2024 19:14:32 +0300 Subject: [PATCH] *fix bug in AVX-512BW optimizations of Convolution32fNhwcDepthwiseDefault. --- ...Bf16SynetMergedConvolution16bDepthwise.cpp | 254 ++++++++---------- ...x512bwSynetConvolution32fNhwcDepthwise.cpp | 20 +- src/Test/TestSynetConvolution16b.cpp | 6 +- src/Test/TestSynetConvolution32f.cpp | 3 +- src/Test/TestSynetMergedConvolution16b.cpp | 8 +- 5 files changed, 142 insertions(+), 149 deletions(-) diff --git a/src/Simd/SimdAmxBf16SynetMergedConvolution16bDepthwise.cpp b/src/Simd/SimdAmxBf16SynetMergedConvolution16bDepthwise.cpp index 9b43af586e..a81393c514 100644 --- a/src/Simd/SimdAmxBf16SynetMergedConvolution16bDepthwise.cpp +++ b/src/Simd/SimdAmxBf16SynetMergedConvolution16bDepthwise.cpp @@ -38,24 +38,25 @@ namespace Simd //------------------------------------------------------------------------------------------------- - template SIMD_INLINE __m512 LoadSrc(const T* src); + template SIMD_INLINE __m512 LoadSrc(const T* src, __mmask16 mask = -1); - template <> SIMD_INLINE __m512 LoadSrc(const float* src) + template <> SIMD_INLINE __m512 LoadSrc(const float* src, __mmask16 mask) { - return _mm512_loadu_ps(src); + return _mm512_maskz_loadu_ps(mask, src); } - template <> SIMD_INLINE __m512 LoadSrc(const uint16_t* src) + template <> SIMD_INLINE __m512 LoadSrc(const uint16_t* src, __mmask16 mask) { - return BFloat16ToFloat32(_mm256_loadu_si256((__m256i*)src)); + return BFloat16ToFloat32(_mm256_maskz_loadu_epi16(mask, src)); } //------------------------------------------------------------------------------------------------- - template void DepthwiseConvolution(const uint8_t* src8, const ConvParam& p, const AlgParam& a, + template void DepthwiseConvolutionDefault(const uint8_t* src8, const ConvParam& p, const AlgParam& a, size_t maC, size_t yBeg, size_t yEnd, const float* weight, const float* bias, const float* params, uint8_t* dst) { const T* src = (T*)src8; + size_t srcH = p.srcH, srcW = p.srcW, kernelX = p.kernelX, kernelY = p.kernelY; size_t strideY = p.strideY, strideX = p.strideX, padY = p.padY, padX = p.padX, padH = p.padH, padW = p.padW; size_t sM = (a.bufH[1] - 1), sD = a.bufH[1] ? a.bufH[1] * p.srcW * F : F, sX = a.bufH[1] ? F : p.srcC, sY = sX * p.srcW, dstC = maC; size_t dX = (a.bufH[2] ? a.maC * 2 : p.dstC * a.elem[1]), dY = p.dstW * dX, dy0 = a.bufH[2] ? yBeg : 0, dD = a.bufH[2] ? F * 2 : F * a.elem[1]; @@ -87,17 +88,17 @@ namespace Simd for (size_t dx = 0; dx < p.dstW; ++dx, pd += dX) { __m512 sum = _mm512_setzero_ps(); - for (size_t ky = 0; ky < p.kernelY; ++ky) + for (size_t ky = 0; ky < kernelY; ++ky) { size_t sy = dy * strideY + ky - padY; - if (sy < p.srcH) + if (sy < srcH) { - for (size_t kx = 0; kx < p.kernelX; ++kx) + for (size_t kx = 0; kx < kernelX; ++kx) { size_t sx = dx * strideX + kx - padX; - if (sx < p.srcW) + if (sx < srcW) { - const float* pw = weight + (ky * p.kernelX + kx) * F; + const float* pw = weight + (ky * kernelX + kx) * F; const T* ps = src + (sy & sM) * sY + sx * sX; sum = Fmadd(LoadSrc(ps), _mm512_loadu_ps(pw), sum); } @@ -114,44 +115,47 @@ namespace Simd for (size_t dy = yBeg; dy < yEnd; ++dy) { uint8_t* pd = dst + (dy - dy0) * dY; - if (dy >= noseY && dy < bodyY) + size_t dx = 0; + for (; dx < noseX; dx += 1, pd += dX) { - size_t dx = 0; - for (; dx < noseX; dx += 1, pd += dX) + __m512 sum = _mm512_setzero_ps(); + for (size_t ky = 0; ky < kernelY; ++ky) { - __m512 sum = _mm512_setzero_ps(); - for (size_t ky = 0; ky < p.kernelY; ++ky) + size_t sy = dy * strideY + ky - padY; + if (sy < srcH) { - size_t sy = dy * p.strideY + ky - padY; - for (size_t kx = 0; kx < p.kernelX; ++kx) + for (size_t kx = 0; kx < kernelX; ++kx) { - size_t sx = dx * p.strideX + kx - padX; - if (sx < p.srcW) + size_t sx = dx * strideX + kx - padX; + if (sx < srcW) { - const float* pw = weight + (ky * p.kernelX + kx) * F; + const float* pw = weight + (ky * kernelX + kx) * F; const T* ps = src + (sy & sM) * sY + sx * sX; sum = Fmadd(LoadSrc(ps), _mm512_loadu_ps(pw), sum); } } } - Save1(pd, NULL, sum, _bias, _params); } - for (; dx < bodyX8; dx += 8, pd += 8 * dX) + Save1(pd, NULL, sum, _bias, _params); + } + for (; dx < bodyX8; dx += 8, pd += 8 * dX) + { + __m512 sum0 = _mm512_setzero_ps(); + __m512 sum1 = _mm512_setzero_ps(); + __m512 sum2 = _mm512_setzero_ps(); + __m512 sum3 = _mm512_setzero_ps(); + __m512 sum4 = _mm512_setzero_ps(); + __m512 sum5 = _mm512_setzero_ps(); + __m512 sum6 = _mm512_setzero_ps(); + __m512 sum7 = _mm512_setzero_ps(); + const float* pw = weight; + for (size_t ky = 0; ky < kernelY; ++ky) { - __m512 sum0 = _mm512_setzero_ps(); - __m512 sum1 = _mm512_setzero_ps(); - __m512 sum2 = _mm512_setzero_ps(); - __m512 sum3 = _mm512_setzero_ps(); - __m512 sum4 = _mm512_setzero_ps(); - __m512 sum5 = _mm512_setzero_ps(); - __m512 sum6 = _mm512_setzero_ps(); - __m512 sum7 = _mm512_setzero_ps(); - const float* pw = weight; - for (size_t ky = 0; ky < p.kernelY; ++ky) + size_t sy = dy * strideY + ky - padY; + if (sy < srcH) { - size_t sy = dy * strideY + ky - padY; const T* ps = src + (sy & sM) * sY + (dx * strideX - padX) * sX; - for (size_t kx = 0; kx < p.kernelX; ++kx, ps += sX, pw += F) + for (size_t kx = 0; kx < kernelX; ++kx, ps += sX, pw += F) { __m512 w0 = _mm512_loadu_ps(pw); sum0 = Fmadd(LoadSrc(ps + 0 * ssX), w0, sum0); @@ -164,27 +168,32 @@ namespace Simd sum7 = Fmadd(LoadSrc(ps + 7 * ssX), w0, sum7); } } - Save1(pd + 0 * dX, NULL, sum0, _bias, _params); - Save1(pd + 1 * dX, NULL, sum1, _bias, _params); - Save1(pd + 2 * dX, NULL, sum2, _bias, _params); - Save1(pd + 3 * dX, NULL, sum3, _bias, _params); - Save1(pd + 4 * dX, NULL, sum4, _bias, _params); - Save1(pd + 5 * dX, NULL, sum5, _bias, _params); - Save1(pd + 6 * dX, NULL, sum6, _bias, _params); - Save1(pd + 7 * dX, NULL, sum7, _bias, _params); + else + pw += kernelX * F; } - for (; dx < bodyX4; dx += 4, pd += 4 * dX) + Save1(pd + 0 * dX, NULL, sum0, _bias, _params); + Save1(pd + 1 * dX, NULL, sum1, _bias, _params); + Save1(pd + 2 * dX, NULL, sum2, _bias, _params); + Save1(pd + 3 * dX, NULL, sum3, _bias, _params); + Save1(pd + 4 * dX, NULL, sum4, _bias, _params); + Save1(pd + 5 * dX, NULL, sum5, _bias, _params); + Save1(pd + 6 * dX, NULL, sum6, _bias, _params); + Save1(pd + 7 * dX, NULL, sum7, _bias, _params); + } + for (; dx < bodyX4; dx += 4, pd += 4 * dX) + { + __m512 sum0 = _mm512_setzero_ps(); + __m512 sum1 = _mm512_setzero_ps(); + __m512 sum2 = _mm512_setzero_ps(); + __m512 sum3 = _mm512_setzero_ps(); + const float* pw = weight; + for (size_t ky = 0; ky < p.kernelY; ++ky) { - __m512 sum0 = _mm512_setzero_ps(); - __m512 sum1 = _mm512_setzero_ps(); - __m512 sum2 = _mm512_setzero_ps(); - __m512 sum3 = _mm512_setzero_ps(); - const float* pw = weight; - for (size_t ky = 0; ky < p.kernelY; ++ky) + size_t sy = dy * strideY + ky - padY; + if (sy < srcH) { - size_t sy = dy * strideY + ky - padY; const T* ps = src + (sy & sM) * sY + (dx * strideX - padX) * sX; - for (size_t kx = 0; kx < p.kernelX; ++kx, ps += sX, pw += F) + for (size_t kx = 0; kx < kernelX; ++kx, ps += sX, pw += F) { __m512 w0 = _mm512_loadu_ps(pw); sum0 = Fmadd(LoadSrc(ps + 0 * ssX), w0, sum0); @@ -193,90 +202,80 @@ namespace Simd sum3 = Fmadd(LoadSrc(ps + 3 * ssX), w0, sum3); } } - Save1(pd + 0 * dX, NULL, sum0, _bias, _params); - Save1(pd + 1 * dX, NULL, sum1, _bias, _params); - Save1(pd + 2 * dX, NULL, sum2, _bias, _params); - Save1(pd + 3 * dX, NULL, sum3, _bias, _params); + else + pw += kernelX * F; } - for (; dx < bodyX2; dx += 2, pd += 2 * dX) + Save1(pd + 0 * dX, NULL, sum0, _bias, _params); + Save1(pd + 1 * dX, NULL, sum1, _bias, _params); + Save1(pd + 2 * dX, NULL, sum2, _bias, _params); + Save1(pd + 3 * dX, NULL, sum3, _bias, _params); + } + for (; dx < bodyX2; dx += 2, pd += 2 * dX) + { + __m512 sum0 = _mm512_setzero_ps(); + __m512 sum1 = _mm512_setzero_ps(); + const float* pw = weight; + for (size_t ky = 0; ky < p.kernelY; ++ky) { - __m512 sum0 = _mm512_setzero_ps(); - __m512 sum1 = _mm512_setzero_ps(); - const float* pw = weight; - for (size_t ky = 0; ky < p.kernelY; ++ky) + size_t sy = dy * strideY + ky - padY; + if (sy < srcH) { - size_t sy = dy * strideY + ky - padY; const T* ps = src + (sy & sM) * sY + (dx * strideX - padX) * sX; - for (size_t kx = 0; kx < p.kernelX; ++kx, ps += sX, pw += F) + for (size_t kx = 0; kx < kernelX; ++kx, ps += sX, pw += F) { __m512 w0 = _mm512_loadu_ps(pw); sum0 = Fmadd(LoadSrc(ps + 0 * ssX), w0, sum0); sum1 = Fmadd(LoadSrc(ps + 1 * ssX), w0, sum1); } } - Save1(pd + 0 * dX, NULL, sum0, _bias, _params); - Save1(pd + 1 * dX, NULL, sum1, _bias, _params); + else + pw += kernelX * F; } - for (; dx < bodyX; dx += 1, pd += dX) + Save1(pd + 0 * dX, NULL, sum0, _bias, _params); + Save1(pd + 1 * dX, NULL, sum1, _bias, _params); + } + for (; dx < bodyX; dx += 1, pd += dX) + { + __m512 sum = _mm512_setzero_ps(); + const float* pw = weight; + for (size_t ky = 0; ky < p.kernelY; ++ky) { - __m512 sum = _mm512_setzero_ps(); - const float* pw = weight; - for (size_t ky = 0; ky < p.kernelY; ++ky) + size_t sy = dy * strideY + ky - padY; + if (sy < srcH) { - size_t sy = dy * strideY + ky - padY; const T* ps = src + (sy & sM) * sY + (dx * strideX - padX) * sX; - for (size_t kx = 0; kx < p.kernelX; ++kx, ps += sX, pw += F) + for (size_t kx = 0; kx < kernelX; ++kx, ps += sX, pw += F) { __m512 w0 = _mm512_loadu_ps(pw); sum = Fmadd(LoadSrc(ps), w0, sum); } } - Save1(pd, NULL, sum, _bias, _params); + else + pw += kernelX * F; } - for (; dx < p.dstW; dx += 1, pd += dX) + Save1(pd, NULL, sum, _bias, _params); + } + for (; dx < p.dstW; dx += 1, pd += dX) + { + __m512 sum = _mm512_setzero_ps(); + for (size_t ky = 0; ky < p.kernelY; ++ky) { - __m512 sum = _mm512_setzero_ps(); - for (size_t ky = 0; ky < p.kernelY; ++ky) + size_t sy = dy * strideY + ky - padY; + if (sy < srcH) { - size_t sy = dy * strideY + ky - padY; - for (size_t kx = 0; kx < p.kernelX; ++kx) + for (size_t kx = 0; kx < kernelX; ++kx) { size_t sx = dx * strideX + kx - padX; - if (sx < p.srcW) + if (sx < srcW) { - const float* pw = weight + (ky * p.kernelX + kx) * F; + const float* pw = weight + (ky * kernelX + kx) * F; const T* ps = src + (sy & sM) * sY + sx * sX; sum = Fmadd(LoadSrc(ps), _mm512_loadu_ps(pw), sum); } } } - Save1(pd, NULL, sum, _bias, _params); - } - } - else - { - for (size_t dx = 0; dx < p.dstW; ++dx, pd += dX) - { - __m512 sum = _mm512_setzero_ps(); - for (size_t ky = 0; ky < p.kernelY; ++ky) - { - size_t sy = dy * strideY + ky - padY; - if (sy < p.srcH) - { - for (size_t kx = 0; kx < p.kernelX; ++kx) - { - size_t sx = dx * strideX + kx - padX; - if (sx < p.srcW) - { - const float* pw = weight + (ky * p.kernelX + kx) * F; - const T* ps = src + (sy & sM) * sY + sx * sX; - sum = Fmadd(LoadSrc(ps), _mm512_loadu_ps(pw), sum); - } - } - } - } - Save1(pd, NULL, sum, _bias, _params); } + Save1(pd, NULL, sum, _bias, _params); } } src += sD; @@ -604,44 +603,27 @@ namespace Simd //------------------------------------------------------------------------------------------------- - template static void SetDepthwise(const ConvParam& p, DepthwisePtr& depthwise) + template static void SetDepthwise(const ConvParam& p, DepthwisePtr& depthwise) { if (IsKernel(p, 3) && IsDilation(p, 1) && Aligned(p.dstC, F)) - { - if (Base::FmaAvoid(p.compatibility)) - depthwise = p.srcT == SimdTensorData16b ? - DepthwiseConvolution3x3 : - DepthwiseConvolution3x3; - else - depthwise = p.srcT == SimdTensorData16b ? - DepthwiseConvolution3x3 : - DepthwiseConvolution3x3; - } + depthwise = DepthwiseConvolution3x3; else - { - if (Base::FmaAvoid(p.compatibility)) - { - if (p.srcT == SimdTensorData16b) - depthwise = DepthwiseConvolution; - else - depthwise = DepthwiseConvolution; - } - else - { - if (p.srcT == SimdTensorData16b) - depthwise = DepthwiseConvolution; - else - depthwise = DepthwiseConvolution; - } - } + depthwise = DepthwiseConvolutionDefault; + } + + template static void SetDepthwise(const ConvParam& p, DepthwisePtr& depthwise) + { + return Base::FmaAvoid(p.compatibility) ? SetDepthwise(p, depthwise) : SetDepthwise(p, depthwise); + } + + template static void SetDepthwise(const ConvParam& p, DepthwisePtr& depthwise) + { + return p.dstT == SimdTensorData32f ? SetDepthwise(p, depthwise) : SetDepthwise(p, depthwise); } template static void SetDepthwise(const ConvParam& p, DepthwisePtr& depthwise) { - if (p.dstT == SimdTensorData32f) - SetDepthwise(p, depthwise); - else - SetDepthwise(p, depthwise); + return p.srcT == SimdTensorData16b ? SetDepthwise(p, depthwise) : SetDepthwise(p, depthwise); } void SetDepthwise(const ConvParam& p, DepthwisePtr& depthwise) diff --git a/src/Simd/SimdAvx512bwSynetConvolution32fNhwcDepthwise.cpp b/src/Simd/SimdAvx512bwSynetConvolution32fNhwcDepthwise.cpp index e68d235720..376ee63c23 100644 --- a/src/Simd/SimdAvx512bwSynetConvolution32fNhwcDepthwise.cpp +++ b/src/Simd/SimdAvx512bwSynetConvolution32fNhwcDepthwise.cpp @@ -35,7 +35,7 @@ namespace Simd { template<::SimdConvolutionActivationType type> void Convolution32fNhwcDepthwiseDefault(const float * src, const ConvParam & p, const float * weight, const float * bias, const float * params, float * dst) { - size_t srcW = p.srcW, strideX = p.strideX, dilationX = p.dilationX, kernelX = p.kernelY; + size_t srcW = p.srcW, strideX = p.strideX, dilationX = p.dilationX, kernelX = p.kernelY, sX = strideX * p.dstC; size_t dstC = p.dstC, dstCF = AlignLo(p.dstC, F), dstC2F = AlignLo(p.dstC, 2 * F), dstC4F = AlignLo(p.dstC, 4 * F); size_t dstW2 = AlignLo(p.dstW, 2), dstW4 = AlignLo(p.dstW, 4); __m512 d00, d01, d02, d03, d10, d11, d12, d13, d20, d21, d22, d23, d30, d31, d32, d33, w0; @@ -81,7 +81,7 @@ namespace Simd __mmask16 mask1 = sx + 1 * strideX < srcW ? 0xFFFF : 0x0000; __mmask16 mask2 = sx + 2 * strideX < srcW ? 0xFFFF : 0x0000; __mmask16 mask3 = sx + 3 * strideX < srcW ? 0xFFFF : 0x0000; - const float* ps0 = psy + sx * dstC, * ps1 = ps0 + 1 * dstC, * ps2 = ps0 + 2 * dstC, * ps3 = ps0 + 3 * dstC; + const float* ps0 = psy + sx * dstC, * ps1 = ps0 + 1 * sX, * ps2 = ps0 + 2 * sX, * ps3 = ps0 + 3 * sX; w0 = _mm512_loadu_ps(pw + 0 * F); d00 = _mm512_mask3_fmadd_ps(_mm512_maskz_loadu_ps(mask0, ps0 + 0 * F), w0, d00, mask0); @@ -154,7 +154,7 @@ namespace Simd __mmask16 mask1 = sx + 1 * strideX < srcW ? 0xFFFF : 0x0000; __mmask16 mask2 = sx + 2 * strideX < srcW ? 0xFFFF : 0x0000; __mmask16 mask3 = sx + 3 * strideX < srcW ? 0xFFFF : 0x0000; - const float* ps0 = psy + sx * dstC, * ps1 = ps0 + 1 * dstC, * ps2 = ps0 + 2 * dstC, * ps3 = ps0 + 3 * dstC; + const float* ps0 = psy + sx * dstC, * ps1 = ps0 + 1 * sX, * ps2 = ps0 + 2 * sX, * ps3 = ps0 + 3 * sX; w0 = _mm512_loadu_ps(pw + 0 * F); d00 = _mm512_mask3_fmadd_ps(_mm512_maskz_loadu_ps(mask0, ps0 + 0 * F), w0, d00, mask0); @@ -180,7 +180,7 @@ namespace Simd } for (; dc < dstC; dc += F) { - __mmask16 tailC = dc < dstCF ? __mmask16(-1) : TailMask16(dstCF - dc); + __mmask16 tailC = dc < dstCF ? __mmask16(-1) : TailMask16(dstC - dc); d00 = bias ? _mm512_maskz_loadu_ps(tailC, bias + dc) : _mm512_setzero_ps(); d10 = d00; d20 = d00; d30 = d00; for (size_t ky = 0; ky < p.kernelY; ++ky) @@ -198,7 +198,7 @@ namespace Simd __mmask16 mask1 = sx + 1 * strideX < srcW ? tailC : 0x0000; __mmask16 mask2 = sx + 2 * strideX < srcW ? tailC : 0x0000; __mmask16 mask3 = sx + 3 * strideX < srcW ? tailC : 0x0000; - const float* ps0 = psy + sx * dstC, * ps1 = ps0 + 1 * dstC, * ps2 = ps0 + 2 * dstC, * ps3 = ps0 + 3 * dstC; + const float* ps0 = psy + sx * dstC, * ps1 = ps0 + 1 * sX, * ps2 = ps0 + 2 * sX, * ps3 = ps0 + 3 * sX; w0 = _mm512_loadu_ps(pw + 0 * F); d00 = _mm512_mask3_fmadd_ps(_mm512_maskz_loadu_ps(mask0, ps0 + 0 * F), w0, d00, mask0); @@ -250,7 +250,7 @@ namespace Simd const float* pw = pwy + kx * dstC; __mmask16 mask0 = sx + 0 * strideX < srcW ? 0xFFFF : 0x0000; __mmask16 mask1 = sx + 1 * strideX < srcW ? 0xFFFF : 0x0000; - const float* ps0 = psy + sx * dstC, *ps1 = ps0 + dstC; + const float* ps0 = psy + sx * dstC, * ps1 = ps0 + 1 * sX; w0 = _mm512_loadu_ps(pw + 0 * F); d00 = _mm512_mask3_fmadd_ps(_mm512_maskz_loadu_ps(mask0, ps0 + 0 * F), w0, d00, mask0); @@ -302,7 +302,7 @@ namespace Simd const float* pw = pwy + kx * dstC; __mmask16 mask0 = sx + 0 * strideX < srcW ? 0xFFFF : 0x0000; __mmask16 mask1 = sx + 1 * strideX < srcW ? 0xFFFF : 0x0000; - const float* ps0 = psy + sx * dstC, * ps1 = ps0 + dstC; + const float* ps0 = psy + sx * dstC, * ps1 = ps0 + 1 * sX; w0 = _mm512_loadu_ps(pw + 0 * F); d00 = _mm512_mask3_fmadd_ps(_mm512_maskz_loadu_ps(mask0, ps0 + 0 * F), w0, d00, mask0); @@ -320,7 +320,7 @@ namespace Simd } for (; dc < dstC; dc += F) { - __mmask16 tailC = dc < dstCF ? __mmask16(-1) : TailMask16(dstCF - dc); + __mmask16 tailC = dc < dstCF ? __mmask16(-1) : TailMask16(dstC - dc); d00 = bias ? _mm512_maskz_loadu_ps(tailC, bias + dc) : _mm512_setzero_ps(); d10 = d00; for (size_t ky = 0; ky < p.kernelY; ++ky) @@ -336,7 +336,7 @@ namespace Simd const float* pw = pwy + kx * dstC; __mmask16 mask0 = sx + 0 * strideX < srcW ? tailC : 0x0000; __mmask16 mask1 = sx + 1 * strideX < srcW ? tailC : 0x0000; - const float* ps0 = psy + sx * dstC, * ps1 = ps0 + dstC; + const float* ps0 = psy + sx * dstC, * ps1 = ps0 + 1 * sX; w0 = _mm512_maskz_loadu_ps(tailC, pw); d00 = _mm512_mask3_fmadd_ps(_mm512_maskz_loadu_ps(mask0, ps0 + 0 * F), w0, d00, mask0); @@ -428,7 +428,7 @@ namespace Simd } for (; dc < dstC; dc += F) { - __mmask16 tailC = dc < dstCF ? __mmask16(-1) : TailMask16(dstCF - dc); + __mmask16 tailC = dc < dstCF ? __mmask16(-1) : TailMask16(dstC - dc); d00 = bias ? _mm512_maskz_loadu_ps(tailC, bias + dc) : _mm512_setzero_ps(); for (size_t ky = 0; ky < p.kernelY; ++ky) { diff --git a/src/Test/TestSynetConvolution16b.cpp b/src/Test/TestSynetConvolution16b.cpp index 2db673530a..52a5cd567a 100644 --- a/src/Test/TestSynetConvolution16b.cpp +++ b/src/Test/TestSynetConvolution16b.cpp @@ -255,7 +255,7 @@ namespace Test result = result && SynetConvolution16bForwardAutoTest(eps, Param(1, 32, 321, 321, 16, _2, _1, _1, _0, _0, 1, aRe, tT, f32, b16), c, f1, f2); result = result && SynetConvolution16bForwardAutoTest(eps, Param(1, 16, 320, 320, 32, _2, _1, _1, _0, _1, 1, aRe, tT, b16, f32), c, f1, f2); #endif -#if 1 +#if 0 result = result && SynetConvolution16bForwardAutoTest(eps, Param(1, 256, 48, 48, 256, _1, _1, _1, _0, _0, 1, aPr, tF, b16, b16), c, f1, f2); result = result && SynetConvolution16bForwardAutoTest(eps, Param(1, 256, 48, 48, 256, _1, _1, _1, _0, _0, 1, aPr, tF, f32, b16), c, f1, f2); result = result && SynetConvolution16bForwardAutoTest(eps, Param(1, 256, 48, 48, 256, _1, _1, _1, _0, _0, 1, aPr, tF, b16, f32), c, f1, f2); @@ -271,6 +271,10 @@ namespace Test result = result && SynetConvolution16bForwardAutoTest(eps, Param(1, 55, 15, 15, 56, _1, _1, _1, _0, _0, 1, aPr, tF, b16, b16), c, f1, f2); result = result && SynetConvolution16bForwardAutoTest(eps, Param(1, 55, 15, 15, 55, _1, _1, _1, _0, _0, 1, aPr, tF, b16, b16), c, f1, f2); #endif +#if 1 + result = result && SynetConvolution16bForwardAutoTest(eps, Param(1, 608, 8, 8, 608, _1, _1, _1, _0, _0, 1, aPr, tT, b16, b16), c, f1, f2); + result = result && SynetConvolution16bForwardAutoTest(eps, Param(1, 608, 8, 8, 608, _1, _1, _1, _0, _0, 1, aPr, tT, f32, b16), c, f1, f2); +#endif #else result = result && SynetConvolution16bForwardAutoTest(eps, Param(1, 2156, 4, 4, 4, _1, _1, _1, _0, _0, 1, aId, tF, b16, b16), c, f1, f2); diff --git a/src/Test/TestSynetConvolution32f.cpp b/src/Test/TestSynetConvolution32f.cpp index e92df2271e..b65aa5b2ce 100644 --- a/src/Test/TestSynetConvolution32f.cpp +++ b/src/Test/TestSynetConvolution32f.cpp @@ -197,7 +197,7 @@ namespace Test result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 64, 56, 56, 128, _7, _1, _2, _3, _3, 64, a, t), f1, f2); result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 49, 29, 29, 98, _7, _1, _2, _3, _3, 49, a, t), f1, f2); #endif -#if 1 +#if 0 result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 48, 24, 24, 96, _1, _1, _2, _0, _0, 1, aGe, tT), f1, f2); result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 128, 14, 14, 128, _5, _1, _1, _2, _2, 1, aEl, tT), f1, f2); result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 160, 28, 28, 160, _3, _1, _2, _1, _1, 10, aRe, tT), f1, f2); @@ -246,6 +246,7 @@ namespace Test #endif #if 1 result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 256, 48, 48, 256, _1, _1, _1, _0, _0, 1, a, t), f1, f2); + result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 20, 75, 75, 20, _5, _1, _2, _2, _2, 20, a, t), f1, f2); result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 304, 16, 16, 304, _3, _1, _1, _1, _1, 304, aId, t), f1, f2); result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 304, 16, 16, 304, _7, _1, _1, _3, _3, 304, aId, t), f1, f2); #endif diff --git a/src/Test/TestSynetMergedConvolution16b.cpp b/src/Test/TestSynetMergedConvolution16b.cpp index 85be6898fc..2cac69cda1 100644 --- a/src/Test/TestSynetMergedConvolution16b.cpp +++ b/src/Test/TestSynetMergedConvolution16b.cpp @@ -269,7 +269,7 @@ namespace Test aHi = SimdConvolutionActivationHardSigmoid, aSw = SimdConvolutionActivationSwish, aGe = SimdConvolutionActivationGelu; const SimdConvolutionActivationType a0 = aSw, a1 = aSw, a2 = aSw; #if defined(NDEBUG) -#if 1 +#if 0 result = result && SynetMergedConvolution16bForwardAutoTest(eps, Param(Shp(1, 555, 40, 23), Cnv(a1, 1, 1, 256), Cnv(a0, 3, 1), f32, b16, c), f1, f2); result = result && SynetMergedConvolution16bForwardAutoTest(eps, Param(Shp(1, 555, 40, 23), Cnv(a1, 1, 1, 256), Cnv(a0, 3, 1), b16, b16, c), f1, f2); result = result && SynetMergedConvolution16bForwardAutoTest(eps, Param(Shp(1, 555, 40, 23), Cnv(a0, 3, 2), Cnv(a1, 1, 1, 1555), f32, f32, c), f1, f2); @@ -284,6 +284,12 @@ namespace Test result = result && SynetMergedConvolution16bForwardAutoTest(eps, Param(Shp(1, 224, 2, 3), Cnv(a0, 1, 1, 64), Cnv(a1, 3, 2), Cnv(a2, 1, 1, 128), f32, f32, c), f1, f2); result = result && SynetMergedConvolution16bForwardAutoTest(eps, Param(Shp(1, 116, 15, 5), Cnv(a1, 3, 2), Cnv(a2, 1, 1, 116), f32, f32, c), f1, f2); #endif +#if 1 + result = result && SynetMergedConvolution16bForwardAutoTest(eps, Param(Shp(1, 76, 64, 64), Cnv(a1, 7, 1), Cnv(a2, 1, 1, 304), f32, b16, c), f1, f2); + result = result && SynetMergedConvolution16bForwardAutoTest(eps, Param(Shp(1, 152, 32, 32), Cnv(a1, 7, 1), Cnv(a2, 1, 1, 608), f32, b16, c), f1, f2); + result = result && SynetMergedConvolution16bForwardAutoTest(eps, Param(Shp(1, 304, 16, 16), Cnv(a1, 7, 1), Cnv(a2, 1, 1, 1216), f32, b16, c), f1, f2); + result = result && SynetMergedConvolution16bForwardAutoTest(eps, Param(Shp(1, 608, 8, 8), Cnv(a1, 7, 1), Cnv(a2, 1, 1, 2432), f32, b16, c), f1, f2); +#endif #if 0 { Param p(Shp(1, 64, 20, 60), Cnv(a0, 1, 1, 128), Cnv(a1, 3, 1), b16, b16, c);