Describes convolution (deconvolution) parameters. It is used in SimdSynetConvolution32fInit, SimdSynetConvolution8iInit, SimdSynetDeconvolution32fInit, SimdSynetMergedConvolution32fInit and SimdSynetMergedConvolution8iInit.
+
◆ srcC
diff --git a/prj/vs2019/AmxBf16.vcxproj b/prj/vs2019/AmxBf16.vcxproj
index 70cc4f0c94..b5c0028f1e 100644
--- a/prj/vs2019/AmxBf16.vcxproj
+++ b/prj/vs2019/AmxBf16.vcxproj
@@ -46,8 +46,6 @@
-
-
@@ -75,8 +73,6 @@
-
-
diff --git a/prj/vs2019/AmxBf16.vcxproj.filters b/prj/vs2019/AmxBf16.vcxproj.filters
index 53e2cea50f..14c9226f6b 100644
--- a/prj/vs2019/AmxBf16.vcxproj.filters
+++ b/prj/vs2019/AmxBf16.vcxproj.filters
@@ -82,9 +82,6 @@
Inc
-
- Inc
-
Inc
@@ -109,9 +106,6 @@
AmxBf16
-
- Inc
-
Inc
@@ -197,12 +191,6 @@
AmxBf16
-
- AmxBf16
-
-
- AmxBf16
-
AmxBf16
diff --git a/prj/vs2019/Avx2.vcxproj b/prj/vs2019/Avx2.vcxproj
index 00fb9813bf..e6a409cec8 100644
--- a/prj/vs2019/Avx2.vcxproj
+++ b/prj/vs2019/Avx2.vcxproj
@@ -90,7 +90,6 @@
-
@@ -213,7 +212,6 @@
-
diff --git a/prj/vs2019/Avx2.vcxproj.filters b/prj/vs2019/Avx2.vcxproj.filters
index 074d39a745..15a0c591cd 100644
--- a/prj/vs2019/Avx2.vcxproj.filters
+++ b/prj/vs2019/Avx2.vcxproj.filters
@@ -358,9 +358,6 @@
Avx2
-
- Avx2
-
Avx2
@@ -653,9 +650,6 @@
Inc
-
- Inc
-
Inc
diff --git a/prj/vs2019/Avx512bw.vcxproj b/prj/vs2019/Avx512bw.vcxproj
index 2115f2aebf..22739535a1 100644
--- a/prj/vs2019/Avx512bw.vcxproj
+++ b/prj/vs2019/Avx512bw.vcxproj
@@ -93,7 +93,6 @@
-
@@ -215,7 +214,6 @@
-
diff --git a/prj/vs2019/Avx512bw.vcxproj.filters b/prj/vs2019/Avx512bw.vcxproj.filters
index e1d6c8680d..d9a468ba39 100644
--- a/prj/vs2019/Avx512bw.vcxproj.filters
+++ b/prj/vs2019/Avx512bw.vcxproj.filters
@@ -367,9 +367,6 @@
Avx512bw
-
- Avx512bw
-
Avx512bw
@@ -662,9 +659,6 @@
Inc
-
- Inc
-
Inc
diff --git a/prj/vs2019/Base.vcxproj b/prj/vs2019/Base.vcxproj
index 7de2a88ba1..8159ee926d 100644
--- a/prj/vs2019/Base.vcxproj
+++ b/prj/vs2019/Base.vcxproj
@@ -84,7 +84,6 @@
-
@@ -197,7 +196,6 @@
-
diff --git a/prj/vs2019/Base.vcxproj.filters b/prj/vs2019/Base.vcxproj.filters
index fbf47b21c5..142727b100 100644
--- a/prj/vs2019/Base.vcxproj.filters
+++ b/prj/vs2019/Base.vcxproj.filters
@@ -289,9 +289,6 @@
Base
-
- Base
-
Base
@@ -597,9 +594,6 @@
Inc
-
- Inc
-
Inc
diff --git a/prj/vs2019/Neon.vcxproj b/prj/vs2019/Neon.vcxproj
index eae09277a8..096d0c8ebb 100644
--- a/prj/vs2019/Neon.vcxproj
+++ b/prj/vs2019/Neon.vcxproj
@@ -178,7 +178,6 @@
-
diff --git a/prj/vs2019/Neon.vcxproj.filters b/prj/vs2019/Neon.vcxproj.filters
index c75406cce7..712dd641d1 100644
--- a/prj/vs2019/Neon.vcxproj.filters
+++ b/prj/vs2019/Neon.vcxproj.filters
@@ -518,9 +518,6 @@
Inc
-
- Inc
-
Inc
diff --git a/prj/vs2019/Sse41.vcxproj b/prj/vs2019/Sse41.vcxproj
index 8f13fe2c0e..12f817df39 100644
--- a/prj/vs2019/Sse41.vcxproj
+++ b/prj/vs2019/Sse41.vcxproj
@@ -97,7 +97,6 @@
-
@@ -220,7 +219,6 @@
-
diff --git a/prj/vs2019/Sse41.vcxproj.filters b/prj/vs2019/Sse41.vcxproj.filters
index 50ec19bc4d..a02005fcec 100644
--- a/prj/vs2019/Sse41.vcxproj.filters
+++ b/prj/vs2019/Sse41.vcxproj.filters
@@ -382,9 +382,6 @@
Sse41
-
- Sse41
-
Sse41
@@ -677,9 +674,6 @@
Inc
-
- Inc
-
Inc
diff --git a/prj/vs2022/AmxBf16.vcxproj b/prj/vs2022/AmxBf16.vcxproj
index 70cc4f0c94..b5c0028f1e 100644
--- a/prj/vs2022/AmxBf16.vcxproj
+++ b/prj/vs2022/AmxBf16.vcxproj
@@ -46,8 +46,6 @@
-
-
@@ -75,8 +73,6 @@
-
-
diff --git a/prj/vs2022/AmxBf16.vcxproj.filters b/prj/vs2022/AmxBf16.vcxproj.filters
index 53e2cea50f..14c9226f6b 100644
--- a/prj/vs2022/AmxBf16.vcxproj.filters
+++ b/prj/vs2022/AmxBf16.vcxproj.filters
@@ -82,9 +82,6 @@
Inc
-
- Inc
-
Inc
@@ -109,9 +106,6 @@
AmxBf16
-
- Inc
-
Inc
@@ -197,12 +191,6 @@
AmxBf16
-
- AmxBf16
-
-
- AmxBf16
-
AmxBf16
diff --git a/prj/vs2022/Avx2.vcxproj b/prj/vs2022/Avx2.vcxproj
index 00fb9813bf..e6a409cec8 100644
--- a/prj/vs2022/Avx2.vcxproj
+++ b/prj/vs2022/Avx2.vcxproj
@@ -90,7 +90,6 @@
-
@@ -213,7 +212,6 @@
-
diff --git a/prj/vs2022/Avx2.vcxproj.filters b/prj/vs2022/Avx2.vcxproj.filters
index 074d39a745..15a0c591cd 100644
--- a/prj/vs2022/Avx2.vcxproj.filters
+++ b/prj/vs2022/Avx2.vcxproj.filters
@@ -358,9 +358,6 @@
Avx2
-
- Avx2
-
Avx2
@@ -653,9 +650,6 @@
Inc
-
- Inc
-
Inc
diff --git a/prj/vs2022/Avx512bw.vcxproj b/prj/vs2022/Avx512bw.vcxproj
index 2115f2aebf..22739535a1 100644
--- a/prj/vs2022/Avx512bw.vcxproj
+++ b/prj/vs2022/Avx512bw.vcxproj
@@ -93,7 +93,6 @@
-
@@ -215,7 +214,6 @@
-
diff --git a/prj/vs2022/Avx512bw.vcxproj.filters b/prj/vs2022/Avx512bw.vcxproj.filters
index e1d6c8680d..d9a468ba39 100644
--- a/prj/vs2022/Avx512bw.vcxproj.filters
+++ b/prj/vs2022/Avx512bw.vcxproj.filters
@@ -367,9 +367,6 @@
Avx512bw
-
- Avx512bw
-
Avx512bw
@@ -662,9 +659,6 @@
Inc
-
- Inc
-
Inc
diff --git a/prj/vs2022/Base.vcxproj b/prj/vs2022/Base.vcxproj
index 7de2a88ba1..8159ee926d 100644
--- a/prj/vs2022/Base.vcxproj
+++ b/prj/vs2022/Base.vcxproj
@@ -84,7 +84,6 @@
-
@@ -197,7 +196,6 @@
-
diff --git a/prj/vs2022/Base.vcxproj.filters b/prj/vs2022/Base.vcxproj.filters
index fbf47b21c5..142727b100 100644
--- a/prj/vs2022/Base.vcxproj.filters
+++ b/prj/vs2022/Base.vcxproj.filters
@@ -289,9 +289,6 @@
Base
-
- Base
-
Base
@@ -597,9 +594,6 @@
Inc
-
- Inc
-
Inc
diff --git a/prj/vs2022/Neon.vcxproj b/prj/vs2022/Neon.vcxproj
index eae09277a8..096d0c8ebb 100644
--- a/prj/vs2022/Neon.vcxproj
+++ b/prj/vs2022/Neon.vcxproj
@@ -178,7 +178,6 @@
-
diff --git a/prj/vs2022/Neon.vcxproj.filters b/prj/vs2022/Neon.vcxproj.filters
index c75406cce7..712dd641d1 100644
--- a/prj/vs2022/Neon.vcxproj.filters
+++ b/prj/vs2022/Neon.vcxproj.filters
@@ -518,9 +518,6 @@
Inc
-
- Inc
-
Inc
diff --git a/prj/vs2022/Sse41.vcxproj b/prj/vs2022/Sse41.vcxproj
index 8f13fe2c0e..12f817df39 100644
--- a/prj/vs2022/Sse41.vcxproj
+++ b/prj/vs2022/Sse41.vcxproj
@@ -97,7 +97,6 @@
-
@@ -220,7 +219,6 @@
-
diff --git a/prj/vs2022/Sse41.vcxproj.filters b/prj/vs2022/Sse41.vcxproj.filters
index 50ec19bc4d..a02005fcec 100644
--- a/prj/vs2022/Sse41.vcxproj.filters
+++ b/prj/vs2022/Sse41.vcxproj.filters
@@ -382,9 +382,6 @@
Sse41
-
- Sse41
-
Sse41
@@ -677,9 +674,6 @@
Inc
-
- Inc
-
Inc
diff --git a/src/Simd/SimdAmxBf16SynetConvolution32f.cpp b/src/Simd/SimdAmxBf16SynetConvolution32f.cpp
deleted file mode 100644
index cd8a55c765..0000000000
--- a/src/Simd/SimdAmxBf16SynetConvolution32f.cpp
+++ /dev/null
@@ -1,49 +0,0 @@
-/*
-* Simd Library (http://ermig1979.github.io/Simd).
-*
-* Copyright (c) 2011-2024 Yermalayeu Ihar.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a copy
-* of this software and associated documentation files (the "Software"), to deal
-* in the Software without restriction, including without limitation the rights
-* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-* copies of the Software, and to permit persons to whom the Software is
-* furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice shall be included in
-* all copies or substantial portions of the Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-* SOFTWARE.
-*/
-
-#include "Simd/SimdSynetConvolution32fBf16.h"
-#include "Simd/SimdSynet.h"
-
-namespace Simd
-{
-#if (defined(SIMD_AMXBF16_ENABLE) || (defined(SIMD_AVX512BW_ENABLE) && defined(SIMD_AMX_EMULATE))) && defined(SIMD_SYNET_ENABLE)
- namespace AmxBf16
- {
- void* SynetConvolution32fInit(size_t batch, const SimdConvolutionParameters* conv, SimdSynetCompatibilityType compatibility)
- {
- ConvParam param(batch, conv, compatibility);
- if (!param.Valid(SimdTensorData32f))
- return NULL;
- else if (Base::Bf16Soft(compatibility) || Base::Bf16Hard(compatibility))
- {
- if (Base::SynetConvolution32fBf16NhwcGemm::Preferable(param))
- return new SynetConvolution32fBf16NhwcGemm(param);
- else
- return new Base::SynetConvolution32fBf16Gemm(param);
- }
- return Avx512bw::SynetConvolution32fInit(batch, conv, compatibility);
- }
- }
-#endif
-}
diff --git a/src/Simd/SimdAmxBf16SynetConvolution32fBf16NhwcGemm.cpp b/src/Simd/SimdAmxBf16SynetConvolution32fBf16NhwcGemm.cpp
deleted file mode 100644
index 3f00ccc25d..0000000000
--- a/src/Simd/SimdAmxBf16SynetConvolution32fBf16NhwcGemm.cpp
+++ /dev/null
@@ -1,489 +0,0 @@
-/*
-* Simd Library (http://ermig1979.github.io/Simd).
-*
-* Copyright (c) 2011-2024 Yermalayeu Ihar.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a copy
-* of this software and associated documentation files (the "Software"), to deal
-* in the Software without restriction, including without limitation the rights
-* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-* copies of the Software, and to permit persons to whom the Software is
-* furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice shall be included in
-* all copies or substantial portions of the Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-* SOFTWARE.
-*/
-#include "Simd/SimdSynetConvolution32fBf16.h"
-#include "Simd/SimdSynetConvolution32fCommon.h"
-#include "Simd/SimdSynetConvolution16bCommon.h"
-#include "Simd/SimdBFloat16.h"
-#include "Simd/SimdSynet.h"
-#include "Simd/SimdAvx512bw.h"
-#include "Simd/SimdAmxBf16.h"
-#include "Simd/SimdCpu.h"
-#include "Simd/SimdTile.h"
-
-namespace Simd
-{
-#if (defined(SIMD_AMXBF16_ENABLE) || (defined(SIMD_AVX512BW_ENABLE) && defined(SIMD_AMX_EMULATE))) && defined(SIMD_SYNET_ENABLE)
- namespace AmxBf16
- {
- typedef Base::SynetConvolution32fBf16NhwcGemm::AlgParam AlgParam;
- typedef Base::SynetConvolution32fBf16NhwcGemm::ConvolutionPtr Convolution;
-
- //-------------------------------------------------------------------------------------------------
-
- static void ConvertBf16NhwcGemm1x1(const float* src, const ConvParam& p, const SynetConvolution32fBf16NhwcGemm::AlgParam& a, size_t b, size_t yBeg, size_t yEnd, uint16_t* dst)
- {
- size_t srcC32 = AlignLo(p.srcC, 32);
- __mmask16 srcMask[2];
- __mmask32 dstMask[1];
- if (srcC32 < p.srcC)
- {
- srcMask[0] = TailMask16(p.srcC - srcC32 - F * 0);
- srcMask[1] = TailMask16(p.srcC - srcC32 - F * 1);
- dstMask[0] = __mmask32(-1);
- }
- src += yBeg * p.srcW * p.srcC;
- if (a.macroK < a.bufK)
- {
- //SIMD_PERF_BEG("reorder");
- size_t bodyK = AlignLoAny(a.bufK, a.macroK), tailK = a.bufK - bodyK;
- for (size_t dy = yBeg, dr = (b * p.dstH + dy) * p.dstW; dy < yEnd; ++dy)
- {
- for (size_t dx = 0; dx < p.dstW; ++dx, ++dr)
- {
- size_t sc = 0, mak = 0;
- for (; mak < bodyK; mak += a.macroK)
- {
- uint16_t* buf = dst + mak * a.bufM + dr * a.macroK;
- for (size_t scE = mak + a.macroK; sc < scE; sc += 32)
- Float32ToBFloat16
(src + sc, buf + sc - mak, srcMask, dstMask);
- }
- if(tailK)
- {
- uint16_t* buf = dst + mak * a.bufM + dr * tailK;
- for (; sc < srcC32; sc += 32)
- Float32ToBFloat16(src + sc, buf + sc - mak, srcMask, dstMask);
- if (srcC32 < p.srcC)
- Float32ToBFloat16(src + sc, buf + sc - mak, srcMask, dstMask);
- }
- src += p.srcC;
- }
- }
- }
- else if (srcC32 < p.srcC)
- {
- //SIMD_PERF_BEG("direct");
- dst += b * p.dstH * p.dstW * a.bufK;
- for (size_t dy = yBeg; dy < yEnd; ++dy)
- {
- for (size_t dx = 0; dx < p.dstW; ++dx)
- {
- size_t sc = 0;
- for (; sc < srcC32; sc += 32)
- Float32ToBFloat16(src + sc, dst + sc, srcMask, dstMask);
- if (srcC32 < p.srcC)
- Float32ToBFloat16(src + sc, dst + sc, srcMask, dstMask);
- src += p.srcC;
- dst += a.bufK;
- }
- }
- }
- else
- {
- //SIMD_PERF_BEG("solid");
- dst += b * p.dstH * p.dstW * a.bufK;
- for (size_t n = (yEnd - yBeg) * p.srcW * p.srcC, i = 0; i < n; i += 32)
- Float32ToBFloat16(src + i, dst + i, srcMask, dstMask);
- }
- }
-
- static void ConvertBf16NhwcGemm(const float* src, const ConvParam& p, const SynetConvolution32fBf16NhwcGemm::AlgParam& a, size_t b, size_t yBeg, size_t yEnd, uint16_t* dst)
- {
- //SIMD_PERF_FUNC();
-
- size_t srcC32 = AlignLo(p.srcC, 32);
- __mmask16 srcMask[2];
- __mmask32 dstMask[1];
- if (srcC32 < p.srcC)
- {
- srcMask[0] = TailMask16(p.srcC - srcC32 - F * 0);
- srcMask[1] = TailMask16(p.srcC - srcC32 - F * 1);
- dstMask[0] = TailMask32(p.srcC - srcC32);
- }
- uint16_t* buf = dst + a.bufM * a.bufK;
- size_t gap = a.bufK - a.K;
- __mmask32 gapMask = TailMask32(gap);
- for (size_t dy = yBeg, dr = (a.macroK < a.bufK ? dy * p.dstW : 0) + b * p.dstH * p.dstW; dy < yEnd; ++dy)
- {
- for (size_t dx = 0; dx < p.dstW; ++dx, ++dr)
- {
- uint16_t* row = a.macroK < a.bufK ? buf : dst + dr * a.bufK;
- for (size_t ky = 0, k = 0; ky < p.kernelY; ky++)
- {
- size_t sy = dy * p.strideY + ky * p.dilationY - p.padY;
- if (sy < p.srcH)
- {
- for (size_t kx = 0; kx < p.kernelX; kx++)
- {
- size_t sx = dx * p.strideX + kx * p.dilationX - p.padX;
- if (sx < p.srcW)
- {
- const float* ps = src + (sy * p.srcW + sx) * p.srcC;
- size_t sc = 0;
- for (; sc < srcC32; sc += 32)
- Float32ToBFloat16(ps + sc, row + sc, srcMask, dstMask);
- if (srcC32 < p.srcC)
- Float32ToBFloat16(ps + sc, row + sc, srcMask, dstMask);
- row += p.srcC;
- }
- else
- {
- memset(row, 0, p.srcC * 2);
- row += p.srcC;
- }
- }
- }
- else
- {
- memset(row, 0, p.kernelX * p.srcC * 2);
- row += p.kernelX * p.srcC;
- }
- }
- if (gap)
- {
- _mm512_mask_storeu_epi16(row, gapMask, _mm512_setzero_si512());
- row += gap;
- }
- if (a.macroK < a.bufK)
- {
- for (size_t mak = 0; mak < a.bufK; mak += a.macroK)
- {
- size_t macroK = Simd::Min(a.bufK, mak + a.macroK) - mak;
- memcpy(dst + mak * a.bufM + dr * macroK, buf + mak, macroK * 2);
- }
- }
- }
- }
- }
-
- //-------------------------------------------------------------------------------------------------
-
- template void ConvolutionBf16NhwcGemm_2x2(const uint16_t* src0, const ConvParam& p,
- size_t srcC, size_t dstS, size_t dstC, int zero, const uint16_t* weight0, const __m512* bias, const __m512* params, float* dst)
- {
- int dD = (int)p.dstC, strideS = (int)srcC * 2, strideW = 128, strideD = dD * 4;
- const uint16_t* src1 = src0 + srcC * 16, *weight1 = weight0 + 32;
-
- TileConf conf;
- conf.rows[0] = 16;
- conf.rows[1] = 16;
- conf.rows[2] = uint8_t(dstS - 16);
- conf.rows[3] = uint8_t(dstS - 16);
- conf.rows[4] = 16;
- conf.rows[5] = uint8_t(dstS - 16);
- conf.rows[6] = 16;
- conf.rows[7] = 16;
- conf.colsb[0] = 64;
- conf.colsb[1] = uint16_t((dstC - 16) * 4);
- conf.colsb[2] = 64;
- conf.colsb[3] = uint16_t((dstC - 16) * 4);
- conf.colsb[4] = 64;
- conf.colsb[5] = 64;
- conf.colsb[6] = 64;
- conf.colsb[7] = uint16_t((dstC - 16) * 4);
- _tile_loadconfig(&conf);
-
- if (zero)
- {
- _tile_zero(0);
- _tile_zero(1);
- _tile_zero(2);
- _tile_zero(3);
- }
- else
- {
- _tile_stream_loadd(0, dst + 0, strideD);
- _tile_stream_loadd(1, dst + F, strideD);
- _tile_stream_loadd(2, dst + 16 * dD + 0, strideD);
- _tile_stream_loadd(3, dst + 16 * dD + F, strideD);
- }
- for (size_t sc = 0; sc < srcC; sc += 32)
- {
- _tile_stream_loadd(4, src0 + sc, strideS);
- _tile_loadd(6, weight0 + sc * 32, strideW);
- _tile_dpbf16ps(0, 4, 6);
- _tile_loadd(7, weight1 + sc * 32, strideW);
- _tile_dpbf16ps(1, 4, 7);
- _tile_stream_loadd(5, src1 + sc, strideS);
- _tile_dpbf16ps(2, 5, 6);
- _tile_dpbf16ps(3, 5, 7);
- }
- _tile_stored(0, dst + 0, strideD);
- _tile_stored(1, dst + F, strideD);
- _tile_stored(2, dst + 16 * dD + 0, strideD);
- _tile_stored(3, dst + 16 * dD + F, strideD);
-
- if (type)
- {
- __mmask16 tailD = TailMask16(dstC - F);
- size_t dstS8 = AlignLo(dstS, 8), s = 0;
- for (; s < dstS8; s += 8, dst += 8 * dD)
- Apply2x8(dst, dD, bias, params, tailD);
- for(; s < dstS; ++s, dst += dD)
- Apply2(dst, bias, params, tailD);
- }
- }
-
- template void ConvolutionBf16NhwcGemm_2x1(const uint16_t* src0, const ConvParam& p,
- size_t srcC, size_t dstS, size_t dstC, int zero, const uint16_t* weight0, const __m512* bias, const __m512* params, float* dst)
- {
- size_t dD = p.dstC;
- int strideS = (int)srcC * 2, strideW = 128, strideD = (int)dD * 4;
- const uint16_t* src1 = src0 + srcC * 16;
-
- TileConf conf;
- conf.rows[0] = 16;
- conf.rows[2] = uint8_t(dstS - 16);
- conf.rows[4] = 16;
- conf.rows[5] = uint8_t(dstS - 16);
- conf.rows[6] = 16;
- conf.colsb[0] = uint16_t(dstC * 4);
- conf.colsb[2] = uint16_t(dstC * 4);
- conf.colsb[4] = 64;
- conf.colsb[5] = 64;
- conf.colsb[6] = uint16_t(dstC * 4);
- _tile_loadconfig(&conf);
-
- if (zero)
- {
- _tile_zero(0);
- _tile_zero(2);
- }
- else
- {
- _tile_stream_loadd(0, dst + 0, strideD);
- _tile_stream_loadd(2, dst + 16 * dD + 0, strideD);
- }
- for (size_t sc = 0; sc < srcC; sc += 32)
- {
- _tile_stream_loadd(4, src0 + sc, strideS);
- _tile_loadd(6, weight0 + sc * 32, strideW);
- _tile_dpbf16ps(0, 4, 6);
- _tile_stream_loadd(5, src1 + sc, strideS);
- _tile_dpbf16ps(2, 5, 6);
- }
- _tile_stored(0, dst + 0, strideD);
- _tile_stored(2, dst + 16 * dD + 0, strideD);
-
- if (type)
- {
- __mmask16 tailD = TailMask16(dstC);
- size_t dstS8 = AlignLo(dstS, 8), s = 0;
- for (; s < dstS8; s += 8, dst += 8 * dD)
- Apply1x8(dst, dD, bias, params, tailD);
- for (; s < dstS; ++s, dst += dD)
- Apply1(dst, bias, params, tailD);
- }
- }
-
- template void ConvolutionBf16NhwcGemm_1x2(const uint16_t* src0, const ConvParam& p,
- size_t srcC, size_t dstS, size_t dstC, int zero, const uint16_t* weight0, const __m512* bias, const __m512* params, float* dst)
- {
- size_t dD = p.dstC;
- int strideS = (int)srcC * 2, strideW = 128, strideD = (int)dD * 4;
- const uint16_t* weight1 = weight0 + 32;
-
- TileConf conf;
- conf.rows[0] = uint8_t(dstS);
- conf.rows[1] = uint8_t(dstS);
- conf.rows[4] = uint8_t(dstS);
- conf.rows[6] = 16;
- conf.rows[7] = 16;
- conf.colsb[0] = 64;
- conf.colsb[1] = uint16_t(dstC - 16) * 4;
- conf.colsb[4] = 64;
- conf.colsb[6] = 64;
- conf.colsb[7] = uint16_t(dstC - 16) * 4;
- _tile_loadconfig(&conf);
-
- if (zero)
- {
- _tile_zero(0);
- _tile_zero(1);
- }
- else
- {
- _tile_stream_loadd(0, dst + 0, strideD);
- _tile_stream_loadd(1, dst + F, strideD);
- }
- for (size_t sc = 0; sc < srcC; sc += 32)
- {
- _tile_stream_loadd(4, src0 + sc, strideS);
- _tile_loadd(6, weight0 + sc * 32, strideW);
- _tile_dpbf16ps(0, 4, 6);
- _tile_loadd(7, weight1 + sc * 32, strideW);
- _tile_dpbf16ps(1, 4, 7);
- }
- _tile_stored(0, dst + 0, strideD);
- _tile_stored(1, dst + F, strideD);
-
- if (type)
- {
- __mmask16 tailD = TailMask16(dstC - F);
- size_t dstS8 = AlignLo(dstS, 8), s = 0;
- for (; s < dstS8; s += 8, dst += 8 * dD)
- Apply2x8(dst, dD, bias, params, tailD);
- for (; s < dstS; ++s, dst += dD)
- Apply2(dst, bias, params, tailD);
- }
- }
-
- template void ConvolutionBf16NhwcGemm_1x1(const uint16_t* src0, const ConvParam& p,
- size_t srcC, size_t dstS, size_t dstC, int zero, const uint16_t* weight0, const __m512* bias, const __m512* params, float* dst)
- {
- size_t dD = p.dstC;
- int strideS = (int)srcC * 2, strideW = 128, strideD = (int)dD * 4;
-
- TileConf conf;
- conf.rows[0] = uint8_t(dstS);
- conf.rows[4] = uint8_t(dstS);
- conf.rows[6] = 16;
- conf.colsb[0] = uint16_t(dstC * 4);
- conf.colsb[4] = 64;
- conf.colsb[6] = uint16_t(dstC * 4);
- _tile_loadconfig(&conf);
-
- if (zero)
- {
- _tile_zero(0);
- }
- else
- {
- _tile_stream_loadd(0, dst + 0, strideD);
- }
- for (size_t sc = 0; sc < srcC; sc += 32)
- {
- _tile_stream_loadd(4, src0 + sc, strideS);
- _tile_loadd(6, weight0 + sc * 32, strideW);
- _tile_dpbf16ps(0, 4, 6);
- }
- _tile_stored(0, dst + 0, strideD);
-
- if (type)
- {
- __mmask16 tailD = TailMask16(dstC);
- size_t dstS8 = AlignLo(dstS, 8), s = 0;
- for (; s < dstS8; s += 8, dst += 8 * dD)
- Apply1x8(dst, dD, bias, params, tailD);
- for (; s < dstS; ++s, dst += dD)
- Apply1(dst, bias, params, tailD);
- }
- }
-
- typedef void (*ConvolutionBf16NhwcGemmPtr)(const uint16_t* src0, const ConvParam& p,
- size_t srcC, size_t dstS, size_t dstC, int zero, const uint16_t* weight0, const __m512* bias, const __m512* params, float* dst);
-
- template void ConvolutionBf16NhwcGemm_2(const uint16_t* src, const ConvParam& p,
- size_t dstC, size_t dstH, size_t srcC, int zero, const uint16_t* weight, const float* bias, const float* params, float* dst)
- {
- //SIMD_PERF_FUNC();
-
- size_t n = 32, n1 = dstH * p.dstW, nn = AlignLoAny(n1, n), m = n1 - nn, dW = AlignHi(srcC, 2) * DF;
- ConvolutionBf16NhwcGemmPtr body_2 = ConvolutionBf16NhwcGemm_2x2;
- ConvolutionBf16NhwcGemmPtr tail_2 = m > 16 ? ConvolutionBf16NhwcGemm_2x2 : ConvolutionBf16NhwcGemm_1x2;
- ConvolutionBf16NhwcGemmPtr body_1 = ConvolutionBf16NhwcGemm_2x1;
- ConvolutionBf16NhwcGemmPtr tail_1 = m > 16 ? ConvolutionBf16NhwcGemm_2x1 : ConvolutionBf16NhwcGemm_1x1;
-
- __m512 _params[2], _bias[2];
- _params[0] = _mm512_set1_ps(params[0]);
- if (type == SimdConvolutionActivationRestrictRange ||
- type == SimdConvolutionActivationHswish ||
- type == SimdConvolutionActivationHardSigmoid)
- _params[1] = _mm512_set1_ps(params[1]);
-
- for (size_t dc = 0; dc < dstC; dc += DF)
- {
- size_t dC = Simd::Min(DF, dstC - dc);
- _bias[0] = _mm512_loadu_ps(bias + dc + 0);
- _bias[1] = _mm512_loadu_ps(bias + dc + F);
- if (type == ::SimdConvolutionActivationPrelu)
- {
- _params[0] = _mm512_loadu_ps(params + dc + 0);
- _params[1] = _mm512_loadu_ps(params + dc + F);
- }
- float* d = dst;
- const uint16_t* s = src;
- size_t i = 0;
- if (dC > F)
- {
- for (; i < nn; i += n, s += n * srcC, d += n * p.dstC)
- body_2(s, p, srcC, n, dC, zero, weight, _bias, _params, d);
- if (m)
- tail_2(s, p, srcC, m, dC, zero, weight, _bias, _params, d);
- }
- else
- {
- for (; i < nn; i += n, s += n * srcC, d += n * p.dstC)
- body_1(s, p, srcC, n, dC, zero, weight, _bias, _params, d);
- if (m)
- tail_1(s, p, srcC, m, dC, zero, weight, _bias, _params, d);
- }
- weight += dW;
- dst += DF;
- }
- }
-
- //-------------------------------------------------------------------------------------------------
-
- template SIMD_INLINE void Set(const ConvParam& p, const AlgParam& a, Convolution* convolutions)
- {
- convolutions[TermLast] = ConvolutionBf16NhwcGemm_2;
- convolutions[TermInterim] = ConvolutionBf16NhwcGemm_2;
- }
-
- SynetConvolution32fBf16NhwcGemm::SynetConvolution32fBf16NhwcGemm(const ConvParam & p)
- : Avx512bw::SynetConvolution32fBf16NhwcGemm(p)
- {
- size_t microD = 16 * 2;
- size_t microM = 16 * 2;
- size_t microC = 16 * 2;
-#if !defined(SIMD_AMX_EMULATE)
- if (p.srcC* p.kernelX * p.kernelY < 1 * microC)
- return;
-#endif
- SetAlgParam(microD, microM, microC, Base::AlgCacheL1(), Base::AlgCacheL2() / 2, Base::AlgCacheL3());
-#if !defined(SIMD_AMX_EMULATE)
- if(p.Is1x1())
- _convert = ConvertBf16NhwcGemm1x1;
- else
- _convert = ConvertBf16NhwcGemm;
-#endif
- switch (p.activation)
- {
- case SimdConvolutionActivationIdentity: Set(p, _alg, _convolutions); break;
- case SimdConvolutionActivationRelu: Set(p, _alg, _convolutions); break;
- case SimdConvolutionActivationLeakyRelu: Set(p, _alg, _convolutions); break;
- case SimdConvolutionActivationRestrictRange: Set(p, _alg, _convolutions); break;
- case SimdConvolutionActivationPrelu: Set(p, _alg, _convolutions); break;
- case SimdConvolutionActivationElu: Set(p, _alg, _convolutions); break;
- case SimdConvolutionActivationHswish: Set(p, _alg, _convolutions); break;
- case SimdConvolutionActivationMish: Set(p, _alg, _convolutions); break;
- case SimdConvolutionActivationHardSigmoid: Set(p, _alg, _convolutions); break;
- case SimdConvolutionActivationSwish: Set(p, _alg, _convolutions); break;
- case SimdConvolutionActivationGelu: Set(p, _alg, _convolutions); break;
- default: assert(0);
- }
- }
- }
-#endif
-}
diff --git a/src/Simd/SimdAmxBf16SynetDeconvolution16bNhwcGemm.cpp b/src/Simd/SimdAmxBf16SynetDeconvolution16bNhwcGemm.cpp
index cec8824258..942ac24a78 100644
--- a/src/Simd/SimdAmxBf16SynetDeconvolution16bNhwcGemm.cpp
+++ b/src/Simd/SimdAmxBf16SynetDeconvolution16bNhwcGemm.cpp
@@ -271,6 +271,7 @@ namespace Simd
void Deconvolution16bNhwcGemm_2(const uint16_t* src, const DeconvParam& p, const AlgParam& a, size_t M, size_t N, size_t K, int zero, const uint16_t* wgt, float* dst)
{
+ //SIMD_PERF_FUNC();
size_t m = 32, mm = AlignLoAny(M, m), t = M - mm;
size_t dS = a.bufK, dW = a.bufK * DF, dD = a.bufN;
Deconvolution16bNhwcGemmPtr body_2 = Deconvolution16bNhwcGemm_32x32;
diff --git a/src/Simd/SimdAvx2SynetConvolution32f.cpp b/src/Simd/SimdAvx2SynetConvolution32f.cpp
index 97e09da753..9a02d2480a 100644
--- a/src/Simd/SimdAvx2SynetConvolution32f.cpp
+++ b/src/Simd/SimdAvx2SynetConvolution32f.cpp
@@ -23,7 +23,6 @@
*/
#include "Simd/SimdSynetConvolution32f.h"
#include "Simd/SimdSynetConvolution32fCommon.h"
-#include "Simd/SimdSynetConvolution32fBf16.h"
#include "Simd/SimdSet.h"
#include "Simd/SimdLoad.h"
#include "Simd/SimdAvx2.h"
@@ -808,19 +807,12 @@ namespace Simd
//-------------------------------------------------------------------------------------------------
- void * SynetConvolution32fInit(size_t batch, const SimdConvolutionParameters * conv, SimdSynetCompatibilityType compatibility)
+ void * SynetConvolution32fInit(size_t batch, const SimdConvolutionParameters * conv)
{
- ConvParam param(batch, conv, compatibility);
+ ConvParam param(batch, conv, SimdSynetCompatibilityDefault);
if (!param.Valid(SimdTensorData32f))
return NULL;
- else if (Base::Bf16Soft(compatibility))
- {
- if (Base::SynetConvolution32fBf16NhwcGemm::Preferable(param))
- return new Avx2::SynetConvolution32fBf16NhwcGemm(param);
- else
- return new Base::SynetConvolution32fBf16Gemm(param);
- }
- else if (SynetConvolution32fDepthwiseDotProduct::Preferable(param))
+ if (SynetConvolution32fDepthwiseDotProduct::Preferable(param))
return new SynetConvolution32fDepthwiseDotProduct(param);
else if (SynetConvolution32fWinograd::Preferable(param))
return new SynetConvolution32fWinograd(param);
@@ -838,5 +830,5 @@ namespace Simd
return new SynetConvolution32fGemmNN(param);
}
}
-#endif//SIMD_AVX2_ENABLE
+#endif
}
diff --git a/src/Simd/SimdAvx2SynetConvolution32fBf16NhwcGemm.cpp b/src/Simd/SimdAvx2SynetConvolution32fBf16NhwcGemm.cpp
deleted file mode 100644
index 282a0f872a..0000000000
--- a/src/Simd/SimdAvx2SynetConvolution32fBf16NhwcGemm.cpp
+++ /dev/null
@@ -1,380 +0,0 @@
-/*
-* Simd Library (http://ermig1979.github.io/Simd).
-*
-* Copyright (c) 2011-2024 Yermalayeu Ihar.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a copy
-* of this software and associated documentation files (the "Software"), to deal
-* in the Software without restriction, including without limitation the rights
-* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-* copies of the Software, and to permit persons to whom the Software is
-* furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice shall be included in
-* all copies or substantial portions of the Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-* SOFTWARE.
-*/
-#include "Simd/SimdSynetConvolution32fBf16.h"
-#include "Simd/SimdSynetConvolution32fCommon.h"
-#include "Simd/SimdBFloat16.h"
-#include "Simd/SimdSynet.h"
-#include "Simd/SimdAvx2.h"
-#include "Simd/SimdCpu.h"
-
-namespace Simd
-{
-#if defined(SIMD_AVX2_ENABLE) && defined(SIMD_SYNET_ENABLE)
- namespace Avx2
- {
- typedef Base::SynetConvolution32fBf16NhwcGemm::AlgParam AlgParam;
- typedef Base::SynetConvolution32fBf16NhwcGemm::ConvolutionPtr Convolution;
-
- //-----------------------------------------------------------------------------------------
-
- static void ConvertBf16NhwcGemm(const float* src, const ConvParam& p, const SynetConvolution32fBf16NhwcGemm::AlgParam& a, size_t b, size_t yBeg, size_t yEnd, uint16_t* dst)
- {
- size_t srcC16 = Simd::AlignLo(p.srcC, 16);
- size_t srcC8 = Simd::AlignLo(p.srcC, 8);
- size_t srcC4 = Simd::AlignLo(p.srcC, 4);
- uint16_t* buf = dst + a.bufM * a.bufK;
- size_t gap = a.bufK - a.K;
- for (size_t dy = yBeg, dr = (a.macroK < a.bufK ? dy * p.dstW : 0) + b * p.dstH * p.dstW; dy < yEnd; ++dy)
- {
- for (size_t dx = 0; dx < p.dstW; ++dx, ++dr)
- {
- uint16_t* row = a.macroK < a.bufK ? buf : dst + dr * a.bufK;
- for (size_t ky = 0, k = 0; ky < p.kernelY; ky++)
- {
- size_t sy = dy * p.strideY + ky * p.dilationY - p.padY;
- if (sy < p.srcH)
- {
- for (size_t kx = 0; kx < p.kernelX; kx++)
- {
- size_t sx = dx * p.strideX + kx * p.dilationX - p.padX;
- if (sx < p.srcW)
- {
- const float* ps = src + (sy * p.srcW + sx) * p.srcC;
- size_t sc = 0;
- for (; sc < srcC16; sc += 16)
- {
- __m256i d0 = Float32ToBFloat16(_mm256_loadu_ps(ps + sc + 0));
- __m256i d1 = Float32ToBFloat16(_mm256_loadu_ps(ps + sc + 8));
- _mm256_storeu_si256((__m256i*)(row + sc), _mm256_permute4x64_epi64(_mm256_packus_epi32(d0, d1), 0xD8));
- }
- for (; sc < srcC8; sc += 8)
- {
- __m128i d0 = Sse41::Float32ToBFloat16(_mm_loadu_ps(ps + sc + 0));
- __m128i d1 = Sse41::Float32ToBFloat16(_mm_loadu_ps(ps + sc + 4));
- _mm_storeu_si128((__m128i*)(row + sc), _mm_packus_epi32(d0, d1));
- }
- for (; sc < srcC4; sc += 4)
- {
- __m128i d0 = Sse41::Float32ToBFloat16(_mm_loadu_ps(ps + sc + 0));
- _mm_storel_epi64((__m128i*)(row + sc), _mm_packus_epi32(d0, Sse41::K_ZERO));
- }
- for (; sc < p.srcC; ++sc)
- row[sc] = Base::Float32ToBFloat16(ps[sc]);
- row += p.srcC;
- }
- else
- {
- memset(row, 0, p.srcC * 2);
- row += p.srcC;
- }
- }
- }
- else
- {
- memset(row, 0, p.kernelX * p.srcC * 2);
- row += p.kernelX * p.srcC;
- }
- }
- for (size_t g = 0; g < gap; ++g)
- *(row++) = 0;
- if (a.macroK < a.bufK)
- {
- for (size_t mak = 0; mak < a.bufK; mak += a.macroK)
- {
- size_t macroK = Simd::Min(a.bufK, mak + a.macroK) - mak;
- memcpy(dst + mak * a.bufM + dr * macroK, buf + mak, macroK * 2);
- }
- }
- }
- }
- }
-
- //-----------------------------------------------------------------------------------------
-
- template void ConvolutionBf16NhwcGemm_2xM(const uint16_t* src0,
- const ConvParam& p, size_t srcC, size_t dstC, int zero, const uint16_t* weight, const __m256* bias, const __m256* params, float* dst)
- {
- __m256 d00, d01, d10, d11, d20, d21, d30, d31, d40, d41, s0, w00, w01, w10, w11, m = _mm256_castsi256_ps(Bf16::MASK);
- size_t dD = p.dstC;
- const uint16_t* src1 = src0 + 1 * srcC;
- const uint16_t* src2 = src0 + 2 * srcC;
- const uint16_t* src3 = src0 + 3 * srcC;
- const uint16_t* src4 = src0 + 4 * srcC;
- if (dstC > F)
- {
- if (zero)
- {
- if (M > 0) d00 = _mm256_setzero_ps(), d01 = _mm256_setzero_ps();
- if (M > 1) d10 = _mm256_setzero_ps(), d11 = _mm256_setzero_ps();
- if (M > 2) d20 = _mm256_setzero_ps(), d21 = _mm256_setzero_ps();
- if (M > 3) d30 = _mm256_setzero_ps(), d31 = _mm256_setzero_ps();
- if (M > 4) d40 = _mm256_setzero_ps(), d41 = _mm256_setzero_ps();
- }
- else
- {
- if (M > 0) d00 = _mm256_loadu_ps(dst + 0 * dD + 0), d01 = _mm256_loadu_ps(dst + 0 * dD + F);
- if (M > 1) d10 = _mm256_loadu_ps(dst + 1 * dD + 0), d11 = _mm256_loadu_ps(dst + 1 * dD + F);
- if (M > 2) d20 = _mm256_loadu_ps(dst + 2 * dD + 0), d21 = _mm256_loadu_ps(dst + 2 * dD + F);
- if (M > 3) d30 = _mm256_loadu_ps(dst + 3 * dD + 0), d31 = _mm256_loadu_ps(dst + 3 * dD + F);
- if (M > 4) d40 = _mm256_loadu_ps(dst + 4 * dD + 0), d41 = _mm256_loadu_ps(dst + 4 * dD + F);
- }
- for (size_t offs = 0; offs < srcC; offs += 2)
- {
- w01 = _mm256_loadu_ps((float*)weight + 0);
- w00 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(w01), Base::Bf16::SHIFT));
- w01 = _mm256_and_ps(w01, m);
- w11 = _mm256_loadu_ps((float*)weight + F);
- w10 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(w11), Base::Bf16::SHIFT));
- w11 = _mm256_and_ps(w11, m);
- if (M > 0)
- {
- s0 = _mm256_and_ps(_mm256_set1_ps(*(float*)(src0 + offs - 1)), m);
- d00 = _mm256_fmadd_ps(s0, w00, d00);
- d01 = _mm256_fmadd_ps(s0, w10, d01);
- s0 = _mm256_and_ps(_mm256_set1_ps(*(float*)(src0 + offs - 0)), m);
- d00 = _mm256_fmadd_ps(s0, w01, d00);
- d01 = _mm256_fmadd_ps(s0, w11, d01);
- }
- if (M > 1)
- {
- s0 = _mm256_and_ps(_mm256_set1_ps(*(float*)(src1 + offs - 1)), m);
- d10 = _mm256_fmadd_ps(s0, w00, d10);
- d11 = _mm256_fmadd_ps(s0, w10, d11);
- s0 = _mm256_and_ps(_mm256_set1_ps(*(float*)(src1 + offs - 0)), m);
- d10 = _mm256_fmadd_ps(s0, w01, d10);
- d11 = _mm256_fmadd_ps(s0, w11, d11);
- }
- if (M > 2)
- {
- s0 = _mm256_and_ps(_mm256_set1_ps(*(float*)(src2 + offs - 1)), m);
- d20 = _mm256_fmadd_ps(s0, w00, d20);
- d21 = _mm256_fmadd_ps(s0, w10, d21);
- s0 = _mm256_and_ps(_mm256_set1_ps(*(float*)(src2 + offs - 0)), m);
- d20 = _mm256_fmadd_ps(s0, w01, d20);
- d21 = _mm256_fmadd_ps(s0, w11, d21);
- }
- if (M > 3)
- {
- s0 = _mm256_and_ps(_mm256_set1_ps(*(float*)(src3 + offs - 1)), m);
- d30 = _mm256_fmadd_ps(s0, w00, d30);
- d31 = _mm256_fmadd_ps(s0, w10, d31);
- s0 = _mm256_and_ps(_mm256_set1_ps(*(float*)(src3 + offs - 0)), m);
- d30 = _mm256_fmadd_ps(s0, w01, d30);
- d31 = _mm256_fmadd_ps(s0, w11, d31);
- }
- if (M > 4)
- {
- s0 = _mm256_and_ps(_mm256_set1_ps(*(float*)(src4 + offs - 1)), m);
- d40 = _mm256_fmadd_ps(s0, w00, d40);
- d41 = _mm256_fmadd_ps(s0, w10, d41);
- s0 = _mm256_and_ps(_mm256_set1_ps(*(float*)(src4 + offs - 0)), m);
- d40 = _mm256_fmadd_ps(s0, w01, d40);
- d41 = _mm256_fmadd_ps(s0, w11, d41);
- }
- weight += QF;
- }
- if (dstC == DF)
- {
- if (M > 0) Save2(dst, d00, d01, bias, params), dst += dD;
- if (M > 1) Save2(dst, d10, d11, bias, params), dst += dD;
- if (M > 2) Save2(dst, d20, d21, bias, params), dst += dD;
- if (M > 3) Save2(dst, d30, d31, bias, params), dst += dD;
- if (M > 4) Save2(dst, d40, d41, bias, params), dst += dD;
- }
- else
- {
- dstC -= F;
- if (M > 0) Save2(dst, d00, d01, bias, params, dstC), dst += dD;
- if (M > 1) Save2(dst, d10, d11, bias, params, dstC), dst += dD;
- if (M > 2) Save2(dst, d20, d21, bias, params, dstC), dst += dD;
- if (M > 3) Save2(dst, d30, d31, bias, params, dstC), dst += dD;
- if (M > 4) Save2(dst, d40, d41, bias, params, dstC), dst += dD;
- }
- }
- else
- {
- if (zero)
- {
- if (M > 0) d00 = _mm256_setzero_ps();
- if (M > 1) d10 = _mm256_setzero_ps();
- if (M > 2) d20 = _mm256_setzero_ps();
- if (M > 3) d30 = _mm256_setzero_ps();
- if (M > 4) d40 = _mm256_setzero_ps();
- }
- else
- {
- if (M > 0) d00 = _mm256_loadu_ps(dst + 0 * dD + 0);
- if (M > 1) d10 = _mm256_loadu_ps(dst + 1 * dD + 0);
- if (M > 2) d20 = _mm256_loadu_ps(dst + 2 * dD + 0);
- if (M > 3) d30 = _mm256_loadu_ps(dst + 3 * dD + 0);
- if (M > 4) d40 = _mm256_loadu_ps(dst + 4 * dD + 0);
- }
- for (size_t offs = 0; offs < srcC; offs += 2)
- {
- w01 = _mm256_loadu_ps((float*)weight + 0);
- w00 = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(w01), Base::Bf16::SHIFT));
- w01 = _mm256_and_ps(w01, m);
- if (M > 0)
- {
- s0 = _mm256_and_ps(_mm256_set1_ps(*(float*)(src0 + offs - 1)), m);
- d00 = _mm256_fmadd_ps(s0, w00, d00);
- s0 = _mm256_and_ps(_mm256_set1_ps(*(float*)(src0 + offs - 0)), m);
- d00 = _mm256_fmadd_ps(s0, w01, d00);
- }
- if (M > 1)
- {
- s0 = _mm256_and_ps(_mm256_set1_ps(*(float*)(src1 + offs - 1)), m);
- d10 = _mm256_fmadd_ps(s0, w00, d10);
- s0 = _mm256_and_ps(_mm256_set1_ps(*(float*)(src1 + offs - 0)), m);
- d10 = _mm256_fmadd_ps(s0, w01, d10);
- }
- if (M > 2)
- {
- s0 = _mm256_and_ps(_mm256_set1_ps(*(float*)(src2 + offs - 1)), m);
- d20 = _mm256_fmadd_ps(s0, w00, d20);
- s0 = _mm256_and_ps(_mm256_set1_ps(*(float*)(src2 + offs - 0)), m);
- d20 = _mm256_fmadd_ps(s0, w01, d20);
- }
- if (M > 3)
- {
- s0 = _mm256_and_ps(_mm256_set1_ps(*(float*)(src3 + offs - 1)), m);
- d30 = _mm256_fmadd_ps(s0, w00, d30);
- s0 = _mm256_and_ps(_mm256_set1_ps(*(float*)(src3 + offs - 0)), m);
- d30 = _mm256_fmadd_ps(s0, w01, d30);
- }
- if (M > 4)
- {
- s0 = _mm256_and_ps(_mm256_set1_ps(*(float*)(src4 + offs - 1)), m);
- d40 = _mm256_fmadd_ps(s0, w00, d40);
- s0 = _mm256_and_ps(_mm256_set1_ps(*(float*)(src4 + offs - 0)), m);
- d40 = _mm256_fmadd_ps(s0, w01, d40);
- }
- weight += QF;
- }
- if (dstC == F)
- {
- if (M > 0) Save1(dst, d00, bias, params), dst += dD;
- if (M > 1) Save1(dst, d10, bias, params), dst += dD;
- if (M > 2) Save1(dst, d20, bias, params), dst += dD;
- if (M > 3) Save1(dst, d30, bias, params), dst += dD;
- if (M > 4) Save1(dst, d40, bias, params), dst += dD;
- }
- else
- {
- if (M > 0) Save1(dst, d00, bias, params, dstC), dst += dD;
- if (M > 1) Save1(dst, d10, bias, params, dstC), dst += dD;
- if (M > 2) Save1(dst, d20, bias, params, dstC), dst += dD;
- if (M > 3) Save1(dst, d30, bias, params, dstC), dst += dD;
- if (M > 4) Save1(dst, d40, bias, params, dstC), dst += dD;
- }
- }
- }
-
- typedef void(*ConvolutionBf16NhwcGemm_2xM_Ptr)(const uint16_t* src0, const ConvParam& p, size_t srcC,
- size_t dstC, int zero, const uint16_t* weight, const __m256* bias, const __m256* params, float* dst);
-
- template ConvolutionBf16NhwcGemm_2xM_Ptr GetConvolutionBf16NhwcGemm_2xM(size_t M)
- {
- switch (M)
- {
- case 0: return NULL;
- case 1: return ConvolutionBf16NhwcGemm_2xM;
- case 2: return ConvolutionBf16NhwcGemm_2xM;
- case 3: return ConvolutionBf16NhwcGemm_2xM;
- case 4: return ConvolutionBf16NhwcGemm_2xM;
- case 5: return ConvolutionBf16NhwcGemm_2xM;
- }
- assert(0);
- return NULL;
- }
-
- template void ConvolutionBf16NhwcGemm_2(const uint16_t* src, const ConvParam& p,
- size_t dstC, size_t dstH, size_t srcC, int zero, const uint16_t* weight, const float* bias, const float* params, float* dst)
- {
- size_t n1 = dstH * p.dstW, n = 5;
- size_t nn = AlignLoAny(n1, n), m = n1 - nn, dW = AlignHi(srcC, 2) * DF;
- ConvolutionBf16NhwcGemm_2xM_Ptr convolution_2xN = GetConvolutionBf16NhwcGemm_2xM(n);
- ConvolutionBf16NhwcGemm_2xM_Ptr convolution_2xM = GetConvolutionBf16NhwcGemm_2xM(m);
-
- __m256 _params[2], _bias[2];
- _params[0] = _mm256_set1_ps(params[0]);
- if (type == SimdConvolutionActivationRestrictRange ||
- type == SimdConvolutionActivationHswish ||
- type == SimdConvolutionActivationHardSigmoid)
- _params[1] = _mm256_set1_ps(params[1]);
-
- for (size_t dc = 0; dc < dstC; dc += DF)
- {
- size_t dC = Simd::Min(DF, dstC - dc);
- _bias[0] = _mm256_loadu_ps(bias + dc + 0);
- _bias[1] = _mm256_loadu_ps(bias + dc + F);
- if (type == ::SimdConvolutionActivationPrelu)
- {
- _params[0] = _mm256_loadu_ps(params + dc + 0);
- _params[1] = _mm256_loadu_ps(params + dc + F);
- }
- float* d = dst;
- const uint16_t* s = src;
- size_t i = 0;
- for (; i < nn; i += n, s += n * srcC, d += n * p.dstC)
- convolution_2xN(s, p, srcC, dC, zero, weight, _bias, _params, d);
- for (; i < n1; i += m, s += m * srcC, d += m * p.dstC)
- convolution_2xM(s, p, srcC, dC, zero, weight, _bias, _params, d);
- weight += dW;
- dst += DF;
- }
- }
-
- //-----------------------------------------------------------------------------------------
-
- template SIMD_INLINE void Set(const ConvParam& p, const AlgParam& a, Convolution* convolutions)
- {
- convolutions[TermLast] = ConvolutionBf16NhwcGemm_2;
- convolutions[TermInterim] = ConvolutionBf16NhwcGemm_2;
- }
-
- SynetConvolution32fBf16NhwcGemm::SynetConvolution32fBf16NhwcGemm(const ConvParam& p)
- : Sse41::SynetConvolution32fBf16NhwcGemm(p)
- {
- SetAlgParam(F * 2, 5, 2, Base::AlgCacheL1(), Base::AlgCacheL2(), Base::AlgCacheL3());
- _convert = ConvertBf16NhwcGemm;
- switch (p.activation)
- {
- case SimdConvolutionActivationIdentity: Set(p, _alg, _convolutions); break;
- case SimdConvolutionActivationRelu: Set(p, _alg, _convolutions); break;
- case SimdConvolutionActivationLeakyRelu: Set(p, _alg, _convolutions); break;
- case SimdConvolutionActivationRestrictRange: Set(p, _alg, _convolutions); break;
- case SimdConvolutionActivationPrelu: Set(p, _alg, _convolutions); break;
- case SimdConvolutionActivationElu: Set(p, _alg, _convolutions); break;
- case SimdConvolutionActivationHswish: Set(p, _alg, _convolutions); break;
- case SimdConvolutionActivationMish: Set(p, _alg, _convolutions); break;
- case SimdConvolutionActivationHardSigmoid: Set(p, _alg, _convolutions); break;
- case SimdConvolutionActivationSwish: Set(p, _alg, _convolutions); break;
- case SimdConvolutionActivationGelu: Set(p, _alg, _convolutions); break;
- default: assert(0);
- }
- }
- }
-#endif
-}
diff --git a/src/Simd/SimdAvx512bwSynetConvolution32f.cpp b/src/Simd/SimdAvx512bwSynetConvolution32f.cpp
index c3afb132d9..70e1f33c96 100644
--- a/src/Simd/SimdAvx512bwSynetConvolution32f.cpp
+++ b/src/Simd/SimdAvx512bwSynetConvolution32f.cpp
@@ -22,7 +22,6 @@
* SOFTWARE.
*/
#include "Simd/SimdSynetConvolution32f.h"
-#include "Simd/SimdSynetConvolution32fBf16.h"
#include "Simd/SimdAvx512bw.h"
#include "Simd/SimdSynet.h"
#include "Simd/SimdExp.h"
@@ -839,19 +838,12 @@ namespace Simd
//-----------------------------------------------------------------------------------------
- void * SynetConvolution32fInit(size_t batch, const SimdConvolutionParameters * conv, SimdSynetCompatibilityType compatibility)
+ void * SynetConvolution32fInit(size_t batch, const SimdConvolutionParameters * conv)
{
- ConvParam param(batch, conv, compatibility);
+ ConvParam param(batch, conv, SimdSynetCompatibilityDefault);
if (!param.Valid(SimdTensorData32f))
return NULL;
- else if (Base::Bf16Soft(compatibility))
- {
- if (Base::SynetConvolution32fBf16NhwcGemm::Preferable(param))
- return new Avx512bw::SynetConvolution32fBf16NhwcGemm(param);
- else
- return new Base::SynetConvolution32fBf16Gemm(param);
- }
- else if (Avx2::SynetConvolution32fDepthwiseDotProduct::Preferable(param))
+ if (Avx2::SynetConvolution32fDepthwiseDotProduct::Preferable(param))
return new Avx2::SynetConvolution32fDepthwiseDotProduct(param);
else if (SynetConvolution32fWinograd::Preferable(param))
return new SynetConvolution32fWinograd(param);
@@ -869,5 +861,5 @@ namespace Simd
return new SynetConvolution32fGemmNN(param);
}
}
-#endif//SIMD_AVX2_ENABLE
+#endif
}
diff --git a/src/Simd/SimdAvx512bwSynetConvolution32fBf16NhwcGemm.cpp b/src/Simd/SimdAvx512bwSynetConvolution32fBf16NhwcGemm.cpp
deleted file mode 100644
index 06c770e5e4..0000000000
--- a/src/Simd/SimdAvx512bwSynetConvolution32fBf16NhwcGemm.cpp
+++ /dev/null
@@ -1,490 +0,0 @@
-/*
-* Simd Library (http://ermig1979.github.io/Simd).
-*
-* Copyright (c) 2011-2024 Yermalayeu Ihar.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a copy
-* of this software and associated documentation files (the "Software"), to deal
-* in the Software without restriction, including without limitation the rights
-* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-* copies of the Software, and to permit persons to whom the Software is
-* furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice shall be included in
-* all copies or substantial portions of the Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-* SOFTWARE.
-*/
-#include "Simd/SimdSynetConvolution32fBf16.h"
-#include "Simd/SimdSynetConvolution32fCommon.h"
-#include "Simd/SimdBFloat16.h"
-#include "Simd/SimdSynet.h"
-#include "Simd/SimdAvx512bw.h"
-#include "Simd/SimdAvx2.h"
-#include "Simd/SimdCpu.h"
-
-namespace Simd
-{
-#if defined(SIMD_AVX512BW_ENABLE) && defined(SIMD_SYNET_ENABLE)
- namespace Avx512bw
- {
- typedef Base::SynetConvolution32fBf16NhwcGemm::AlgParam AlgParam;
- typedef Base::SynetConvolution32fBf16NhwcGemm::ConvolutionPtr Convolution;
-
- //-----------------------------------------------------------------------------------------
-
- static void ConvertBf16NhwcGemm(const float* src, const ConvParam& p, const SynetConvolution32fBf16NhwcGemm::AlgParam& a, size_t b, size_t yBeg, size_t yEnd, uint16_t* dst)
- {
- size_t srcC32 = AlignLo(p.srcC, 32);
- __mmask16 srcMask[2];
- __mmask32 dstMask[1];
- if (srcC32 < p.srcC)
- {
- srcMask[0] = TailMask16(p.srcC - srcC32 - F * 0);
- srcMask[1] = TailMask16(p.srcC - srcC32 - F * 1);
- dstMask[0] = TailMask32(p.srcC - srcC32);
- }
- uint16_t* buf = dst + a.bufM * a.bufK;
- size_t gap = a.bufK - a.K;
- for (size_t dy = yBeg, dr = (a.macroK < a.bufK ? dy * p.dstW : 0) + b * p.dstH * p.dstW; dy < yEnd; ++dy)
- {
- for (size_t dx = 0; dx < p.dstW; ++dx, ++dr)
- {
- uint16_t* row = a.macroK < a.bufK ? buf : dst + dr * a.bufK;
- for (size_t ky = 0, k = 0; ky < p.kernelY; ky++)
- {
- size_t sy = dy * p.strideY + ky * p.dilationY - p.padY;
- if (sy < p.srcH)
- {
- for (size_t kx = 0; kx < p.kernelX; kx++)
- {
- size_t sx = dx * p.strideX + kx * p.dilationX - p.padX;
- if (sx < p.srcW)
- {
- const float* ps = src + (sy * p.srcW + sx) * p.srcC;
- size_t sc = 0;
- for (; sc < srcC32; sc += 32)
- Float32ToBFloat16(ps + sc, row + sc, srcMask, dstMask);
- if (srcC32 < p.srcC)
- Float32ToBFloat16(ps + sc, row + sc, srcMask, dstMask);
- row += p.srcC;
- }
- else
- {
- memset(row, 0, p.srcC * 2);
- row += p.srcC;
- }
- }
- }
- else
- {
- memset(row, 0, p.kernelX * p.srcC * 2);
- row += p.kernelX * p.srcC;
- }
- }
- for (size_t g = 0; g < gap; ++g)
- *(row++) = 0;
- if (a.macroK < a.bufK)
- {
- for (size_t mak = 0; mak < a.bufK; mak += a.macroK)
- {
- size_t macroK = Simd::Min(a.bufK, mak + a.macroK) - mak;
- memcpy(dst + mak * a.bufM + dr * macroK, buf + mak, macroK * 2);
- }
- }
- }
- }
- }
-
- //-----------------------------------------------------------------------------------------
-
- template void ConvolutionBf16NhwcGemm_2xM(const uint16_t* src0, const ConvParam& p,
- size_t srcC, int zero, const uint16_t* weight, const __m512* bias, const __m512* params, float* dst, const __mmask16 tails[2])
- {
- __m512 d00, d01, d10, d11, d20, d21, d30, d31, d40, d41, d50, d51,
- d60, d61, d70, d71, d80, d81, d90, d91, da0, da1, db0, db1,
- s0, w00, w01, w10, w11, m = _mm512_castsi512_ps(Bf16::MASK);
- size_t dD = p.dstC;
- const uint16_t* src1 = src0 + 1 * srcC;
- const uint16_t* src2 = src0 + 2 * srcC;
- const uint16_t* src3 = src0 + 3 * srcC;
- const uint16_t* src4 = src0 + 4 * srcC;
- const uint16_t* src5 = src0 + 5 * srcC;
- if (tails[1])
- {
- if (zero)
- {
- if (M > 0x0) d00 = _mm512_setzero_ps(), d01 = _mm512_setzero_ps();
- if (M > 0x1) d10 = _mm512_setzero_ps(), d11 = _mm512_setzero_ps();
- if (M > 0x2) d20 = _mm512_setzero_ps(), d21 = _mm512_setzero_ps();
- if (M > 0x3) d30 = _mm512_setzero_ps(), d31 = _mm512_setzero_ps();
- if (M > 0x4) d40 = _mm512_setzero_ps(), d41 = _mm512_setzero_ps();
- if (M > 0x5) d50 = _mm512_setzero_ps(), d51 = _mm512_setzero_ps();
- if (M > 0x6) d60 = _mm512_setzero_ps(), d61 = _mm512_setzero_ps();
- if (M > 0x7) d70 = _mm512_setzero_ps(), d71 = _mm512_setzero_ps();
- if (M > 0x8) d80 = _mm512_setzero_ps(), d81 = _mm512_setzero_ps();
- if (M > 0x9) d90 = _mm512_setzero_ps(), d91 = _mm512_setzero_ps();
- if (M > 0xa) da0 = _mm512_setzero_ps(), da1 = _mm512_setzero_ps();
- if (M > 0xb) db0 = _mm512_setzero_ps(), db1 = _mm512_setzero_ps();
- }
- else
- {
- if (M > 0x0) d00 = _mm512_loadu_ps(dst + 0x0 * dD + 0), d01 = _mm512_maskz_loadu_ps(tails[1], dst + 0x0 * dD + F);
- if (M > 0x1) d10 = _mm512_loadu_ps(dst + 0x1 * dD + 0), d11 = _mm512_maskz_loadu_ps(tails[1], dst + 0x1 * dD + F);
- if (M > 0x2) d20 = _mm512_loadu_ps(dst + 0x2 * dD + 0), d21 = _mm512_maskz_loadu_ps(tails[1], dst + 0x2 * dD + F);
- if (M > 0x3) d30 = _mm512_loadu_ps(dst + 0x3 * dD + 0), d31 = _mm512_maskz_loadu_ps(tails[1], dst + 0x3 * dD + F);
- if (M > 0x4) d40 = _mm512_loadu_ps(dst + 0x4 * dD + 0), d41 = _mm512_maskz_loadu_ps(tails[1], dst + 0x4 * dD + F);
- if (M > 0x5) d50 = _mm512_loadu_ps(dst + 0x5 * dD + 0), d51 = _mm512_maskz_loadu_ps(tails[1], dst + 0x5 * dD + F);
- if (M > 0x6) d60 = _mm512_loadu_ps(dst + 0x6 * dD + 0), d61 = _mm512_maskz_loadu_ps(tails[1], dst + 0x6 * dD + F);
- if (M > 0x7) d70 = _mm512_loadu_ps(dst + 0x7 * dD + 0), d71 = _mm512_maskz_loadu_ps(tails[1], dst + 0x7 * dD + F);
- if (M > 0x8) d80 = _mm512_loadu_ps(dst + 0x8 * dD + 0), d81 = _mm512_maskz_loadu_ps(tails[1], dst + 0x8 * dD + F);
- if (M > 0x9) d90 = _mm512_loadu_ps(dst + 0x9 * dD + 0), d91 = _mm512_maskz_loadu_ps(tails[1], dst + 0x9 * dD + F);
- if (M > 0xa) da0 = _mm512_loadu_ps(dst + 0xa * dD + 0), da1 = _mm512_maskz_loadu_ps(tails[1], dst + 0xa * dD + F);
- if (M > 0xb) db0 = _mm512_loadu_ps(dst + 0xb * dD + 0), db1 = _mm512_maskz_loadu_ps(tails[1], dst + 0xb * dD + F);
- }
- for (size_t offs0 = 0, offs6 = offs0 + 6 * srcC; offs0 < srcC; offs0 += 2, offs6 += 2)
- {
- w01 = _mm512_loadu_ps((float*)weight + 0);
- w00 = _mm512_castsi512_ps(_mm512_slli_epi32(_mm512_castps_si512(w01), Base::Bf16::SHIFT));
- w01 = _mm512_and_ps(w01, m);
- w11 = _mm512_loadu_ps((float*)weight + F);
- w10 = _mm512_castsi512_ps(_mm512_slli_epi32(_mm512_castps_si512(w11), Base::Bf16::SHIFT));
- w11 = _mm512_and_ps(w11, m);
- if (M > 0x0)
- {
- s0 = _mm512_castsi512_ps(_mm512_maskz_set1_epi16(0xAAAAAAAA, src0[offs0]));
- d00 = _mm512_fmadd_ps(s0, w00, d00); d01 = _mm512_fmadd_ps(s0, w10, d01);
- s0 = _mm512_and_ps(_mm512_set1_ps(*(float*)(src0 + offs0)), m);
- d00 = _mm512_fmadd_ps(s0, w01, d00); d01 = _mm512_fmadd_ps(s0, w11, d01);
- }
- if (M > 0x1)
- {
- s0 = _mm512_castsi512_ps(_mm512_maskz_set1_epi16(0xAAAAAAAA, src1[offs0]));
- d10 = _mm512_fmadd_ps(s0, w00, d10); d11 = _mm512_fmadd_ps(s0, w10, d11);
- s0 = _mm512_and_ps(_mm512_set1_ps(*(float*)(src1 + offs0)), m);
- d10 = _mm512_fmadd_ps(s0, w01, d10); d11 = _mm512_fmadd_ps(s0, w11, d11);
- }
- if (M > 0x2)
- {
- s0 = _mm512_castsi512_ps(_mm512_maskz_set1_epi16(0xAAAAAAAA, src2[offs0]));
- d20 = _mm512_fmadd_ps(s0, w00, d20); d21 = _mm512_fmadd_ps(s0, w10, d21);
- s0 = _mm512_and_ps(_mm512_set1_ps(*(float*)(src2 + offs0)), m);
- d20 = _mm512_fmadd_ps(s0, w01, d20); d21 = _mm512_fmadd_ps(s0, w11, d21);
- }
- if (M > 0x3)
- {
- s0 = _mm512_castsi512_ps(_mm512_maskz_set1_epi16(0xAAAAAAAA, src3[offs0]));
- d30 = _mm512_fmadd_ps(s0, w00, d30); d31 = _mm512_fmadd_ps(s0, w10, d31);
- s0 = _mm512_and_ps(_mm512_set1_ps(*(float*)(src3 + offs0)), m);
- d30 = _mm512_fmadd_ps(s0, w01, d30); d31 = _mm512_fmadd_ps(s0, w11, d31);
- }
- if (M > 0x4)
- {
- s0 = _mm512_castsi512_ps(_mm512_maskz_set1_epi16(0xAAAAAAAA, src4[offs0]));
- d40 = _mm512_fmadd_ps(s0, w00, d40); d41 = _mm512_fmadd_ps(s0, w10, d41);
- s0 = _mm512_and_ps(_mm512_set1_ps(*(float*)(src4 + offs0)), m);
- d40 = _mm512_fmadd_ps(s0, w01, d40); d41 = _mm512_fmadd_ps(s0, w11, d41);
- }
- if (M > 0x5)
- {
- s0 = _mm512_castsi512_ps(_mm512_maskz_set1_epi16(0xAAAAAAAA, src5[offs0]));
- d50 = _mm512_fmadd_ps(s0, w00, d50); d51 = _mm512_fmadd_ps(s0, w10, d51);
- s0 = _mm512_and_ps(_mm512_set1_ps(*(float*)(src5 + offs0)), m);
- d50 = _mm512_fmadd_ps(s0, w01, d50); d51 = _mm512_fmadd_ps(s0, w11, d51);
- }
- if (M > 0x6)
- {
- s0 = _mm512_castsi512_ps(_mm512_maskz_set1_epi16(0xAAAAAAAA, src0[offs6]));
- d60 = _mm512_fmadd_ps(s0, w00, d60); d61 = _mm512_fmadd_ps(s0, w10, d61);
- s0 = _mm512_and_ps(_mm512_set1_ps(*(float*)(src0 + offs6)), m);
- d60 = _mm512_fmadd_ps(s0, w01, d60); d61 = _mm512_fmadd_ps(s0, w11, d61);
- }
- if (M > 0x7)
- {
- s0 = _mm512_castsi512_ps(_mm512_maskz_set1_epi16(0xAAAAAAAA, src1[offs6]));
- d70 = _mm512_fmadd_ps(s0, w00, d70); d71 = _mm512_fmadd_ps(s0, w10, d71);
- s0 = _mm512_and_ps(_mm512_set1_ps(*(float*)(src1 + offs6)), m);
- d70 = _mm512_fmadd_ps(s0, w01, d70); d71 = _mm512_fmadd_ps(s0, w11, d71);
- }
- if (M > 0x8)
- {
- s0 = _mm512_castsi512_ps(_mm512_maskz_set1_epi16(0xAAAAAAAA, src2[offs6]));
- d80 = _mm512_fmadd_ps(s0, w00, d80); d81 = _mm512_fmadd_ps(s0, w10, d81);
- s0 = _mm512_and_ps(_mm512_set1_ps(*(float*)(src2 + offs6)), m);
- d80 = _mm512_fmadd_ps(s0, w01, d80); d81 = _mm512_fmadd_ps(s0, w11, d81);
- }
- if (M > 0x9)
- {
- s0 = _mm512_castsi512_ps(_mm512_maskz_set1_epi16(0xAAAAAAAA, src3[offs6]));
- d90 = _mm512_fmadd_ps(s0, w00, d90); d91 = _mm512_fmadd_ps(s0, w10, d91);
- s0 = _mm512_and_ps(_mm512_set1_ps(*(float*)(src3 + offs6)), m);
- d90 = _mm512_fmadd_ps(s0, w01, d90); d91 = _mm512_fmadd_ps(s0, w11, d91);
- }
- if (M > 0xa)
- {
- s0 = _mm512_castsi512_ps(_mm512_maskz_set1_epi16(0xAAAAAAAA, src4[offs6]));
- da0 = _mm512_fmadd_ps(s0, w00, da0); da1 = _mm512_fmadd_ps(s0, w10, da1);
- s0 = _mm512_and_ps(_mm512_set1_ps(*(float*)(src4 + offs6)), m);
- da0 = _mm512_fmadd_ps(s0, w01, da0); da1 = _mm512_fmadd_ps(s0, w11, da1);
- }
- if (M > 0xb)
- {
- s0 = _mm512_castsi512_ps(_mm512_maskz_set1_epi16(0xAAAAAAAA, src5[offs6]));
- db0 = _mm512_fmadd_ps(s0, w00, db0); db1 = _mm512_fmadd_ps(s0, w10, db1);
- s0 = _mm512_and_ps(_mm512_set1_ps(*(float*)(src5 + offs6)), m);
- db0 = _mm512_fmadd_ps(s0, w01, db0); db1 = _mm512_fmadd_ps(s0, w11, db1);
- }
- weight += QF;
- }
- if (M > 0x0) Save2(dst, d00, d01, bias, params, tails), dst += dD;
- if (M > 0x1) Save2(dst, d10, d11, bias, params, tails), dst += dD;
- if (M > 0x2) Save2(dst, d20, d21, bias, params, tails), dst += dD;
- if (M > 0x3) Save2(dst, d30, d31, bias, params, tails), dst += dD;
- if (M > 0x4) Save2(dst, d40, d41, bias, params, tails), dst += dD;
- if (M > 0x5) Save2(dst, d50, d51, bias, params, tails), dst += dD;
- if (M > 0x6) Save2(dst, d60, d61, bias, params, tails), dst += dD;
- if (M > 0x7) Save2(dst, d70, d71, bias, params, tails), dst += dD;
- if (M > 0x8) Save2(dst, d80, d81, bias, params, tails), dst += dD;
- if (M > 0x9) Save2(dst, d90, d91, bias, params, tails), dst += dD;
- if (M > 0xa) Save2(dst, da0, da1, bias, params, tails), dst += dD;
- if (M > 0xb) Save2(dst, db0, db1, bias, params, tails), dst += dD;
- }
- else
- {
- if (zero)
- {
- if (M > 0x0) d00 = _mm512_setzero_ps();
- if (M > 0x1) d10 = _mm512_setzero_ps();
- if (M > 0x2) d20 = _mm512_setzero_ps();
- if (M > 0x3) d30 = _mm512_setzero_ps();
- if (M > 0x4) d40 = _mm512_setzero_ps();
- if (M > 0x5) d50 = _mm512_setzero_ps();
- if (M > 0x6) d60 = _mm512_setzero_ps();
- if (M > 0x7) d70 = _mm512_setzero_ps();
- if (M > 0x8) d80 = _mm512_setzero_ps();
- if (M > 0x9) d90 = _mm512_setzero_ps();
- if (M > 0xa) da0 = _mm512_setzero_ps();
- if (M > 0xb) db0 = _mm512_setzero_ps();
- }
- else
- {
- if (M > 0x0) d00 = _mm512_maskz_loadu_ps(tails[0], dst + 0x0 * dD + 0);
- if (M > 0x1) d10 = _mm512_maskz_loadu_ps(tails[0], dst + 0x1 * dD + 0);
- if (M > 0x2) d20 = _mm512_maskz_loadu_ps(tails[0], dst + 0x2 * dD + 0);
- if (M > 0x3) d30 = _mm512_maskz_loadu_ps(tails[0], dst + 0x3 * dD + 0);
- if (M > 0x4) d40 = _mm512_maskz_loadu_ps(tails[0], dst + 0x4 * dD + 0);
- if (M > 0x5) d50 = _mm512_maskz_loadu_ps(tails[0], dst + 0x5 * dD + 0);
- if (M > 0x6) d60 = _mm512_maskz_loadu_ps(tails[0], dst + 0x6 * dD + 0);
- if (M > 0x7) d70 = _mm512_maskz_loadu_ps(tails[0], dst + 0x7 * dD + 0);
- if (M > 0x8) d80 = _mm512_maskz_loadu_ps(tails[0], dst + 0x8 * dD + 0);
- if (M > 0x9) d90 = _mm512_maskz_loadu_ps(tails[0], dst + 0x9 * dD + 0);
- if (M > 0xa) da0 = _mm512_maskz_loadu_ps(tails[0], dst + 0xa * dD + 0);
- if (M > 0xb) db0 = _mm512_maskz_loadu_ps(tails[0], dst + 0xb * dD + 0);
- }
- for (size_t offs0 = 0, offs6 = offs0 + 6 * srcC; offs0 < srcC; offs0 += 2, offs6 += 2)
- {
- w01 = _mm512_loadu_ps((float*)weight + 0);
- w00 = _mm512_castsi512_ps(_mm512_slli_epi32(_mm512_castps_si512(w01), Base::Bf16::SHIFT));
- w01 = _mm512_and_ps(w01, m);
- if (M > 0x0)
- {
- s0 = _mm512_castsi512_ps(_mm512_maskz_set1_epi16(0xAAAAAAAA, src0[offs0]));
- d00 = _mm512_fmadd_ps(s0, w00, d00);
- s0 = _mm512_and_ps(_mm512_set1_ps(*(float*)(src0 + offs0)), m);
- d00 = _mm512_fmadd_ps(s0, w01, d00);
- }
- if (M > 0x1)
- {
- s0 = _mm512_castsi512_ps(_mm512_maskz_set1_epi16(0xAAAAAAAA, src1[offs0]));
- d10 = _mm512_fmadd_ps(s0, w00, d10);
- s0 = _mm512_and_ps(_mm512_set1_ps(*(float*)(src1 + offs0)), m);
- d10 = _mm512_fmadd_ps(s0, w01, d10);
- }
- if (M > 0x2)
- {
- s0 = _mm512_castsi512_ps(_mm512_maskz_set1_epi16(0xAAAAAAAA, src2[offs0]));
- d20 = _mm512_fmadd_ps(s0, w00, d20);
- s0 = _mm512_and_ps(_mm512_set1_ps(*(float*)(src2 + offs0)), m);
- d20 = _mm512_fmadd_ps(s0, w01, d20);
- }
- if (M > 0x3)
- {
- s0 = _mm512_castsi512_ps(_mm512_maskz_set1_epi16(0xAAAAAAAA, src3[offs0]));
- d30 = _mm512_fmadd_ps(s0, w00, d30);
- s0 = _mm512_and_ps(_mm512_set1_ps(*(float*)(src3 + offs0)), m);
- d30 = _mm512_fmadd_ps(s0, w01, d30);
- }
- if (M > 0x4)
- {
- s0 = _mm512_castsi512_ps(_mm512_maskz_set1_epi16(0xAAAAAAAA, src4[offs0]));
- d40 = _mm512_fmadd_ps(s0, w00, d40);
- s0 = _mm512_and_ps(_mm512_set1_ps(*(float*)(src4 + offs0)), m);
- d40 = _mm512_fmadd_ps(s0, w01, d40);
- }
- if (M > 0x5)
- {
- s0 = _mm512_castsi512_ps(_mm512_maskz_set1_epi16(0xAAAAAAAA, src5[offs0]));
- d50 = _mm512_fmadd_ps(s0, w00, d50);
- s0 = _mm512_and_ps(_mm512_set1_ps(*(float*)(src5 + offs0)), m);
- d50 = _mm512_fmadd_ps(s0, w01, d50);
- }
- if (M > 0x6)
- {
- s0 = _mm512_castsi512_ps(_mm512_maskz_set1_epi16(0xAAAAAAAA, src0[offs6]));
- d60 = _mm512_fmadd_ps(s0, w00, d60);
- s0 = _mm512_and_ps(_mm512_set1_ps(*(float*)(src0 + offs6)), m);
- d60 = _mm512_fmadd_ps(s0, w01, d60);
- }
- if (M > 0x7)
- {
- s0 = _mm512_castsi512_ps(_mm512_maskz_set1_epi16(0xAAAAAAAA, src1[offs6]));
- d70 = _mm512_fmadd_ps(s0, w00, d70);
- s0 = _mm512_and_ps(_mm512_set1_ps(*(float*)(src1 + offs6)), m);
- d70 = _mm512_fmadd_ps(s0, w01, d70);
- }
- if (M > 0x8)
- {
- s0 = _mm512_castsi512_ps(_mm512_maskz_set1_epi16(0xAAAAAAAA, src2[offs6]));
- d80 = _mm512_fmadd_ps(s0, w00, d80);
- s0 = _mm512_and_ps(_mm512_set1_ps(*(float*)(src2 + offs6)), m);
- d80 = _mm512_fmadd_ps(s0, w01, d80);
- }
- if (M > 0x9)
- {
- s0 = _mm512_castsi512_ps(_mm512_maskz_set1_epi16(0xAAAAAAAA, src3[offs6]));
- d90 = _mm512_fmadd_ps(s0, w00, d90);
- s0 = _mm512_and_ps(_mm512_set1_ps(*(float*)(src3 + offs6)), m);
- d90 = _mm512_fmadd_ps(s0, w01, d90);
- }
- if (M > 0xa)
- {
- s0 = _mm512_castsi512_ps(_mm512_maskz_set1_epi16(0xAAAAAAAA, src4[offs6]));
- da0 = _mm512_fmadd_ps(s0, w00, da0);
- s0 = _mm512_and_ps(_mm512_set1_ps(*(float*)(src4 + offs6)), m);
- da0 = _mm512_fmadd_ps(s0, w01, da0);
- }
- if (M > 0xb)
- {
- s0 = _mm512_castsi512_ps(_mm512_maskz_set1_epi16(0xAAAAAAAA, src5[offs6]));
- db0 = _mm512_fmadd_ps(s0, w00, db0);
- s0 = _mm512_and_ps(_mm512_set1_ps(*(float*)(src5 + offs6)), m);
- db0 = _mm512_fmadd_ps(s0, w01, db0);
- }
- weight += QF;
- }
- if (M > 0x0) Save1(dst, d00, bias, params, tails), dst += dD;
- if (M > 0x1) Save1(dst, d10, bias, params, tails), dst += dD;
- if (M > 0x2) Save1(dst, d20, bias, params, tails), dst += dD;
- if (M > 0x3) Save1(dst, d30, bias, params, tails), dst += dD;
- if (M > 0x4) Save1(dst, d40, bias, params, tails), dst += dD;
- if (M > 0x5) Save1(dst, d50, bias, params, tails), dst += dD;
- if (M > 0x6) Save1(dst, d60, bias, params, tails), dst += dD;
- if (M > 0x7) Save1(dst, d70, bias, params, tails), dst += dD;
- if (M > 0x8) Save1(dst, d80, bias, params, tails), dst += dD;
- if (M > 0x9) Save1(dst, d90, bias, params, tails), dst += dD;
- if (M > 0xa) Save1(dst, da0, bias, params, tails), dst += dD;
- if (M > 0xb) Save1(dst, db0, bias, params, tails), dst += dD;
- }
- }
-
- typedef void(*ConvolutionBf16NhwcGemm_2xM_Ptr)(const uint16_t* src0, const ConvParam& p, size_t srcC, int zero,
- const uint16_t* weight, const __m512* bias, const __m512* params, float* dst, const __mmask16 tails[2]);
-
- template ConvolutionBf16NhwcGemm_2xM_Ptr GetConvolutionBf16NhwcGemm_2xM(size_t M)
- {
- switch (M)
- {
- case 0x0: return NULL;
- case 0x1: return ConvolutionBf16NhwcGemm_2xM;
- case 0x2: return ConvolutionBf16NhwcGemm_2xM;
- case 0x3: return ConvolutionBf16NhwcGemm_2xM;
- case 0x4: return ConvolutionBf16NhwcGemm_2xM;
- case 0x5: return ConvolutionBf16NhwcGemm_2xM;
- case 0x6: return ConvolutionBf16NhwcGemm_2xM;
- case 0x7: return ConvolutionBf16NhwcGemm_2xM;
- case 0x8: return ConvolutionBf16NhwcGemm_2xM;
- case 0x9: return ConvolutionBf16NhwcGemm_2xM;
- case 0xa: return ConvolutionBf16NhwcGemm_2xM;
- case 0xb: return ConvolutionBf16NhwcGemm_2xM;
- case 0xc: return ConvolutionBf16NhwcGemm_2xM;
- }
- assert(0);
- return NULL;
- }
-
- template void ConvolutionBf16NhwcGemm_2(const uint16_t* src, const ConvParam& p,
- size_t dstC, size_t dstH, size_t srcC, int zero, const uint16_t* weight, const float* bias, const float* params, float* dst)
- {
- size_t n1 = dstH * p.dstW, n = 12;
- size_t nn = AlignLoAny(n1, n), m = n1 - nn, dW = AlignHi(srcC, 2) * DF;
- ConvolutionBf16NhwcGemm_2xM_Ptr convolution_2xN = GetConvolutionBf16NhwcGemm_2xM(n);
- ConvolutionBf16NhwcGemm_2xM_Ptr convolution_2xM = GetConvolutionBf16NhwcGemm_2xM(m);
-
- __m512 _params[2], _bias[2];
- _params[0] = _mm512_set1_ps(params[0]);
- if (type == SimdConvolutionActivationRestrictRange ||
- type == SimdConvolutionActivationHswish ||
- type == SimdConvolutionActivationHardSigmoid)
- _params[1] = _mm512_set1_ps(params[1]);
-
- for (size_t dc = 0; dc < dstC; dc += DF)
- {
- size_t dC = Simd::Min(DF, dstC - dc);
- __mmask16 tails[2] = { TailMask16(dC), TailMask16(dC - F) };
- _bias[0] = _mm512_loadu_ps(bias + dc + 0);
- _bias[1] = _mm512_loadu_ps(bias + dc + F);
- if (type == ::SimdConvolutionActivationPrelu)
- {
- _params[0] = _mm512_loadu_ps(params + dc + 0);
- _params[1] = _mm512_loadu_ps(params + dc + F);
- }
- float* d = dst;
- const uint16_t* s = src;
- size_t i = 0;
- for (; i < nn; i += n, s += n * srcC, d += n * p.dstC)
- convolution_2xN(s, p, srcC, zero, weight, _bias, _params, d, tails);
- for (; i < n1; i += m, s += m * srcC, d += m * p.dstC)
- convolution_2xM(s, p, srcC, zero, weight, _bias, _params, d, tails);
- weight += dW;
- dst += DF;
- }
- }
-
- //-----------------------------------------------------------------------------------------
-
- template SIMD_INLINE void Set(const ConvParam& p, const AlgParam& a, Convolution* convolutions)
- {
- convolutions[TermLast] = ConvolutionBf16NhwcGemm_2;
- convolutions[TermInterim] = ConvolutionBf16NhwcGemm_2;
- }
-
- SynetConvolution32fBf16NhwcGemm::SynetConvolution32fBf16NhwcGemm(const ConvParam& p)
- : Avx2::SynetConvolution32fBf16NhwcGemm(p)
- {
- SetAlgParam(F * 2, 12, 2, Base::AlgCacheL1(), Base::AlgCacheL2(), Base::AlgCacheL3());
- _convert = ConvertBf16NhwcGemm;
- switch (p.activation)
- {
- case SimdConvolutionActivationIdentity: Set(p, _alg, _convolutions); break;
- case SimdConvolutionActivationRelu: Set(p, _alg, _convolutions); break;
- case SimdConvolutionActivationLeakyRelu: Set(p, _alg, _convolutions); break;
- case SimdConvolutionActivationRestrictRange: Set(p, _alg, _convolutions); break;
- case SimdConvolutionActivationPrelu: Set(p, _alg, _convolutions); break;
- case SimdConvolutionActivationElu: Set(p, _alg, _convolutions); break;
- case SimdConvolutionActivationHswish: Set(p, _alg, _convolutions); break;
- case SimdConvolutionActivationMish: Set(p, _alg, _convolutions); break;
- case SimdConvolutionActivationHardSigmoid: Set(p, _alg, _convolutions); break;
- case SimdConvolutionActivationSwish: Set(p, _alg, _convolutions); break;
- case SimdConvolutionActivationGelu: Set(p, _alg, _convolutions); break;
- default: assert(0);
- }
- }
- }
-#endif
-}
diff --git a/src/Simd/SimdBaseSynetConvolution32f.cpp b/src/Simd/SimdBaseSynetConvolution32f.cpp
index 5611175fa0..8cc1bb6930 100644
--- a/src/Simd/SimdBaseSynetConvolution32f.cpp
+++ b/src/Simd/SimdBaseSynetConvolution32f.cpp
@@ -23,7 +23,6 @@
*/
#include "Simd/SimdSynetConvolution32f.h"
#include "Simd/SimdSynetConvolution32fCommon.h"
-#include "Simd/SimdSynetConvolution32fBf16.h"
#include "Simd/SimdSynet.h"
#include "Simd/SimdBase.h"
#include "Simd/SimdCpu.h"
@@ -1768,17 +1767,13 @@ namespace Simd
//#define SIMD_BASE_ONLY_GEMM_NN
- void * SynetConvolution32fInit(size_t batch, const SimdConvolutionParameters * conv, SimdSynetCompatibilityType compatibility)
+ void * SynetConvolution32fInit(size_t batch, const SimdConvolutionParameters * conv)
{
- ConvParam param(batch, conv, compatibility);
+ ConvParam param(batch, conv, SimdSynetCompatibilityDefault);
if (!param.Valid(SimdTensorData32f))
return NULL;
- else if (Bf16Soft(compatibility))
- {
- return new SynetConvolution32fBf16Gemm(param);
- }
#if !defined(SIMD_BASE_ONLY_GEMM_NN)
- else if (SynetConvolution32fDepthwiseDotProduct::Preferable(param))
+ if (SynetConvolution32fDepthwiseDotProduct::Preferable(param))
return new SynetConvolution32fDepthwiseDotProduct(param);
else if (SynetConvolution32fWinograd::Preferable(param))
return new SynetConvolution32fWinograd(param);
diff --git a/src/Simd/SimdBaseSynetConvolution32fBf16.cpp b/src/Simd/SimdBaseSynetConvolution32fBf16.cpp
deleted file mode 100644
index ceb3d031e0..0000000000
--- a/src/Simd/SimdBaseSynetConvolution32fBf16.cpp
+++ /dev/null
@@ -1,443 +0,0 @@
-/*
-* Simd Library (http://ermig1979.github.io/Simd).
-*
-* Copyright (c) 2011-2024 Yermalayeu Ihar.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a copy
-* of this software and associated documentation files (the "Software"), to deal
-* in the Software without restriction, including without limitation the rights
-* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-* copies of the Software, and to permit persons to whom the Software is
-* furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice shall be included in
-* all copies or substantial portions of the Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-* SOFTWARE.
-*/
-#include "Simd/SimdSynetConvolution32fBf16.h"
-#include "Simd/SimdSynetConvolution32fCommon.h"
-#include "Simd/SimdSynet.h"
-#include "Simd/SimdBFloat16.h"
-#include "Simd/SimdBase.h"
-#include "Simd/SimdCpu.h"
-
-namespace Simd
-{
-#if defined(SIMD_SYNET_ENABLE)
- namespace Base
- {
- SynetConvolution32fBf16Gemm::SynetConvolution32fBf16Gemm(const ConvParam & p)
- : SynetConvolution32f(p)
- {
- if (p.trans)
- {
- _M = p.dstH * p.dstW;
- _N = p.dstC / p.group;
- _K = p.srcC * p.kernelY * p.kernelX / p.group;
- _ldS = _K;
- _ldW = p.dstC;
- _ldD = p.dstC;
- _grW = _N;
- _grS = _K * _M;
- _grD = _N;
- _weight.Resize(_K * _N);
- }
- else
- {
- _M = p.dstC / p.group;
- _N = p.dstH * p.dstW;
- _K = p.srcC * p.kernelY * p.kernelX / p.group;
- _ldW = _K;
- _ldS = _N;
- _ldD = _N;
- _grW = _M * _K;
- _grS = _K * _N;
- _grD = _M * _N;
- _weight.Resize(_K * _M);
- }
- _batch = p.batch;
- _sizeS = p.srcC * p.srcH * p.srcW;
- _sizeB = p.srcC * p.kernelY * p.kernelX * p.dstH * p.dstW;
- _sizeD = p.dstC * p.dstH * p.dstW;
- }
-
- size_t SynetConvolution32fBf16Gemm::ExternalBufferSize() const
- {
- return _sizeB;
- };
-
- void SynetConvolution32fBf16Gemm::SetParams(const float * weight, SimdBool * internal, const float * bias, const float * params)
- {
- Simd::SynetConvolution32f::SetParams(weight, internal, bias, params);
- Float32ToBFloat16(weight, _weight.size, _weight.data);
- if (internal)
- *internal = SimdTrue;
- }
-
- void SynetConvolution32fBf16Gemm::Forward(const float * src, float * buf_, float * dst)
- {
- const ConvParam & p = _param;
- uint16_t * buf = (uint16_t*)Buffer(buf_);
- const uint16_t* wgt = _weight.data;
- for (size_t b = 0; b < _batch; ++b)
- {
- if (_param.trans)
- {
- ImgToRow(src, buf);
- for (size_t g = 0; g < p.group; ++g)
- GemmNN(_M, _N, _K, buf + _grS * g, _ldS, wgt + _grW * g, _ldW, dst + _grD * g, _ldD);
- }
- else
- {
- ImgToCol(src, buf);
- for (size_t g = 0; g < p.group; ++g)
- GemmNN(_M, _N, _K, wgt + _grW * g, _ldW, buf + _grS * g, _ldS, dst + _grD * g, _ldD);
- }
- ConvolutionBiasAndActivation(_bias, p.dstC, p.dstH * p.dstW, p.activation, _params, p.trans, dst);
- src += _sizeS;
- dst += _sizeD;
- }
- }
-
- void SynetConvolution32fBf16Gemm::ImgToCol(const float* src, uint16_t* dst)
- {
- const ConvParam& p = _param;
- assert(!p.trans);
- size_t srcSize = p.srcW * p.srcH;
- for (size_t c = 0; c < p.srcC; ++c)
- {
- for (size_t ky = 0; ky < p.kernelY; ky++)
- {
- for (size_t kx = 0; kx < p.kernelX; kx++)
- {
- size_t sy = ky * p.dilationY - p.padY;
- for (size_t dy = 0; dy < p.dstH; ++dy)
- {
- if (sy < p.srcH)
- {
- size_t sx = kx * p.dilationX - p.padX;
- for (size_t dx = 0; dx < p.dstW; ++dx)
- {
- if (sx < p.srcW)
- *(dst++) = Float32ToBFloat16(src[sy * p.srcW + sx]);
- else
- *(dst++) = 0;
- sx += p.strideX;
- }
- }
- else
- {
- for (size_t dx = 0; dx < p.dstW; ++dx)
- *(dst++) = 0;
- }
- sy += p.strideY;
- }
- }
- }
- src += srcSize;
- }
- }
-
- void SynetConvolution32fBf16Gemm::ImgToRow(const float* src, uint16_t* dst)
- {
- const ConvParam& p = _param;
- assert(p.trans);
- size_t size = p.srcC / p.group;
- for (size_t g = 0; g < p.group; ++g)
- {
- for (size_t dy = 0; dy < p.dstH; ++dy)
- {
- for (size_t dx = 0; dx < p.dstW; ++dx)
- {
- for (size_t ky = 0; ky < p.kernelY; ky++)
- {
- size_t sy = dy * p.strideY + ky * p.dilationY - p.padY;
- if (sy < p.srcH)
- {
- for (size_t kx = 0; kx < p.kernelX; kx++)
- {
- size_t sx = dx * p.strideX + kx * p.dilationX - p.padX;
- if (sx < p.srcW)
- {
- Float32ToBFloat16(src + (sy * p.srcW + sx) * p.srcC, size, dst);
- dst += size;
- }
- else
- {
- memset(dst, 0, size * sizeof(uint16_t));
- dst += size;
- }
- }
- }
- else
- {
- memset(dst, 0, p.kernelX * size * sizeof(uint16_t));
- dst += p.kernelX * size;
- }
- }
- }
- }
- src += size;
- }
- }
-
- void SynetConvolution32fBf16Gemm::GemmNN(size_t M, size_t N, size_t K, const uint16_t* A, size_t lda, const uint16_t* B, size_t ldb, float* C, size_t ldc)
- {
- for (size_t i = 0; i < M; ++i)
- {
- float* pC = C + i * ldc;
- for (size_t j = 0; j < N; ++j)
- pC[j] = 0.0f;
- for (size_t k = 0; k < K; ++k)
- {
- const uint16_t* pB = B + k * ldb;
- float a = BFloat16ToFloat32(A[i * lda + k]);
- for (size_t j = 0; j < N; ++j)
- pC[j] += a * BFloat16ToFloat32(pB[j]);
- }
- }
- }
-
- //-------------------------------------------------------------------------------------------------
-
- SynetConvolution32fBf16Nhwc::SynetConvolution32fBf16Nhwc(const ConvParam& p)
- : SynetConvolution32f(p)
- {
- }
-
- size_t SynetConvolution32fBf16Nhwc::InternalBufferSize() const
- {
- return _buffer.size + _weight.size / 2 + _bias.size + _params.size;
- }
-
- void SynetConvolution32fBf16Nhwc::SetBias(const float* bias, size_t align)
- {
- const ConvParam& p = _param;
- _bias.Resize(AlignHiAny(p.dstC, align), true);
- if (bias)
- memcpy(_bias.data, bias, p.dstC * sizeof(float));
- }
-
- void SynetConvolution32fBf16Nhwc::SetParams(const float* params, size_t align)
- {
- const ConvParam& p = _param;
- if (p.activation == SimdConvolutionActivationLeakyRelu || p.activation == SimdConvolutionActivationPrelu)
- _params.Resize(AlignHiAny(p.dstC, align), true);
- else
- _params.Resize(2, true);
- switch (p.activation)
- {
- case SimdConvolutionActivationIdentity:
- _params.data[0] = -FLT_MAX;
- _params.data[1] = FLT_MAX;
- break;
- case SimdConvolutionActivationRelu:
- _params.data[0] = 0;
- _params.data[1] = FLT_MAX;
- break;
- case SimdConvolutionActivationLeakyRelu:
- for (size_t d = 0; d < p.dstC; ++d)
- _params.data[d] = params[0];
- break;
- case SimdConvolutionActivationRestrictRange:
- _params.data[0] = params[0];
- _params.data[1] = params[1];
- break;
- case SimdConvolutionActivationPrelu:
- for (size_t d = 0; d < p.dstC; ++d)
- _params.data[d] = params[d];
- break;
- case SimdConvolutionActivationElu:
- _params.data[0] = params[0];
- break;
- case SimdConvolutionActivationHswish:
- _params.data[0] = params[0];
- _params.data[1] = params[1];
- break;
- case SimdConvolutionActivationMish:
- _params.data[0] = params[0];
- break;
- case SimdConvolutionActivationHardSigmoid:
- _params.data[0] = params[0];
- _params.data[1] = params[1];
- break;
- case SimdConvolutionActivationSwish:
- _params.data[0] = params[0];
- break;
- case SimdConvolutionActivationGelu:
- break;
- default:
- assert(0);
- }
- }
-
- //-------------------------------------------------------------------------------------------------
-
- SynetConvolution32fBf16NhwcGemm::SynetConvolution32fBf16NhwcGemm(const ConvParam& p)
- : SynetConvolution32fBf16Nhwc(p)
- {
- _convert = 0;
- _convolutions[0] = 0;
- _convolutions[1] = 0;
- }
-
- String SynetConvolution32fBf16NhwcGemm::Desc() const
- {
- std::stringstream desc;
- desc << Ext() << "::Bf16NhwcGemm";
- if (_alg.batch > 1)
- desc << "-" << _alg.batch;
- return desc.str();
- }
-
- void SynetConvolution32fBf16NhwcGemm::SetAlgParam(size_t microD, size_t microM, size_t microK, size_t L1, size_t L2, size_t L3)
- {
- const ConvParam& p = _param;
- AlgParam& a = _alg;
-
- a.M = p.dstW * p.dstH;
- a.K = p.srcC * p.kernelY * p.kernelX;
- a.microD = microD;
- a.microM = microM;
- a.microK = microK;
- a.bufD = AlignHiAny(p.dstC, a.microD);
- a.bufK = AlignHi(a.K, a.microK);
- a.macroK = Simd::RestrictRange(AlignLo(L1 / a.microD / 2, a.microK), a.microK, a.bufK);
- a.batch = 1;
- size_t bufSize = a.M * a.bufK * 2;
- if (bufSize * 2 <= L2 && p.batch > 1)
- {
- for (size_t batch = 1; batch <= p.batch; ++batch)
- if (p.batch % batch == 0 && batch * bufSize <= L2)
- a.batch = batch;
- }
- a.bufM = a.batch * a.M;
- a.macroH = Simd::RestrictRange(L2 / a.macroK / p.dstW / 2, size_t(1), p.dstH * a.batch);
- a.macroD = Simd::RestrictRange(AlignLoAny(L3 / a.macroK / 2, a.microD), a.microD, a.bufD);
- }
-
- size_t SynetConvolution32fBf16NhwcGemm::ExternalBufferSize() const
- {
- const AlgParam& a = _alg;
- return (a.bufM + 1) * a.bufK / 2;
- }
-
- void SynetConvolution32fBf16NhwcGemm::SetParams(const float* weight, SimdBool* internal, const float* bias, const float* params)
- {
- SetWeight(weight);
- if (internal)
- *internal = SimdTrue;
- SynetConvolution32fBf16Nhwc::SetBias(bias, _alg.microD);
- SynetConvolution32fBf16Nhwc::SetParams(params, _alg.microD);
- }
-
- void SynetConvolution32fBf16NhwcGemm::SetWeight(const float* weight)
- {
- const ConvParam& p = _param;
- const AlgParam& a = _alg;
- Array16u buffer(a.bufD * a.bufK, true);
- uint16_t* buf = buffer.data;
- for (size_t k = 0; k < a.K; k += 2)
- {
- for (size_t d = 0; d < p.dstC; ++d)
- {
- *(buf++) = Float32ToBFloat16(weight[d]);
- *(buf++) = k + 1 < a.K ? Float32ToBFloat16(weight[d + p.dstC]) : 0;
- }
- buf += 2 * (a.bufD - p.dstC);
- weight += 2 * p.dstC;
- }
- _weight.Resize(a.bufK * a.bufD, true);
- size_t bufK = a.bufK / 2, macK = a.macroK / 2, bufD = a.bufD * 2, macD = a.macroD * 2, micD = a.microD * 2;
- const uint16_t * src = buffer.data;
- uint16_t* dst = _weight.data;
- for (size_t mad = 0; mad < bufD; mad += macD)
- {
- size_t macroD = Simd::Min(bufD, mad + macD) - mad;
- for (size_t mak = 0; mak < bufK; mak += macK)
- {
- size_t macroK = Simd::Min(bufK, mak + macK) - mak;
- for (size_t mid = 0; mid < macroD; mid += micD)
- {
- for (size_t k = 0; k < macroK; ++k)
- {
- memcpy(dst, src + (mak + k) * bufD + mad + mid, micD * 2);
- dst += micD;
- }
- }
- }
- }
- }
-
- void SynetConvolution32fBf16NhwcGemm::Forward(const float* src, float* buf, float* dst)
- {
- const ConvParam& p = _param;
- const AlgParam& a = _alg;
- buf = Buffer(buf);
- for (size_t b = 0; b < p.batch; b += a.batch)
- {
- Forward(src, (uint16_t*)buf, dst);
- src += p.srcH * p.srcW * p.srcC * a.batch;
- dst += p.dstH * p.dstW * p.dstC * a.batch;
- }
- }
-
- void SynetConvolution32fBf16NhwcGemm::Forward(const float* src, uint16_t* buf, float* dst)
- {
- const ConvParam& p = _param;
- const AlgParam& a = _alg;
- const uint16_t* weight = _weight.data;
- const float* bias = _bias.data, * params = _params.data;
- size_t dstH = p.dstH * a.batch;
- for (size_t dc = 0; dc < p.dstC; dc += a.macroD)
- {
- size_t macroD = Simd::Min(p.dstC, dc + a.macroD) - dc;
- for (size_t mak = 0; mak < a.K; mak += a.macroK)
- {
- size_t macroK = Simd::Min(a.bufK, mak + a.macroK) - mak;
- for (size_t yBeg = 0; yBeg < dstH;)
- {
- size_t yEnd = Simd::Min(yBeg + a.macroH, dstH);
- size_t offs = a.macroK < a.bufK ? mak * a.bufM + yBeg * p.dstW * macroK : 0;
- if (dc == 0 && mak == 0)
- {
- if (a.batch > 1)
- {
- size_t dS = p.srcH * p.srcW * p.srcC;
- for (size_t b = 0; b < a.batch; ++b)
- _convert(src + b * dS, p, a, b, 0, p.dstH, buf);
- }
- else
- _convert(src, p, a, 0, yBeg, yEnd, buf);
- }
- if (mak + macroK == a.bufK)
- _convolutions[TermLast](buf + offs, p, macroD, yEnd - yBeg, macroK, macroK == a.bufK ? 1 : 0,
- weight, bias, params, dst + yBeg * p.dstW * p.dstC);
- else
- _convolutions[TermInterim](buf + offs, p, macroD, yEnd - yBeg, macroK, mak == 0 ? 1 : 0,
- weight, bias, params, dst + yBeg * p.dstW * p.dstC);
- yBeg = yEnd;
- }
- weight += AlignHi(macroK, a.microK) * AlignHiAny(macroD, a.microD);
- }
- bias += macroD;
- if (p.activation == ::SimdConvolutionActivationPrelu)
- params += macroD;
- dst += macroD;
- }
- }
-
- bool SynetConvolution32fBf16NhwcGemm::Preferable(const ConvParam& p)
- {
- return p.trans != 0 && p.group == 1;
- }
- }
-#endif
-}
diff --git a/src/Simd/SimdLib.cpp b/src/Simd/SimdLib.cpp
index cdd44e0795..314d82e693 100644
--- a/src/Simd/SimdLib.cpp
+++ b/src/Simd/SimdLib.cpp
@@ -4866,14 +4866,14 @@ SIMD_API void SimdSynetConvert8uTo32f(const uint8_t* src, size_t batch, size_t c
#endif
}
-SIMD_API void * SimdSynetConvolution32fInit(size_t batch, const SimdConvolutionParameters * params, SimdSynetCompatibilityType compatibility)
+SIMD_API void * SimdSynetConvolution32fInit(size_t batch, const SimdConvolutionParameters * params)
{
SIMD_EMPTY();
#if defined(SIMD_SYNET_ENABLE)
- typedef void* (*SimdSynetConvolution32fInitPtr) (size_t batch, const SimdConvolutionParameters * params, SimdSynetCompatibilityType compatibility);
- const static SimdSynetConvolution32fInitPtr simdSynetConvolution32fInit = SIMD_FUNC5(SynetConvolution32fInit, SIMD_AMXBF16_FUNC, SIMD_AVX512BW_FUNC, SIMD_AVX2_FUNC, SIMD_SSE41_FUNC, SIMD_NEON_FUNC);
+ typedef void* (*SimdSynetConvolution32fInitPtr) (size_t batch, const SimdConvolutionParameters * params);
+ const static SimdSynetConvolution32fInitPtr simdSynetConvolution32fInit = SIMD_FUNC4(SynetConvolution32fInit, SIMD_AVX512BW_FUNC, SIMD_AVX2_FUNC, SIMD_SSE41_FUNC, SIMD_NEON_FUNC);
- return simdSynetConvolution32fInit(batch, params, compatibility);
+ return simdSynetConvolution32fInit(batch, params);
#else
assert(0);
return 0;
diff --git a/src/Simd/SimdLib.h b/src/Simd/SimdLib.h
index e887353f02..b6930002d3 100644
--- a/src/Simd/SimdLib.h
+++ b/src/Simd/SimdLib.h
@@ -269,7 +269,7 @@ typedef enum
SimdCpuInfoAvx2, /*!< Availability of AVX, FMA, AVX2 (x86). */
SimdCpuInfoAvx512bw, /*!< Availability of AVX-512F, AVX-512BW (x86). */
SimdCpuInfoAvx512vnni, /*!< Availability of AVX-512VNNI (x86). */
- SimdCpuInfoAmxBf16, /*!< Availability of AMX-BF16, AMX-INT8 (x86). */
+ SimdCpuInfoAmxBf16, /*!< Availability of AVX-512VBMI, AVX-512FP16, AMX-BF16, AMX-INT8 (x86). */
SimdCpuInfoNeon, /*!< Availability of NEON (ARM). */
SimdCpuInfoCurrentFrequency, /*!< Gets CPU current frequency (for current CPU core). */
} SimdCpuInfoType;
@@ -6090,18 +6090,17 @@ extern "C"
/*! @ingroup synet_convolution_fp32
- \fn void * SimdSynetConvolution32fInit(size_t batch, const SimdConvolutionParameters * conv, SimdSynetCompatibilityType compatibility);
+ \fn void * SimdSynetConvolution32fInit(size_t batch, const SimdConvolutionParameters * conv);
\short Initilizes FP32 convolution algorithm.
\param [in] batch - a batch size.
\param [in] conv - a pointer to convolution parameters.
- \param [in] compatibility - a flags of calculation compatibility.
\return a pointer to FP32 convolution context. On error it returns NULL. It must be released with using of function ::SimdRelease.
This pointer is used in functions ::SimdSynetConvolution32fExternalBufferSize, ::SimdSynetConvolution32fInternalBufferSize,
::SimdSynetConvolution32fInfo, ::SimdSynetConvolution32fSetParams and ::SimdSynetConvolution32fForward.
*/
- SIMD_API void * SimdSynetConvolution32fInit(size_t batch, const SimdConvolutionParameters * conv, SimdSynetCompatibilityType compatibility);
+ SIMD_API void * SimdSynetConvolution32fInit(size_t batch, const SimdConvolutionParameters * conv);
/*! @ingroup synet_convolution_fp32
diff --git a/src/Simd/SimdLib.hpp b/src/Simd/SimdLib.hpp
index 72377a50a9..671f33c536 100644
--- a/src/Simd/SimdLib.hpp
+++ b/src/Simd/SimdLib.hpp
@@ -57,7 +57,7 @@ namespace Simd
os << ", L3: " << SimdCpuInfo(SimdCpuInfoCacheL3) / 1024 << " KB";
os << ", RAM: " << SimdCpuInfo(SimdCpuInfoRam) / 1024 / 1024 << " MB";
os << "; Available SIMD:";
- os << (SimdCpuInfo(SimdCpuInfoAmxBf16) ? " AMX-BF16 AMX-INT8 AVX-512BF16" : "");
+ os << (SimdCpuInfo(SimdCpuInfoAmxBf16) ? " AMX-BF16 AMX-INT8 AVX-512VBMI AVX-512FP16" : "");
os << (SimdCpuInfo(SimdCpuInfoAvx512vnni) ? " AVX-512VNNI" : "");
os << (SimdCpuInfo(SimdCpuInfoAvx512bw) ? " AVX-512BW AVX-512F" : "");
os << (SimdCpuInfo(SimdCpuInfoAvx2) ? " AVX2 FMA AVX" : "");
diff --git a/src/Simd/SimdNeonSynetConvolution32f.cpp b/src/Simd/SimdNeonSynetConvolution32f.cpp
index 91c18aac0d..d2c0375aa1 100644
--- a/src/Simd/SimdNeonSynetConvolution32f.cpp
+++ b/src/Simd/SimdNeonSynetConvolution32f.cpp
@@ -22,7 +22,6 @@
* SOFTWARE.
*/
#include "Simd/SimdSynetConvolution32f.h"
-#include "Simd/SimdSynetConvolution32fBf16.h"
#include "Simd/SimdExtract.h"
#include "Simd/SimdStore.h"
#include "Simd/SimdSynet.h"
@@ -749,16 +748,12 @@ namespace Simd
//---------------------------------------------------------------------
- void * SynetConvolution32fInit(size_t batch, const SimdConvolutionParameters * conv, SimdSynetCompatibilityType compatibility)
+ void * SynetConvolution32fInit(size_t batch, const SimdConvolutionParameters * conv)
{
- ConvParam param(batch, conv, compatibility);
+ ConvParam param(batch, conv, SimdSynetCompatibilityDefault);
if (!param.Valid(SimdTensorData32f))
return NULL;
- else if (Base::Bf16Soft(compatibility))
- {
- return new Base::SynetConvolution32fBf16Gemm(param);
- }
- else if (SynetConvolution32fDepthwiseDotProduct::Preferable(param))
+ if (SynetConvolution32fDepthwiseDotProduct::Preferable(param))
return new SynetConvolution32fDepthwiseDotProduct(param);
else if (SynetConvolution32fWinograd::Preferable(param))
return new SynetConvolution32fWinograd(param);
@@ -776,5 +771,5 @@ namespace Simd
return new SynetConvolution32fGemmNN(param);
}
}
-#endif// SIMD_NEON_ENABLE
+#endif
}
diff --git a/src/Simd/SimdSse41SynetConvolution32f.cpp b/src/Simd/SimdSse41SynetConvolution32f.cpp
index 30ea11900b..f7d2ba4d74 100644
--- a/src/Simd/SimdSse41SynetConvolution32f.cpp
+++ b/src/Simd/SimdSse41SynetConvolution32f.cpp
@@ -22,7 +22,6 @@
* SOFTWARE.
*/
#include "Simd/SimdSynetConvolution32f.h"
-#include "Simd/SimdSynetConvolution32fBf16.h"
#include "Simd/SimdSynet.h"
#include "Simd/SimdGemm.h"
#include "Simd/SimdExp.h"
@@ -700,19 +699,12 @@ namespace Simd
//-------------------------------------------------------------------------------------------------
- void * SynetConvolution32fInit(size_t batch, const SimdConvolutionParameters * conv, SimdSynetCompatibilityType compatibility)
+ void * SynetConvolution32fInit(size_t batch, const SimdConvolutionParameters * conv)
{
- ConvParam param(batch, conv, compatibility);
+ ConvParam param(batch, conv, SimdSynetCompatibilityDefault);
if (!param.Valid(SimdTensorData32f))
return NULL;
- else if (Base::Bf16Soft(compatibility))
- {
- if(Base::SynetConvolution32fBf16NhwcGemm::Preferable(param))
- return new SynetConvolution32fBf16NhwcGemm(param);
- else
- return new Base::SynetConvolution32fBf16Gemm(param);
- }
- else if (SynetConvolution32fDepthwiseDotProduct::Preferable(param))
+ if (SynetConvolution32fDepthwiseDotProduct::Preferable(param))
return new SynetConvolution32fDepthwiseDotProduct(param);
else if (SynetConvolution32fWinograd::Preferable(param))
return new SynetConvolution32fWinograd(param);
diff --git a/src/Simd/SimdSse41SynetConvolution32fBf16NhwcGemm.cpp b/src/Simd/SimdSse41SynetConvolution32fBf16NhwcGemm.cpp
deleted file mode 100644
index 6025c22c80..0000000000
--- a/src/Simd/SimdSse41SynetConvolution32fBf16NhwcGemm.cpp
+++ /dev/null
@@ -1,373 +0,0 @@
-/*
-* Simd Library (http://ermig1979.github.io/Simd).
-*
-* Copyright (c) 2011-2024 Yermalayeu Ihar.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a copy
-* of this software and associated documentation files (the "Software"), to deal
-* in the Software without restriction, including without limitation the rights
-* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-* copies of the Software, and to permit persons to whom the Software is
-* furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice shall be included in
-* all copies or substantial portions of the Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-* SOFTWARE.
-*/
-#include "Simd/SimdSynetConvolution32fBf16.h"
-#include "Simd/SimdSynetConvolution32fCommon.h"
-#include "Simd/SimdBFloat16.h"
-#include "Simd/SimdSynet.h"
-#include "Simd/SimdSse41.h"
-#include "Simd/SimdCpu.h"
-
-namespace Simd
-{
-#if defined(SIMD_SSE41_ENABLE) && defined(SIMD_SYNET_ENABLE)
- namespace Sse41
- {
- typedef Base::SynetConvolution32fBf16NhwcGemm::AlgParam AlgParam;
- typedef Base::SynetConvolution32fBf16NhwcGemm::ConvolutionPtr Convolution;
-
- //-----------------------------------------------------------------------------------------
-
- static void ConvertBf16NhwcGemm(const float* src, const ConvParam& p, const SynetConvolution32fBf16NhwcGemm::AlgParam& a, size_t b, size_t yBeg, size_t yEnd, uint16_t* dst)
- {
- size_t srcC8 = Simd::AlignLo(p.srcC, 8);
- size_t srcC4 = Simd::AlignLo(p.srcC, 4);
- uint16_t* buf = dst + a.bufM * a.bufK;
- size_t gap = a.bufK - a.K;
- for (size_t dy = yBeg, dr = (a.macroK < a.bufK ? dy * p.dstW : 0) + b * p.dstH * p.dstW; dy < yEnd; ++dy)
- {
- for (size_t dx = 0; dx < p.dstW; ++dx, ++dr)
- {
- uint16_t* row = a.macroK < a.bufK ? buf : dst + dr * a.bufK;
- for (size_t ky = 0, k = 0; ky < p.kernelY; ky++)
- {
- size_t sy = dy * p.strideY + ky * p.dilationY - p.padY;
- if (sy < p.srcH)
- {
- for (size_t kx = 0; kx < p.kernelX; kx++)
- {
- size_t sx = dx * p.strideX + kx * p.dilationX - p.padX;
- if (sx < p.srcW)
- {
- const float* ps = src + (sy * p.srcW + sx) * p.srcC;
- size_t sc = 0;
- for (; sc < srcC8; sc += 8)
- {
- __m128i d0 = Float32ToBFloat16(_mm_loadu_ps(ps + sc + 0));
- __m128i d1 = Float32ToBFloat16(_mm_loadu_ps(ps + sc + 4));
- _mm_storeu_si128((__m128i*)(row + sc), _mm_packus_epi32(d0, d1));
- }
- for (; sc < srcC4; sc += 4)
- {
- __m128i d0 = Float32ToBFloat16(_mm_loadu_ps(ps + sc + 0));
- _mm_storel_epi64((__m128i*)(row + sc), _mm_packus_epi32(d0, K_ZERO));
- }
- for (; sc < p.srcC; ++sc)
- row[sc] = Base::Float32ToBFloat16(ps[sc]);
- row += p.srcC;
- }
- else
- {
- memset(row, 0, p.srcC * 2);
- row += p.srcC;
- }
- }
- }
- else
- {
- memset(row, 0, p.kernelX * p.srcC * 2);
- row += p.kernelX * p.srcC;
- }
- }
- for (size_t g = 0; g < gap; ++g)
- *(row++) = 0;
- if (a.macroK < a.bufK)
- {
- for (size_t mak = 0; mak < a.bufK; mak += a.macroK)
- {
- size_t macroK = Simd::Min(a.bufK, mak + a.macroK) - mak;
- memcpy(dst + mak * a.bufM + dr * macroK, buf + mak, macroK * 2);
- }
- }
- }
- }
- }
-
- //-----------------------------------------------------------------------------------------
-
- template void ConvolutionBf16NhwcGemm_2xM(const uint16_t* src0,
- const ConvParam& p, size_t srcC, size_t dstC, int zero, const uint16_t* weight, const __m128* bias, const __m128* params, float* dst)
- {
- __m128 d00, d01, d10, d11, d20, d21, d30, d31, d40, d41, s0, w00, w01, w10, w11, m = _mm_castsi128_ps(Bf16::MASK);
- size_t dD = p.dstC;
- const uint16_t* src1 = src0 + 1 * srcC;
- const uint16_t* src2 = src0 + 2 * srcC;
- const uint16_t* src3 = src0 + 3 * srcC;
- const uint16_t* src4 = src0 + 4 * srcC;
- if (dstC > F)
- {
- if (zero)
- {
- if (M > 0) d00 = _mm_setzero_ps(), d01 = _mm_setzero_ps();
- if (M > 1) d10 = _mm_setzero_ps(), d11 = _mm_setzero_ps();
- if (M > 2) d20 = _mm_setzero_ps(), d21 = _mm_setzero_ps();
- if (M > 3) d30 = _mm_setzero_ps(), d31 = _mm_setzero_ps();
- if (M > 4) d40 = _mm_setzero_ps(), d41 = _mm_setzero_ps();
- }
- else
- {
- if (M > 0) d00 = _mm_loadu_ps(dst + 0 * dD + 0), d01 = _mm_loadu_ps(dst + 0 * dD + F);
- if (M > 1) d10 = _mm_loadu_ps(dst + 1 * dD + 0), d11 = _mm_loadu_ps(dst + 1 * dD + F);
- if (M > 2) d20 = _mm_loadu_ps(dst + 2 * dD + 0), d21 = _mm_loadu_ps(dst + 2 * dD + F);
- if (M > 3) d30 = _mm_loadu_ps(dst + 3 * dD + 0), d31 = _mm_loadu_ps(dst + 3 * dD + F);
- if (M > 4) d40 = _mm_loadu_ps(dst + 4 * dD + 0), d41 = _mm_loadu_ps(dst + 4 * dD + F);
- }
- for (size_t offs = 0; offs < srcC; offs += 2)
- {
- w01 = _mm_loadu_ps((float*)weight + 0);
- w00 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(w01), Base::Bf16::SHIFT));
- w01 = _mm_and_ps(w01, m);
- w11 = _mm_loadu_ps((float*)weight + F);
- w10 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(w11), Base::Bf16::SHIFT));
- w11 = _mm_and_ps(w11, m);
- if (M > 0)
- {
- s0 = _mm_and_ps(_mm_set1_ps(*(float*)(src0 + offs - 1)), m);
- d00 = _mm_add_ps(_mm_mul_ps(s0, w00), d00);
- d01 = _mm_add_ps(_mm_mul_ps(s0, w10), d01);
- s0 = _mm_and_ps(_mm_set1_ps(*(float*)(src0 + offs - 0)), m);
- d00 = _mm_add_ps(_mm_mul_ps(s0, w01), d00);
- d01 = _mm_add_ps(_mm_mul_ps(s0, w11), d01);
- }
- if (M > 1)
- {
- s0 = _mm_and_ps(_mm_set1_ps(*(float*)(src1 + offs - 1)), m);
- d10 = _mm_add_ps(_mm_mul_ps(s0, w00), d10);
- d11 = _mm_add_ps(_mm_mul_ps(s0, w10), d11);
- s0 = _mm_and_ps(_mm_set1_ps(*(float*)(src1 + offs - 0)), m);
- d10 = _mm_add_ps(_mm_mul_ps(s0, w01), d10);
- d11 = _mm_add_ps(_mm_mul_ps(s0, w11), d11);
- }
- if (M > 2)
- {
- s0 = _mm_and_ps(_mm_set1_ps(*(float*)(src2 + offs - 1)), m);
- d20 = _mm_add_ps(_mm_mul_ps(s0, w00), d20);
- d21 = _mm_add_ps(_mm_mul_ps(s0, w10), d21);
- s0 = _mm_and_ps(_mm_set1_ps(*(float*)(src2 + offs - 0)), m);
- d20 = _mm_add_ps(_mm_mul_ps(s0, w01), d20);
- d21 = _mm_add_ps(_mm_mul_ps(s0, w11), d21);
- }
- if (M > 3)
- {
- s0 = _mm_and_ps(_mm_set1_ps(*(float*)(src3 + offs - 1)), m);
- d30 = _mm_add_ps(_mm_mul_ps(s0, w00), d30);
- d31 = _mm_add_ps(_mm_mul_ps(s0, w10), d31);
- s0 = _mm_and_ps(_mm_set1_ps(*(float*)(src3 + offs - 0)), m);
- d30 = _mm_add_ps(_mm_mul_ps(s0, w01), d30);
- d31 = _mm_add_ps(_mm_mul_ps(s0, w11), d31);
- }
- if (M > 4)
- {
- s0 = _mm_and_ps(_mm_set1_ps(*(float*)(src4 + offs - 1)), m);
- d40 = _mm_add_ps(_mm_mul_ps(s0, w00), d40);
- d41 = _mm_add_ps(_mm_mul_ps(s0, w10), d41);
- s0 = _mm_and_ps(_mm_set1_ps(*(float*)(src4 + offs - 0)), m);
- d40 = _mm_add_ps(_mm_mul_ps(s0, w01), d40);
- d41 = _mm_add_ps(_mm_mul_ps(s0, w11), d41);
- }
- weight += QF;
- }
- if (dstC == DF)
- {
- if (M > 0) Save2(dst, d00, d01, bias, params), dst += dD;
- if (M > 1) Save2(dst, d10, d11, bias, params), dst += dD;
- if (M > 2) Save2(dst, d20, d21, bias, params), dst += dD;
- if (M > 3) Save2(dst, d30, d31, bias, params), dst += dD;
- if (M > 4) Save2(dst, d40, d41, bias, params), dst += dD;
- }
- else
- {
- dstC -= F;
- if (M > 0) Save2(dst, d00, d01, bias, params, dstC), dst += dD;
- if (M > 1) Save2(dst, d10, d11, bias, params, dstC), dst += dD;
- if (M > 2) Save2(dst, d20, d21, bias, params, dstC), dst += dD;
- if (M > 3) Save2(dst, d30, d31, bias, params, dstC), dst += dD;
- if (M > 4) Save2(dst, d40, d41, bias, params, dstC), dst += dD;
- }
- }
- else
- {
- if (zero)
- {
- if (M > 0) d00 = _mm_setzero_ps();
- if (M > 1) d10 = _mm_setzero_ps();
- if (M > 2) d20 = _mm_setzero_ps();
- if (M > 3) d30 = _mm_setzero_ps();
- if (M > 4) d40 = _mm_setzero_ps();
- }
- else
- {
- if (M > 0) d00 = _mm_loadu_ps(dst + 0 * dD + 0);
- if (M > 1) d10 = _mm_loadu_ps(dst + 1 * dD + 0);
- if (M > 2) d20 = _mm_loadu_ps(dst + 2 * dD + 0);
- if (M > 3) d30 = _mm_loadu_ps(dst + 3 * dD + 0);
- if (M > 4) d40 = _mm_loadu_ps(dst + 4 * dD + 0);
- }
- for (size_t offs = 0; offs < srcC; offs += 2)
- {
- w01 = _mm_loadu_ps((float*)weight + 0);
- w00 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(w01), Base::Bf16::SHIFT));
- w01 = _mm_and_ps(w01, m);
- if (M > 0)
- {
- s0 = _mm_and_ps(_mm_set1_ps(*(float*)(src0 + offs - 1)), m);
- d00 = _mm_add_ps(_mm_mul_ps(s0, w00), d00);
- s0 = _mm_and_ps(_mm_set1_ps(*(float*)(src0 + offs - 0)), m);
- d00 = _mm_add_ps(_mm_mul_ps(s0, w01), d00);
- }
- if (M > 1)
- {
- s0 = _mm_and_ps(_mm_set1_ps(*(float*)(src1 + offs - 1)), m);
- d10 = _mm_add_ps(_mm_mul_ps(s0, w00), d10);
- s0 = _mm_and_ps(_mm_set1_ps(*(float*)(src1 + offs - 0)), m);
- d10 = _mm_add_ps(_mm_mul_ps(s0, w01), d10);
- }
- if (M > 2)
- {
- s0 = _mm_and_ps(_mm_set1_ps(*(float*)(src2 + offs - 1)), m);
- d20 = _mm_add_ps(_mm_mul_ps(s0, w00), d20);
- s0 = _mm_and_ps(_mm_set1_ps(*(float*)(src2 + offs - 0)), m);
- d20 = _mm_add_ps(_mm_mul_ps(s0, w01), d20);
- }
- if (M > 3)
- {
- s0 = _mm_and_ps(_mm_set1_ps(*(float*)(src3 + offs - 1)), m);
- d30 = _mm_add_ps(_mm_mul_ps(s0, w00), d30);
- s0 = _mm_and_ps(_mm_set1_ps(*(float*)(src3 + offs - 0)), m);
- d30 = _mm_add_ps(_mm_mul_ps(s0, w01), d30);
- }
- if (M > 4)
- {
- s0 = _mm_and_ps(_mm_set1_ps(*(float*)(src4 + offs - 1)), m);
- d40 = _mm_add_ps(_mm_mul_ps(s0, w00), d40);
- s0 = _mm_and_ps(_mm_set1_ps(*(float*)(src4 + offs - 0)), m);
- d40 = _mm_add_ps(_mm_mul_ps(s0, w01), d40);
- }
- weight += QF;
- }
- if (dstC == F)
- {
- if (M > 0) Save1(dst, d00, bias, params), dst += dD;
- if (M > 1) Save1(dst, d10, bias, params), dst += dD;
- if (M > 2) Save1(dst, d20, bias, params), dst += dD;
- if (M > 3) Save1(dst, d30, bias, params), dst += dD;
- if (M > 4) Save1(dst, d40, bias, params), dst += dD;
- }
- else
- {
- if (M > 0) Save1(dst, d00, bias, params, dstC), dst += dD;
- if (M > 1) Save1(dst, d10, bias, params, dstC), dst += dD;
- if (M > 2) Save1(dst, d20, bias, params, dstC), dst += dD;
- if (M > 3) Save1(dst, d30, bias, params, dstC), dst += dD;
- if (M > 4) Save1(dst, d40, bias, params, dstC), dst += dD;
- }
- }
- }
-
- typedef void(*ConvolutionBf16NhwcGemm_2xM_Ptr)(const uint16_t* src0, const ConvParam& p, size_t srcC,
- size_t dstC, int zero, const uint16_t* weight, const __m128* bias, const __m128* params, float* dst);
-
- template ConvolutionBf16NhwcGemm_2xM_Ptr GetConvolutionBf16NhwcGemm_2xM(size_t M)
- {
- switch (M)
- {
- case 0: return NULL;
- case 1: return ConvolutionBf16NhwcGemm_2xM;
- case 2: return ConvolutionBf16NhwcGemm_2xM;
- case 3: return ConvolutionBf16NhwcGemm_2xM;
- case 4: return ConvolutionBf16NhwcGemm_2xM;
- case 5: return ConvolutionBf16NhwcGemm_2xM;
- }
- assert(0);
- return NULL;
- }
-
- template void ConvolutionBf16NhwcGemm_2(const uint16_t* src, const ConvParam& p,
- size_t dstC, size_t dstH, size_t srcC, int zero, const uint16_t* weight, const float* bias, const float* params, float* dst)
- {
- size_t n1 = dstH * p.dstW, n = 5;
- size_t nn = AlignLoAny(n1, n), m = n1 - nn, dW = AlignHi(srcC, 2) * DF;
- ConvolutionBf16NhwcGemm_2xM_Ptr convolution_2xN = GetConvolutionBf16NhwcGemm_2xM(n);
- ConvolutionBf16NhwcGemm_2xM_Ptr convolution_2xM = GetConvolutionBf16NhwcGemm_2xM(m);
-
- __m128 _params[2], _bias[2];
- _params[0] = _mm_set1_ps(params[0]);
- if (type == SimdConvolutionActivationRestrictRange ||
- type == SimdConvolutionActivationHswish ||
- type == SimdConvolutionActivationHardSigmoid)
- _params[1] = _mm_set1_ps(params[1]);
-
- for (size_t dc = 0; dc < dstC; dc += DF)
- {
- size_t dC = Simd::Min(DF, dstC - dc);
- _bias[0] = _mm_loadu_ps(bias + dc + 0);
- _bias[1] = _mm_loadu_ps(bias + dc + F);
- if (type == ::SimdConvolutionActivationPrelu)
- {
- _params[0] = _mm_loadu_ps(params + dc + 0);
- _params[1] = _mm_loadu_ps(params + dc + F);
- }
- float* d = dst;
- const uint16_t* s = src;
- size_t i = 0;
- for (; i < nn; i += n, s += n * srcC, d += n * p.dstC)
- convolution_2xN(s, p, srcC, dC, zero, weight, _bias, _params, d);
- for (; i < n1; i += m, s += m * srcC, d += m * p.dstC)
- convolution_2xM(s, p, srcC, dC, zero, weight, _bias, _params, d);
- weight += dW;
- dst += DF;
- }
- }
-
- //-----------------------------------------------------------------------------------------
-
- template SIMD_INLINE void Set(const ConvParam& p, const AlgParam & a, Convolution* convolutions)
- {
- convolutions[TermLast] = ConvolutionBf16NhwcGemm_2;
- convolutions[TermInterim] = ConvolutionBf16NhwcGemm_2;
- }
-
- SynetConvolution32fBf16NhwcGemm::SynetConvolution32fBf16NhwcGemm(const ConvParam & p)
- : Base::SynetConvolution32fBf16NhwcGemm(p)
- {
- SetAlgParam(F * 2, 5, 2, Base::AlgCacheL1(), Base::AlgCacheL2(), Base::AlgCacheL3());
- _convert = ConvertBf16NhwcGemm;
- switch (p.activation)
- {
- case SimdConvolutionActivationIdentity: Set(p, _alg, _convolutions); break;
- case SimdConvolutionActivationRelu: Set(p, _alg, _convolutions); break;
- case SimdConvolutionActivationLeakyRelu: Set(p, _alg, _convolutions); break;
- case SimdConvolutionActivationRestrictRange: Set(p, _alg, _convolutions); break;
- case SimdConvolutionActivationPrelu: Set(p, _alg, _convolutions); break;
- case SimdConvolutionActivationElu: Set(p, _alg, _convolutions); break;
- case SimdConvolutionActivationHswish: Set(p, _alg, _convolutions); break;
- case SimdConvolutionActivationMish: Set(p, _alg, _convolutions); break;
- case SimdConvolutionActivationHardSigmoid: Set(p, _alg, _convolutions); break;
- case SimdConvolutionActivationSwish: Set(p, _alg, _convolutions); break;
- case SimdConvolutionActivationGelu: Set(p, _alg, _convolutions); break;
- default: assert(0);
- }
- }
- }
-#endif
-}
diff --git a/src/Simd/SimdSynetConvolution32f.h b/src/Simd/SimdSynetConvolution32f.h
index cf658692df..6e64aed59f 100644
--- a/src/Simd/SimdSynetConvolution32f.h
+++ b/src/Simd/SimdSynetConvolution32f.h
@@ -383,7 +383,7 @@ namespace Simd
//-------------------------------------------------------------------------------------------------
- void * SynetConvolution32fInit(size_t batch, const SimdConvolutionParameters * conv, SimdSynetCompatibilityType compatibility);
+ void * SynetConvolution32fInit(size_t batch, const SimdConvolutionParameters * conv);
}
#ifdef SIMD_SSE41_ENABLE
@@ -466,7 +466,7 @@ namespace Simd
//-------------------------------------------------------------------------------------------------
- void * SynetConvolution32fInit(size_t batch, const SimdConvolutionParameters * conv, SimdSynetCompatibilityType compatibility);
+ void * SynetConvolution32fInit(size_t batch, const SimdConvolutionParameters * conv);
}
#endif
@@ -553,7 +553,7 @@ namespace Simd
//-----------------------------------------------------------------------------------------
- void * SynetConvolution32fInit(size_t batch, const SimdConvolutionParameters * conv, SimdSynetCompatibilityType compatibility);
+ void * SynetConvolution32fInit(size_t batch, const SimdConvolutionParameters * conv);
}
#endif
@@ -632,14 +632,7 @@ namespace Simd
//-----------------------------------------------------------------------------------------
- void* SynetConvolution32fInit(size_t batch, const SimdConvolutionParameters* conv, SimdSynetCompatibilityType compatibility);
- }
-#endif
-
-#if (defined(SIMD_AMXBF16_ENABLE) || (defined(SIMD_AVX512BW_ENABLE) && defined(SIMD_AMX_EMULATE)))
- namespace AmxBf16
- {
- void* SynetConvolution32fInit(size_t batch, const SimdConvolutionParameters* conv, SimdSynetCompatibilityType compatibility);
+ void* SynetConvolution32fInit(size_t batch, const SimdConvolutionParameters* conv);
}
#endif
@@ -723,7 +716,7 @@ namespace Simd
//-----------------------------------------------------------------------------------------
- void * SynetConvolution32fInit(size_t batch, const SimdConvolutionParameters * conv, SimdSynetCompatibilityType compatibility);
+ void * SynetConvolution32fInit(size_t batch, const SimdConvolutionParameters * conv);
}
#endif
}
diff --git a/src/Simd/SimdSynetConvolution32fBf16.h b/src/Simd/SimdSynetConvolution32fBf16.h
deleted file mode 100644
index 7935188a4a..0000000000
--- a/src/Simd/SimdSynetConvolution32fBf16.h
+++ /dev/null
@@ -1,171 +0,0 @@
-/*
-* Simd Library (http://ermig1979.github.io/Simd).
-*
-* Copyright (c) 2011-2024 Yermalayeu Ihar.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a copy
-* of this software and associated documentation files (the "Software"), to deal
-* in the Software without restriction, including without limitation the rights
-* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-* copies of the Software, and to permit persons to whom the Software is
-* furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice shall be included in
-* all copies or substantial portions of the Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-* SOFTWARE.
-*/
-#ifndef __SimdSynetConvolution32fBf16_h__
-#define __SimdSynetConvolution32fBf16_h__
-
-#include "Simd/SimdSynetConvolution32f.h"
-
-namespace Simd
-{
- namespace Base
- {
- class SynetConvolution32fBf16Gemm : public SynetConvolution32f
- {
- public:
- SynetConvolution32fBf16Gemm(const ConvParam& p);
- virtual String Ext() const { return "Base"; }
- virtual String Desc() const { return Ext() + "::Bf16Gemm"; }
- virtual size_t ExternalBufferSize() const;
- virtual void SetParams(const float* weight, SimdBool* internal, const float* bias, const float* params);
- virtual void Forward(const float* src, float* buf, float* dst);
-
- protected:
- void ImgToCol(const float* src, uint16_t* dst);
- void ImgToRow(const float* src, uint16_t* dst);
- void GemmNN(size_t M, size_t N, size_t K, const uint16_t* A, size_t lda, const uint16_t* B, size_t ldb, float* C, size_t ldc);
-
- Array16u _weight;
- size_t _M, _N, _K, _ldW, _ldS, _ldD, _grW, _grS, _grD, _batch, _sizeS, _sizeB, _sizeD;
- };
-
- //-------------------------------------------------------------------------------------------------
-
- class SynetConvolution32fBf16Nhwc : public SynetConvolution32f
- {
- public:
- SynetConvolution32fBf16Nhwc(const ConvParam& p);
- virtual size_t InternalBufferSize() const;
-
- protected:
- void SetBias(const float* bias, size_t align);
- void SetParams(const float* params, size_t align);
-
- Array16u _weight;
- Array32f _bias, _params;
- };
-
- //-------------------------------------------------------------------------------------------------
-
- class SynetConvolution32fBf16NhwcGemm : public SynetConvolution32fBf16Nhwc
- {
- public:
- SynetConvolution32fBf16NhwcGemm(const ConvParam& p);
- virtual String Ext() const { return "Base"; }
- virtual String Desc() const;
- virtual size_t ExternalBufferSize() const;
- virtual void SetParams(const float* weight, SimdBool* internal, const float* bias, const float* params);
- virtual void Forward(const float* src, float* buf, float* dst);
-
- static bool Preferable(const ConvParam& p);
-
- struct AlgParam
- {
- size_t batch, K, M;
- size_t microD, microM, microK;
- size_t macroD, macroH, macroK;
- size_t bufD, bufM, bufK;
- };
-
- typedef void(*ConvertPtr)(const float* src, const ConvParam& p, const AlgParam& a, size_t b, size_t yBeg, size_t yEnd, uint16_t* dst);
-
- typedef void(*ConvolutionPtr)(const uint16_t* src, const ConvParam& p, size_t dstC, size_t dstH,
- size_t srcC, int zero, const uint16_t* weight, const float* bias, const float* params, float* dst);
-
- protected:
- void SetAlgParam(size_t microD, size_t microM, size_t microK, size_t L1, size_t L2, size_t L3);
- void SetWeight(const float* weight);
- void Forward(const float* src, uint16_t* buf, float* dst);
-
- AlgParam _alg;
- ConvertPtr _convert;
- ConvolutionPtr _convolutions[2];
- };
- }
-
-#ifdef SIMD_SSE41_ENABLE
- namespace Sse41
- {
- class SynetConvolution32fBf16NhwcGemm : public Base::SynetConvolution32fBf16NhwcGemm
- {
- public:
- SynetConvolution32fBf16NhwcGemm(const ConvParam& p);
-
- virtual String Ext() const { return "Sse41"; }
- };
- }
-#endif
-
-#ifdef SIMD_AVX2_ENABLE
- namespace Avx2
- {
- class SynetConvolution32fBf16NhwcGemm : public Sse41::SynetConvolution32fBf16NhwcGemm
- {
- public:
- SynetConvolution32fBf16NhwcGemm(const ConvParam& p);
-
- virtual String Ext() const { return "Avx2"; }
- };
- }
-#endif
-
-#ifdef SIMD_AVX512BW_ENABLE
- namespace Avx512bw
- {
- class SynetConvolution32fBf16NhwcGemm : public Avx2::SynetConvolution32fBf16NhwcGemm
- {
- public:
- SynetConvolution32fBf16NhwcGemm(const ConvParam& p);
-
- virtual String Ext() const { return "Avx512bw"; }
- };
-
- //-----------------------------------------------------------------------------------------
-
- void ConvolutionBf16NhwcConvertConv(const float* src, const ConvParam& p, size_t yBeg, size_t yEnd, size_t srcC, size_t micC, uint16_t* dst);
-
- void ConvolutionBf16NhwcConvertGemm(const float* src, const ConvParam& p, size_t yBeg, size_t yEnd, size_t srcC, size_t micC, uint16_t* dst);
- }
-#endif
-
-#if (defined(SIMD_AMXBF16_ENABLE) || (defined(SIMD_AVX512BW_ENABLE) && defined(SIMD_AMX_EMULATE)))
- namespace AmxBf16
- {
- class SynetConvolution32fBf16NhwcGemm : public Avx512bw::SynetConvolution32fBf16NhwcGemm
- {
- public:
- SynetConvolution32fBf16NhwcGemm(const ConvParam& p);
-
- virtual String Ext() const { return "AmxBf16"; }
- };
- }
-#endif
-
-#ifdef SIMD_NEON_ENABLE
- namespace Neon
- {
- }
-#endif
-}
-
-#endif
diff --git a/src/Test/TestSynetConvolution16b.cpp b/src/Test/TestSynetConvolution16b.cpp
index 553066be7c..2db673530a 100644
--- a/src/Test/TestSynetConvolution16b.cpp
+++ b/src/Test/TestSynetConvolution16b.cpp
@@ -159,7 +159,7 @@ namespace Test
c.srcT = SimdTensorData32f;
c.dstT = SimdTensorData32f;
- void* context3 = SimdSynetConvolution32fInit(p.batch, &c, SimdSynetCompatibilityDefault);
+ void* context3 = SimdSynetConvolution32fInit(p.batch, &c);
Tensor32f dst32f3(p.DstShape(), p.conv.dstF), buf32f(Shp( ::SimdSynetConvolution32fExternalBufferSize(context3)));
diff --git a/src/Test/TestSynetConvolution32f.cpp b/src/Test/TestSynetConvolution32f.cpp
index e482af6fa8..e3d0581dde 100644
--- a/src/Test/TestSynetConvolution32f.cpp
+++ b/src/Test/TestSynetConvolution32f.cpp
@@ -39,16 +39,16 @@ namespace Test
struct FuncC
{
- typedef void*(*FuncPtr)(size_t batch, const SimdConvolutionParameters * conv, SimdSynetCompatibilityType compatibility);
+ typedef void*(*FuncPtr)(size_t batch, const SimdConvolutionParameters * conv);
FuncPtr func;
String desc;
FuncC(const FuncPtr & f, const String & d) : func(f), desc(d) {}
- void Update(const Param & p, SimdSynetCompatibilityType c)
+ void Update(const Param & p)
{
- desc = desc + p.Decription(Simd::Base::Bf16Soft(c) ? "-bf16" : "-fp32");
+ desc = desc + p.Decription();
}
void Call(void * context, const Tensor32f & src, Tensor32f & buf, Tensor32f & dst) const
@@ -62,12 +62,12 @@ namespace Test
#define FUNC_C(function) \
FuncC(function, std::string(#function))
- bool SynetConvolution32fForwardAutoTest(float eps, const Param & p, SimdSynetCompatibilityType comp, FuncC f1, FuncC f2)
+ bool SynetConvolution32fForwardAutoTest(float eps, const Param & p, FuncC f1, FuncC f2)
{
bool result = true;
- f1.Update(p, comp);
- f2.Update(p, comp);
+ f1.Update(p);
+ f2.Update(p);
TEST_LOG_SS(Info, "Test [" << f1.desc << " & " << f2.desc << "].");
@@ -111,8 +111,8 @@ namespace Test
::SimdFill32f(dst1.Data(), dst1.Size(), params.Data() + 0);
::SimdFill32f(dst2.Data(), dst2.Size(), params.Data() + 1);
- void * context1 = f1.func(p.batch, &p.conv, comp);
- void * context2 = f2.func(p.batch, &p.conv, comp);
+ void * context1 = f1.func(p.batch, &p.conv);
+ void * context2 = f2.func(p.batch, &p.conv);
buf1.Extend({ ::SimdSynetConvolution32fExternalBufferSize(context1) });
buf2.Extend({ ::SimdSynetConvolution32fExternalBufferSize(context2) });
@@ -137,7 +137,7 @@ namespace Test
return result;
}
- bool SynetConvolution32fForwardAutoTest(float eps, SimdConvolutionActivationType a, SimdBool t, SimdSynetCompatibilityType c, const FuncC& f1, const FuncC& f2)
+ bool SynetConvolution32fForwardAutoTest(float eps, SimdConvolutionActivationType a, SimdBool t, const FuncC& f1, const FuncC& f2)
{
bool result = true;
@@ -146,465 +146,464 @@ namespace Test
aLr = SimdConvolutionActivationLeakyRelu, aRr = SimdConvolutionActivationRestrictRange, aPr = SimdConvolutionActivationPrelu,
aEl = SimdConvolutionActivationElu, aHs = SimdConvolutionActivationHswish, aMi = SimdConvolutionActivationMish,
aHi = SimdConvolutionActivationHardSigmoid, aSw = SimdConvolutionActivationSwish, aGe = SimdConvolutionActivationGelu;
- const SimdSynetCompatibilityType fp32 = SimdSynetCompatibilityDefault, bf16 = SimdSynetCompatibility16bfSoft;
const SimdBool tF = SimdFalse, tT = SimdTrue;
#ifdef NDEBUG
#if 0
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 16, 112, 96, 32, _3, _1, _3, Size(1, 0), Size(1, 0), 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 16, 114, 96, 32, _3, _1, _3, _0, _0, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 64, 19, 16, 64, _3, _1, _3, _1, _1, 64, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 32, 19, 16, 64, _1, _1, _1, _0, _0, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 128, 7, 7, 128, _7, _1, _1, _0, _0, 128, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 16, 56, 56, 32, _1, _1, _1, _0, _0, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 3, 112, 112, 16, _3, _1, _2, _1, _1, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 128, 7, 6, 128, _3, _1, _2, Size(0, 1), Size(1, 1), 128, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 128, 4, 3, 256, _1, _1, _1, _0, _0, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 32, 38, 32, 32, _3, _1, _2, _0, _1, 32, a, t), c, f1, f2);
-#endif
-#if 0
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 3, 112, 96, 16, _3, _1, _1, _1, _1, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 16, 112, 96, 32, _3, _1, _3, Size(1, 0), Size(1, 0), 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 32, 38, 32, 32, _3, _1, _2, _0, _1, 32, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 32, 19, 16, 64, _1, _1, _1, _0, _0, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 64, 19, 16, 64, _3, _1, _3, _1, _1, 64, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 64, 7, 6, 128, _1, _1, _1, _0, _0, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 128, 7, 6, 128, _3, _1, _2, Size(0, 1), Size(1, 1), 128, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 128, 4, 3, 256, _1, _1, _1, _0, _0, 1, a, t), c, f1, f2);
-#endif
-#if 0
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 1024, 13, 13, 1024, _3, _1, _1, _1, _1, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 512, 10, 10, 1024, _3, _1, _1, _1, _1, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 256, 10, 10, 512, _3, _1, _1, _1, _1, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 384, 20, 20, 256, _3, _1, _1, _1, _1, 1, a, t), c, f1, f2);
-#endif
-#if 0
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 16, 150, 150, 96, _1, _1, _1, _0, _0, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 17, 150, 150, 96, _1, _1, _1, _0, _0, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 16, 150, 150, 96, _2, _1, _1, _0, _0, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 17, 150, 150, 96, _2, _1, _1, _0, _0, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 16, 150, 150, 96, _3, _1, _1, _1, _1, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 17, 150, 150, 96, _3, _1, _1, _0, _0, 1, a, t), c, f1, f2);
-#endif
-#if 0
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 3, 224, 224, 16, _1, _1, _1, _1, _1, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 3, 224, 224, 16, _1, _1, _2, _1, _1, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 3, 224, 224, 16, _2, _1, _1, _1, _0, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 3, 224, 224, 16, _2, _1, _2, _1, _0, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 3, 224, 224, 16, _3, _1, _1, _1, _1, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 3, 224, 224, 16, _3, _1, _2, _1, _1, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 3, 224, 224, 16, _4, _1, _1, _2, _1, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 3, 224, 224, 16, _4, _1, _2, _2, _1, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 3, 224, 224, 16, _5, _1, _1, _2, _2, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 3, 224, 224, 16, _5, _1, _2, _2, _2, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 3, 224, 224, 16, _7, _1, _1, _3, _3, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 3, 224, 224, 16, _7, _1, _2, _3, _3, 1, a, t), c, f1, f2);
-#endif
-#if 0
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 32, 150, 150, 16, _1, _1, _1, _0, _0, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 16, 150, 150, 96, _1, _1, _1, _0, _0, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 96, 75, 75, 24, _1, _1, _1, _0, _0, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 24, 75, 75, 144, _1, _1, _1, _0, _0, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 144, 75, 75, 24, _1, _1, _1, _0, _0, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 144, 38, 38, 32, _1, _1, _1, _0, _0, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 32, 38, 38, 192, _1, _1, _1, _0, _0, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 192, 38, 38, 32, _1, _1, _1, _0, _0, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 192, 19, 19, 64, _1, _1, _1, _0, _0, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 64, 19, 19, 384, _1, _1, _1, _0, _0, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 384, 19, 19, 64, _1, _1, _1, _0, _0, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 384, 19, 19, 96, _1, _1, _1, _0, _0, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 96, 19, 19, 576, _1, _1, _1, _0, _0, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 576, 19, 19, 96, _1, _1, _1, _0, _0, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 576, 10, 10, 160, _1, _1, _1, _0, _0, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 160, 10, 10, 960, _1, _1, _1, _0, _0, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 960, 10, 10, 160, _1, _1, _1, _0, _0, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 960, 10, 10, 320, _1, _1, _1, _0, _0, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 320, 10, 10, 1280, _1, _1, _1, _0, _0, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 1280, 10, 10, 256, _1, _1, _1, _0, _0, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 256, 5, 5, 512, _1, _1, _1, _0, _0, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 512, 5, 5, 128, _1, _1, _1, _0, _0, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 128, 3, 3, 256, _1, _1, _1, _0, _0, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 256, 3, 3, 128, _1, _1, _1, _0, _0, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 128, 2, 2, 256, _1, _1, _1, _0, _0, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 256, 2, 2, 128, _1, _1, _1, _0, _0, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 256, 2, 2, 64, _1, _1, _1, _0, _0, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 64, 1, 1, 128, _1, _1, _1, _0, _0, 1, a, t), c, f1, f2);
-
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 576, 19, 19, 12, _3, _1, _1, _1, _1, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 1280, 10, 10, 16, _3, _1, _1, _1, _1, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 512, 5, 5, 16, _3, _1, _1, _1, _1, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 256, 3, 3, 16, _3, _1, _1, _1, _1, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 256, 2, 2, 16, _3, _1, _1, _1, _1, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 128, 1, 1, 16, _3, _1, _1, _1, _1, 1, a, t), c, f1, f2);
-
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 3, 300, 300, 32, _3, _1, _2, _0, _1, 1, a, t), c, f1, f2);
-
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 32, 150, 150, 32, _3, _1, _1, _1, _1, 32, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 144, 75, 75, 144, _3, _1, _1, _1, _1, 144, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 192, 38, 38, 192, _3, _1, _1, _1, _1, 192, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 384, 19, 19, 384, _3, _1, _1, _1, _1, 384, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 576, 19, 19, 576, _3, _1, _1, _1, _1, 576, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 960, 10, 10, 960, _3, _1, _1, _1, _1, 960, a, t), c, f1, f2);
-
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 96, 150, 150, 96, _3, _1, _2, _0, _1, 96, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 144, 75, 75, 144, _3, _1, _2, _1, _1, 144, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 192, 38, 38, 192, _3, _1, _2, _0, _1, 192, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 576, 19, 19, 576, _3, _1, _2, _1, _1, 576, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 256, 10, 10, 256, _3, _1, _2, _0, _1, 256, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 128, 5, 5, 128, _3, _1, _2, _1, _1, 128, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 128, 3, 3, 128, _3, _1, _2, _1, _1, 128, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 64, 2, 2, 64, _3, _1, _2, _0, _1, 64, a, t), c, f1, f2);
-#endif
-#if 0
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 48, 256, 256, 48, _1, _1, _1, _0, _0, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 96, 128, 128, 96, _1, _1, _1, _0, _0, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 192, 64, 64, 192, _1, _1, _1, _0, _0, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 384, 32, 32, 384, _1, _1, _1, _0, _0, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 768, 16, 16, 768, _1, _1, _1, _0, _0, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 1536, 8, 8, 1536, _1, _1, _1, _0, _0, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 3072, 4, 4, 3072, _1, _1, _1, _0, _0, 1, a, t), c, f1, f2);//slow
-
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 16, 256, 256, 16, _3, _1, _1, _1, _1, 1, a, t), c, f1, f2);//slow
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 32, 128, 128, 32, _3, _1, _1, _1, _1, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 64, 64, 64, 64, _3, _1, _1, _1, _1, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 128, 32, 32, 128, _3, _1, _1, _1, _1, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 256, 16, 16, 256, _3, _1, _1, _1, _1, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 512, 8, 8, 512, _3, _1, _1, _1, _1, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 1024, 4, 4, 1024, _3, _1, _1, _1, _1, 1, a, t), c, f1, f2);
-
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 10, 256, 256, 10, _5, _1, _1, _2, _2, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 20, 128, 128, 20, _5, _1, _1, _2, _2, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 40, 64, 64, 40, _5, _1, _1, _2, _2, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 80, 32, 32, 80, _5, _1, _1, _2, _2, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 160, 16, 16, 160, _5, _1, _1, _2, _2, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 320, 8, 8, 320, _5, _1, _1, _2, _2, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 640, 4, 4, 640, _5, _1, _1, _2, _2, 1, a, t), c, f1, f2);
-#endif
-#if 0
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 3, 300, 300, 32, _3, _1, _2, _0, _1, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 3, 300, 300, 16, _3, _1, _2, _0, _1, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 3, 224, 224, 16, _3, _1, _1, _1, _1, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 3, 112, 112, 16, _3, _1, _2, _0, _1, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 3, 180, 320, 10, _3, _1, _1, _0, _0, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 10, 89, 159, 16, _3, _1, _1, _0, _0, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 16, 87, 157, 32, _3, _1, _1, _0, _0, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 3, 224, 224, 16, _5, _1, _1, _2, _2, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 3, 224, 224, 16, _5, _1, _2, _2, _2, 1, a, t), c, f1, f2);
-#endif
-#if 0
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 32, 150, 150, 32, _3, _1, _1, _1, _1, 32, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 144, 75, 75, 144, _3, _1, _1, _1, _1, 144, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 192, 38, 38, 192, _3, _1, _1, _1, _1, 192, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 384, 19, 19, 384, _3, _1, _1, _1, _1, 384, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 576, 19, 19, 576, _3, _1, _1, _1, _1, 576, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 960, 10, 10, 960, _3, _1, _1, _1, _1, 960, a, t), c, f1, f2);
-
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 96, 150, 150, 96, _3, _1, _2, _0, _1, 96, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 144, 75, 75, 144, _3, _1, _2, _1, _1, 144, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 192, 38, 38, 192, _3, _1, _2, _0, _1, 192, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 576, 19, 19, 576, _3, _1, _2, _1, _1, 576, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 256, 10, 10, 256, _3, _1, _2, _0, _1, 256, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 128, 5, 5, 128, _3, _1, _2, _1, _1, 128, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 128, 3, 3, 128, _3, _1, _2, _1, _1, 128, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 64, 2, 2, 64, _3, _1, _2, _0, _1, 64, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 32, 38, 32, 32, _3, _1, _2, _0, _1, 32, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 64, 19, 16, 64, _3, _1, _3, _1, _1, 64, a, t), c, f1, f2);
-#endif
-#if 0
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 728, 14, 14, 728, _1, _1, _1, _0, _0, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 64, 56, 48, 64, _3, _1, _1, _1, _1, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 128, 28, 24, 128, _3, _1, _1, _1, _1, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 256, 14, 12, 256, _3, _1, _1, _1, _1, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 512, 7, 6, 512, _3, _1, _1, _1, _1, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 3, 300, 300, 32, _3, _1, _2, _0, _1, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 32, 150, 150, 32, _3, _1, _1, _1, _1, 32, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 32, 150, 150, 16, _1, _1, _1, _0, _0, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 16, 150, 150, 96, _1, _1, _1, _0, _0, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 96, 150, 150, 96, _3, _1, _2, _0, _1, 96, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 96, 75, 75, 24, _1, _1, _1, _0, _0, 1, a, t), c, f1, f2);
-#endif
-#if 0
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 3, 112, 96, 64, _3, _1, _2, _0, _1, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 64, 56, 48, 64, _3, _1, _1, _1, _1, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 64, 56, 48, 128, _3, _1, _2, _0, _1, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 128, 28, 24, 128, _3, _1, _1, _1, _1, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 128, 28, 24, 256, _3, _1, _2, _0, _1, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 256, 14, 12, 256, _3, _1, _1, _1, _1, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 256, 14, 12, 512, _3, _1, _2, _0, _1, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 512, 7, 6, 512, _3, _1, _1, _1, _1, 1, a, t), c, f1, f2);
-#endif
-#if 0
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 3, 1024, 1024, 24, _7, _1, _4, _3, _3, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 48, 128, 128, 64, _5, _1, _2, _2, _2, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 116, 8, 8, 116, _1, _1, _1, _0, _0, 1, a, t), c, f1, f2);
-#endif
-#if 0
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 16, 160, 160, 16, _3, _1, _1, _1, _1, 16, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 48, 160, 160, 48, _3, _1, _2, _1, _0, 48, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 48, 80, 80, 48, _3, _1, _2, _1, _0, 48, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 48, 80, 80, 48, _3, _1, _1, _1, _1, 48, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 144, 20, 20, 144, _3, _1, _1, _1, _1, 144, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 192, 20, 20, 192, _3, _1, _1, _1, _1, 192, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 96, 40, 40, 96, _3, _1, _1, _1, _1, 96, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 96, 40, 40, 96, _3, _1, _2, _1, _0, 96, a, t), c, f1, f2);
-#endif
-#if 0
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 16, 160, 160, 8, _1, _1, _1, _0, _0, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 8, 160, 160, 48, _1, _1, _1, _0, _0, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 16, 16, 160, 8, _1, _1, _1, _0, _0, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 8, 16, 160, 48, _1, _1, _1, _0, _0, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 8, 80, 80, 48, _1, _1, _1, _0, _0, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 48, 80, 80, 8, _1, _1, _1, _0, _0, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 8, 80, 80, 48, _1, _1, _1, _0, _0, 1, a, t), c, f1, f2);
-#endif
-#if 0
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 32, 115, 63, 4, _1, _1, _1, _0, _0, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 32, 115, 63, 2, _1, _1, _1, _0, _0, 1, a, t), c, f1, f2);
-#endif
-#if 0
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 128, 7, 7, 128, _7, _1, _1, _0, _0, 128, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 128, 5, 5, 128, _5, _1, _1, _0, _0, 128, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 128, 3, 3, 128, _3, _1, _1, _0, _0, 128, a, t), c, f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 16, 112, 96, 32, _3, _1, _3, Size(1, 0), Size(1, 0), 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 16, 114, 96, 32, _3, _1, _3, _0, _0, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 64, 19, 16, 64, _3, _1, _3, _1, _1, 64, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 32, 19, 16, 64, _1, _1, _1, _0, _0, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 128, 7, 7, 128, _7, _1, _1, _0, _0, 128, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 16, 56, 56, 32, _1, _1, _1, _0, _0, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 3, 112, 112, 16, _3, _1, _2, _1, _1, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 128, 7, 6, 128, _3, _1, _2, Size(0, 1), Size(1, 1), 128, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 128, 4, 3, 256, _1, _1, _1, _0, _0, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 32, 38, 32, 32, _3, _1, _2, _0, _1, 32, a, t), f1, f2);
+#endif
+#if 0
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 3, 112, 96, 16, _3, _1, _1, _1, _1, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 16, 112, 96, 32, _3, _1, _3, Size(1, 0), Size(1, 0), 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 32, 38, 32, 32, _3, _1, _2, _0, _1, 32, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 32, 19, 16, 64, _1, _1, _1, _0, _0, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 64, 19, 16, 64, _3, _1, _3, _1, _1, 64, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 64, 7, 6, 128, _1, _1, _1, _0, _0, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 128, 7, 6, 128, _3, _1, _2, Size(0, 1), Size(1, 1), 128, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 128, 4, 3, 256, _1, _1, _1, _0, _0, 1, a, t), f1, f2);
+#endif
+#if 0
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 1024, 13, 13, 1024, _3, _1, _1, _1, _1, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 512, 10, 10, 1024, _3, _1, _1, _1, _1, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 256, 10, 10, 512, _3, _1, _1, _1, _1, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 384, 20, 20, 256, _3, _1, _1, _1, _1, 1, a, t), f1, f2);
+#endif
+#if 0
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 16, 150, 150, 96, _1, _1, _1, _0, _0, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 17, 150, 150, 96, _1, _1, _1, _0, _0, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 16, 150, 150, 96, _2, _1, _1, _0, _0, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 17, 150, 150, 96, _2, _1, _1, _0, _0, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 16, 150, 150, 96, _3, _1, _1, _1, _1, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 17, 150, 150, 96, _3, _1, _1, _0, _0, 1, a, t), f1, f2);
+#endif
+#if 0
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 3, 224, 224, 16, _1, _1, _1, _1, _1, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 3, 224, 224, 16, _1, _1, _2, _1, _1, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 3, 224, 224, 16, _2, _1, _1, _1, _0, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 3, 224, 224, 16, _2, _1, _2, _1, _0, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 3, 224, 224, 16, _3, _1, _1, _1, _1, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 3, 224, 224, 16, _3, _1, _2, _1, _1, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 3, 224, 224, 16, _4, _1, _1, _2, _1, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 3, 224, 224, 16, _4, _1, _2, _2, _1, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 3, 224, 224, 16, _5, _1, _1, _2, _2, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 3, 224, 224, 16, _5, _1, _2, _2, _2, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 3, 224, 224, 16, _7, _1, _1, _3, _3, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 3, 224, 224, 16, _7, _1, _2, _3, _3, 1, a, t), f1, f2);
+#endif
+#if 0
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 32, 150, 150, 16, _1, _1, _1, _0, _0, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 16, 150, 150, 96, _1, _1, _1, _0, _0, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 96, 75, 75, 24, _1, _1, _1, _0, _0, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 24, 75, 75, 144, _1, _1, _1, _0, _0, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 144, 75, 75, 24, _1, _1, _1, _0, _0, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 144, 38, 38, 32, _1, _1, _1, _0, _0, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 32, 38, 38, 192, _1, _1, _1, _0, _0, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 192, 38, 38, 32, _1, _1, _1, _0, _0, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 192, 19, 19, 64, _1, _1, _1, _0, _0, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 64, 19, 19, 384, _1, _1, _1, _0, _0, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 384, 19, 19, 64, _1, _1, _1, _0, _0, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 384, 19, 19, 96, _1, _1, _1, _0, _0, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 96, 19, 19, 576, _1, _1, _1, _0, _0, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 576, 19, 19, 96, _1, _1, _1, _0, _0, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 576, 10, 10, 160, _1, _1, _1, _0, _0, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 160, 10, 10, 960, _1, _1, _1, _0, _0, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 960, 10, 10, 160, _1, _1, _1, _0, _0, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 960, 10, 10, 320, _1, _1, _1, _0, _0, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 320, 10, 10, 1280, _1, _1, _1, _0, _0, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 1280, 10, 10, 256, _1, _1, _1, _0, _0, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 256, 5, 5, 512, _1, _1, _1, _0, _0, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 512, 5, 5, 128, _1, _1, _1, _0, _0, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 128, 3, 3, 256, _1, _1, _1, _0, _0, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 256, 3, 3, 128, _1, _1, _1, _0, _0, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 128, 2, 2, 256, _1, _1, _1, _0, _0, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 256, 2, 2, 128, _1, _1, _1, _0, _0, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 256, 2, 2, 64, _1, _1, _1, _0, _0, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 64, 1, 1, 128, _1, _1, _1, _0, _0, 1, a, t), f1, f2);
+
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 576, 19, 19, 12, _3, _1, _1, _1, _1, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 1280, 10, 10, 16, _3, _1, _1, _1, _1, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 512, 5, 5, 16, _3, _1, _1, _1, _1, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 256, 3, 3, 16, _3, _1, _1, _1, _1, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 256, 2, 2, 16, _3, _1, _1, _1, _1, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 128, 1, 1, 16, _3, _1, _1, _1, _1, 1, a, t), f1, f2);
+
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 3, 300, 300, 32, _3, _1, _2, _0, _1, 1, a, t), f1, f2);
+
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 32, 150, 150, 32, _3, _1, _1, _1, _1, 32, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 144, 75, 75, 144, _3, _1, _1, _1, _1, 144, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 192, 38, 38, 192, _3, _1, _1, _1, _1, 192, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 384, 19, 19, 384, _3, _1, _1, _1, _1, 384, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 576, 19, 19, 576, _3, _1, _1, _1, _1, 576, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 960, 10, 10, 960, _3, _1, _1, _1, _1, 960, a, t), f1, f2);
+
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 96, 150, 150, 96, _3, _1, _2, _0, _1, 96, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 144, 75, 75, 144, _3, _1, _2, _1, _1, 144, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 192, 38, 38, 192, _3, _1, _2, _0, _1, 192, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 576, 19, 19, 576, _3, _1, _2, _1, _1, 576, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 256, 10, 10, 256, _3, _1, _2, _0, _1, 256, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 128, 5, 5, 128, _3, _1, _2, _1, _1, 128, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 128, 3, 3, 128, _3, _1, _2, _1, _1, 128, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 64, 2, 2, 64, _3, _1, _2, _0, _1, 64, a, t), f1, f2);
+#endif
+#if 0
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 48, 256, 256, 48, _1, _1, _1, _0, _0, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 96, 128, 128, 96, _1, _1, _1, _0, _0, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 192, 64, 64, 192, _1, _1, _1, _0, _0, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 384, 32, 32, 384, _1, _1, _1, _0, _0, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 768, 16, 16, 768, _1, _1, _1, _0, _0, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 1536, 8, 8, 1536, _1, _1, _1, _0, _0, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 3072, 4, 4, 3072, _1, _1, _1, _0, _0, 1, a, t), f1, f2);//slow
+
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 16, 256, 256, 16, _3, _1, _1, _1, _1, 1, a, t), f1, f2);//slow
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 32, 128, 128, 32, _3, _1, _1, _1, _1, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 64, 64, 64, 64, _3, _1, _1, _1, _1, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 128, 32, 32, 128, _3, _1, _1, _1, _1, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 256, 16, 16, 256, _3, _1, _1, _1, _1, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 512, 8, 8, 512, _3, _1, _1, _1, _1, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 1024, 4, 4, 1024, _3, _1, _1, _1, _1, 1, a, t), f1, f2);
+
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 10, 256, 256, 10, _5, _1, _1, _2, _2, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 20, 128, 128, 20, _5, _1, _1, _2, _2, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 40, 64, 64, 40, _5, _1, _1, _2, _2, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 80, 32, 32, 80, _5, _1, _1, _2, _2, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 160, 16, 16, 160, _5, _1, _1, _2, _2, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 320, 8, 8, 320, _5, _1, _1, _2, _2, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 640, 4, 4, 640, _5, _1, _1, _2, _2, 1, a, t), f1, f2);
+#endif
+#if 0
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 3, 300, 300, 32, _3, _1, _2, _0, _1, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 3, 300, 300, 16, _3, _1, _2, _0, _1, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 3, 224, 224, 16, _3, _1, _1, _1, _1, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 3, 112, 112, 16, _3, _1, _2, _0, _1, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 3, 180, 320, 10, _3, _1, _1, _0, _0, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 10, 89, 159, 16, _3, _1, _1, _0, _0, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 16, 87, 157, 32, _3, _1, _1, _0, _0, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 3, 224, 224, 16, _5, _1, _1, _2, _2, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 3, 224, 224, 16, _5, _1, _2, _2, _2, 1, a, t), f1, f2);
+#endif
+#if 0
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 32, 150, 150, 32, _3, _1, _1, _1, _1, 32, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 144, 75, 75, 144, _3, _1, _1, _1, _1, 144, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 192, 38, 38, 192, _3, _1, _1, _1, _1, 192, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 384, 19, 19, 384, _3, _1, _1, _1, _1, 384, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 576, 19, 19, 576, _3, _1, _1, _1, _1, 576, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 960, 10, 10, 960, _3, _1, _1, _1, _1, 960, a, t), f1, f2);
+
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 96, 150, 150, 96, _3, _1, _2, _0, _1, 96, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 144, 75, 75, 144, _3, _1, _2, _1, _1, 144, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 192, 38, 38, 192, _3, _1, _2, _0, _1, 192, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 576, 19, 19, 576, _3, _1, _2, _1, _1, 576, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 256, 10, 10, 256, _3, _1, _2, _0, _1, 256, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 128, 5, 5, 128, _3, _1, _2, _1, _1, 128, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 128, 3, 3, 128, _3, _1, _2, _1, _1, 128, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 64, 2, 2, 64, _3, _1, _2, _0, _1, 64, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 32, 38, 32, 32, _3, _1, _2, _0, _1, 32, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 64, 19, 16, 64, _3, _1, _3, _1, _1, 64, a, t), f1, f2);
+#endif
+#if 0
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 728, 14, 14, 728, _1, _1, _1, _0, _0, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 64, 56, 48, 64, _3, _1, _1, _1, _1, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 128, 28, 24, 128, _3, _1, _1, _1, _1, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 256, 14, 12, 256, _3, _1, _1, _1, _1, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 512, 7, 6, 512, _3, _1, _1, _1, _1, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 3, 300, 300, 32, _3, _1, _2, _0, _1, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 32, 150, 150, 32, _3, _1, _1, _1, _1, 32, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 32, 150, 150, 16, _1, _1, _1, _0, _0, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 16, 150, 150, 96, _1, _1, _1, _0, _0, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 96, 150, 150, 96, _3, _1, _2, _0, _1, 96, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 96, 75, 75, 24, _1, _1, _1, _0, _0, 1, a, t), f1, f2);
+#endif
+#if 0
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 3, 112, 96, 64, _3, _1, _2, _0, _1, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 64, 56, 48, 64, _3, _1, _1, _1, _1, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 64, 56, 48, 128, _3, _1, _2, _0, _1, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 128, 28, 24, 128, _3, _1, _1, _1, _1, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 128, 28, 24, 256, _3, _1, _2, _0, _1, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 256, 14, 12, 256, _3, _1, _1, _1, _1, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 256, 14, 12, 512, _3, _1, _2, _0, _1, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 512, 7, 6, 512, _3, _1, _1, _1, _1, 1, a, t), f1, f2);
+#endif
+#if 0
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 3, 1024, 1024, 24, _7, _1, _4, _3, _3, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 48, 128, 128, 64, _5, _1, _2, _2, _2, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 116, 8, 8, 116, _1, _1, _1, _0, _0, 1, a, t), f1, f2);
+#endif
+#if 0
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 16, 160, 160, 16, _3, _1, _1, _1, _1, 16, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 48, 160, 160, 48, _3, _1, _2, _1, _0, 48, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 48, 80, 80, 48, _3, _1, _2, _1, _0, 48, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 48, 80, 80, 48, _3, _1, _1, _1, _1, 48, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 144, 20, 20, 144, _3, _1, _1, _1, _1, 144, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 192, 20, 20, 192, _3, _1, _1, _1, _1, 192, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 96, 40, 40, 96, _3, _1, _1, _1, _1, 96, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 96, 40, 40, 96, _3, _1, _2, _1, _0, 96, a, t), f1, f2);
+#endif
+#if 0
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 16, 160, 160, 8, _1, _1, _1, _0, _0, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 8, 160, 160, 48, _1, _1, _1, _0, _0, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 16, 16, 160, 8, _1, _1, _1, _0, _0, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 8, 16, 160, 48, _1, _1, _1, _0, _0, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 8, 80, 80, 48, _1, _1, _1, _0, _0, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 48, 80, 80, 8, _1, _1, _1, _0, _0, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 8, 80, 80, 48, _1, _1, _1, _0, _0, 1, a, t), f1, f2);
+#endif
+#if 0
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 32, 115, 63, 4, _1, _1, _1, _0, _0, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 32, 115, 63, 2, _1, _1, _1, _0, _0, 1, a, t), f1, f2);
+#endif
+#if 0
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 128, 7, 7, 128, _7, _1, _1, _0, _0, 128, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 128, 5, 5, 128, _5, _1, _1, _0, _0, 128, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 128, 3, 3, 128, _3, _1, _1, _0, _0, 128, a, t), f1, f2);
#endif
#if 0
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 256, 14, 14, 512, _3, _1, _1, _1, _1, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 256, 14, 14, 256, _3, _1, _1, _1, _1, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 256, 14, 12, 256, _3, _1, _1, _1, _1, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 512, 7, 7, 512, _3, _1, _1, _1, _1, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 512, 7, 6, 512, _3, _1, _1, _1, _1, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 512, 3, 3, 512, _3, _1, _1, _1, _1, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 32, 23, 23, 64, _3, _1, _1, _1, _1, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(10, 256, 14, 14, 512, _3, _1, _1, _1, _1, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(10, 256, 14, 14, 256, _3, _1, _1, _1, _1, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(10, 256, 14, 12, 256, _3, _1, _1, _1, _1, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(10, 512, 7, 7, 512, _3, _1, _1, _1, _1, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(10, 512, 7, 6, 512, _3, _1, _1, _1, _1, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(10, 512, 3, 3, 512, _3, _1, _1, _1, _1, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(10, 32, 23, 23, 64, _3, _1, _1, _1, _1, 1, a, t), c, f1, f2);
-#endif
-#if 0
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(2, 128, 24, 24, 8, _1, _1, _1, _0, _0, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(2, 128, 7, 7, 128, _7, _1, _1, _0, _0, 128, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(2, 256, 2, 2, 256, _1, _1, _1, _0, _0, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(2, 128, 64, 64, 128, _3, _1, _1, _1, _1, 2, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(2, 128, 64, 64, 128, _3, _1, _1, _1, _1, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(2, 128, 64, 64, 128, _1, _1, _1, _0, _0, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(2, 48, 160, 160, 48, _3, _1, _2, _1, _0, 48, a, t), c, f1, f2);
-#endif
-#if 0
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 16, 8, 8, 48, _5, _1, _1, _2, _2, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 576, 19, 19, 12, _3, _1, _1, _1, _1, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 21, 192, 192, 21, _3, _1, _1, _1, _1, 21, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 90, 192, 192, 90, _3, _1, _2, _0, _1, 90, a, t), c, f1, f2);
-#endif
-#if 0
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 1152, 12, 12, 12, _3, _1, _1, _1, _1, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 1156, 12, 12, 12, _3, _1, _1, _1, _1, 1, a, t), c, f1, f2);
-#endif
-#if 0
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 96, 192, 192, 96, _3, _1, _2, _1, _1, 96, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 144, 96, 96, 144, _3, _1, _1, _1, _1, 144, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 96, 96, 96, 24, _1, _1, _1, _0, _0, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 144, 96, 96, 24, _1, _1, _1, _0, _0, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 24, 96, 96, 96, _1, _1, _1, _0, _0, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 24, 96, 96, 144, _1, _1, _1, _0, _0, 1, a, t), c, f1, f2);
-#endif
-#if 0
- //result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 256, 17, 17, 128, _3, _1, _2, _1, _1, 1, a, t), c, f1, f2);
- //result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 256, 17, 17, 64, _3, _1, _1, _1, _1, 1, a, t), c, f1, f2);
- //result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 64, 17, 17, 256, _3, _1, _1, _1, _1, 1, a, t), c, f1, f2);
- //result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 192, 20, 20, 512, _1, _1, _1, _0, _0, 1, a, t), c, f1, f2);
- //result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 576, 24, 42, 16, _3, _1, _1, _1, _1, 1, a, t), c, f1, f2);
- //result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 1280, 7, 12, 36, _3, _1, _1, _1, _1, 1, a, t), c, f1, f2);
- //result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 512, 4, 6, 24, _3, _1, _1, _1, _1, 1, a, t), c, f1, f2);
-
- //result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 64, 24, 42, 16, _3, _2, _1, _2, _2, 1, a, t), c, f1, f2);
- //result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 64, 24, 42, 16, _3, _1, _1, _1, _1, 1, a, t), c, f1, f2);
- //result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 512, 25, 43, 16, _3, _1, _2, _1, _1, 1, a, t), c, f1, f2);
- //result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 16, 24, 42, 512, _3, _1, _1, _1, _1, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 192, 20, 20, 512, _1, _1, _1, _0, _0, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 192, 10, 10, 512, _1, _1, _1, _0, _0, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 192, 5, 5, 512, _1, _1, _1, _0, _0, 1, a, t), c, f1, f2);
- //result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 512, 4, 6, 18, _3, _1, _1, _1, _1, 1, a, t), c, f1, f2);
- //result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 512, 4, 6, 24, _3, _1, _1, _1, _1, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 320, 7, 12, 1280, _1, _1, _1, _0, _0, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 1280, 7, 12, 256, _1, _1, _1, _0, _0, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 576, 7, 12, 160, _1, _1, _1, _0, _0, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 96, 14, 24, 576, _1, _1, _1, _0, _0, 1, a, t), c, f1, f2);
-#endif
-#if 0
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 3, 240, 135, 10, _3, _1, _1, _0, _0, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 10, 119, 67, 16, _3, _1, _1, _0, _0, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 16, 117, 65, 32, _3, _1, _1, _0, _0, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 32, 115, 63, 4, _1, _1, _1, _0, _0, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 32, 115, 63, 2, _1, _1, _1, _0, _0, 1, a, t), c, f1, f2);
-#endif
-#if 0
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 256, 64, 64, 256, _3, _1, _1, _1, _1, 1, a, t), c, f1, f2);
- //result = result && SynetConvolution32fForwardAutoTest(eps * 10, Param(1, 1024, 64, 64, 256, _3, _1, _1, _1, _1, 1, a, t), c, f1, f2);
- //result = result && SynetConvolution32fForwardAutoTest(eps * 10, Param(1, 256, 128, 128, 256, _3, _1, _1, _1, _1, 1, a, t), c, f1, f2);
- //result = result && SynetConvolution32fForwardAutoTest(eps*10, Param(1, 1024, 128, 128, 1024, _3, _1, _1, _1, _1, 1, a, t), c, f1, f2);
-#endif
-#if 0
- //result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 120, 12, 12, 120, _3, _1, _1, _1, _1, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 3, 240, 135, 10, _3, _1, _1, _0, _0, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 768, 10, 4, 128, _1, _1, _1, _0, _0, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 3, 240, 135, 27, _3, _1, _1, _0, _0, 1, a, t), c, f1, f2);
-#endif
-#if 0
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 34, 32, 32, 34, _1, _1, _1, _0, _0, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 34, 32, 32, 34, _3, _1, _1, _1, _1, 34, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 34, 32, 32, 34, _5, _1, _1, _2, _2, 34, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 34, 32, 32, 34, _7, _1, _1, _3, _3, 34, a, t), c, f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 256, 14, 14, 512, _3, _1, _1, _1, _1, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 256, 14, 14, 256, _3, _1, _1, _1, _1, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 256, 14, 12, 256, _3, _1, _1, _1, _1, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 512, 7, 7, 512, _3, _1, _1, _1, _1, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 512, 7, 6, 512, _3, _1, _1, _1, _1, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 512, 3, 3, 512, _3, _1, _1, _1, _1, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 32, 23, 23, 64, _3, _1, _1, _1, _1, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(10, 256, 14, 14, 512, _3, _1, _1, _1, _1, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(10, 256, 14, 14, 256, _3, _1, _1, _1, _1, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(10, 256, 14, 12, 256, _3, _1, _1, _1, _1, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(10, 512, 7, 7, 512, _3, _1, _1, _1, _1, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(10, 512, 7, 6, 512, _3, _1, _1, _1, _1, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(10, 512, 3, 3, 512, _3, _1, _1, _1, _1, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(10, 32, 23, 23, 64, _3, _1, _1, _1, _1, 1, a, t), f1, f2);
+#endif
+#if 0
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(2, 128, 24, 24, 8, _1, _1, _1, _0, _0, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(2, 128, 7, 7, 128, _7, _1, _1, _0, _0, 128, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(2, 256, 2, 2, 256, _1, _1, _1, _0, _0, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(2, 128, 64, 64, 128, _3, _1, _1, _1, _1, 2, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(2, 128, 64, 64, 128, _3, _1, _1, _1, _1, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(2, 128, 64, 64, 128, _1, _1, _1, _0, _0, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(2, 48, 160, 160, 48, _3, _1, _2, _1, _0, 48, a, t), f1, f2);
+#endif
+#if 0
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 16, 8, 8, 48, _5, _1, _1, _2, _2, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 576, 19, 19, 12, _3, _1, _1, _1, _1, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 21, 192, 192, 21, _3, _1, _1, _1, _1, 21, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 90, 192, 192, 90, _3, _1, _2, _0, _1, 90, a, t), f1, f2);
+#endif
+#if 0
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 1152, 12, 12, 12, _3, _1, _1, _1, _1, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 1156, 12, 12, 12, _3, _1, _1, _1, _1, 1, a, t), f1, f2);
+#endif
+#if 0
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 96, 192, 192, 96, _3, _1, _2, _1, _1, 96, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 144, 96, 96, 144, _3, _1, _1, _1, _1, 144, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 96, 96, 96, 24, _1, _1, _1, _0, _0, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 144, 96, 96, 24, _1, _1, _1, _0, _0, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 24, 96, 96, 96, _1, _1, _1, _0, _0, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 24, 96, 96, 144, _1, _1, _1, _0, _0, 1, a, t), f1, f2);
+#endif
+#if 0
+ //result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 256, 17, 17, 128, _3, _1, _2, _1, _1, 1, a, t), f1, f2);
+ //result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 256, 17, 17, 64, _3, _1, _1, _1, _1, 1, a, t), f1, f2);
+ //result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 64, 17, 17, 256, _3, _1, _1, _1, _1, 1, a, t), f1, f2);
+ //result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 192, 20, 20, 512, _1, _1, _1, _0, _0, 1, a, t), f1, f2);
+ //result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 576, 24, 42, 16, _3, _1, _1, _1, _1, 1, a, t), f1, f2);
+ //result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 1280, 7, 12, 36, _3, _1, _1, _1, _1, 1, a, t), f1, f2);
+ //result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 512, 4, 6, 24, _3, _1, _1, _1, _1, 1, a, t), f1, f2);
+
+ //result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 64, 24, 42, 16, _3, _2, _1, _2, _2, 1, a, t), f1, f2);
+ //result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 64, 24, 42, 16, _3, _1, _1, _1, _1, 1, a, t), f1, f2);
+ //result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 512, 25, 43, 16, _3, _1, _2, _1, _1, 1, a, t), f1, f2);
+ //result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 16, 24, 42, 512, _3, _1, _1, _1, _1, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 192, 20, 20, 512, _1, _1, _1, _0, _0, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 192, 10, 10, 512, _1, _1, _1, _0, _0, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 192, 5, 5, 512, _1, _1, _1, _0, _0, 1, a, t), f1, f2);
+ //result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 512, 4, 6, 18, _3, _1, _1, _1, _1, 1, a, t), f1, f2);
+ //result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 512, 4, 6, 24, _3, _1, _1, _1, _1, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 320, 7, 12, 1280, _1, _1, _1, _0, _0, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 1280, 7, 12, 256, _1, _1, _1, _0, _0, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 576, 7, 12, 160, _1, _1, _1, _0, _0, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 96, 14, 24, 576, _1, _1, _1, _0, _0, 1, a, t), f1, f2);
+#endif
+#if 0
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 3, 240, 135, 10, _3, _1, _1, _0, _0, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 10, 119, 67, 16, _3, _1, _1, _0, _0, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 16, 117, 65, 32, _3, _1, _1, _0, _0, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 32, 115, 63, 4, _1, _1, _1, _0, _0, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 32, 115, 63, 2, _1, _1, _1, _0, _0, 1, a, t), f1, f2);
+#endif
+#if 0
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 256, 64, 64, 256, _3, _1, _1, _1, _1, 1, a, t), f1, f2);
+ //result = result && SynetConvolution32fForwardAutoTest(eps * 10, Param(1, 1024, 64, 64, 256, _3, _1, _1, _1, _1, 1, a, t), f1, f2);
+ //result = result && SynetConvolution32fForwardAutoTest(eps * 10, Param(1, 256, 128, 128, 256, _3, _1, _1, _1, _1, 1, a, t), f1, f2);
+ //result = result && SynetConvolution32fForwardAutoTest(eps*10, Param(1, 1024, 128, 128, 1024, _3, _1, _1, _1, _1, 1, a, t), f1, f2);
+#endif
+#if 0
+ //result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 120, 12, 12, 120, _3, _1, _1, _1, _1, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 3, 240, 135, 10, _3, _1, _1, _0, _0, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 768, 10, 4, 128, _1, _1, _1, _0, _0, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 3, 240, 135, 27, _3, _1, _1, _0, _0, 1, a, t), f1, f2);
+#endif
+#if 0
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 34, 32, 32, 34, _1, _1, _1, _0, _0, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 34, 32, 32, 34, _3, _1, _1, _1, _1, 34, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 34, 32, 32, 34, _5, _1, _1, _2, _2, 34, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 34, 32, 32, 34, _7, _1, _1, _3, _3, 34, a, t), f1, f2);
#endif
#if 0
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 24, 96, 96, 24, _3, _1, _1, _1, _1, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 24, 48, 48, 24, _3, _1, _1, _1, _1, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 48, 24, 24, 48, _3, _1, _1, _1, _1, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 96, 12, 12, 96, _3, _1, _1, _1, _1, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(10, 24, 96, 96, 24, _3, _1, _1, _1, _1, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(10, 24, 48, 48, 24, _3, _1, _1, _1, _1, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(10, 48, 24, 24, 48, _3, _1, _1, _1, _1, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(10, 96, 12, 12, 96, _3, _1, _1, _1, _1, 1, a, t), c, f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 24, 96, 96, 24, _3, _1, _1, _1, _1, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 24, 48, 48, 24, _3, _1, _1, _1, _1, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 48, 24, 24, 48, _3, _1, _1, _1, _1, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 96, 12, 12, 96, _3, _1, _1, _1, _1, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(10, 24, 96, 96, 24, _3, _1, _1, _1, _1, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(10, 24, 48, 48, 24, _3, _1, _1, _1, _1, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(10, 48, 24, 24, 48, _3, _1, _1, _1, _1, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(10, 96, 12, 12, 96, _3, _1, _1, _1, _1, 1, a, t), f1, f2);
#endif
#if 0
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 384, 13, 13, 1152, _1, _1, _1, _0, _0, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 1152, 13, 13, 128, _1, _1, _1, _0, _0, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 1152, 13, 13, 192, _1, _1, _1, _0, _0, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 448, 6, 6, 2048, _1, _1, _1, _0, _0, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 2048, 6, 6, 192, _1, _1, _1, _0, _0, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 16, 64, 128, 1, _1, _1, _1, _0, _0, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 128, 8, 60, 256, _2, _1, _1, _0, _0, 1, a, t), c, f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 384, 13, 13, 1152, _1, _1, _1, _0, _0, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 1152, 13, 13, 128, _1, _1, _1, _0, _0, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 1152, 13, 13, 192, _1, _1, _1, _0, _0, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 448, 6, 6, 2048, _1, _1, _1, _0, _0, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 2048, 6, 6, 192, _1, _1, _1, _0, _0, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 16, 64, 128, 1, _1, _1, _1, _0, _0, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 128, 8, 60, 256, _2, _1, _1, _0, _0, 1, a, t), f1, f2);
#endif
#if 0
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 64, 96, 160, 64, _3, _1, _1, _1, _1, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 64, 80, 160, 64, _3, _1, _1, _1, _1, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 64, 64, 160, 64, _3, _1, _1, _1, _1, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 64, 48, 160, 64, _3, _1, _1, _1, _1, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 64, 32, 160, 64, _3, _1, _1, _1, _1, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 64, 16, 160, 64, _3, _1, _1, _1, _1, 1, a, t), c, f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 64, 96, 160, 64, _3, _1, _1, _1, _1, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 64, 80, 160, 64, _3, _1, _1, _1, _1, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 64, 64, 160, 64, _3, _1, _1, _1, _1, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 64, 48, 160, 64, _3, _1, _1, _1, _1, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 64, 32, 160, 64, _3, _1, _1, _1, _1, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 64, 16, 160, 64, _3, _1, _1, _1, _1, 1, a, t), f1, f2);
#endif
-#if 0
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 64, 89, 159, 64, _3, _1, _1, _1, _1, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 256, 150, 150, 256, _3, _1, _1, _1, _1, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 256, 150, 150, 128, _3, _1, _1, _1, _1, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 256, 150, 150, 64, _3, _1, _1, _1, _1, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 256, 150, 150, 32, _3, _1, _1, _1, _1, 1, a, t), c, f1, f2);
-#endif
-#if 0
- result = result && SynetConvolution32fForwardAutoTest(eps * 10, Param(1, 1024, 64, 64, 256, _3, _1, _1, _1, _1, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps * 10, Param(1, 256, 64, 64, 256, _3, _1, _1, _1, _1, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps * 10, Param(1, 512, 64, 64, 512, _3, _1, _1, _1, _1, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 256, 75, 75, 256, _1, _1, _1, _0, _0, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 192, 75, 75, 192, _1, _1, _1, _0, _0, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 256, 17, 17, 128, _3, _1, _2, _1, _1, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 256, 17, 17, 64, _3, _1, _1, _1, _1, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 64, 17, 17, 256, _3, _1, _1, _1, _1, 1, a, t), c, f1, f2);
-#endif
-#if 0
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 3, 240, 135, 27, _3, _1, _1, _0, _0, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 32, 37, 47, 48, _3, _1, _2, _1, _1, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 96, 16, 26, 256, _1, _1, _1, _0, _0, 1, a, t), c, f1, f2);
-#endif
-#if 0
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 3, 96, 96, 48, _3, _1, _1, _1, _1, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 12, 1, 1, 192, _1, _1, _1, _0, _0, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 24, 1, 1, 384, _1, _1, _1, _0, _0, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 3, 1, 1, 48, _1, _1, _1, _0, _0, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 6, 1, 1, 96, _1, _1, _1, _0, _0, 1, a, t), c, f1, f2);
-#endif
-#if 0
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 16, 40, 23, 16, _3, _1, _1, _1, _1, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 16, 40, 23, 16, _3, _2, _1, _2, _2, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 16, 40, 23, 16, _3, _3, _1, _3, _3, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 16, 40, 23, 16, _3, _5, _1, _5, _5, 1, a, t), c, f1, f2);
-#endif
-#if 0
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 512, 7, 59, 256, Size(9, 3), _1, _1, _0, _0, 1, a, t), c, f1, f2);
-#endif
-#if 0
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 512, 6, 6, 2048, _1, _1, _1, _0, _0, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 2048, 6, 6, 512, _1, _1, _1, _0, _0, 1, a, t), c, f1, f2);
-#endif
-#if 0
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 128, 20, 12, 20, _1, _1, _1, _0, _0, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 128, 20, 12, 128, _1, _1, _1, _0, _0, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 128, 20, 12, 128, _3, _1, _1, _1, _1, 128, a, t), c, f1, f2);
-#endif
-#if 0
- //result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 448, 6, 6, 2048, _1, _1, _1, _0, _0, 1, a, t), c, f1, f2);
- //result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 384, 13, 13, 1152, _1, _1, _1, _0, _0, 1, a, t), c, f1, f2);
- //result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 256, 16, 16, 256, _1, _1, _1, _0, _0, 1, a, t), c, f1, f2);
- //result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 256, 16, 16, 256, _3, _1, _1, _1, _1, 1, a, t), c, f1, f2);
- //result = result && SynetConvolution32fForwardAutoTest(eps, Param(10, 96, 12, 12, 96, _3, _1, _1, _1, _1, 1, a, t), c, f1, f2);
- //result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 96, 12, 12, 96, _3, _1, _1, _1, _1, 1, a, t), c, f1, f2);
- //result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 96, 14, 14, 96, _3, _1, _1, _0, _0, 1, a, t), c, f1, f2);
- //result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 2048, 6, 6, 192, _1, _1, _1, _0, _0, 1, a, t), c, f1, f2);
- //result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 192, 10, 1, 192, _1, _1, _1, _0, _0, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 192, 1, 1, 192, _1, _1, _1, _0, _0, 1, a, t), c, f1, f2);
- //result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 192, 5, 2, 192, _1, _1, _1, _0, _0, 1, a, SimdFalse), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 192, 1, 1, 192, _1, _1, _1, _0, _0, 1, a, SimdFalse), c, f1, f2);
-#endif
-#if 0
- //result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 728, 14, 14, 728, _1, _1, _1, _0, _0, 1, a, t), c, f1, f2);
- //result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 728, 14, 14, 728, _1, _1, _1, _0, _0, 728, a, t), c, f1, f2);
- //result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 384, 27, 27, 32, _1, _1, _1, _0, _0, 1, a, t), c, f1, f2);
- //result = result && SynetConvolution32fForwardAutoTest(eps, Param(10, 384, 27, 27, 32, _1, _1, _1, _0, _0, 1, a, t), c, f1, f2);
- //result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 384, 270, 27, 32, _1, _1, _1, _0, _0, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 64, 109, 109, 128, _1, _1, _2, _0, _0, 1, a, t), c, f1, f2);
-#endif
-#if 0
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 128, 14, 14, 128, _5, _1, _1, _2, _2, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 32, 14, 14, 96, _5, _1, _1, _2, _2, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 1280, 14, 14, 1536, _1, _1, _1, _0, _0, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 512, 14, 14, 1536, _1, _1, _1, _0, _0, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 512, 14, 14, 512, _1, _1, _1, _0, _0, 1, a, t), c, f1, f2);
-#endif
-#if 0
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 35, 15, 20, 63, _3, _1, _1, _1, _1, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 35, 32, 32, 63, _1, _1, _1, _0, _0, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 64, 32, 47, 47, _3, _1, _1, _1, _1, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 600, 30, 30, 235, _1, _1, _1, _0, _0, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 120, 65, 63, 135, _1, _1, _1, _0, _0, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 256, 64, 64, 256, _3, _1, _1, _1, _1, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(10, 256, 8, 8, 256, _3, _1, _2, _1, _1, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(10, 128, 4, 4, 128, _3, _1, _2, _1, _1, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 1023, 32, 32, 1023, _1, _1, _1, _0, _0, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 512, 32, 32, 512, _1, _1, _1, _0, _0, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 256, 32, 32, 256, _1, _1, _1, _0, _0, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 128, 96, 128, 128, _1, _1, _1, _0, _0, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 511, 15, 20, 1023, _5, _1, _2, _2, _2, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 128, 150, 100, 128, _3, _1, _2, _1, _1, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 32, 37, 47, 48, _3, _1, _2, _1, _1, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 64, 24, 32, 64, _3, _1, _1, _1, _1, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 32, 104, 104, 16, _1, _1, _1, _0, _0, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 16, 28, 24, 16, _3, _1, _1, _1, _1, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 3, 640, 540, 16, _6, _1, _2, _2, _2, 1, a, t), c, f1, f2);
+#if 0
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 64, 89, 159, 64, _3, _1, _1, _1, _1, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 256, 150, 150, 256, _3, _1, _1, _1, _1, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 256, 150, 150, 128, _3, _1, _1, _1, _1, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 256, 150, 150, 64, _3, _1, _1, _1, _1, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 256, 150, 150, 32, _3, _1, _1, _1, _1, 1, a, t), f1, f2);
+#endif
+#if 0
+ result = result && SynetConvolution32fForwardAutoTest(eps * 10, Param(1, 1024, 64, 64, 256, _3, _1, _1, _1, _1, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps * 10, Param(1, 256, 64, 64, 256, _3, _1, _1, _1, _1, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps * 10, Param(1, 512, 64, 64, 512, _3, _1, _1, _1, _1, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 256, 75, 75, 256, _1, _1, _1, _0, _0, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 192, 75, 75, 192, _1, _1, _1, _0, _0, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 256, 17, 17, 128, _3, _1, _2, _1, _1, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 256, 17, 17, 64, _3, _1, _1, _1, _1, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 64, 17, 17, 256, _3, _1, _1, _1, _1, 1, a, t), f1, f2);
+#endif
+#if 0
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 3, 240, 135, 27, _3, _1, _1, _0, _0, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 32, 37, 47, 48, _3, _1, _2, _1, _1, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 96, 16, 26, 256, _1, _1, _1, _0, _0, 1, a, t), f1, f2);
+#endif
+#if 0
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 3, 96, 96, 48, _3, _1, _1, _1, _1, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 12, 1, 1, 192, _1, _1, _1, _0, _0, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 24, 1, 1, 384, _1, _1, _1, _0, _0, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 3, 1, 1, 48, _1, _1, _1, _0, _0, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 6, 1, 1, 96, _1, _1, _1, _0, _0, 1, a, t), f1, f2);
+#endif
+#if 0
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 16, 40, 23, 16, _3, _1, _1, _1, _1, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 16, 40, 23, 16, _3, _2, _1, _2, _2, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 16, 40, 23, 16, _3, _3, _1, _3, _3, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 16, 40, 23, 16, _3, _5, _1, _5, _5, 1, a, t), f1, f2);
+#endif
+#if 0
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 512, 7, 59, 256, Size(9, 3), _1, _1, _0, _0, 1, a, t), f1, f2);
+#endif
+#if 0
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 512, 6, 6, 2048, _1, _1, _1, _0, _0, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 2048, 6, 6, 512, _1, _1, _1, _0, _0, 1, a, t), f1, f2);
+#endif
+#if 0
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 128, 20, 12, 20, _1, _1, _1, _0, _0, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 128, 20, 12, 128, _1, _1, _1, _0, _0, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 128, 20, 12, 128, _3, _1, _1, _1, _1, 128, a, t), f1, f2);
+#endif
+#if 0
+ //result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 448, 6, 6, 2048, _1, _1, _1, _0, _0, 1, a, t), f1, f2);
+ //result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 384, 13, 13, 1152, _1, _1, _1, _0, _0, 1, a, t), f1, f2);
+ //result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 256, 16, 16, 256, _1, _1, _1, _0, _0, 1, a, t), f1, f2);
+ //result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 256, 16, 16, 256, _3, _1, _1, _1, _1, 1, a, t), f1, f2);
+ //result = result && SynetConvolution32fForwardAutoTest(eps, Param(10, 96, 12, 12, 96, _3, _1, _1, _1, _1, 1, a, t), f1, f2);
+ //result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 96, 12, 12, 96, _3, _1, _1, _1, _1, 1, a, t), f1, f2);
+ //result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 96, 14, 14, 96, _3, _1, _1, _0, _0, 1, a, t), f1, f2);
+ //result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 2048, 6, 6, 192, _1, _1, _1, _0, _0, 1, a, t), f1, f2);
+ //result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 192, 10, 1, 192, _1, _1, _1, _0, _0, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 192, 1, 1, 192, _1, _1, _1, _0, _0, 1, a, t), f1, f2);
+ //result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 192, 5, 2, 192, _1, _1, _1, _0, _0, 1, a, SimdFalse), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 192, 1, 1, 192, _1, _1, _1, _0, _0, 1, a, SimdFalse), f1, f2);
+#endif
+#if 0
+ //result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 728, 14, 14, 728, _1, _1, _1, _0, _0, 1, a, t), f1, f2);
+ //result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 728, 14, 14, 728, _1, _1, _1, _0, _0, 728, a, t), f1, f2);
+ //result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 384, 27, 27, 32, _1, _1, _1, _0, _0, 1, a, t), f1, f2);
+ //result = result && SynetConvolution32fForwardAutoTest(eps, Param(10, 384, 27, 27, 32, _1, _1, _1, _0, _0, 1, a, t), f1, f2);
+ //result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 384, 270, 27, 32, _1, _1, _1, _0, _0, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 64, 109, 109, 128, _1, _1, _2, _0, _0, 1, a, t), f1, f2);
+#endif
+#if 0
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 128, 14, 14, 128, _5, _1, _1, _2, _2, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 32, 14, 14, 96, _5, _1, _1, _2, _2, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 1280, 14, 14, 1536, _1, _1, _1, _0, _0, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 512, 14, 14, 1536, _1, _1, _1, _0, _0, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 512, 14, 14, 512, _1, _1, _1, _0, _0, 1, a, t), f1, f2);
+#endif
+#if 0
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 35, 15, 20, 63, _3, _1, _1, _1, _1, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 35, 32, 32, 63, _1, _1, _1, _0, _0, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 64, 32, 47, 47, _3, _1, _1, _1, _1, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 600, 30, 30, 235, _1, _1, _1, _0, _0, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 120, 65, 63, 135, _1, _1, _1, _0, _0, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 256, 64, 64, 256, _3, _1, _1, _1, _1, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(10, 256, 8, 8, 256, _3, _1, _2, _1, _1, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(10, 128, 4, 4, 128, _3, _1, _2, _1, _1, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 1023, 32, 32, 1023, _1, _1, _1, _0, _0, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 512, 32, 32, 512, _1, _1, _1, _0, _0, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 256, 32, 32, 256, _1, _1, _1, _0, _0, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 128, 96, 128, 128, _1, _1, _1, _0, _0, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 511, 15, 20, 1023, _5, _1, _2, _2, _2, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 128, 150, 100, 128, _3, _1, _2, _1, _1, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 32, 37, 47, 48, _3, _1, _2, _1, _1, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 64, 24, 32, 64, _3, _1, _1, _1, _1, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 32, 104, 104, 16, _1, _1, _1, _0, _0, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 16, 28, 24, 16, _3, _1, _1, _1, _1, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 3, 640, 540, 16, _6, _1, _2, _2, _2, 1, a, t), f1, f2);
#endif
#if 0
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 384, 7, 7, 768, _3, _1, _1, _1, _1, 384, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 192, 14, 14, 384, _7, _1, _2, _3, _3, 192, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 96, 28, 28, 192, _7, _1, _2, _3, _3, 96, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 48, 56, 56, 96, _7, _1, _2, _3, _3, 48, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 512, 7, 7, 1024, _3, _1, _1, _1, _1, 512, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 256, 14, 14, 512, _7, _1, _2, _3, _3, 256, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 128, 28, 28, 256, _7, _1, _2, _3, _3, 128, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 64, 56, 56, 128, _7, _1, _2, _3, _3, 64, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 49, 29, 29, 98, _7, _1, _2, _3, _3, 49, a, t), c, f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 384, 7, 7, 768, _3, _1, _1, _1, _1, 384, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 192, 14, 14, 384, _7, _1, _2, _3, _3, 192, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 96, 28, 28, 192, _7, _1, _2, _3, _3, 96, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 48, 56, 56, 96, _7, _1, _2, _3, _3, 48, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 512, 7, 7, 1024, _3, _1, _1, _1, _1, 512, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 256, 14, 14, 512, _7, _1, _2, _3, _3, 256, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 128, 28, 28, 256, _7, _1, _2, _3, _3, 128, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 64, 56, 56, 128, _7, _1, _2, _3, _3, 64, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 49, 29, 29, 98, _7, _1, _2, _3, _3, 49, a, t), f1, f2);
#endif
-#if 0
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 160, 14, 14, 160, _3, _1, _1, _1, _1, 10, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 160, 28, 28, 160, _3, _1, _2, _1, _1, 10, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 32, 112, 112, 32, _3, _1, _2, _1, _1, 2, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 384, 14, 14, 384, _3, _1, _2, _1, _1, 24, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 384, 7, 7, 384, _3, _1, _1, _1, _1, 24, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 64, 28, 28, 64, _3, _1, _1, _1, _1, 4, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 64, 56, 56, 64, _3, _1, _2, _1, _1, 4, a, t), c, f1, f2);
+#if 0
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 160, 14, 14, 160, _3, _1, _1, _1, _1, 10, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 160, 28, 28, 160, _3, _1, _2, _1, _1, 10, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 32, 112, 112, 32, _3, _1, _2, _1, _1, 2, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 384, 14, 14, 384, _3, _1, _2, _1, _1, 24, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 384, 7, 7, 384, _3, _1, _1, _1, _1, 24, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 64, 28, 28, 64, _3, _1, _1, _1, _1, 4, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 64, 56, 56, 64, _3, _1, _2, _1, _1, 4, a, t), f1, f2);
#endif
#if 0
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 64, 56, 56, 128, _1, _1, _2, _0, _0, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 48, 24, 24, 96, _1, _1, _2, _0, _0, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 56, 48, 48, 112, _1, _1, _2, _0, _0, 1, a, t), c, f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 64, 56, 56, 128, _1, _1, _2, _0, _0, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 48, 24, 24, 96, _1, _1, _2, _0, _0, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 56, 48, 48, 112, _1, _1, _2, _0, _0, 1, a, t), f1, f2);
#endif
#if 0
result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 48, 24, 24, 96, _1, _1, _2, _0, _0, 1, aGe, tT), fp32, f1, f2);
@@ -637,37 +636,37 @@ namespace Test
//result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 512, 32, 32, 256, _1, _1, _1, _0, _0, 1, aRe, tT), fp32, f1, f2);
#endif
#if 0
- //result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 3072, 64, 64, 768, _1, _1, _1, _0, _0, 1, a, t), c, f1, f2);
- //result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 768, 64, 64, 768, _1, _1, _1, _0, _0, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 256, 64, 64, 256, _1, _1, _1, _0, _0, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 256, 64, 64, 256, _3, _1, _1, _1, _1, 1, a, t), c, f1, f2);
- //result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 256, 64, 64, 256, _5, _1, _1, _2, _2, 1, a, t), c, f1, f2);
- //result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 256, 64, 64, 256, _7, _1, _1, _3, _3, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 64, 128, 128, 256, _3, _1, _1, _1, _1, 1, a, t), c, f1, f2);
- //result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 32, 16, 16, 256, _5, _1, _1, _2, _2, 1, a, t), c, f1, f2);
- //result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 768, 16, 16, 256, _1, _1, _1, _0, _0, 1, a, t), c, f1, f2);
- //result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 64, 16, 16, 256, Size(3, 4), _1, _1, _1, Size(1, 2), 1, a, t), c, f1, f2);
- //result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 48, 24, 24, 64, _3, _1, _1, _1, _1, 1, a, t), c, f1, f2);
- //result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 3072, 24, 24, 256, _1, _1, _1, _0, _0, 1, a, t), c, f1, f2);
- //result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 600, 24, 24, 64, _1, _1, _1, _0, _0, 1, a, t), c, f1, f2);
- //result = result && SynetConvolution32fForwardAutoTest(eps, Param(12, 2000, 5, 5, 255, _1, _1, _1, _0, _0, 1, a, t), c, f1, f2);
- //result = result && SynetConvolution32fForwardAutoTest(eps, Param(24, 2048, 6, 6, 255, _3, _1, _1, _0, _0, 1, a, t), c, f1, f2);
-#endif
-#if 0
- //result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 16, 160, 160, 16, _3, _1, _1, _1, _1, 1, a, t), c, f1, f2);
- //result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 16, 160, 160, 32, _3, _1, _2, _1, _1, 1, a, t), c, f1, f2);
- //result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 48, 160, 160, 32, _1, _1, _1, _0, _0, 1, a, t), c, f1, f2);
- //result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 32, 80, 80, 32, _3, _1, _1, _1, _1, 1, a, t), c, f1, f2);
- //result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 96, 80, 80, 64, _1, _1, _1, _0, _0, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(2, 256, 1, 1, 512, _1, _1, _1, _0, _0, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 1664, 40, 40, 512, _1, _1, _1, _0, _0, 1, a, t), c, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(2, 192, 5, 5, 256, _3, _1, _1, _0, _0, 1, a, t), c, f1, f2);
+ //result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 3072, 64, 64, 768, _1, _1, _1, _0, _0, 1, a, t), f1, f2);
+ //result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 768, 64, 64, 768, _1, _1, _1, _0, _0, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 256, 64, 64, 256, _1, _1, _1, _0, _0, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 256, 64, 64, 256, _3, _1, _1, _1, _1, 1, a, t), f1, f2);
+ //result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 256, 64, 64, 256, _5, _1, _1, _2, _2, 1, a, t), f1, f2);
+ //result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 256, 64, 64, 256, _7, _1, _1, _3, _3, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 64, 128, 128, 256, _3, _1, _1, _1, _1, 1, a, t), f1, f2);
+ //result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 32, 16, 16, 256, _5, _1, _1, _2, _2, 1, a, t), f1, f2);
+ //result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 768, 16, 16, 256, _1, _1, _1, _0, _0, 1, a, t), f1, f2);
+ //result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 64, 16, 16, 256, Size(3, 4), _1, _1, _1, Size(1, 2), 1, a, t), f1, f2);
+ //result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 48, 24, 24, 64, _3, _1, _1, _1, _1, 1, a, t), f1, f2);
+ //result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 3072, 24, 24, 256, _1, _1, _1, _0, _0, 1, a, t), f1, f2);
+ //result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 600, 24, 24, 64, _1, _1, _1, _0, _0, 1, a, t), f1, f2);
+ //result = result && SynetConvolution32fForwardAutoTest(eps, Param(12, 2000, 5, 5, 255, _1, _1, _1, _0, _0, 1, a, t), f1, f2);
+ //result = result && SynetConvolution32fForwardAutoTest(eps, Param(24, 2048, 6, 6, 255, _3, _1, _1, _0, _0, 1, a, t), f1, f2);
+#endif
+#if 0
+ //result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 16, 160, 160, 16, _3, _1, _1, _1, _1, 1, a, t), f1, f2);
+ //result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 16, 160, 160, 32, _3, _1, _2, _1, _1, 1, a, t), f1, f2);
+ //result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 48, 160, 160, 32, _1, _1, _1, _0, _0, 1, a, t), f1, f2);
+ //result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 32, 80, 80, 32, _3, _1, _1, _1, _1, 1, a, t), f1, f2);
+ //result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 96, 80, 80, 64, _1, _1, _1, _0, _0, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(2, 256, 1, 1, 512, _1, _1, _1, _0, _0, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 1664, 40, 40, 512, _1, _1, _1, _0, _0, 1, a, t), f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(2, 192, 5, 5, 256, _3, _1, _1, _0, _0, 1, a, t), f1, f2);
#endif
#if 1
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 256, 48, 48, 256, _1, _1, _1, _0, _0, 1, a, t), c, f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 256, 48, 48, 256, _1, _1, _1, _0, _0, 1, a, t), f1, f2);
#endif
#else
- result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 256, 44, 44, 256, _1, _1, _1, _0, _0, 1, a, t), c, f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, Param(1, 256, 44, 44, 256, _1, _1, _1, _0, _0, 1, a, t), f1, f2);
#endif
return result;
}
@@ -676,35 +675,18 @@ namespace Test
{
bool result = true;
- SimdSynetCompatibilityType fp32 = SimdSynetCompatibilityDefault;
- SimdSynetCompatibilityType bf16 = SimdSynetCompatibility16bfSoft;
+ const SimdConvolutionActivationType aId = SimdConvolutionActivationIdentity, aRe = SimdConvolutionActivationRelu,
+ aLr = SimdConvolutionActivationLeakyRelu, aRr = SimdConvolutionActivationRestrictRange, aPr = SimdConvolutionActivationPrelu,
+ aEl = SimdConvolutionActivationElu, aHs = SimdConvolutionActivationHswish, aMi = SimdConvolutionActivationMish,
+ aHi = SimdConvolutionActivationHardSigmoid, aSw = SimdConvolutionActivationSwish, aGe = SimdConvolutionActivationGelu;
+ const SimdBool tF = SimdFalse, tT = SimdTrue;
#ifdef NDEBUG
- //result = result && SynetConvolution32fForwardAutoTest(eps, SimdConvolutionActivationIdentity, SimdTrue, fp32, f1, f2);
- //result = result && SynetConvolution32fForwardAutoTest(eps, SimdConvolutionActivationRelu, SimdTrue, fp32, f1, f2);
- //result = result && SynetConvolution32fForwardAutoTest(eps, SimdConvolutionActivationRelu, SimdTrue, fp32, f1, f2);
- //result = result && SynetConvolution32fForwardAutoTest(eps, SimdConvolutionActivationLeakyRelu, SimdTrue, fp32, f1, f2);
- //result = result && SynetConvolution32fForwardAutoTest(eps, SimdConvolutionActivationRestrictRange, SimdTrue, fp32, f1, f2);
- //result = result && SynetConvolution32fForwardAutoTest(eps, SimdConvolutionActivationPrelu, SimdTrue, fp32, f1, f2);
- //result = result && SynetConvolution32fForwardAutoTest(eps, SimdConvolutionActivationElu, SimdTrue, fp32, f1, f2);
- //result = result && SynetConvolution32fForwardAutoTest(eps, SimdConvolutionActivationHswish, SimdTrue, fp32, f1, f2);
- //result = result && SynetConvolution32fForwardAutoTest(eps, SimdConvolutionActivationMish, SimdTrue, fp32, f1, f2);
- //result = result && SynetConvolution32fForwardAutoTest(eps, SimdConvolutionActivationHardSigmoid, SimdFalse, fp32, f1, f2);
- //result = result && SynetConvolution32fForwardAutoTest(eps, SimdConvolutionActivationHardSigmoid, SimdTrue, fp32, f1, f2);
- //result = result && SynetConvolution32fForwardAutoTest(eps, SimdConvolutionActivationSwish, SimdFalse, fp32, f1, f2);
- //result = result && SynetConvolution32fForwardAutoTest(eps, SimdConvolutionActivationPrelu, SimdTrue, fp32, f1, f2);
- //result = result && SynetConvolution32fForwardAutoTest(eps, SimdConvolutionActivationSwish, SimdFalse, bf16, f1, f2);
- //result = result && SynetConvolution32fForwardAutoTest(eps, SimdConvolutionActivationIdentity, SimdTrue, bf16, f1, f2);
- //result = result && SynetConvolution32fForwardAutoTest(eps, SimdConvolutionActivationGelu, SimdFalse, fp32, f1, f2);
- //result = result && SynetConvolution32fForwardAutoTest(eps, SimdConvolutionActivationLeakyRelu, SimdTrue, fp32, f1, f2);
- //result = result && SynetConvolution32fForwardAutoTest(eps, SimdConvolutionActivationGelu, SimdFalse, bf16, f1, f2);
- //result = result && SynetConvolution32fForwardAutoTest(eps, SimdConvolutionActivationLeakyRelu, SimdTrue, bf16, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, SimdConvolutionActivationSwish, SimdTrue, fp32, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, SimdConvolutionActivationSwish, SimdFalse, fp32, f1, f2);
+ //result = result && SynetConvolution32fForwardAutoTest(eps, SimdConvolutionActivationGelu, tF, f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, SimdConvolutionActivationRelu, tT, f1, f2);
#else
- //result = result && SynetConvolution32fForwardAutoTest(eps, SimdConvolutionActivationGelu, SimdFalse, bf16, f1, f2);
- //result = result && SynetConvolution32fForwardAutoTest(eps, SimdConvolutionActivationRelu, SimdTrue, bf16, f1, f2);
- result = result && SynetConvolution32fForwardAutoTest(eps, SimdConvolutionActivationRelu, SimdTrue, fp32, f1, f2);
+ //result = result && SynetConvolution32fForwardAutoTest(eps, SimdConvolutionActivationGelu, tF, f1, f2);
+ result = result && SynetConvolution32fForwardAutoTest(eps, SimdConvolutionActivationRelu, tT, f1, f2);
#endif
return result;
@@ -733,11 +715,6 @@ namespace Test
result = result && SynetConvolution32fForwardAutoTest(2 * EPS, FUNC_C(Simd::Avx512bw::SynetConvolution32fInit), FUNC_C(SimdSynetConvolution32fInit));
#endif
-#if (defined(SIMD_AMXBF16_ENABLE) || (defined(SIMD_AVX512BW_ENABLE) && defined(SIMD_AMX_EMULATE)))
- if (Simd::AmxBf16::Enable && TestAmxBf16())
- result = result && SynetConvolution32fForwardAutoTest(2 * EPS, FUNC_C(Simd::AmxBf16::SynetConvolution32fInit), FUNC_C(SimdSynetConvolution32fInit));
-#endif
-
#ifdef SIMD_NEON_ENABLE
if (Simd::Neon::Enable && TestNeon())
result = result && SynetConvolution32fForwardAutoTest(2 * EPS, FUNC_C(Simd::Neon::SynetConvolution32fInit), FUNC_C(SimdSynetConvolution32fInit));
diff --git a/src/Test/TestSynetConvolution8i.cpp b/src/Test/TestSynetConvolution8i.cpp
index a50f56f5d1..93b95c54fa 100644
--- a/src/Test/TestSynetConvolution8i.cpp
+++ b/src/Test/TestSynetConvolution8i.cpp
@@ -74,7 +74,7 @@ namespace Test
{
p.conv.srcT = SimdTensorData32f;
p.conv.dstT = SimdTensorData32f;
- void * context = SimdSynetConvolution32fInit(p.batch, &p.conv, SimdSynetCompatibilityDefault);
+ void * context = SimdSynetConvolution32fInit(p.batch, &p.conv);
buf.Extend({ SimdSynetConvolution32fExternalBufferSize(context) });
SimdSynetConvolution32fSetParams(context, weight.Data(), NULL, bias.Data(), params.Data());
SimdSynetConvolution32fForward(context, src.Data(), buf.Data(), dst.Data());
diff --git a/src/Test/TestSynetMergedConvolution8i.cpp b/src/Test/TestSynetMergedConvolution8i.cpp
index 8542abacb4..1d562baa3c 100644
--- a/src/Test/TestSynetMergedConvolution8i.cpp
+++ b/src/Test/TestSynetMergedConvolution8i.cpp
@@ -142,7 +142,7 @@ namespace Test
SimdConvolutionParameters conv = p.conv[i];
conv.srcT = SimdTensorData32f;
conv.dstT = SimdTensorData32f;
- void* context = SimdSynetConvolution32fInit(p.batch, &conv, SimdSynetCompatibilityDefault);
+ void* context = SimdSynetConvolution32fInit(p.batch, &conv);
buf.Extend({ SimdSynetConvolution32fExternalBufferSize(context) });
dst.Reshape(Shp(p.batch, conv.dstH, conv.dstW, conv.dstC), conv.dstF);
SimdSynetConvolution32fSetParams(context, weight.Data(), NULL, bias.Data(), params.Data());