Skip to content

Commit

Permalink
*fix bug: Error in SSE4.1, AVX, AVX2, AVX-512BW optimizations of clas…
Browse files Browse the repository at this point in the history
…s SynetMergedConvolution32fCd.
  • Loading branch information
ermig1979 committed Jul 24, 2023
1 parent 0235068 commit 15b7297
Show file tree
Hide file tree
Showing 6 changed files with 108 additions and 103 deletions.
1 change: 1 addition & 0 deletions docs/2023.html
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ <h5>Bug fixing</h5>
<li>Error in AVX-512VNNI optimizations of class SynetMergedConvolution8iCd.</li>
<li>Error in AVX-512VNNI optimizations of class SynetMergedConvolution8iDc.</li>
<li>Error (assert) in Base implementation of class ResizerNearest.</li>
<li>Error in SSE4.1, AVX, AVX2, AVX-512BW optimizations of class SynetMergedConvolution32fCd.</li>
</ul>

<h4>Test framework</h4>
Expand Down
51 changes: 25 additions & 26 deletions src/Simd/SimdAvx1SynetMergedConvolution32fCd.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ namespace Simd
for (size_t dy = yBeg; dy < yEnd; ++dy)
{
float* pd = dst + dy * dstS;
for (size_t dx = 0; dx < p.dstW; ++dx, pd += srcC)
for (size_t dx = 0; dx < p.dstW; ++dx, pd += p.dstC)
{
__m256 sum = _bias;
for (size_t ky = 0; ky < p.kernelY; ++ky)
Expand Down Expand Up @@ -106,7 +106,7 @@ namespace Simd
if (dy >= noseY && dy < bodyY)
{
size_t dx = 0;
for (; dx < noseX; ++dx, pd += srcC)
for (; dx < noseX; ++dx, pd += p.dstC)
{
__m256 sum = _bias;
for (size_t ky = 0; ky < p.kernelY; ++ky)
Expand All @@ -125,7 +125,7 @@ namespace Simd
}
_mm256_storeu_ps(pd, Activate<type>(sum, _params, 0));
}
for (; dx < bodyX8; dx += 8, pd += 8 * srcC)
for (; dx < bodyX8; dx += 8, pd += 8 * p.dstC)
{
__m256 sum0 = _bias;
__m256 sum1 = _bias;
Expand Down Expand Up @@ -153,16 +153,16 @@ namespace Simd
sum7 = _mm256_add_ps(_mm256_mul_ps(_mm256_loadu_ps(ps + 7 * strideXF), w0), sum7);
}
}
_mm256_storeu_ps(pd + 0 * srcC, Activate<type>(sum0, _params, 0));
_mm256_storeu_ps(pd + 1 * srcC, Activate<type>(sum1, _params, 0));
_mm256_storeu_ps(pd + 2 * srcC, Activate<type>(sum2, _params, 0));
_mm256_storeu_ps(pd + 3 * srcC, Activate<type>(sum3, _params, 0));
_mm256_storeu_ps(pd + 4 * srcC, Activate<type>(sum4, _params, 0));
_mm256_storeu_ps(pd + 5 * srcC, Activate<type>(sum5, _params, 0));
_mm256_storeu_ps(pd + 6 * srcC, Activate<type>(sum6, _params, 0));
_mm256_storeu_ps(pd + 7 * srcC, Activate<type>(sum7, _params, 0));
_mm256_storeu_ps(pd + 0 * p.dstC, Activate<type>(sum0, _params, 0));
_mm256_storeu_ps(pd + 1 * p.dstC, Activate<type>(sum1, _params, 0));
_mm256_storeu_ps(pd + 2 * p.dstC, Activate<type>(sum2, _params, 0));
_mm256_storeu_ps(pd + 3 * p.dstC, Activate<type>(sum3, _params, 0));
_mm256_storeu_ps(pd + 4 * p.dstC, Activate<type>(sum4, _params, 0));
_mm256_storeu_ps(pd + 5 * p.dstC, Activate<type>(sum5, _params, 0));
_mm256_storeu_ps(pd + 6 * p.dstC, Activate<type>(sum6, _params, 0));
_mm256_storeu_ps(pd + 7 * p.dstC, Activate<type>(sum7, _params, 0));
}
for (; dx < bodyX4; dx += 4, pd += 4 * srcC)
for (; dx < bodyX4; dx += 4, pd += 4 * p.dstC)
{
__m256 sum0 = _bias;
__m256 sum1 = _bias;
Expand All @@ -182,12 +182,12 @@ namespace Simd
sum3 = _mm256_add_ps(_mm256_mul_ps(_mm256_loadu_ps(ps + 3 * strideXF), w0), sum3);
}
}
_mm256_storeu_ps(pd + 0 * srcC, Activate<type>(sum0, _params, 0));
_mm256_storeu_ps(pd + 1 * srcC, Activate<type>(sum1, _params, 0));
_mm256_storeu_ps(pd + 2 * srcC, Activate<type>(sum2, _params, 0));
_mm256_storeu_ps(pd + 3 * srcC, Activate<type>(sum3, _params, 0));
_mm256_storeu_ps(pd + 0 * p.dstC, Activate<type>(sum0, _params, 0));
_mm256_storeu_ps(pd + 1 * p.dstC, Activate<type>(sum1, _params, 0));
_mm256_storeu_ps(pd + 2 * p.dstC, Activate<type>(sum2, _params, 0));
_mm256_storeu_ps(pd + 3 * p.dstC, Activate<type>(sum3, _params, 0));
}
for (; dx < bodyX2; dx += 2, pd += 2 * srcC)
for (; dx < bodyX2; dx += 2, pd += 2 * p.dstC)
{
__m256 sum0 = _bias;
__m256 sum1 = _bias;
Expand All @@ -203,10 +203,10 @@ namespace Simd
sum1 = _mm256_add_ps(_mm256_mul_ps(_mm256_loadu_ps(ps + 1 * strideXF), w0), sum1);
}
}
_mm256_storeu_ps(pd + 0 * srcC, Activate<type>(sum0, _params, 0));
_mm256_storeu_ps(pd + 1 * srcC, Activate<type>(sum1, _params, 0));
_mm256_storeu_ps(pd + 0 * p.dstC, Activate<type>(sum0, _params, 0));
_mm256_storeu_ps(pd + 1 * p.dstC, Activate<type>(sum1, _params, 0));
}
for (; dx < bodyX; ++dx, pd += srcC)
for (; dx < bodyX; ++dx, pd += p.dstC)
{
__m256 sum = _bias;
const float* pw = weight;
Expand All @@ -222,7 +222,7 @@ namespace Simd
}
_mm256_storeu_ps(pd, Activate<type>(sum, _params, 0));
}
for (; dx < p.dstW; ++dx, pd += srcC)
for (; dx < p.dstW; ++dx, pd += p.dstC)
{
__m256 sum = _bias;
for (size_t ky = 0; ky < p.kernelY; ++ky)
Expand All @@ -244,7 +244,7 @@ namespace Simd
}
else
{
for (size_t dx = 0; dx < p.dstW; ++dx, pd += srcC)
for (size_t dx = 0; dx < p.dstW; ++dx, pd += p.dstC)
{
__m256 sum = _bias;
for (size_t ky = 0; ky < p.kernelY; ++ky)
Expand All @@ -263,18 +263,17 @@ namespace Simd
}
}
}
}
}
_mm256_storeu_ps(pd, Activate<type>(sum, _params, 0));
}
}
}
}
}

src += srcS;
dst += F;
weight += weightS;
}
}
}

template<SimdConvolutionActivationType type> SIMD_INLINE void ConvolutionDepthwise3x3Edge2x2(
const float* src0, const float* src1, const __m256* weight, const __m256& bias, const __m256* params, float* dst)
Expand Down
51 changes: 25 additions & 26 deletions src/Simd/SimdAvx2SynetMergedConvolution32fCd.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ namespace Simd
for (size_t dy = yBeg; dy < yEnd; ++dy)
{
float* pd = dst + dy * dstS;
for (size_t dx = 0; dx < p.dstW; ++dx, pd += srcC)
for (size_t dx = 0; dx < p.dstW; ++dx, pd += p.dstC)
{
__m256 sum = _bias;
for (size_t ky = 0; ky < p.kernelY; ++ky)
Expand Down Expand Up @@ -106,7 +106,7 @@ namespace Simd
if (dy >= noseY && dy < bodyY)
{
size_t dx = 0;
for (; dx < noseX; ++dx, pd += srcC)
for (; dx < noseX; ++dx, pd += p.dstC)
{
__m256 sum = _bias;
for (size_t ky = 0; ky < p.kernelY; ++ky)
Expand All @@ -125,7 +125,7 @@ namespace Simd
}
_mm256_storeu_ps(pd, Activate<type>(sum, _params, 0));
}
for (; dx < bodyX8; dx += 8, pd += 8 * srcC)
for (; dx < bodyX8; dx += 8, pd += 8 * p.dstC)
{
__m256 sum0 = _bias;
__m256 sum1 = _bias;
Expand Down Expand Up @@ -153,16 +153,16 @@ namespace Simd
sum7 = _mm256_fmadd_ps(_mm256_loadu_ps(ps + 7 * strideXF), w0, sum7);
}
}
_mm256_storeu_ps(pd + 0 * srcC, Activate<type>(sum0, _params, 0));
_mm256_storeu_ps(pd + 1 * srcC, Activate<type>(sum1, _params, 0));
_mm256_storeu_ps(pd + 2 * srcC, Activate<type>(sum2, _params, 0));
_mm256_storeu_ps(pd + 3 * srcC, Activate<type>(sum3, _params, 0));
_mm256_storeu_ps(pd + 4 * srcC, Activate<type>(sum4, _params, 0));
_mm256_storeu_ps(pd + 5 * srcC, Activate<type>(sum5, _params, 0));
_mm256_storeu_ps(pd + 6 * srcC, Activate<type>(sum6, _params, 0));
_mm256_storeu_ps(pd + 7 * srcC, Activate<type>(sum7, _params, 0));
_mm256_storeu_ps(pd + 0 * p.dstC, Activate<type>(sum0, _params, 0));
_mm256_storeu_ps(pd + 1 * p.dstC, Activate<type>(sum1, _params, 0));
_mm256_storeu_ps(pd + 2 * p.dstC, Activate<type>(sum2, _params, 0));
_mm256_storeu_ps(pd + 3 * p.dstC, Activate<type>(sum3, _params, 0));
_mm256_storeu_ps(pd + 4 * p.dstC, Activate<type>(sum4, _params, 0));
_mm256_storeu_ps(pd + 5 * p.dstC, Activate<type>(sum5, _params, 0));
_mm256_storeu_ps(pd + 6 * p.dstC, Activate<type>(sum6, _params, 0));
_mm256_storeu_ps(pd + 7 * p.dstC, Activate<type>(sum7, _params, 0));
}
for (; dx < bodyX4; dx += 4, pd += 4 * srcC)
for (; dx < bodyX4; dx += 4, pd += 4 * p.dstC)
{
__m256 sum0 = _bias;
__m256 sum1 = _bias;
Expand All @@ -182,12 +182,12 @@ namespace Simd
sum3 = _mm256_fmadd_ps(_mm256_loadu_ps(ps + 3 * strideXF), w0, sum3);
}
}
_mm256_storeu_ps(pd + 0 * srcC, Activate<type>(sum0, _params, 0));
_mm256_storeu_ps(pd + 1 * srcC, Activate<type>(sum1, _params, 0));
_mm256_storeu_ps(pd + 2 * srcC, Activate<type>(sum2, _params, 0));
_mm256_storeu_ps(pd + 3 * srcC, Activate<type>(sum3, _params, 0));
_mm256_storeu_ps(pd + 0 * p.dstC, Activate<type>(sum0, _params, 0));
_mm256_storeu_ps(pd + 1 * p.dstC, Activate<type>(sum1, _params, 0));
_mm256_storeu_ps(pd + 2 * p.dstC, Activate<type>(sum2, _params, 0));
_mm256_storeu_ps(pd + 3 * p.dstC, Activate<type>(sum3, _params, 0));
}
for (; dx < bodyX2; dx += 2, pd += 2 * srcC)
for (; dx < bodyX2; dx += 2, pd += 2 * p.dstC)
{
__m256 sum0 = _bias;
__m256 sum1 = _bias;
Expand All @@ -203,10 +203,10 @@ namespace Simd
sum1 = _mm256_fmadd_ps(_mm256_loadu_ps(ps + 1 * strideXF), w0, sum1);
}
}
_mm256_storeu_ps(pd + 0 * srcC, Activate<type>(sum0, _params, 0));
_mm256_storeu_ps(pd + 1 * srcC, Activate<type>(sum1, _params, 0));
_mm256_storeu_ps(pd + 0 * p.dstC, Activate<type>(sum0, _params, 0));
_mm256_storeu_ps(pd + 1 * p.dstC, Activate<type>(sum1, _params, 0));
}
for (; dx < bodyX; ++dx, pd += srcC)
for (; dx < bodyX; ++dx, pd += p.dstC)
{
__m256 sum = _bias;
const float* pw = weight;
Expand All @@ -222,7 +222,7 @@ namespace Simd
}
_mm256_storeu_ps(pd, Activate<type>(sum, _params, 0));
}
for (; dx < p.dstW; ++dx, pd += srcC)
for (; dx < p.dstW; ++dx, pd += p.dstC)
{
__m256 sum = _bias;
for (size_t ky = 0; ky < p.kernelY; ++ky)
Expand All @@ -244,7 +244,7 @@ namespace Simd
}
else
{
for (size_t dx = 0; dx < p.dstW; ++dx, pd += srcC)
for (size_t dx = 0; dx < p.dstW; ++dx, pd += p.dstC)
{
__m256 sum = _bias;
for (size_t ky = 0; ky < p.kernelY; ++ky)
Expand All @@ -263,18 +263,17 @@ namespace Simd
}
}
}
}
}
_mm256_storeu_ps(pd, Activate<type>(sum, _params, 0));
}
}
}
}
}

src += srcS;
dst += F;
weight += weightS;
}
}
}

template<SimdConvolutionActivationType type> SIMD_INLINE void ConvolutionDepthwise3x3Edge2x2(
const float* src0, const float* src1, const __m256* weight, const __m256& bias, const __m256* params, float* dst)
Expand Down
43 changes: 21 additions & 22 deletions src/Simd/SimdAvx512bwSynetMergedConvolution32fCd.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ namespace Simd
if (dy >= noseY && dy < bodyY)
{
size_t dx = 0;
for (; dx < noseX; ++dx, pd += srcC)
for (; dx < noseX; ++dx, pd += p.dstC)
{
__m512 sum = _bias;
for (size_t ky = 0; ky < p.kernelY; ++ky)
Expand All @@ -85,7 +85,7 @@ namespace Simd
}
_mm512_mask_storeu_ps(pd, tail, Activate<type>(sum, _params, 0));
}
for (; dx < bodyX8; dx += 8, pd += 8 * srcC)
for (; dx < bodyX8; dx += 8, pd += 8 * p.dstC)
{
__m512 sum0 = _bias;
__m512 sum1 = _bias;
Expand Down Expand Up @@ -113,16 +113,16 @@ namespace Simd
sum7 = _mm512_fmadd_ps(_mm512_loadu_ps(ps + 7 * strideXF), w0, sum7);
}
}
_mm512_mask_storeu_ps(pd + 0 * srcC, tail, Activate<type>(sum0, _params, 0));
_mm512_mask_storeu_ps(pd + 1 * srcC, tail, Activate<type>(sum1, _params, 0));
_mm512_mask_storeu_ps(pd + 2 * srcC, tail, Activate<type>(sum2, _params, 0));
_mm512_mask_storeu_ps(pd + 3 * srcC, tail, Activate<type>(sum3, _params, 0));
_mm512_mask_storeu_ps(pd + 4 * srcC, tail, Activate<type>(sum4, _params, 0));
_mm512_mask_storeu_ps(pd + 5 * srcC, tail, Activate<type>(sum5, _params, 0));
_mm512_mask_storeu_ps(pd + 6 * srcC, tail, Activate<type>(sum6, _params, 0));
_mm512_mask_storeu_ps(pd + 7 * srcC, tail, Activate<type>(sum7, _params, 0));
_mm512_mask_storeu_ps(pd + 0 * p.dstC, tail, Activate<type>(sum0, _params, 0));
_mm512_mask_storeu_ps(pd + 1 * p.dstC, tail, Activate<type>(sum1, _params, 0));
_mm512_mask_storeu_ps(pd + 2 * p.dstC, tail, Activate<type>(sum2, _params, 0));
_mm512_mask_storeu_ps(pd + 3 * p.dstC, tail, Activate<type>(sum3, _params, 0));
_mm512_mask_storeu_ps(pd + 4 * p.dstC, tail, Activate<type>(sum4, _params, 0));
_mm512_mask_storeu_ps(pd + 5 * p.dstC, tail, Activate<type>(sum5, _params, 0));
_mm512_mask_storeu_ps(pd + 6 * p.dstC, tail, Activate<type>(sum6, _params, 0));
_mm512_mask_storeu_ps(pd + 7 * p.dstC, tail, Activate<type>(sum7, _params, 0));
}
for (; dx < bodyX4; dx += 4, pd += 4 * srcC)
for (; dx < bodyX4; dx += 4, pd += 4 * p.dstC)
{
__m512 sum0 = _bias;
__m512 sum1 = _bias;
Expand All @@ -142,12 +142,12 @@ namespace Simd
sum3 = _mm512_fmadd_ps(_mm512_loadu_ps(ps + 3 * strideXF), w0, sum3);
}
}
_mm512_mask_storeu_ps(pd + 0 * srcC, tail, Activate<type>(sum0, _params, 0));
_mm512_mask_storeu_ps(pd + 1 * srcC, tail, Activate<type>(sum1, _params, 0));
_mm512_mask_storeu_ps(pd + 2 * srcC, tail, Activate<type>(sum2, _params, 0));
_mm512_mask_storeu_ps(pd + 3 * srcC, tail, Activate<type>(sum3, _params, 0));
_mm512_mask_storeu_ps(pd + 0 * p.dstC, tail, Activate<type>(sum0, _params, 0));
_mm512_mask_storeu_ps(pd + 1 * p.dstC, tail, Activate<type>(sum1, _params, 0));
_mm512_mask_storeu_ps(pd + 2 * p.dstC, tail, Activate<type>(sum2, _params, 0));
_mm512_mask_storeu_ps(pd + 3 * p.dstC, tail, Activate<type>(sum3, _params, 0));
}
for (; dx < bodyX2; dx += 2, pd += 2 * srcC)
for (; dx < bodyX2; dx += 2, pd += 2 * p.dstC)
{
__m512 sum0 = _bias;
__m512 sum1 = _bias;
Expand All @@ -163,10 +163,10 @@ namespace Simd
sum1 = _mm512_fmadd_ps(_mm512_loadu_ps(ps + 1 * strideXF), w0, sum1);
}
}
_mm512_mask_storeu_ps(pd + 0 * srcC, tail, Activate<type>(sum0, _params, 0));
_mm512_mask_storeu_ps(pd + 1 * srcC, tail, Activate<type>(sum1, _params, 0));
_mm512_mask_storeu_ps(pd + 0 * p.dstC, tail, Activate<type>(sum0, _params, 0));
_mm512_mask_storeu_ps(pd + 1 * p.dstC, tail, Activate<type>(sum1, _params, 0));
}
for (; dx < bodyX; ++dx, pd += srcC)
for (; dx < bodyX; ++dx, pd += p.dstC)
{
__m512 sum = _bias;
const float* pw = weight;
Expand All @@ -182,7 +182,7 @@ namespace Simd
}
_mm512_mask_storeu_ps(pd, tail, Activate<type>(sum, _params, 0));
}
for (; dx < p.dstW; ++dx, pd += srcC)
for (; dx < p.dstW; ++dx, pd += p.dstC)
{
__m512 sum = _bias;
for (size_t ky = 0; ky < p.kernelY; ++ky)
Expand All @@ -204,7 +204,7 @@ namespace Simd
}
else
{
for (size_t dx = 0; dx < p.dstW; ++dx, pd += srcC)
for (size_t dx = 0; dx < p.dstW; ++dx, pd += p.dstC)
{
__m512 sum = _bias;
for (size_t ky = 0; ky < p.kernelY; ++ky)
Expand All @@ -228,7 +228,6 @@ namespace Simd
}
}
}

src += srcS;
dst += F;
weight += weightS;
Expand Down
Loading

0 comments on commit 15b7297

Please sign in to comment.