Skip to content

Commit

Permalink
Merge branch 'master' of github.com:ermig1979/Simd
Browse files Browse the repository at this point in the history
  • Loading branch information
ermig1979 committed Jun 23, 2023
2 parents 5f3e8c4 + 6007715 commit 3393d41
Show file tree
Hide file tree
Showing 5 changed files with 171 additions and 6 deletions.
2 changes: 1 addition & 1 deletion docs/2023.html
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ <h5>New features</h5>
<li>Support of 4-bit and 5-bit depth in Base implementation, SSE4.1, AVX2, AVX-512BW optimizations of function DescrIntCosineDistance.</li>
<li>Support of 4-bit and 5-bit depth in Base implementation, SSE4.1, AVX2, AVX-512BW optimizations of function DescrIntCosineDistancesMxNp.</li>
<li>Support of 4-bit and 5-bit depth in Base implementation, SSE4.1, AVX2, AVX-512BW optimizations of function DescrIntCosineDistancesMxNa.</li>
<li>Base implementation, SSE4.1, AVX2 optimizations of function SynetNormalizeLayerForwardV3.</li>
<li>Base implementation, SSE4.1, AVX2, AVX-512BW optimizations of function SynetNormalizeLayerForwardV3.</li>
</ul>
<h5>Bug fixing</h5>
<ul>
Expand Down
3 changes: 3 additions & 0 deletions src/Simd/SimdAvx512bw.h
Original file line number Diff line number Diff line change
Expand Up @@ -556,6 +556,9 @@ namespace Simd
void SynetNormalizeLayerForwardV2(const float* src, size_t batch, size_t channels, size_t spatial,
const float* scale, const float* shift, const float* eps, SimdTensorFormatType format, float* buf, float* dst);

void SynetNormalizeLayerForwardV3(const float* src, size_t batch, size_t channels, size_t spatial,
const float* scale, const float* shift, const float* eps, SimdTensorFormatType format, float* buf, float* dst);

void SynetPoolingAverage(const float* src, size_t srcC, size_t srcH, size_t srcW, size_t kernelY, size_t kernelX,
size_t strideY, size_t strideX, size_t padY, size_t padX, float* dst, size_t dstH, size_t dstW, SimdBool excludePad, SimdTensorFormatType format);

Expand Down
162 changes: 162 additions & 0 deletions src/Simd/SimdAvx512bwSynetNormalize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -368,6 +368,168 @@ namespace Simd
else
assert(0);
}

//-------------------------------------------------------------------------------------------------

void NormalizeNchwV3(const float* src, size_t batch, size_t channels, size_t spatial, const float* scale, const float* shift, float eps, float* dst)
{
float k = 1.0f / float(spatial);
size_t spatialF = AlignLo(spatial, F), s;
__mmask16 spatialM = TailMask16(spatial - spatialF);
for (size_t b = 0; b < batch; ++b)
{
for (size_t c = 0; c < channels; ++c)
{
__m512 _sum = _mm512_setzero_ps();
for (s = 0; s < spatialF; s += F)
_sum = _mm512_add_ps(_mm512_loadu_ps(src + s), _sum);
if(s < spatial)
_sum = _mm512_add_ps(_mm512_maskz_loadu_ps(spatialM, src + s), _sum);
float sum = ExtractSum(_sum);
__m512 mean = _mm512_set1_ps(sum * k);
for (s = 0; s < spatialF; s += F)
_mm512_storeu_ps(dst + s, _mm512_sub_ps(_mm512_loadu_ps(src + s), mean));
if (s < spatial)
_mm512_mask_storeu_ps(dst + s, spatialM, _mm512_sub_ps(_mm512_maskz_loadu_ps(spatialM, src + s), mean));

__m512 _sqsum = _mm512_setzero_ps();
for (s = 0; s < spatialF; s += F)
{
__m512 _dst = _mm512_loadu_ps(dst + s);
_sqsum = _mm512_fmadd_ps(_dst, _dst, _sqsum);
}
if (s < spatial)
{
__m512 _dst = _mm512_maskz_loadu_ps(spatialM, dst + s);
_sqsum = _mm512_fmadd_ps(_dst, _dst, _sqsum);
}
float sqsum = ExtractSum(_sqsum);
__m512 norm = _mm512_set1_ps(1.0f / ::sqrt(sqsum * k + eps));
__m512 _scale = _mm512_set1_ps(scale[c]);
__m512 _shift = _mm512_set1_ps(shift[c]);
for (s = 0; s < spatialF; s += F)
_mm512_storeu_ps(dst + s, _mm512_add_ps(_mm512_mul_ps(_mm512_mul_ps(_mm512_loadu_ps(dst + s), norm), _scale), _shift));
if (s < spatial)
_mm512_mask_storeu_ps(dst + s, spatialM, _mm512_add_ps(_mm512_mul_ps(_mm512_mul_ps(_mm512_maskz_loadu_ps(spatialM, dst + s), norm), _scale), _shift));

dst += spatial;
src += spatial;
}
}
}

void NormalizeNhwcV3(const float* src, size_t batch, size_t channels, size_t spatial, const float* scale, const float* shift, float eps, float* buf, float* dst)
{
float k = 1.0f / float(spatial);
Array32f _buf;
if (buf == NULL)
{
_buf.Resize(spatial);
buf = _buf.data;
}
size_t channelsF = AlignLo(channels, F);
__mmask16 channelsM = TailMask16(channels - channelsF);
__m512 _eps = _mm512_set1_ps(eps), _k = _mm512_set1_ps(k), _1 = _mm512_set1_ps(1.0f);
for (size_t b = 0, c; b < batch; ++b)
{
for (c = 0; c < channelsF; c += F)
_mm512_storeu_ps(buf + c, _mm512_setzero_ps());
if(c < channels)
_mm512_mask_storeu_ps(buf + c, channelsM, _mm512_setzero_ps());
for (size_t s = 0; s < spatial; ++s)
{
const float* ps = src + s * channels;
for (c = 0; c < channelsF; c += F)
{
__m512 _src = _mm512_loadu_ps(ps + c);
__m512 _sum = _mm512_loadu_ps(buf + c);
_mm512_storeu_ps(buf + c, _mm512_add_ps(_sum, _src));
}
if (c < channels)
{
__m512 _src = _mm512_maskz_loadu_ps(channelsM, ps + c);
__m512 _sum = _mm512_maskz_loadu_ps(channelsM, buf + c);
_mm512_mask_storeu_ps(buf + c, channelsM, _mm512_add_ps(_sum, _src));
}
}
for (c = 0; c < channelsF; c += F)
_mm512_storeu_ps(buf + c, _mm512_mul_ps(_mm512_loadu_ps(buf + c), _k));
if (c < channels)
_mm512_mask_storeu_ps(buf + c, channelsM, _mm512_mul_ps(_mm512_maskz_loadu_ps(channelsM, buf + c), _k));
for (size_t s = 0; s < spatial; ++s)
{
const float* ps = src + s * channels;
float* pd = dst + s * channels;
for (c = 0; c < channelsF; c += F)
{
__m512 _src = _mm512_loadu_ps(ps + c);
__m512 mean = _mm512_loadu_ps(buf + c);
_mm512_storeu_ps(pd + c, _mm512_sub_ps(_src, mean));
}
if (c < channels)
{
__m512 _src = _mm512_maskz_loadu_ps(channelsM, ps + c);
__m512 mean = _mm512_maskz_loadu_ps(channelsM, buf + c);
_mm512_mask_storeu_ps(pd + c, channelsM, _mm512_sub_ps(_src, mean));
}
}

for (c = 0; c < channelsF; c += F)
_mm512_storeu_ps(buf + c, _mm512_setzero_ps());
if (c < channels)
_mm512_mask_storeu_ps(buf + c, channelsM, _mm512_setzero_ps());
for (size_t s = 0; s < spatial; ++s)
{
const float* pd = dst + s * channels;
for (c = 0; c < channelsF; c += F)
{
__m512 _dst = _mm512_loadu_ps(pd + c);
__m512 _sum = _mm512_loadu_ps(buf + c);
_mm512_storeu_ps(buf + c, _mm512_fmadd_ps(_dst, _dst, _sum));
}
if (c < channels)
{
__m512 _dst = _mm512_maskz_loadu_ps(channelsM, pd + c);
__m512 _sum = _mm512_maskz_loadu_ps(channelsM, buf + c);
_mm512_mask_storeu_ps(buf + c, channelsM, _mm512_fmadd_ps(_dst, _dst, _sum));
}
}
for (c = 0; c < channelsF; c += F)
_mm512_storeu_ps(buf + c, _mm512_div_ps(_1, _mm512_sqrt_ps(_mm512_add_ps(_mm512_mul_ps(_mm512_loadu_ps(buf + c), _k), _eps))));
if (c < channels)
_mm512_mask_storeu_ps(buf + c, channelsM, _mm512_div_ps(_1, _mm512_sqrt_ps(_mm512_add_ps(_mm512_mul_ps(_mm512_maskz_loadu_ps(channelsM, buf + c), _k), _eps))));
for (size_t s = 0; s < spatial; ++s)
{
float* pd = dst + s * channels;
for (c = 0; c < channelsF; c += F)
{
__m512 _dst = _mm512_loadu_ps(pd + c);
__m512 norm = _mm512_loadu_ps(buf + c);
_mm512_storeu_ps(pd + c, _mm512_add_ps(_mm512_mul_ps(_mm512_mul_ps(_dst, norm), _mm512_loadu_ps(scale + c)), _mm512_loadu_ps(shift + c)));
}
if (c < channels)
{
__m512 _dst = _mm512_maskz_loadu_ps(channelsM, pd + c);
__m512 norm = _mm512_maskz_loadu_ps(channelsM, buf + c);
_mm512_mask_storeu_ps(pd + c, channelsM, _mm512_add_ps(_mm512_mul_ps(_mm512_mul_ps(_dst, norm), _mm512_maskz_loadu_ps(channelsM, scale + c)), _mm512_maskz_loadu_ps(channelsM, shift + c)));
}
}

src += channels * spatial;
dst += channels * spatial;
}
}

void SynetNormalizeLayerForwardV3(const float* src, size_t batch, size_t channels, size_t spatial,
const float* scale, const float* shift, const float* eps, SimdTensorFormatType format, float* buf, float* dst)
{
if (format == SimdTensorFormatNchw)
NormalizeNchwV3(src, batch, channels, spatial, scale, shift, *eps, dst);
else if (format == SimdTensorFormatNhwc)
NormalizeNhwcV3(src, batch, channels, spatial, scale, shift, *eps, buf, dst);
else
assert(0);
}
}
#endif
}
2 changes: 1 addition & 1 deletion src/Simd/SimdLib.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6590,7 +6590,7 @@ SIMD_API void SimdSynetNormalizeLayerForwardV3(const float* src, size_t batch, s
#if defined(SIMD_SYNET_ENABLE)
typedef void(*SimdSynetNormalizeLayerForwardV3Ptr) (const float* src, size_t batch, size_t channels, size_t spatial,
const float* scale, const float* shift, const float* eps, SimdTensorFormatType format, float* buf, float* dst);
const static SimdSynetNormalizeLayerForwardV3Ptr simdSynetNormalizeLayerForwardV3 = SIMD_FUNC2(SynetNormalizeLayerForwardV3, SIMD_AVX2_FUNC, SIMD_SSE41_FUNC);// , SIMD_AVX512BW_FUNC, SIMD_NEON_FUNC);
const static SimdSynetNormalizeLayerForwardV3Ptr simdSynetNormalizeLayerForwardV3 = SIMD_FUNC3(SynetNormalizeLayerForwardV3, SIMD_AVX512BW_FUNC, SIMD_AVX2_FUNC, SIMD_SSE41_FUNC);// , SIMD_NEON_FUNC);

simdSynetNormalizeLayerForwardV3(src, batch, channels, spatial, scale, shift, eps, format, buf, dst);
#else
Expand Down
8 changes: 4 additions & 4 deletions src/Test/TestSynetNormalize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -255,10 +255,10 @@ namespace Test
result = result && SynetNormalizeLayerForwardV2AutoTest(FUNC_SNLF2(Simd::Avx2::SynetNormalizeLayerForwardV3), FUNC_SNLF2(SimdSynetNormalizeLayerForwardV3));
#endif

//#ifdef SIMD_AVX512BW_ENABLE
// if (Simd::Avx512bw::Enable)
// result = result && SynetNormalizeLayerForwardV2AutoTest(FUNC_SNLF2(Simd::Avx512bw::SynetNormalizeLayerForwardV3), FUNC_SNLF2(SimdSynetNormalizeLayerForwardV3));
//#endif
#ifdef SIMD_AVX512BW_ENABLE
if (Simd::Avx512bw::Enable)
result = result && SynetNormalizeLayerForwardV2AutoTest(FUNC_SNLF2(Simd::Avx512bw::SynetNormalizeLayerForwardV3), FUNC_SNLF2(SimdSynetNormalizeLayerForwardV3));
#endif

return result;
}
Expand Down

0 comments on commit 3393d41

Please sign in to comment.