Skip to content

Commit

Permalink
+add Sse41::Convolution16bNchwGemm_2.
Browse files Browse the repository at this point in the history
  • Loading branch information
ermig1979 committed Jul 23, 2024
1 parent 253654a commit 6fe04b5
Show file tree
Hide file tree
Showing 7 changed files with 493 additions and 1 deletion.
1 change: 1 addition & 0 deletions prj/vs2019/Sse41.vcxproj
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,7 @@
<ClCompile Include="..\..\src\Simd\SimdSse41SynetAdd16b.cpp" />
<ClCompile Include="..\..\src\Simd\SimdSse41SynetConversion.cpp" />
<ClCompile Include="..\..\src\Simd\SimdSse41SynetConvolution16b.cpp" />
<ClCompile Include="..\..\src\Simd\SimdSse41SynetConvolution16bNchwGemm.cpp" />
<ClCompile Include="..\..\src\Simd\SimdSse41SynetConvolution16bNhwcDirect.cpp" />
<ClCompile Include="..\..\src\Simd\SimdSse41SynetConvolution16bNhwcGemm.cpp" />
<ClCompile Include="..\..\src\Simd\SimdSse41SynetConvolution32f.cpp" />
Expand Down
3 changes: 3 additions & 0 deletions prj/vs2019/Sse41.vcxproj.filters
Original file line number Diff line number Diff line change
Expand Up @@ -415,6 +415,9 @@
<ClCompile Include="..\..\src\Simd\SimdSse41SynetAdd16b.cpp">
<Filter>Sse41</Filter>
</ClCompile>
<ClCompile Include="..\..\src\Simd\SimdSse41SynetConvolution16bNchwGemm.cpp">
<Filter>Sse41</Filter>
</ClCompile>
</ItemGroup>
<ItemGroup>
<Filter Include="Sse41">
Expand Down
1 change: 1 addition & 0 deletions prj/vs2022/Sse41.vcxproj
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,7 @@
<ClCompile Include="..\..\src\Simd\SimdSse41SynetAdd16b.cpp" />
<ClCompile Include="..\..\src\Simd\SimdSse41SynetConversion.cpp" />
<ClCompile Include="..\..\src\Simd\SimdSse41SynetConvolution16b.cpp" />
<ClCompile Include="..\..\src\Simd\SimdSse41SynetConvolution16bNchwGemm.cpp" />
<ClCompile Include="..\..\src\Simd\SimdSse41SynetConvolution16bNhwcDirect.cpp" />
<ClCompile Include="..\..\src\Simd\SimdSse41SynetConvolution16bNhwcGemm.cpp" />
<ClCompile Include="..\..\src\Simd\SimdSse41SynetConvolution32f.cpp" />
Expand Down
3 changes: 3 additions & 0 deletions prj/vs2022/Sse41.vcxproj.filters
Original file line number Diff line number Diff line change
Expand Up @@ -415,6 +415,9 @@
<ClCompile Include="..\..\src\Simd\SimdSse41SynetAdd16b.cpp">
<Filter>Sse41</Filter>
</ClCompile>
<ClCompile Include="..\..\src\Simd\SimdSse41SynetConvolution16bNchwGemm.cpp">
<Filter>Sse41</Filter>
</ClCompile>
</ItemGroup>
<ItemGroup>
<Filter Include="Sse41">
Expand Down
408 changes: 408 additions & 0 deletions src/Simd/SimdSse41SynetConvolution16bNchwGemm.cpp

Large diffs are not rendered by default.

10 changes: 9 additions & 1 deletion src/Simd/SimdSynetConvolution16b.h
Original file line number Diff line number Diff line change
Expand Up @@ -224,7 +224,7 @@ namespace Simd
typedef void(*ConvertPtr)(const uint8_t* src, const ConvParam& p, const AlgParam& a, size_t yBeg, size_t yEnd, size_t cBeg, size_t cEnd, uint16_t* dst);

typedef void(*ConvolutionPtr)(const uint16_t* weight, const ConvParam& p, const AlgParam& a, size_t dstC, size_t dstH,
size_t srcC, int zero, const uint16_t* src, const float* bias, const float* params, float* sum, uint8_t* dst);
size_t K, int zero, const uint16_t* src, const float* bias, const float* params, float* sum, uint8_t* dst);

protected:
void SetAlgParam(size_t F, size_t microD, size_t microN, size_t microK, size_t L1, size_t L2, size_t L3);
Expand Down Expand Up @@ -260,6 +260,14 @@ namespace Simd
virtual String Ext() const { return "Sse41"; }
};

class SynetConvolution16bNchwGemm : public Base::SynetConvolution16bNchwGemm
{
public:
SynetConvolution16bNchwGemm(const ConvParam& p);

virtual String Ext() const { return "Sse41"; }
};

//-------------------------------------------------------------------------------------------------

void* SynetConvolution16bInit(size_t batch, const SimdConvolutionParameters* conv, SimdSynetCompatibilityType compatibility);
Expand Down
68 changes: 68 additions & 0 deletions src/Simd/SimdSynetConvolution16bCommon.h
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,9 @@ namespace Simd

template<SimdConvolutionActivationType type> static SIMD_INLINE void Postprocess(const float* src, const float* bias, const float* params, size_t offset, uint8_t* dst);
template<SimdConvolutionActivationType type> static SIMD_INLINE void Postprocess(const float* src, const float* bias, const float* params, size_t offset, uint8_t* dst, size_t tail);

template<SimdConvolutionActivationType type, int index> static SIMD_INLINE void Save(uint8_t* ptr, float* buf, __m128 value, const float* bias, const float* params, size_t offset);
template<SimdConvolutionActivationType type, int index> static SIMD_INLINE void Save(uint8_t* ptr, float* buf, __m128 value, const float* bias, const float* params, size_t offset, size_t tail);
};

template <> struct Term16b<Term16bLast16b>
Expand Down Expand Up @@ -98,6 +101,21 @@ namespace Simd
for (size_t i = 0; i < tail; ++i)
((uint16_t*)dst)[offset + i] = tmp[i];
}

template<SimdConvolutionActivationType type, int index> static SIMD_INLINE void Save(uint8_t* ptr, float* buf, __m128 value, const float* bias, const float* params, size_t offset)
{
__m128 f32 = Activate<type>(_mm_add_ps(value, _mm_set1_ps(bias[offset])), params, offset);
_mm_storel_epi64((__m128i*)(ptr + index * DF), _mm_packus_epi32(Float32ToBFloat16(f32), K_ZERO));
}

template<SimdConvolutionActivationType type, int index> static SIMD_INLINE void Save(uint8_t* ptr, float* buf, __m128 value, const float* bias, const float* params, size_t offset, size_t tail)
{
__m128 f32 = Activate<type>(_mm_add_ps(value, _mm_set1_ps(bias[offset])), params, offset);
uint16_t tmp[F];
_mm_storel_epi64((__m128i*)tmp, _mm_packus_epi32(Float32ToBFloat16(f32), K_ZERO));
for (size_t i = 0; i < tail; ++i)
((uint16_t*)ptr)[i + index * F] = tmp[i];
}
};

template <> struct Term16b<Term16bLast32f>
Expand Down Expand Up @@ -142,6 +160,19 @@ namespace Simd
for (size_t i = 0; i < tail; ++i)
((float*)dst)[offset + i] = tmp[i];
}

template<SimdConvolutionActivationType type, int index> static SIMD_INLINE void Save(uint8_t* ptr, float* buf, __m128 value, const float* bias, const float* params, size_t offset)
{
_mm_storeu_ps((float*)ptr + index * F, Activate<type>(_mm_add_ps(value, _mm_set1_ps(bias[offset])), params, offset));
}

template<SimdConvolutionActivationType type, int index> static SIMD_INLINE void Save(uint8_t* ptr, float* buf, __m128 value, const float* bias, const float* params, size_t offset, size_t tail)
{
float tmp[F];
_mm_storeu_ps(tmp, Activate<type>(_mm_add_ps(value, _mm_set1_ps(bias[offset])), params, offset));
for (size_t i = 0; i < tail; ++i)
((float*)ptr)[i + index * F] = tmp[i];
}
};

template <> struct Term16b<Term16bInterim>
Expand Down Expand Up @@ -179,6 +210,19 @@ namespace Simd
template<SimdConvolutionActivationType type> static SIMD_INLINE void Postprocess(const float* src, const float* bias, const float* params, size_t offset, uint8_t* dst, size_t tail)
{
}

template<SimdConvolutionActivationType type, int index> static SIMD_INLINE void Save(uint8_t* ptr, float* buf, __m128 value, const float* bias, const float* params, size_t offset)
{
_mm_storeu_ps(buf + index * F, value);
}

template<SimdConvolutionActivationType type, int index> static SIMD_INLINE void Save(uint8_t* ptr, float* buf, __m128 value, const float* bias, const float* params, size_t offset, size_t tail)
{
float tmp[F];
_mm_storeu_ps(tmp, value);
for (size_t i = 0; i < tail; ++i)
buf[i + index * F] = tmp[i];
}
};

//-------------------------------------------------------------------------------------------------
Expand Down Expand Up @@ -260,6 +304,30 @@ namespace Simd
Term16b<term>::template Save<SimdConvolutionActivationIdentity, 0>(ptr, buf, val0, bias, NULL);
Term16b<term>::template Save<SimdConvolutionActivationIdentity, 1>(ptr, buf, val1, bias, NULL, tail);
}

//-------------------------------------------------------------------------------------------------

template<Term16bType term, SimdConvolutionActivationType type> SIMD_INLINE void Save1(uint8_t* ptr, float* buf, __m128 val0, const float* bias, const float* params, size_t offset)
{
Term16b<term>::template Save<type, 0>(ptr, buf, val0, bias, params, offset);
}

template<Term16bType term, SimdConvolutionActivationType type> SIMD_INLINE void Save1(uint8_t* ptr, float* buf, __m128 val0, const float* bias, const float* params, size_t offset, size_t tail)
{
Term16b<term>::template Save<type, 0>(ptr, buf, val0, bias, params, tail, offset);
}

template<Term16bType term, SimdConvolutionActivationType type> SIMD_INLINE void Save2(uint8_t* ptr, float* buf, __m128 val0, __m128 val1, const float* bias, const float* params, size_t offset)
{
Term16b<term>::template Save<type, 0>(ptr, buf, val0, bias, params, offset);
Term16b<term>::template Save<type, 1>(ptr, buf, val1, bias, params, offset);
}

template<Term16bType term, SimdConvolutionActivationType type> SIMD_INLINE void Save2(uint8_t* ptr, float* buf, __m128 val0, __m128 val1, const float* bias, const float* params, size_t offset, size_t tail)
{
Term16b<term>::template Save<type, 0>(ptr, buf, val0, bias, params, offset);
Term16b<term>::template Save<type, 1>(ptr, buf, val1, bias, params, offset, tail);
}
}
#endif

Expand Down

0 comments on commit 6fe04b5

Please sign in to comment.