Skip to content

Commit

Permalink
+add SSE4.1 optimizations of SynetGridSample2d32fBlZ class.
Browse files Browse the repository at this point in the history
  • Loading branch information
ermig1979 committed Jul 25, 2023
1 parent ba61f1a commit 52ddfac
Show file tree
Hide file tree
Showing 10 changed files with 165 additions and 14 deletions.
2 changes: 1 addition & 1 deletion docs/2023.html
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ <h5>New features</h5>
<li>Support of SimdCpuInfoRam in function Simd::PrintInfo.</li>
<li>Base implementation of function SimdCpuDesc.</li>
<li>Base implementation of SynetGridSample2dRef class.</li>
<li>Base implementation of SynetGridSample2d32fBlZ class.</li>
<li>Base implementation, SSE4.1 optimizations of SynetGridSample2d32fBlZ class.</li>
</ul>
<h5>Bug fixing</h5>
<ul>
Expand Down
1 change: 1 addition & 0 deletions prj/vs2019/Sse41.vcxproj
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,7 @@
<ClCompile Include="..\..\src\Simd\SimdSse41SynetDeconvolution32f.cpp" />
<ClCompile Include="..\..\src\Simd\SimdSse41SynetFused.cpp" />
<ClCompile Include="..\..\src\Simd\SimdSse41SynetGridSample.cpp" />
<ClCompile Include="..\..\src\Simd\SimdSse41SynetGridSample2d32fBlZ.cpp" />
<ClCompile Include="..\..\src\Simd\SimdSse41SynetInnerProduct32f.cpp" />
<ClCompile Include="..\..\src\Simd\SimdSse41SynetInnerProduct8i.cpp" />
<ClCompile Include="..\..\src\Simd\SimdSse41SynetMergedConvolution32fBf16.cpp" />
Expand Down
3 changes: 3 additions & 0 deletions prj/vs2019/Sse41.vcxproj.filters
Original file line number Diff line number Diff line change
Expand Up @@ -391,6 +391,9 @@
<ClCompile Include="..\..\src\Simd\SimdSse41SynetGridSample.cpp">
<Filter>Sse41</Filter>
</ClCompile>
<ClCompile Include="..\..\src\Simd\SimdSse41SynetGridSample2d32fBlZ.cpp">
<Filter>Sse41</Filter>
</ClCompile>
</ItemGroup>
<ItemGroup>
<Filter Include="Sse41">
Expand Down
1 change: 1 addition & 0 deletions prj/vs2022/Sse41.vcxproj
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,7 @@
<ClCompile Include="..\..\src\Simd\SimdSse41SynetDeconvolution32f.cpp" />
<ClCompile Include="..\..\src\Simd\SimdSse41SynetFused.cpp" />
<ClCompile Include="..\..\src\Simd\SimdSse41SynetGridSample.cpp" />
<ClCompile Include="..\..\src\Simd\SimdSse41SynetGridSample2d32fBlZ.cpp" />
<ClCompile Include="..\..\src\Simd\SimdSse41SynetInnerProduct32f.cpp" />
<ClCompile Include="..\..\src\Simd\SimdSse41SynetInnerProduct8i.cpp" />
<ClCompile Include="..\..\src\Simd\SimdSse41SynetMergedConvolution32fBf16.cpp" />
Expand Down
3 changes: 3 additions & 0 deletions prj/vs2022/Sse41.vcxproj.filters
Original file line number Diff line number Diff line change
Expand Up @@ -391,6 +391,9 @@
<ClCompile Include="..\..\src\Simd\SimdSse41SynetGridSample.cpp">
<Filter>Sse41</Filter>
</ClCompile>
<ClCompile Include="..\..\src\Simd\SimdSse41SynetGridSample2d32fBlZ.cpp">
<Filter>Sse41</Filter>
</ClCompile>
</ItemGroup>
<ItemGroup>
<Filter Include="Sse41">
Expand Down
21 changes: 11 additions & 10 deletions src/Simd/SimdBaseSynetGridSample2d32fBlZ.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -55,20 +55,21 @@ namespace Simd

//-------------------------------------------------------------------------------------------------

void BilinearInterp32fBlZ(const float* pad, size_t dstS, int padW, uint32_t* idx, float* dy, float* dx, float* dst)
void BilinearInterp32fBlZ(const float* pad0, size_t dstS, int padW, uint32_t* idx, float* dy, float* dx, float* dst)
{
const float* pad1 = pad0 + padW;
for (size_t d = 0; d < dstS; ++d)
{
int offs = idx[d];
float p00 = pad[offs];
float p01 = pad[offs + 1];
float p10 = pad[offs + padW];
float p11 = pad[offs + padW + 1];
float dy0 = dy[d];
float dy1 = 1.0f - dy0;
float dx0 = dx[d];
float dx1 = 1.0f - dx0;
dst[d] = dy1 * (dx1 * p00 + dx0 * p01) + dy0 * (dx1 * p10 + dx0 * p11);
float p00 = pad0[offs + 0];
float p01 = pad0[offs + 1];
float p10 = pad1[offs + 0];
float p11 = pad1[offs + 1];
float dy1 = dy[d];
float dy0 = 1.0f - dy1;
float dx1 = dx[d];
float dx0 = 1.0f - dx1;
dst[d] = dy0 * (dx0 * p00 + dx1 * p01) + dy1 * (dx0 * p10 + dx1 * p11);
}
}

Expand Down
2 changes: 1 addition & 1 deletion src/Simd/SimdSse41SynetGridSample.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ namespace Simd
if (!param.Valid())
return NULL;
if(param.Is32fBlZ())
return new Base::SynetGridSample2d32fBlZ(param);
return new Sse41::SynetGridSample2d32fBlZ(param);
else
return new Base::SynetGridSample2dRef(param);
}
Expand Down
134 changes: 134 additions & 0 deletions src/Simd/SimdSse41SynetGridSample2d32fBlZ.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,134 @@
/*
* Simd Library (http://ermig1979.github.io/Simd).
*
* Copyright (c) 2011-2023 Yermalayeu Ihar.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/

#include "Simd/SimdSynetGridSample.h"

#include "Simd/SimdLoad.h"
#include "Simd/SimdSet.h"

namespace Simd
{
#if defined(SIMD_SSE41_ENABLE) && defined(SIMD_SYNET_ENABLE)
namespace Sse41
{
template <SimdBool align> SIMD_INLINE float Denormalize32f(float pos, int dim)
{
if (align)
return float((pos + 1) / 2.0f * (dim - 1));
else
return float(((pos + 1) * dim - 1) / 2.0f);
}

template<SimdBool align> void IndexCoeffs32fBlZ(const float* grd, size_t dstS, int srcH, int srcW, int padW, uint32_t* idx, float* dy, float* dx)
{
size_t dstSF = AlignLo(dstS, F), d = 0;
const __m128 a = SetFloat((srcW - align) / 2.0f, (srcH - align) / 2.0f);
const __m128 b = SetFloat((srcW - 1) / 2.0f, (srcH - 1) / 2.0f);
const __m128i _0 = _mm_setzero_si128();
const __m128i _2 = _mm_set1_epi32(2);
const __m128i _srcH = _mm_set1_epi32(srcH + 2);
const __m128i _srcW = _mm_set1_epi32(srcW + 2);
const __m128i _padW = _mm_set1_epi32(padW);
for (; d < dstSF; d += F)
{
__m128 xy0 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(grd + 0), a), b);
__m128 xy1 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(grd + F), a), b);
__m128 x = _mm_shuffle_ps(xy0, xy1, 0x88);
__m128 y = _mm_shuffle_ps(xy0, xy1, 0xDD);
__m128 xf = _mm_round_ps(x, _MM_FROUND_FLOOR);
__m128 yf = _mm_round_ps(y, _MM_FROUND_FLOOR);
_mm_storeu_ps(dy + d, _mm_sub_ps(y, yf));
_mm_storeu_ps(dx + d, _mm_sub_ps(x, xf));
__m128i xi = _mm_min_epi32(_mm_max_epi32(_mm_add_epi32(_mm_cvtps_epi32(xf), _2), _0), _srcW);
__m128i yi = _mm_min_epi32(_mm_max_epi32(_mm_add_epi32(_mm_cvtps_epi32(yf), _2), _0), _srcH);
_mm_storeu_si128((__m128i*)(idx + d), _mm_add_epi32(_mm_mullo_epi32(_padW, yi), xi));
grd += 2 * F;
}
for (; d < dstS; ++d)
{
float x = Denormalize32f<align>(grd[0], srcW);
float y = Denormalize32f<align>(grd[1], srcH);
int x0 = int(std::floor(x));
int y0 = int(std::floor(y));
dy[d] = y - float(y0);
dx[d] = x - float(x0);
x0 = Simd::RestrictRange(x0, -2, srcW) + 2;
y0 = Simd::RestrictRange(y0, -2, srcH) + 2;
idx[d] = padW * y0 + x0;
grd += 2;
}
}

//-------------------------------------------------------------------------------------------------

void BilinearInterp32fBlZ(const float* pad0, size_t dstS, int padW, uint32_t* idx, float* dy, float* dx, float* dst)
{
size_t dstSF = AlignLo(dstS, F), d = 0;
const float* pad1 = pad0 + padW;
__m128 p0, p1, _1 = _mm_set1_ps(1.0f);
for (; d < dstSF; d += F)
{
int i0 = idx[d + 0], i1 = idx[d + 1], i2 = idx[d + 2], i3 = idx[d + 3];
p0 = Load(pad0 + i0, pad0 + i1);
p1 = Load(pad0 + i2, pad0 + i3);
__m128 p00 = _mm_shuffle_ps(p0, p1, 0x88);
__m128 p01 = _mm_shuffle_ps(p0, p1, 0xDD);
p0 = Load(pad1 + i0, pad1 + i1);
p1 = Load(pad1 + i2, pad1 + i3);
__m128 p10 = _mm_shuffle_ps(p0, p1, 0x88);
__m128 p11 = _mm_shuffle_ps(p0, p1, 0xDD);
__m128 dy1 = _mm_loadu_ps(dy + d);
__m128 dy0 = _mm_sub_ps(_1, dy1);
__m128 dx1 = _mm_loadu_ps(dx + d);
__m128 dx0 = _mm_sub_ps(_1, dx1);
__m128 d0 = _mm_add_ps(_mm_mul_ps(dx0, p00), _mm_mul_ps(dx1, p01));
__m128 d1 = _mm_add_ps(_mm_mul_ps(dx0, p10), _mm_mul_ps(dx1, p11));
_mm_storeu_ps(dst + d, _mm_add_ps(_mm_mul_ps(dy0, d0), _mm_mul_ps(dy1, d1)));
}
for (; d < dstS; ++d)
{
int offs = idx[d];
float p00 = pad0[offs + 0];
float p01 = pad0[offs + 1];
float p10 = pad1[offs + 0];
float p11 = pad1[offs + 1];
float dy1 = dy[d];
float dy0 = 1.0f - dy1;
float dx1 = dx[d];
float dx0 = 1.0f - dx1;
dst[d] = dy0 * (dx0 * p00 + dx1 * p01) + dy1 * (dx0 * p10 + dx1 * p11);
}
}

//-------------------------------------------------------------------------------------------------

SynetGridSample2d32fBlZ::SynetGridSample2d32fBlZ(const GridSample2dParam& param)
: Base::SynetGridSample2d32fBlZ(param)
{
_indexCoeffs = _param.align ? IndexCoeffs32fBlZ<SimdTrue> : IndexCoeffs32fBlZ<SimdFalse>;
_bilinearInterp = BilinearInterp32fBlZ;
}
}
#endif
}
8 changes: 8 additions & 0 deletions src/Simd/SimdSynetGridSample.h
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,14 @@ namespace Simd
#ifdef SIMD_SSE41_ENABLE
namespace Sse41
{
class SynetGridSample2d32fBlZ : public Base::SynetGridSample2d32fBlZ
{
public:
SynetGridSample2d32fBlZ(const GridSample2dParam& param);
};

//-------------------------------------------------------------------------------------------------

void* SynetGridSample2dInit(size_t batch, size_t channels, size_t srcH, size_t srcW, size_t dstH, size_t dstW,
SimdTensorDataType type, SimdGridSampleInterpType interp, SimdGridSamplePaddingType padding, SimdBool align);
}
Expand Down
4 changes: 2 additions & 2 deletions src/Test/TestSynetGridSample.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -137,9 +137,9 @@ namespace Test
bool result = true;

SimdBool t = SimdTrue, f = SimdFalse;
for (int i = 0; i <= SimdGridSampleInterpBicubic; ++i)
for (int i = 0; i < 3; ++i)
{
for (int p = 0; p <= SimdGridSamplePaddingReflect; ++p)
for (int p = 0; p < 3; ++p)
{
result = result && SynetGridSample2dAutoTest<float>(srcShape, grdShape, SimdTensorData32f, (SimdGridSampleInterpType)i, (SimdGridSamplePaddingType)p, f, f1, f2);
result = result && SynetGridSample2dAutoTest<float>(srcShape, grdShape, SimdTensorData32f, (SimdGridSampleInterpType)i, (SimdGridSamplePaddingType)p, t, f1, f2);
Expand Down

0 comments on commit 52ddfac

Please sign in to comment.