Skip to content

Commit

Permalink
*fix bug: Error in Base implementation, SSE4.1, AVX2, AVX-512BW optim…
Browse files Browse the repository at this point in the history
…izations of function Float32ToBFloat16.
  • Loading branch information
ermig1979 committed Jul 16, 2024
1 parent 89783e2 commit 4d745b6
Show file tree
Hide file tree
Showing 15 changed files with 53 additions and 267 deletions.
3 changes: 1 addition & 2 deletions docs/2024.html
Original file line number Diff line number Diff line change
Expand Up @@ -42,21 +42,20 @@ <h5>New features</h5>
<li>Base implementation, SSE4.1, AVX2, AVX-512BW optimizations of function SynetRelu16b.</li>
<li>API of SynetAdd16b framework.</li>
<li>Base implementation, SSE4.1, AVX2, AVX-512BW optimizations of class SynetAdd16bUniform.</li>
<li>Base implementation, SSE4.1, AVX2, AVX-512BW optimizations of function Float32ToBFloat16NearestEven.</li>
</ul>
<h5>Bug fixing</h5>
<ul>
<li>Error in Base implementation of class SynetMergedConvolution16bCdc.</li>
<li>Error in Base implementation of class SynetMergedConvolution16bDc.</li>
<li>Error in Base implementation of class SynetInnerProduct16bGemmNN.</li>
<li>Error in Base implementation, SSE4.1, AVX2, AVX-512BW optimizations of function Float32ToBFloat16.</li>
</ul>

<h4>Test framework</h4>
<h5>New features</h5>
<ul>
<li>Tests for verifying functionality of function SynetRelu16b.</li>
<li>Tests for verifying functionality of SynetAdd16b framework.</li>
<li>Tests for verifying functionality of function Float32ToBFloat16NearestEven.</li>
</ul>

<a href="#HOME">Home</a>
Expand Down
51 changes: 2 additions & 49 deletions docs/help/group__bfloat16.html

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion src/Simd/SimdAmxBf16.h
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ namespace Simd
#ifdef SIMD_AMXBF16_ENABLE
namespace AmxBf16
{
void Float32ToBFloat16NearestEven(const float* src, size_t size, uint16_t* dst);
void Float32ToBFloat16(const float* src, size_t size, uint16_t* dst);

void ChangeColors(const uint8_t* src, size_t srcStride, size_t width, size_t height, const uint8_t* colors, uint8_t* dst, size_t dstStride);

Expand Down
6 changes: 3 additions & 3 deletions src/Simd/SimdAmxBf16BFloat16.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,20 +30,20 @@ namespace Simd
#ifdef SIMD_AMXBF16_ENABLE
namespace AmxBf16
{
void Float32ToBFloat16NearestEven(const float* src, size_t size, uint16_t* dst)
void Float32ToBFloat16(const float* src, size_t size, uint16_t* dst)
{
size_t size32 = AlignLo(size, 32);
__mmask16 srcMask[2];
__mmask32 dstMask[1];
size_t i = 0;
for (; i < size32; i += 32)
Float32ToBFloat16NearestEven<false, false>(src + i, dst + i, srcMask, dstMask);
Float32ToBFloat16<false, false>(src + i, dst + i, srcMask, dstMask);
if (size32 < size)
{
srcMask[0] = TailMask16(size - size32 - F * 0);
srcMask[1] = TailMask16(size - size32 - F * 1);
dstMask[0] = TailMask32(size - size32);
Float32ToBFloat16NearestEven<false, true>(src + i, dst + i, srcMask, dstMask);
Float32ToBFloat16<false, true>(src + i, dst + i, srcMask, dstMask);
}
}
}
Expand Down
29 changes: 0 additions & 29 deletions src/Simd/SimdAvx2BFloat16.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -61,35 +61,6 @@ namespace Simd

//---------------------------------------------------------------------------------------------

void Float32ToBFloat16NearestEven(const float* src, size_t size, uint16_t* dst)
{
size_t size16 = Simd::AlignLo(size, 16);
size_t size8 = Simd::AlignLo(size, 8);
size_t size4 = Simd::AlignLo(size, 4);
size_t i = 0;
for (; i < size16; i += 16)
{
__m256i d0 = Float32ToBFloat16NearestEven(_mm256_loadu_ps(src + i + 0));
__m256i d1 = Float32ToBFloat16NearestEven(_mm256_loadu_ps(src + i + 8));
_mm256_storeu_si256((__m256i*)(dst + i), _mm256_permute4x64_epi64(_mm256_packus_epi32(d0, d1), 0xD8));
}
for (; i < size8; i += 8)
{
__m128i d0 = Sse41::Float32ToBFloat16NearestEven(_mm_loadu_ps(src + i + 0));
__m128i d1 = Sse41::Float32ToBFloat16NearestEven(_mm_loadu_ps(src + i + 4));
_mm_storeu_si128((__m128i*)(dst + i), _mm_packus_epi32(d0, d1));
}
for (; i < size4; i += 4)
{
__m128i d0 = Sse41::Float32ToBFloat16NearestEven(_mm_loadu_ps(src + i + 0));
_mm_storel_epi64((__m128i*)(dst + i), _mm_packus_epi32(d0, Sse41::K_ZERO));
}
for (; i < size; ++i)
dst[i] = Base::Float32ToBFloat16NearestEven(src[i]);
}

//---------------------------------------------------------------------------------------------

void BFloat16ToFloat32(const uint16_t* src, size_t size, float* dst)
{
size_t size16 = Simd::AlignLo(size, 16);
Expand Down
19 changes: 0 additions & 19 deletions src/Simd/SimdAvx512bwBFloat16.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -49,25 +49,6 @@ namespace Simd

//---------------------------------------------------------------------------------------------

void Float32ToBFloat16NearestEven(const float* src, size_t size, uint16_t* dst)
{
size_t size32 = AlignLo(size, 32);
__mmask16 srcMask[2];
__mmask32 dstMask[1];
size_t i = 0;
for (; i < size32; i += 32)
Float32ToBFloat16NearestEven<false, false>(src + i, dst + i, srcMask, dstMask);
if (size32 < size)
{
srcMask[0] = TailMask16(size - size32 - F * 0);
srcMask[1] = TailMask16(size - size32 - F * 1);
dstMask[0] = TailMask32(size - size32);
Float32ToBFloat16NearestEven<false, true>(src + i, dst + i, srcMask, dstMask);
}
}

//---------------------------------------------------------------------------------------------

template<bool align, bool mask> SIMD_INLINE void BFloat16ToFloat32(const uint16_t* src, float* dst, __mmask32 srcMask[1], __mmask16 dstMask[2])
{
__m512i _src = Load<align, mask>(src, srcMask[0]);
Expand Down
Loading

0 comments on commit 4d745b6

Please sign in to comment.